PyPI - sibi-dst - Versions diffs - 0.3.27__py3-none-any.whl → 0.3.29__py3-none-any.whl - Mend

sibi-dst 0.3.27py3-none-any.whl → 0.3.29py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

sibi_dst/df_helper/__init__.py +2 -0
sibi_dst/df_helper/_df_helper.py +180 -12
sibi_dst/df_helper/core/_filter_handler.py +16 -0
sibi_dst/df_helper/data_cleaner.py +132 -0
sibi_dst/geopy_helper/__init__.py +7 -0
sibi_dst/geopy_helper/geo_location_service.py +63 -0
sibi_dst/geopy_helper/utils.py +55 -0
sibi_dst/osmnx_helper/__init__.py +9 -0
sibi_dst/osmnx_helper/base_osm_map.py +165 -0
sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
sibi_dst/osmnx_helper/basemaps/calendar_html.py +122 -0
sibi_dst/osmnx_helper/basemaps/router_plotter.py +186 -0
sibi_dst/osmnx_helper/utils.py +267 -0
sibi_dst/tests/__init__.py +0 -0
sibi_dst/tests/test_data_wrapper_class.py +78 -0
{sibi_dst-0.3.27.dist-info → sibi_dst-0.3.29.dist-info}/METADATA +5 -1
{sibi_dst-0.3.27.dist-info → sibi_dst-0.3.29.dist-info}/RECORD +18 -6
{sibi_dst-0.3.27.dist-info → sibi_dst-0.3.29.dist-info}/WHEEL +1 -1

sibi_dst/df_helper/__init__.py CHANGED Viewed

@@ -3,9 +3,11 @@ from __future__ import annotations
 from ._df_helper import DfHelper
 from ._parquet_artifact import ParquetArtifact
 from ._parquet_reader import ParquetReader
+#from .data_cleaner import DataCleaner
 __all__ = [
     'DfHelper',
     'ParquetArtifact',
     'ParquetReader',
+    #'DataCleaner'
 ]

sibi_dst/df_helper/_df_helper.py CHANGED Viewed

@@ -6,6 +6,7 @@ from typing import Any, Dict, TypeVar
 from typing import Union, Optional
 import dask.dataframe as dd
+from dask import delayed, compute
 import pandas as pd
 from pydantic import BaseModel
@@ -29,6 +30,38 @@ warnings.filterwarnings(
 class DfHelper:
+    """
+    DfHelper is a utility class for managing, loading, and processing data from
+    various backends, such as Django databases, Parquet files, HTTP sources, and
+    SQLAlchemy-based databases. The class abstracts the complexities of handling
+    different backends and provides a unified interface for data operations.
+    The class is particularly useful for projects that require flexibility in
+    data source configuration and seamless integration with both Dask and Pandas
+    for handling data frames. It includes robust mechanisms for post-processing
+    data, filtering columns, renaming, and setting indices.
+    :ivar df: The DataFrame currently being processed or loaded.
+    :type df: Union[dd.DataFrame, pd.DataFrame]
+    :ivar backend_django: Configuration for interacting with Django database backends.
+    :type backend_django: Optional[DjangoConnectionConfig]
+    :ivar _backend_query: Internal configuration for query handling.
+    :type _backend_query: Optional[QueryConfig]
+    :ivar _backend_params: Internal parameters configuration for DataFrame handling.
+    :type _backend_params: Optional[ParamsConfig]
+    :ivar backend_parquet: Configuration for Parquet file handling.
+    :type backend_parquet: Optional[ParquetConfig]
+    :ivar backend_http: Configuration for interacting with HTTP-based backends.
+    :type backend_http: Optional[HttpConfig]
+    :ivar backend_sqlalchemy: Configuration for interacting with SQLAlchemy-based databases.
+    :type backend_sqlalchemy: Optional[SqlAlchemyConnectionConfig]
+    :ivar parquet_filename: The filename for a Parquet file, if applicable.
+    :type parquet_filename: str
+    :ivar logger: Logger instance used for debugging and information logging.
+    :type logger: Logger
+    :ivar default_config: Default configuration dictionary that can be overridden.
+    :type default_config: Dict
+    """
     df: Union[dd.DataFrame, pd.DataFrame] = None
     backend_django: Optional[DjangoConnectionConfig] = None
     _backend_query: Optional[QueryConfig] = None
@@ -60,7 +93,20 @@ class DfHelper:
     def __str__(self):
         return self.__class__.__name__
+    def __call__(self, **options):
+        return self.load(**options)
     def __post_init(self, **kwargs):
+        """
+        Initializes backend-specific configurations based on the provided backend type and other
+        parameters. This method performs configuration setup dependent on the selected backend,
+        such as 'django_db', 'parquet', 'http', or 'sqlalchemy'. Configuration for each backend
+        type is fetched or instantiated as necessary using provided parameters or default values.
+        :param kwargs: Dictionary of arguments passed during initialization of backend configurations.
+                       Additional parameters for specific backend types are extracted here.
+        :return: None
+        """
         self.logger.debug(f"backend used: {self.backend}")
         self._backend_query = self.__get_config(QueryConfig, kwargs)
         self._backend_params = self.__get_config(ParamsConfig, kwargs)
@@ -88,7 +134,35 @@ class DfHelper:
         model_kwargs = {k: kwargs.pop(k) for k in list(kwargs.keys()) if k in recognized_keys}
         return model(**model_kwargs)
+    def load_parallel(self, **options):
+        """
+        Executes the `load` method in parallel using Dask, allowing multiple instances
+        to run concurrently. This function leverages Dask's `delayed` and `compute`
+        methods to schedule and process tasks in parallel. It is designed to handle
+        concurrent workloads efficiently by utilizing up to 4 parallel executions of
+        the `load` function.
+        :param options: Keyword arguments to be passed to the `load` method. These options
+            will be applied to all parallel instances of the `load` method.
+        :return: A list of results, where each element represents the output
+            from one of the parallel executions of the `load` method.
+        """
+        # Define tasks using Dask's delayed
+        tasks = [delayed(self.load)(**options) for _ in range(4)]
+        results = compute(*tasks)
+        return results
     def load(self, **options):
+        """
+        Loads data from a dataframe backend, ensuring compatibility with multiple
+        data processing backends. Provides the data in a pandas dataframe format
+        if the `as_pandas` attribute is set to True.
+        :param options: Arbitrary keyword arguments for dataframe loading customization.
+        :type options: dict
+        :return: The loaded dataframe, computed as a pandas dataframe if
+            `as_pandas` is set to True, or kept in its native backend format otherwise.
+        """
         # this will be the universal method to load data from a df irrespective of the backend
         df = self.__load(**options)
         if self.as_pandas:
@@ -96,7 +170,23 @@ class DfHelper:
         return df
     def __load(self, **options):
+        """
+        Private method responsible for loading data using a specified backend. This method
+        abstracts away the details of interacting with the backend and dynamically calls the
+        appropriate function depending on the backend type. It supports multiple backend
+        types, such as `django_db`, `sqlalchemy`, `parquet`, and `http`. If the `http` backend
+        is selected, it checks whether the asyncio event loop is running and either runs the
+        process as a new asyncio task or synchronously.
+        :param options: Arbitrary keyword arguments provided for backend-specific configurations.
+                        These should align with the requirements of the chosen backend.
+        :type options: dict
+        :return: The data loaded from the specified backend. The return type is dependent on
+                 the particular backend being used.
+        :rtype: Depending on backend implementation; could be `Task`, `List`, `Dict`, or
+                another format defined by the backend.
+        """
         if self.backend == 'django_db':
             self._backend_params.parse_params(options)
             return self.__load_from_db(**options)
@@ -167,8 +257,13 @@ class DfHelper:
     def __post_process_df(self):
         """
-        Efficiently process the DataFrame by filtering, renaming, and setting indices.
-        Optimized for large datasets with Dask compatibility.
+        Processes a DataFrame according to the provided parameters defined within the
+        `self._backend_params.df_params` dictionary. This involves filtering columns,
+        renaming columns, setting an index column, and handling datetime indexing.
+        The method modifies the DataFrame in place.
+        :raises ValueError: If the lengths of `fieldnames` and `column_names` do not match,
+            or if the specified `index_col` is not found in the DataFrame.
         """
         df_params = self._backend_params.df_params
         fieldnames = df_params.get("fieldnames", None)
@@ -205,6 +300,21 @@ class DfHelper:
         self.logger.debug("Post-processing of DataFrame completed.")
     def __process_loaded_data(self):
+        """
+        Processes the dataframe by applying renaming logic based on the given field map
+        configuration. Inspects the dataframe for missing columns referenced in the field
+        map and flags them with a warning. Applies renaming only for columns that exist
+        in the dataframe while ensuring that no operations take place if the dataframe
+        is empty.
+        :param self: The instance of the class where the dataframe is being processed.
+        :type self: object with attributes `df`, `_backend_params`, and `logger`.
+        :raises Warning: Logs a warning if specified columns in the `field_map` are not
+            present in the dataframe.
+        :return: None
+        """
         self.logger.debug(f"Type of self.df: {type(self.df)}")
         if self.df.map_partitions(len).compute().sum() > 0:
             field_map = self._backend_params.field_map or {}
@@ -239,20 +349,54 @@ class DfHelper:
         self.logger.debug("Save to ClickHouse completed.")
     def __load_from_parquet(self, **options) -> Union[pd.DataFrame, dd.DataFrame]:
-        self.df = self.backend_parquet.load_files()
-        if options:
-            """
-            deprecated specific filter handling to a generic one
-            self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
-            """
-            self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
-        return self.df
+        try:
+            self.df = self.backend_parquet.load_files()
+            if options and self.df is not None:
+                """
+                deprecated specific filter handling to a generic one
+                self.df = ParquetFilterHandler(logger=self.logger).apply_filters_dask(self.df, options)
+                """
+                self.df = FilterHandler(backend='dask', logger=self.logger).apply_filters(self.df, filters=options)
+            return self.df
+        except Exception as e:
+            self.logger.debug(f"Failed to load data from parquet: {e}")
+            return dd.from_pandas(pd.DataFrame(), npartitions=1)
     def load_period(self, **kwargs):
         return self.__load_period(**kwargs)
     def __load_period(self, **kwargs):
+        """
+        Validates and processes the temporal filtering parameters `start` and `end` for querying,
+        ensuring correctness and compatibility with a specified backend (Django or SQLAlchemy).
+        This method dynamically maps and validates the provided datetime or date field from the
+        model according to the configured backend, and applies the appropriate filters to query objects.
+        This function enforces that both `start` and `end` are provided and checks if the start date
+        is earlier or the same as the end date. It supports parsing string representations of dates
+        and validates them against the date or datetime fields associated with the chosen backend.
+        If the backend or field is incompatible or missing, appropriate errors are raised.
+        The resulting filter conditions are integrated into `kwargs` for querying with the
+        appropriate backend model.
+        :param kwargs: Keyword arguments, including temporal filtering parameters and optionally a
+            datetime or date field name. Supported parameters include:
+            - **dt_field**: The name of the date or datetime field to use in filtering. Defaults
+              to an internally set field if not explicitly provided.
+            - **start**: The starting date or datetime for the query range. Can be a `str` or
+              `datetime.date/datetime.datetime` object.
+            - **end**: The ending date or datetime for the query range. Can be a `str` or
+              `datetime.date/datetime.datetime` object.
+        :return: Queryset or result of the load function with the applied temporal filters.
+        :rtype: Any
+        :raises ValueError: If the `dt_field` is not provided, if `start` or `end`
+            are missing, if the `start` date is later than `end`, or if the `dt_field`
+            does not exist in the backend model or its metadata.
+        """
         dt_field = kwargs.pop("dt_field", self.dt_field)
         if dt_field is None:
             raise ValueError("dt_field must be provided")
@@ -316,6 +460,30 @@ class DfHelper:
     @staticmethod
     def parse_date(date_str: str) -> Union[datetime.datetime, datetime.date]:
+        """
+        Parses a date string and converts it to a `datetime.datetime` or
+        `datetime.date` object.
+        This method attempts to parse the given string in two distinct formats:
+        1. First, it tries to interpret the string as a datetime with the format
+           ``%Y-%m-%d %H:%M:%S``. If successful, it returns a `datetime.datetime`
+           object.
+        2. If the first format parsing fails, it attempts to parse the string as
+           a date with the format ``%Y-%m-%d``. If successful, it returns a
+           `datetime.date` object.
+        If the string cannot be parsed in either of these formats, the method will
+        raise a `ValueError`.
+        :param date_str: The date string to be parsed. Expected to match one of the
+            formats: ``%Y-%m-%d %H:%M:%S`` or ``%Y-%m-%d``.
+        :type date_str: str
+        :return: A `datetime.datetime` object if the string matches the first format,
+            or a `datetime.date` object if the string matches the second format.
+        :rtype: Union[datetime.datetime, datetime.date]
+        :raises ValueError: Raised if neither date format can be successfully parsed
+            from the provided string.
+        """
         try:
             return datetime.datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S')
         except ValueError:

sibi_dst/df_helper/core/_filter_handler.py CHANGED Viewed

@@ -9,6 +9,22 @@ from sibi_dst.utils import Logger
 class FilterHandler:
+    """
+    Handles the application of filters to data sources with support for SQLAlchemy and Dask backends.
+    The FilterHandler class abstracts the process of applying filters to various backends, specifically
+    SQLAlchemy queries and Dask DataFrames. It supports multiple filtering operations, including
+    exact matches, comparisons, and string-related operations such as contains and regex. The handler
+    automatically determines and applies backend-specific processing, enabling seamless integration with
+    different data models or backends.
+    :ivar backend: The backend in use ('sqlalchemy' or 'dask').
+    :type backend: str
+    :ivar logger: An optional logger instance for debugging and logging purposes.
+    :type logger: Logger
+    :ivar backend_methods: A dictionary mapping backend-specific methods for column retrieval and operation application.
+    :type backend_methods: dict
+    """
     def __init__(self, backend, logger=None):
         """
         Initialize the FilterHandler.

sibi_dst/df_helper/data_cleaner.py ADDED Viewed

@@ -0,0 +1,132 @@
+import re
+from nltk.corpus import stopwords
+from nltk.stem import SnowballStemmer
+import dask.dataframe as dd
+from dask_ml.preprocessing import OneHotEncoder, LabelEncoder
+import nltk
+class DataCleaner:
+    def __init__(self, dataframe):
+        self.original_df = dataframe
+        self.df = dataframe.copy()
+        self.duplicates_df = None
+    def handle_missing_values(self, strategy='mean'):
+        if strategy == 'mean':
+            self.df = self.df.fillna(self.df.mean())
+        elif strategy == 'median':
+            self.df = self.df.fillna(self.df.median())
+        elif strategy == 'mode':
+            self.df = self.df.fillna(self.df.mode().iloc[0])
+        elif strategy == 'drop':
+            self.df = self.df.dropna()
+        return self
+    def identify_duplicates(self, subset=None):
+        self.duplicates_df = self.df.map_partitions(lambda df: df[df.duplicated(subset=subset, keep=False)])
+        return self.duplicates_df
+    def remove_duplicates(self):
+        if self.duplicates_df is not None:
+            self.df = self.df[~self.df.index.isin(self.duplicates_df.index)]
+        return self
+    def validate_date_fields(self, date_columns=None):
+        if date_columns is None:
+            date_columns = self.df.select_dtypes(include=['datetime', 'datetime64[ns]', 'datetime64[ns, UTC]']).columns
+        for col in date_columns:
+            print('Validating date field: ', col)
+            self.df[col] = dd.to_datetime(self.df[col], errors='coerce')
+        return self
+    def clean_text(self, text_columns=None, language='english'):
+        nltk.download('stopwords')
+        stop_words = set(stopwords.words(language))
+        stemmer = SnowballStemmer(language)
+        def clean_text(text):
+            if isinstance(text, str):
+                text = text.strip().lower()  # Remove leading/trailing whitespace and convert to lowercase
+                text = re.sub(r'[^\w\s]', '', text)  # Remove special characters and punctuation
+                words = text.split()
+                words = [word for word in words if word not in stop_words]  # Remove stop words
+                words = [stemmer.stem(word) for word in words]  # Apply stemming
+                return ' '.join(words)
+            return text
+        if text_columns is None:
+            text_columns = self.df.select_dtypes(include=['object', 'string']).columns
+            text_columns = [col for col in text_columns if self.df[col].dtype != 'bool']
+        for col in text_columns:
+            print('Cleaning text field: ', col)
+            self.df[col] = self.df[col].map(clean_text, meta=('cleaned_text', 'object'))
+        return self
+    def validate_numeric_fields(self, int_columns=None, float_columns=None):
+        if int_columns is None:
+            int_columns = self.df.select_dtypes(include=['int64', 'int32']).columns
+        if float_columns is None:
+            float_columns = self.df.select_dtypes(include=['float64', 'float32']).columns
+        for col in int_columns:
+            print('Validating integer field: ', col)
+            self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='integer')
+        for col in float_columns:
+            print('Validating float field: ', col)
+            self.df[col] = dd.to_numeric(self.df[col], errors='coerce', downcast='float')
+        return self
+    def detect_categorical_columns(self, threshold=0.05):
+        """
+        Detect columns that can be converted to 'category' dtype.
+        Parameters:
+        threshold (float): The maximum ratio of unique values to total values for a column to be considered categorical.
+        Returns:
+        List of column names that can be converted to 'category' dtype.
+        """
+        categorical_columns = []
+        def unique_ratio(partition, col):
+            return partition[col].nunique() / len(partition)
+        for col in self.df.columns:
+            print("Detecting categorical columns: ", col)
+            unique_ratios = self.df.map_partitions(unique_ratio, col=col).compute()
+            overall_unique_ratio = unique_ratios.sum() / len(self.df)
+            if overall_unique_ratio < threshold:
+                print(f'Column {col} is categorical')
+                categorical_columns.append(col)
+        return categorical_columns
+    def handle_categorical_variables(self, columns=None, method='onehot', threshold=0.05):
+        if columns is None:
+            columns = self.detect_categorical_columns(threshold)
+        if method == 'onehot':
+            for col in columns:
+                self.df[col] = self.df[col].astype('category')
+            encoder = OneHotEncoder(sparse_output=False)
+            self.df = encoder.fit_transform(self.df)
+        elif method == 'label':
+            encoder = LabelEncoder()
+            for col in columns:
+                self.df[col] = encoder.fit_transform(self.df[col])
+        return self
+    def analyze_dtypes(self):
+        return self.df.dtypes
+    def get_cleaned_dataframe(self):
+        return self.df
+    def get_original_dataframe(self):
+        return self.original_df
+    def get_duplicates_dataframe(self):
+        return self.duplicates_df

sibi_dst/geopy_helper/__init__.py ADDED Viewed

@@ -0,0 +1,7 @@
+from __future__ import annotations
+from .geo_location_service import *
+__all__ = [
+    'GeolocationService'
+]

sibi_dst/geopy_helper/geo_location_service.py ADDED Viewed

@@ -0,0 +1,63 @@
+import os
+from urllib.parse import urlparse
+from geopy.geocoders import Nominatim
+from geopy.exc import GeocoderTimedOut, GeocoderServiceError
+app_nominatim_url = os.environ.get('NOMINATIM_URL', None)
+app_geo_locator_test_place = os.environ.get('GEO_LOCATOR_TEST_PLACE', "San Jose, Costa Rica")
+class GeolocationService:
+    debug: bool = False
+    def __init__(self, debug=False):
+        self.geolocator = None
+        self._initialize_geolocator()
+        self.debug = debug
+    def _initialize_geolocator(self):
+        nominatim_url = app_nominatim_url
+        if not nominatim_url:
+            if self.debug:
+                print("Nominatim URL not provided in environment variables.")
+            return
+        try:
+            parsed_url = urlparse(nominatim_url)
+            nominatim_url = parsed_url.netloc
+            self.geolocator = Nominatim(user_agent="ibis", scheme="http", domain=nominatim_url)
+            # Test the geolocator to ensure it is available
+            location = self.geolocator.geocode(app_geo_locator_test_place)
+            if location:
+                if self.debug:
+                    print("Geolocator is available.")
+            else:
+                if self.debug:
+                    print("Geolocator service is not responding correctly.")
+                self.geolocator = None
+        except (GeocoderTimedOut, GeocoderServiceError) as e:
+            print(f"Error initializing geolocator: {e}")
+            self.geolocator = None
+    def geocode(self, address):
+        if not self.geolocator:
+            if self.debug:
+                print("Geolocator is not available.")
+            return None
+        try:
+            return self.geolocator.geocode(address)
+        except (GeocoderTimedOut, GeocoderServiceError) as e:
+            print(f"Error during geocoding: {e}")
+            return None
+    def reverse(self, coordinates, exactly_one=True):
+        if not self.geolocator:
+            if self.debug:
+                print("Geolocator is not available.")
+            return None
+        try:
+            return self.geolocator.reverse(coordinates, exactly_one=exactly_one)
+        except (GeocoderTimedOut, GeocoderServiceError) as e:
+            print(f"Error during reverse geocoding: {e}")
+            return None

sibi_dst/geopy_helper/utils.py ADDED Viewed

@@ -0,0 +1,55 @@
+from .geo_location_service import GeolocationService, GeocoderTimedOut, GeocoderServiceError
+# Initialize the geolocator once
+geolocator = None
+def get_geolocator():
+    global geolocator
+    if geolocator is None:
+        geolocator = GeolocationService(debug=True)
+    return geolocator
+#geolocator = GeolocationService(debug=True)
+def get_address_by_coordinates(latitude, longitude, exactly_one=True):
+    geolocator = get_geolocator()
+    try:
+        location = geolocator.reverse((latitude, longitude), exactly_one=exactly_one)
+        if not location:
+            return "No address found for this location."
+        address = location.address
+        return address
+    except GeocoderTimedOut:
+        return "GeocoderTimedOut: Failed to reach the server."
+def get_coordinates_for_address(address):
+    """
+    Geocode an address using a custom Nominatim server.
+    :param address: The address to geocode.
+    :return: A dictionary with the location's latitude, longitude, and full address, or a message if an error occurs.
+    """
+    geolocator = get_geolocator()
+    try:
+        location = geolocator.geocode(address)
+        # Check if location was found
+        if location:
+            return {
+                "Address": location.address,
+                "Latitude": location.latitude,
+                "Longitude": location.longitude
+            }
+        else:
+            return "Location not found."
+    except GeocoderTimedOut:
+        return "GeocoderTimedOut: Request timed out."
+    except GeocoderServiceError as e:
+        return f"GeocoderServiceError: {str(e)}"
+    except Exception as e:
+        return f"Error: {str(e)}"

sibi_dst/osmnx_helper/__init__.py ADDED Viewed

@@ -0,0 +1,9 @@
+from __future__ import annotations
+from .base_osm_map import BaseOsmMap
+from .utils import PBFHandler
+__all__ = [
+    "BaseOsmMap",
+    "PBFHandler",
+]

sibi-dst 0.3.27__py3-none-any.whl → 0.3.29__py3-none-any.whl

sibi-dst 0.3.27py3-none-any.whl → 0.3.29py3-none-any.whl