PyPI - sibi-dst - Versions diffs - 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl - Mend

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/__init__.py +3 -2
sibi_dst/df_helper/_artifact_updater_async.py +238 -0
sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
sibi_dst/df_helper/_df_helper.py +418 -118
sibi_dst/df_helper/_parquet_artifact.py +275 -283
sibi_dst/df_helper/_parquet_reader.py +9 -10
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/route_path_builder.py +45 -46
sibi_dst/utils/__init__.py +2 -0
sibi_dst/utils/base.py +235 -100
sibi_dst/utils/business_days.py +248 -0
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +392 -88
sibi_dst/utils/date_utils.py +711 -393
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_age_checker.py +301 -0
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/periods.py +42 -0
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0

sibi_dst/utils/data_utils.py CHANGED Viewed

@@ -1,5 +1,6 @@
+from __future__ import annotations
-from typing import Union, List
+from typing import Union, List, Dict, Any, Iterable
 import dask.dataframe as dd
 import pandas as pd
@@ -9,240 +10,192 @@ from .log_utils import Logger
 class DataUtils:
     """
-    Utility class for data transformation, manipulation, and merging.
-    This class provides functionalities for transforming numeric and boolean columns, merging
-    lookup data, checking DataFrame emptiness, and converting columns to datetime format in
-    Pandas or Dask DataFrames. It is designed to handle data preprocessing steps efficiently
-    for both small-scale and large-scale datasets. Logging and debug options are available
-    to trace execution and monitor operations.
-    :ivar logger: Logger instance for logging messages.
-    :type logger: logging.Logger
-    :ivar debug: Flag to enable or disable debug mode.
-    :type debug: bool
+    Helpers for transforming columns, safe emptiness checks, datetime coercion,
+    and joining lookup data for Pandas or Dask DataFrames.
     """
-    def __init__(self, logger=None, **kwargs):
+    def __init__(self, logger: Logger | None = None, **kwargs: Any) -> None:
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
-        self.debug = kwargs.get('debug', False)
+        self.debug: bool = bool(kwargs.get("debug", False))
+    # ---------- numeric / boolean transforms ----------
     @staticmethod
-    def _transform_column(series, fill_value, dtype):
-        """
-        Helper method to transform a column by converting it to numeric, filling missing values,
-        and casting to the specified dtype.
-        :param series: The column to transform.
-        :type series: pd.Series or dd.Series
-        :param fill_value: Value to replace missing or invalid data.
-        :type fill_value: int or float
-        :param dtype: Target data type for the column.
-        :type dtype: type
-        :return: Transformed column.
-        :rtype: pd.Series or dd.Series
-        """
-        return (
-            pd.to_numeric(series, errors="coerce")  # Convert to numeric, invalid to NaN
-            .fillna(fill_value)  # Replace NaN with fill_value
-            .astype(dtype)  # Convert to target dtype
-        )
-    def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
-                                  dtype=int):
+    def _transform_column_pandas(series: pd.Series, fill_value: Any, dtype: type) -> pd.Series:
+        return pd.to_numeric(series, errors="coerce").fillna(fill_value).astype(dtype)
+    def transform_numeric_columns(
+        self,
+        df: Union[pd.DataFrame, dd.DataFrame],
+        columns: List[str],
+        fill_value: Any = 0,
+        dtype: type = int,
+    ) -> Union[pd.DataFrame, dd.DataFrame]:
         """
-        Transform specified numeric columns in the DataFrame by converting their data types
-        to the specified dtype and replacing missing values with the given fill_value.
-        :param df: DataFrame to be transformed.
-        :type df: pd.DataFrame or dd.DataFrame
-        :param columns: List of column names to transform.
-        :type columns: list[str]
-        :param fill_value: Value to replace missing or invalid data. Default is 0.
-        :type fill_value: int or float
-        :param dtype: Target data type for the columns. Default is int.
-        :type dtype: type
-        :return: Transformed DataFrame.
-        :rtype: pd.DataFrame or dd.DataFrame
+        Convert selected columns to numeric → fillna → cast dtype.
+        Works for Pandas and Dask (partition-wise).
         """
         if not columns:
-            self.logger.warning("No columns specified.")
+            self.logger.warning("No columns specified for transform_numeric_columns.")
             return df
-        self.logger.debug(f"DataFrame type: {type(df)}")
-        columns = [col for col in columns if col in df.columns]
-        for col in columns:
-            df[col] = df[col].map_partitions(
-                self._transform_column, fill_value, dtype, meta=(col, dtype)
-            )
+        cols = [c for c in columns if c in df.columns]
+        if not cols:
+            self.logger.warning("None of the requested columns exist in the DataFrame.")
+            return df
-        return df
+        if isinstance(df, pd.DataFrame):
+            for col in cols:
+                df[col] = self._transform_column_pandas(df[col], fill_value, dtype)
+            return df
-    def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
-        """
-        This function transforms the specified numeric columns in the given dataframe by converting
-        their data types to the specified dtype, with an optional parameter for replacing missing
-        values. It first checks if the provided columns exist in the dataframe, processes each column
-        to replace non-numeric values with NaN, fills NaN values with the given fill_value, and finally
-        converts the column to the specified dtype.
-        :param df: DataFrame to be transformed.
-        :type df: dask.dataframe.DataFrame
-        :param columns: List of column names to be transformed.
-        :type columns: list[str]
-        :param fill_value: Value used to replace missing or invalid data. Default is 0.
-        :type fill_value: int or float
-        :param dtype: Target data type for the columns after transformation. Default is int.
-        :type dtype: type
-        :return: Transformed dataframe with the specified numeric columns converted and modified.
-        :rtype: dask.dataframe.DataFrame
-        """
-        if not columns:
-            self.logger.warning('No columns specified')
-        self.logger.debug(f'Dataframe type:{type(df)}')
-        columns = [column for column in columns if column in df.columns]
-        for col in columns:
-            # Replace NaN with 0, then convert to boolean
+        # Dask path
+        for col in cols:
             df[col] = df[col].map_partitions(
-                lambda s: pd.to_numeric(s, errors='coerce')  # Convert to numeric, invalid to NaN
-                .fillna(fill_value)  # Replace NaN with 0
-                .astype(dtype),
-                meta=(col, dtype)
+                self._transform_column_pandas,
+                fill_value,
+                dtype,
+                meta=(col, dtype),
             )
         return df
-    def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
-        """
-        Convert specified columns in the DataFrame to boolean, replacing missing values with
-        the given fill_value.
-        :param df: DataFrame to be transformed.
-        :type df: pd.DataFrame or dd.DataFrame
-        :param columns: List of column names to transform.
-        :type columns: list[str]
-        :param fill_value: Value to replace missing or invalid data. Default is 0.
-        :type fill_value: int or float
-        :return: Transformed DataFrame.
-        :rtype: pd.DataFrame or dd.DataFrame
-        """
+    def transform_boolean_columns(
+        self,
+        df: Union[pd.DataFrame, dd.DataFrame],
+        columns: List[str],
+        fill_value: Any = 0,
+    ) -> Union[pd.DataFrame, dd.DataFrame]:
+        """Convenience wrapper: cast to boolean via numeric→fillna→astype(bool)."""
         return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
-    def merge_lookup_data(self, classname, df, **kwargs):
-        """
-        Merge lookup data into the DataFrame based on specified columns.
-        Parameters:
-        - classname: The class instance to use for loading lookup data.
-        - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
-        - kwargs: Additional keyword arguments for configuration.
+    # ---------- lookup merge ----------
-        Returns:
-        - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
+    def merge_lookup_data(
+        self,
+        classname,
+        df: Union[pd.DataFrame, dd.DataFrame],
+        **kwargs: Any,
+    ) -> Union[pd.DataFrame, dd.DataFrame]:
         """
-        # Return early if the DataFrame is empty
+        Merge lookup data for ids present in `source_col`.
+        Required kwargs:
+            - source_col
+            - lookup_col
+            - lookup_description_col
+            - source_description_alias
+        Optional kwargs:
+            - fillna_source_description_alias: bool = False
+            - fieldnames: tuple[str, str] = (lookup_col, lookup_description_col)
+            - column_names: list[str] = ['temp_join_col', source_description_alias]
+            - any other filters passed to `classname.load(...)`
+        """
+        # Early outs for emptiness and required args
         if self.is_dataframe_empty(df):
-            self.logger.debug("merge_lookup_data was given an empty dataFrame")
+            self.logger.debug("merge_lookup_data: input DataFrame empty — nothing to merge.")
             return df
-        # Extract and validate required parameters
-        required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
-        missing_params = [param for param in required_params if param not in kwargs]
-        if missing_params:
-            raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
+        required = ["source_col", "lookup_col", "lookup_description_col", "source_description_alias"]
+        missing = [k for k in required if k not in kwargs]
+        if missing:
+            raise ValueError(f"Missing required parameters: {', '.join(missing)}")
-        source_col = kwargs.pop('source_col')
-        lookup_col = kwargs.pop('lookup_col')
-        lookup_description_col = kwargs.pop('lookup_description_col')
-        source_description_alias = kwargs.pop('source_description_alias')
+        source_col = kwargs.pop("source_col")
+        lookup_col = kwargs.pop("lookup_col")
+        lookup_description_col = kwargs.pop("lookup_description_col")
+        source_description_alias = kwargs.pop("source_description_alias")
-        # Optional parameters with default values
-        fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
-        fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
-        column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
+        fillna_alias = bool(kwargs.pop("fillna_source_description_alias", False))
+        fieldnames = kwargs.pop("fieldnames", (lookup_col, lookup_description_col))
+        column_names = kwargs.pop("column_names", ["temp_join_col", source_description_alias])
         if source_col not in df.columns:
-            self.logger.debug(f"{source_col} not in DataFrame columns")
+            self.logger.debug(f"merge_lookup_data: '{source_col}' not found in frame — skipping merge.")
             return df
-        # Get unique IDs from source column
-        ids = df[source_col].dropna().unique()
-        # Compute if it's a Dask Series
-        if isinstance(ids, dd.Series):
-            ids = ids.compute()
+        # Collect ids safely
+        try:
+            ids_series = df[source_col].dropna()
+            if isinstance(df, dd.DataFrame):
+                # Dask: unique() is lazy → compute smallish result
+                ids = ids_series.unique().compute()
+            else:
+                ids = ids_series.unique()
+            ids = sorted(ids.tolist() if not isinstance(ids, list) else ids)
+        except Exception as e:
+            self.logger.error(f"merge_lookup_data: failed extracting ids from '{source_col}': {e}")
+            return df
-        # Check if any IDs are found
-        if not len(ids):
-            self.logger.debug(f"No IDs found in the source column: {source_col}")
+        if not ids:
+            self.logger.debug(f"merge_lookup_data: no ids found in '{source_col}'.")
             return df
-        # Convert to a list only if necessary and sort
-        if not isinstance(ids, list):
-            ids = ids.tolist()
-        ids = sorted(ids)
-        # Prepare kwargs for loading lookup data
-        load_kwargs = kwargs.copy()
-        load_kwargs.update({
-            'fieldnames': fieldnames,
-            'column_names': column_names,
-            f'{lookup_col}__in': ids
-        })
-        # Load lookup data
+        # Load lookup data (expected to be small after filtering)
+        load_kwargs = {
+            **kwargs,
+            "fieldnames": fieldnames,
+            "column_names": column_names,
+            f"{lookup_col}__in": ids,
+        }
         lookup_instance = classname(debug=self.debug, logger=self.logger)
         result = lookup_instance.load(**load_kwargs)
-        if len(result.index) == 0:
-            self.logger.debug(f"No IDs found in the source column: {source_col}")
-            return df
-        # Determine the join column on the result DataFrame
-        temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
-        # Merge DataFrames
-        df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
+        # If lookup returns Dask, compute to pandas (broadcastable) or keep small Dask?
+        if isinstance(result, dd.DataFrame):
+            # we expect this to be small after filtering by ids; materialize
+            result = result.compute()
-        if fillna_source_description_alias and source_description_alias in df.columns:
-            df[source_description_alias] = df[source_description_alias].fillna('')
+        if not isinstance(result, pd.DataFrame):
+            raise TypeError("merge_lookup_data: lookup 'load' must return a pandas or dask DataFrame.")
-        # Drop temp_join_col if present
-        df = df.drop(columns='temp_join_col', errors='ignore')
+        if result.empty:
+            self.logger.debug("merge_lookup_data: lookup returned 0 rows — nothing to merge.")
+            return df
-        return df
+        # Determine join key in the lookup result
+        temp_join_col = "temp_join_col" if "temp_join_col" in column_names else lookup_col
-    def is_dataframe_empty(self, df):
-        """
-        Check if a DataFrame (Pandas or Dask) is empty.
+        # Perform merge (Dask can merge with a small pandas right side)
+        merged = df.merge(result, how="left", left_on=source_col, right_on=temp_join_col)
+        if fillna_alias and source_description_alias in merged.columns:
+            if isinstance(merged, dd.DataFrame):
+                merged[source_description_alias] = merged[source_description_alias].fillna("")
+            else:
+                merged[source_description_alias] = merged[source_description_alias].fillna("")
-        Parameters:
-        - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
+        # Drop helper join column if present
+        merged = merged.drop(columns="temp_join_col", errors="ignore")
+        return merged
-        Returns:
-        - bool: True if the DataFrame is empty, False otherwise.
+    # ---------- emptiness & datetime ----------
+    def is_dataframe_empty(self, df: Union[pd.DataFrame, dd.DataFrame]) -> bool:
+        """
+        Safe emptiness check. For Dask, uses head(1) to avoid full compute.
         """
         if isinstance(df, dd.DataFrame):
             try:
-                return len(df.index) == 0
+                head = df.head(1, npartitions=-1, compute=True)
+                return head.empty
             except Exception as e:
-                self.logger.error(f"Error while processing Dask DataFrame: {e}")
+                self.logger.error(f"is_dataframe_empty: Dask head() failed: {e}")
                 return False
-        elif isinstance(df, pd.DataFrame):
+        if isinstance(df, pd.DataFrame):
             return df.empty
-        else:
-            self.logger.error("Input must be a pandas or dask DataFrame.")
-            return False
+        self.logger.error("is_dataframe_empty: input must be a pandas or dask DataFrame.")
+        return False
     @staticmethod
-    def convert_to_datetime_dask(df, date_fields):
+    def convert_to_datetime_dask(df: dd.DataFrame, date_fields: Iterable[str]) -> dd.DataFrame:
         """
-        Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
-        Parameters:
-        - df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
-        - date_fields (list of str): List of column names to convert to datetime.
-        Returns:
-        - dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
+        Partition-wise datetime coercion with errors='coerce'.
         """
         for col in date_fields:
             if col in df.columns:
-                df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
-        return df
+                df[col] = df[col].map_partitions(
+                    pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]")
+                )
+        return df

sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.2py3-none-any.whl