PyPI - sibi-dst - Versions diffs - 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl - Mend

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

sibi_dst/__init__.py +7 -1
sibi_dst/df_helper/__init__.py +3 -2
sibi_dst/df_helper/_artifact_updater_async.py +238 -0
sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
sibi_dst/df_helper/_df_helper.py +418 -118
sibi_dst/df_helper/_parquet_artifact.py +275 -283
sibi_dst/df_helper/_parquet_reader.py +9 -10
sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
sibi_dst/osmnx_helper/route_path_builder.py +45 -46
sibi_dst/utils/__init__.py +2 -0
sibi_dst/utils/base.py +235 -100
sibi_dst/utils/business_days.py +248 -0
sibi_dst/utils/clickhouse_writer.py +472 -206
sibi_dst/utils/data_utils.py +139 -186
sibi_dst/utils/data_wrapper.py +392 -88
sibi_dst/utils/date_utils.py +711 -393
sibi_dst/utils/df_utils.py +193 -213
sibi_dst/utils/file_age_checker.py +301 -0
sibi_dst/utils/file_utils.py +3 -2
sibi_dst/utils/filepath_generator.py +314 -152
sibi_dst/utils/log_utils.py +581 -242
sibi_dst/utils/manifest_manager.py +60 -76
sibi_dst/utils/parquet_saver.py +33 -27
sibi_dst/utils/periods.py +42 -0
sibi_dst/utils/phone_formatter.py +88 -95
sibi_dst/utils/update_planner.py +180 -178
sibi_dst/utils/webdav_client.py +116 -166
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
{sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0

sibi_dst/utils/df_utils.py CHANGED Viewed

@@ -1,44 +1,88 @@
+import warnings
+from typing import Union, List, Dict, Tuple, Iterable
 import dask.dataframe as dd
 import pandas as pd
 from .log_utils import Logger
-import warnings
 warnings.filterwarnings("ignore", message="Sorting a Dask DataFrame is expensive and may not be efficient")
 class DfUtils:
-    def __init__(self, logger=None):
-        """
-        Utility class for DataFrame operations compatible with both pandas and Dask DataFrames.
+    """
+    Utilities that work with both pandas and Dask DataFrames, with Dask-first behavior.
+    """
-        Parameters:
-            logger (Logger, optional): Logger instance for logging information.
-        """
+    def __init__(self, logger=None, *, debug: bool = False):
         self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
+        self.debug = debug
+    # -------------------------
+    # helpers
+    # -------------------------
+    @staticmethod
+    def _is_dask(obj) -> bool:
+        return isinstance(obj, (dd.DataFrame, dd.Series))
     @classmethod
     def compute_to_list(cls, series):
         return series.compute().tolist() if hasattr(series, "compute") else series.tolist()
-    def extract_unique_values(self, df, *columns):
-        return {col: self.compute_to_list(df[col].dropna().unique()) for col in columns}
+    def _astype_safe(self, df, col: str, dtype) -> None:
+        """
+        Cast a single column in-place; handles Dask meta generation implicitly by letting Dask infer.
+        """
+        if col not in df.columns:
+            return
+        if self._is_dask(df):
+            df[col] = df[col].astype(dtype)
+        else:
+            df[col] = df[col].astype(dtype)
-    def align_and_merge_by_type(self, df_left, df_right, type_mapping, how='left'):
+    def _df_len_zero(self, df) -> bool:
+        """
+        Dask-safe emptiness check (avoids df.empty with Dask).
         """
-        Align column data types in two DataFrames based on a type mapping dictionary and perform the merge.
-        Parameters:
-        - df_left (pd.DataFrame or dd.DataFrame): Left DataFrame
-        - df_right (pd.DataFrame or dd.DataFrame): Right DataFrame
-        - type_mapping (dict): Dictionary mapping target dtypes to column pairs.
-          Example: {
-              'integer': [('customer_id', 'temp1'), ('product_type_id', 'temp2')],
-              'string': [('group2', 'temp4')]
-          }
-        Returns:
-        - Merged DataFrame
+        if self._is_dask(df):
+            try:
+                # Much faster than materializing the whole df
+                n = df.map_partitions(len).sum().compute()
+                return int(n) == 0
+            except Exception as e:
+                self.logger.error(f"Error computing Dask length: {e}")
+                return False
+        return df.empty
+    # -------------------------
+    # public API
+    # -------------------------
+    def extract_unique_values(self, df, *columns):
+        result: Dict[str, List] = {}
+        for col in columns:
+            if col not in df.columns:
+                result[col] = []
+                continue
+            vals = df[col].dropna()
+            # Prefer drop_duplicates over unique() for Dask robustness
+            if self._is_dask(vals):
+                vals = vals.drop_duplicates().compute()
+            else:
+                vals = vals.drop_duplicates()
+            result[col] = vals.tolist()
+        return result
+    def align_and_merge_by_type(self, df_left, df_right, type_mapping: Dict[str, Iterable[Tuple[str, str]]], how='left'):
+        """
+        Align dtypes for pairs of columns then merge on aligned pairs.
+        type_mapping example:
+        {
+          'integer': [('customer_id','temp1'), ('product_type_id','temp2')],
+          'string':  [('group2','temp4')],
+          'datetime':[('ts','ts2')],
+          'boolean':[('is_ok','flag')]
+        }
         """
-        # Map string keys to actual dtypes
         dtype_map = {
             'integer': 'int64',
             'float': 'float64',
@@ -47,238 +91,174 @@ class DfUtils:
             'boolean': 'bool',
         }
-        # Iterate over each dtype and align the column pairs
-        for target_type, column_pairs in type_mapping.items():
+        # Cast columns as requested
+        for target_type, pairs in (type_mapping or {}).items():
             if target_type not in dtype_map:
                 self.logger.error(f"Unsupported type: {target_type}")
-            for left_col, right_col in column_pairs:
-                # Align dtypes in left and right DataFrames
-                if left_col in df_left.columns and right_col in df_right.columns:
-                    df_left[left_col] = df_left[left_col].astype(dtype_map[target_type])
-                    df_right[right_col] = df_right[right_col].astype(dtype_map[target_type])
-        # Flatten all column pairs for the merge operation
-        all_pairs = [pair for pairs in type_mapping.values() for pair in pairs]
-        # Perform the merge
-        return df_left.merge(
-            df_right,
-            how=how,
-            left_on=[pair[0] for pair in all_pairs],
-            right_on=[pair[1] for pair in all_pairs]
-        )
-    def exclude_from_dataframe(self, df, conditions):
+                continue
+            for left_col, right_col in pairs:
+                if left_col in df_left.columns:
+                    self._astype_safe(df_left, left_col, dtype_map[target_type])
+                if right_col in df_right.columns:
+                    self._astype_safe(df_right, right_col, dtype_map[target_type])
+        all_pairs = [p for pairs in (type_mapping or {}).values() for p in pairs]
+        left_keys = [p[0] for p in all_pairs]
+        right_keys = [p[1] for p in all_pairs]
+        # Dask merge works fine if both are Dask; if mixed, coerce right to Dask for scalability.
+        if self._is_dask(df_left) and not self._is_dask(df_right):
+            df_right = dd.from_pandas(df_right, npartitions=max(1, df_left.npartitions))
+        if self._is_dask(df_right) and not self._is_dask(df_left):
+            df_left = dd.from_pandas(df_left, npartitions=max(1, df_right.npartitions))
+        return df_left.merge(df_right, how=how, left_on=left_keys, right_on=right_keys)
+    def exclude_from_dataframe(self, df, conditions: List[Tuple[str, str, object]]):
         """
-        Generic function to filter rows from a DataFrame (Pandas or Dask).
-        Parameters:
-        - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame to filter.
-        - conditions (list of tuples): List of conditions to apply for filtering.
-          Each condition is a tuple: (column_name, operator, value).
-        Returns:
-        - pandas.DataFrame or dask.dataframe.DataFrame: Filtered DataFrame.
+        Filter rows out based on combined conditions (AND). Returns df[~combined].
+        conditions: list of (column, operator, value)
+        operators supported: ==, !=, <, <=, >, >=
         """
         import operator
+        ops = {"==": operator.eq, "!=": operator.ne, "<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge}
-        # Mapping string operators to actual Python operators
-        ops = {
-            "==": operator.eq,
-            "!=": operator.ne,
-            "<": operator.lt,
-            "<=": operator.le,
-            ">": operator.gt,
-            ">=": operator.ge,
-        }
-        # Ensure all specified columns exist in the DataFrame
-        missing_columns = [col for col, _, _ in conditions if col not in df.columns]
-        if missing_columns:
-            self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
+        if not conditions:
             return df
-        # Build the combined filtering condition
-        combined_condition = None
-        for col, op, value in conditions:
+        missing = [c for c, _, _ in conditions if c not in df.columns]
+        if missing:
+            self.logger.debug(f"Missing columns in DataFrame: {', '.join(missing)}")
+            return df
+        combined = None
+        for col, op, val in conditions:
             if op not in ops:
                 raise ValueError(f"Unsupported operator: {op}")
+            cond = ops[op](df[col], val)
+            combined = cond if combined is None else (combined & cond)
-            # Get the individual condition
-            condition = ops[op](df[col], value)
-            # Combine the condition with AND (&)
-            combined_condition = condition if combined_condition is None else (combined_condition & condition)
-        # Apply the filtering and return the DataFrame
-        return df[~combined_condition]
+        if combined is None:
+            return df
+        return df[~combined]
-    def load_grouped_activity(self, df, group_by_expr, group_expr='count', debug=False):
-        """
-        Groups the DataFrame by the specified expression and computes the size.
+    # ---- numeric/boolean casting
+    @staticmethod
+    def _transform_column(series, fill_value, dtype):
+        return pd.to_numeric(series, errors="coerce").fillna(fill_value).astype(dtype)
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame to be grouped.
-            group_by_expr (str or list): Column(s) to group by.
-            group_expr (str): Name of the size/count column.
-            debug (bool): If True, logs grouping information.
+    def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0, dtype=int):
+        if not columns:
+            self.logger.warning("No columns specified.")
+            return df
+        columns = [c for c in columns if c in df.columns]
+        for col in columns:
+            if self._is_dask(df):
+                df[col] = df[col].map_partitions(self._transform_column, fill_value, dtype, meta=(col, dtype))
+            else:
+                df[col] = self._transform_column(df[col], fill_value, dtype)
+        return df
-        Returns:
-            DataFrame: Grouped DataFrame with counts.
-        """
-        if debug:
-            self.logger.debug(f"Grouping by: {group_by_expr}")
+    # kept for backward-compat
+    def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
+        return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=dtype)
-        df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
-        return df_grouped
+    def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
+        return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
-    def eval_duplicate_removal(self, df, duplicate_expr, sort_field=None, keep='last', debug=False):
+    # ---- duplicate handling
+    def eval_duplicate_removal(self, df, duplicate_expr, sort_field: str | None = None, keep='last', debug=False):
         """
-        Removes duplicate rows based on the specified columns.
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame from which duplicates are to be removed.
-            duplicate_expr (str or list): Column(s) to identify duplicates.
-            sort_field (str, optional): Column to sort by before dropping duplicates.
-            keep (str): Which duplicate to keep ('first' or 'last').
-            debug (bool): If True, logs duplicate rows.
-        Returns:
-            DataFrame: DataFrame with duplicates removed.
+        Drop duplicates. For Dask, uses its shuffle-based drop_duplicates.
+        If sort_field is provided, we avoid global sorts for Dask.
         """
         if duplicate_expr is None:
             return df
         if debug:
-            df_duplicates = df[df.duplicated(subset=duplicate_expr)]
-            self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
+            try:
+                dups = df[df.duplicated(subset=duplicate_expr)]
+                # Do not .compute() here; just log that duplicates exist in Dask
+                self.logger.debug(f"Duplicate rows based on {duplicate_expr}: (preview only)")
+                if not self._is_dask(dups):
+                    self.logger.debug(f"\n{dups}")
+            except Exception:
+                pass
         if sort_field:
-            if isinstance(df, dd.DataFrame):
-                self.logger.warning("Sorting a Dask DataFrame is expensive and may not be efficient.")
-            df = df.sort_values(sort_field)
+            if self._is_dask(df):
+                self.logger.warning("Sorting a Dask DataFrame is expensive; skipping global sort.")
+            else:
+                df = df.sort_values(sort_field)
-        # Optimize duplicate removal for Dask DataFrames
-        if isinstance(df, dd.DataFrame):
-            df = df.drop_duplicates(subset=duplicate_expr, keep=keep, split_every=False)
+        if self._is_dask(df):
+            # Let Dask handle the global de-dup with a shuffle under the hood
+            df = df.drop_duplicates(subset=duplicate_expr, keep=keep)
         else:
             df = df.drop_duplicates(subset=duplicate_expr, keep=keep)
         return df
     def load_latest(self, df, duplicate_expr, sort_field=None, debug=False):
-        """
-        Removes duplicates keeping the latest occurrence.
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame.
-            duplicate_expr (str or list): Column(s) to identify duplicates.
-            sort_field (str, optional): Column to sort by before dropping duplicates.
-            debug (bool): If True, logs duplicate rows.
-        Returns:
-            DataFrame: DataFrame with latest duplicates removed.
-        """
         return self.eval_duplicate_removal(df, duplicate_expr, sort_field=sort_field, keep='last', debug=debug)
     def load_earliest(self, df, duplicate_expr, sort_field=None, debug=False):
-        """
-        Removes duplicates keeping the earliest occurrence.
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame.
-            duplicate_expr (str or list): Column(s) to identify duplicates.
-            sort_field (str, optional): Column to sort by before dropping duplicates.
-            debug (bool): If True, logs duplicate rows.
-        Returns:
-            DataFrame: DataFrame with the earliest duplicates removed.
-        """
         return self.eval_duplicate_removal(df, duplicate_expr, sort_field=sort_field, keep='first', debug=debug)
-    @staticmethod
-    def add_df_totals(df):
+    # ---- totals
+    def add_df_totals(self, df):
         """
-        Adds total row and column to the DataFrame.
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame.
-        Returns:
-            DataFrame: DataFrame with total row and column added.
+        Adds totals; for Dask, this computes to pandas (be careful with large frames).
         """
-        if isinstance(df, dd.DataFrame):
-            # Dask DataFrames are immutable; compute sums and convert to pandas
+        if self._is_dask(df):
+            self.logger.warning("add_df_totals will compute to pandas; may be large.")
             col_totals = df.sum(numeric_only=True).compute()
             row_totals = df.sum(axis=1, numeric_only=True).compute()
-            df = df.compute()
-            df.loc['Total'] = col_totals
-            df['Total'] = row_totals
+            pdf = df.compute()
+            pdf.loc['Total'] = col_totals
+            pdf['Total'] = row_totals
+            return pdf
         else:
             df.loc['Total'] = df.sum(numeric_only=True)
             df['Total'] = df.sum(axis=1, numeric_only=True)
-        return df
+            return df
+    # ---- summarization / resampling
     def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
         """
-        Summarizes data by creating a pivot table and resampling.
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame.
-            summary_column (str or list): Column(s) for summarization.
-            values_column (str or list): Column(s) to aggregate.
-            rule (str): Resampling frequency (e.g., 'D' for daily).
-            agg_func (str or function): Aggregation function.
-        Returns:
-            DataFrame: Resampled pivot table.
-        """
-        if isinstance(df, dd.core.DataFrame):
-            # Implement Dask-compatible pivot and resample
-            self.logger.debug("Performing summarization with Dask DataFrame.")
-            # Ensure the index is a datetime for resampling
-            if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
-                self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
-                df = df.set_index(dd.to_datetime(df.index))
-            # Group by index and summary columns
-            df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
-                agg_func).reset_index()
-            # Pivot the table
-            df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
-                                              aggfunc='sum').fillna(0)
-            # Resample
-            df_pivot.index = dd.to_datetime(df_pivot.index)
-            df_pivot = df_pivot.repartition(freq=rule)
-            df_resampled = df_pivot.map_partitions(lambda df: df.resample(rule).sum())
-            return df_resampled.compute()
-        else:
-            df_pivot = df.pivot_table(
-                index=df.index,
-                columns=summary_column,
-                values=values_column,
-                aggfunc=agg_func
-            ).fillna(0)
-            df_resampled = df_pivot.resample(rule).sum()
-            return df_resampled
-    @staticmethod
-    def summarize_and_resample_data(df, summary_columns, value_columns, rule='D', agg_func='count'):
-        """
-        Summarizes and resamples data.
-        Parameters:
-            df (DataFrame): Pandas or Dask DataFrame.
-            summary_columns (str or list): Column(s) for summarization.
-            value_columns (str or list): Column(s) to aggregate.
-            rule (str): Resampling frequency.
-            agg_func (str or function): Aggregation function.
-        Returns:
-            DataFrame: Resampled pivot table.
+        For pandas: pivot+resample on DatetimeIndex.
+        For Dask: create time bins and aggregate in Dask, then (optionally) pivot in pandas.
         """
-        return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
+        # pandas path
+        if not self._is_dask(df):
+            idx = df.index
+            if not isinstance(idx, pd.DatetimeIndex):
+                self.logger.warning("Index is not DatetimeIndex; converting from current index.")
+                df = df.copy()
+                df.index = pd.to_datetime(idx, errors="coerce")
+            pivot = df.pivot_table(index=df.index, columns=summary_column, values=values_column, aggfunc=agg_func).fillna(0)
+            return pivot.resample(rule).sum()
+        # Dask path
+        # 1) Build a datetime column from index (no global sort)
+        ddf = df
+        ddf = ddf.assign(_ts_bin=dd.to_datetime(ddf.index, errors="coerce"))
+        # 2) Bucket to rule using floor; do it per partition
+        def _floor_partition(pdf: pd.DataFrame, col: str, rule: str) -> pd.DataFrame:
+            out = pdf.copy()
+            out[col] = pd.to_datetime(out[col], errors="coerce")
+            out['_bin'] = out[col].dt.floor(rule)
+            return out
+        ddf = ddf.map_partitions(_floor_partition, col="_ts_bin", rule=rule, meta=dd.utils.make_meta(ddf))
+        # 3) Group in Dask on ['_bin', summary_column] and aggregate
+        grouped = ddf.groupby(['_bin', summary_column])[values_column].agg(agg_func).reset_index()
+        # 4) If you need a pivoted result, compute to pandas then pivot (Dask pivot_table is not generally supported)
+        gpdf = grouped.compute()
+        pivot = gpdf.pivot_table(index="_bin", columns=summary_column, values=values_column, aggfunc='sum').fillna(0)
+        # 5) Ensure regular resample (already bucketed; resampling is now cheap in pandas)
+        pivot.index = pd.to_datetime(pivot.index)
+        return pivot.asfreq(rule, fill_value=0)

sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

sibi-dst 2025.1.13py3-none-any.whl → 2025.8.2py3-none-any.whl