sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
  3. sibi_dst/df_helper/_df_helper.py +417 -117
  4. sibi_dst/df_helper/_parquet_artifact.py +255 -283
  5. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  6. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  7. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  8. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  9. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  10. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  12. sibi_dst/osmnx_helper/__init__.py +1 -0
  13. sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
  14. sibi_dst/osmnx_helper/route_path_builder.py +97 -0
  15. sibi_dst/osmnx_helper/utils.py +2 -0
  16. sibi_dst/utils/base.py +302 -96
  17. sibi_dst/utils/clickhouse_writer.py +472 -206
  18. sibi_dst/utils/data_utils.py +139 -186
  19. sibi_dst/utils/data_wrapper.py +317 -73
  20. sibi_dst/utils/date_utils.py +1 -0
  21. sibi_dst/utils/df_utils.py +193 -213
  22. sibi_dst/utils/file_utils.py +3 -2
  23. sibi_dst/utils/filepath_generator.py +314 -152
  24. sibi_dst/utils/log_utils.py +581 -242
  25. sibi_dst/utils/manifest_manager.py +60 -76
  26. sibi_dst/utils/parquet_saver.py +33 -27
  27. sibi_dst/utils/phone_formatter.py +88 -95
  28. sibi_dst/utils/update_planner.py +180 -178
  29. sibi_dst/utils/webdav_client.py +116 -166
  30. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
  31. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
  32. {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
@@ -1,44 +1,88 @@
1
+ import warnings
2
+ from typing import Union, List, Dict, Tuple, Iterable
3
+
1
4
  import dask.dataframe as dd
2
5
  import pandas as pd
3
6
 
4
7
  from .log_utils import Logger
5
- import warnings
8
+
6
9
  warnings.filterwarnings("ignore", message="Sorting a Dask DataFrame is expensive and may not be efficient")
7
10
 
11
+
8
12
  class DfUtils:
9
- def __init__(self, logger=None):
10
- """
11
- Utility class for DataFrame operations compatible with both pandas and Dask DataFrames.
13
+ """
14
+ Utilities that work with both pandas and Dask DataFrames, with Dask-first behavior.
15
+ """
12
16
 
13
- Parameters:
14
- logger (Logger, optional): Logger instance for logging information.
15
- """
17
+ def __init__(self, logger=None, *, debug: bool = False):
16
18
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
19
+ self.debug = debug
20
+
21
+ # -------------------------
22
+ # helpers
23
+ # -------------------------
24
+ @staticmethod
25
+ def _is_dask(obj) -> bool:
26
+ return isinstance(obj, (dd.DataFrame, dd.Series))
17
27
 
18
28
  @classmethod
19
29
  def compute_to_list(cls, series):
20
30
  return series.compute().tolist() if hasattr(series, "compute") else series.tolist()
21
31
 
22
- def extract_unique_values(self, df, *columns):
23
- return {col: self.compute_to_list(df[col].dropna().unique()) for col in columns}
32
+ def _astype_safe(self, df, col: str, dtype) -> None:
33
+ """
34
+ Cast a single column in-place; handles Dask meta generation implicitly by letting Dask infer.
35
+ """
36
+ if col not in df.columns:
37
+ return
38
+ if self._is_dask(df):
39
+ df[col] = df[col].astype(dtype)
40
+ else:
41
+ df[col] = df[col].astype(dtype)
24
42
 
25
- def align_and_merge_by_type(self, df_left, df_right, type_mapping, how='left'):
43
+ def _df_len_zero(self, df) -> bool:
44
+ """
45
+ Dask-safe emptiness check (avoids df.empty with Dask).
26
46
  """
27
- Align column data types in two DataFrames based on a type mapping dictionary and perform the merge.
28
-
29
- Parameters:
30
- - df_left (pd.DataFrame or dd.DataFrame): Left DataFrame
31
- - df_right (pd.DataFrame or dd.DataFrame): Right DataFrame
32
- - type_mapping (dict): Dictionary mapping target dtypes to column pairs.
33
- Example: {
34
- 'integer': [('customer_id', 'temp1'), ('product_type_id', 'temp2')],
35
- 'string': [('group2', 'temp4')]
36
- }
37
-
38
- Returns:
39
- - Merged DataFrame
47
+ if self._is_dask(df):
48
+ try:
49
+ # Much faster than materializing the whole df
50
+ n = df.map_partitions(len).sum().compute()
51
+ return int(n) == 0
52
+ except Exception as e:
53
+ self.logger.error(f"Error computing Dask length: {e}")
54
+ return False
55
+ return df.empty
56
+
57
+ # -------------------------
58
+ # public API
59
+ # -------------------------
60
+ def extract_unique_values(self, df, *columns):
61
+ result: Dict[str, List] = {}
62
+ for col in columns:
63
+ if col not in df.columns:
64
+ result[col] = []
65
+ continue
66
+ vals = df[col].dropna()
67
+ # Prefer drop_duplicates over unique() for Dask robustness
68
+ if self._is_dask(vals):
69
+ vals = vals.drop_duplicates().compute()
70
+ else:
71
+ vals = vals.drop_duplicates()
72
+ result[col] = vals.tolist()
73
+ return result
74
+
75
+ def align_and_merge_by_type(self, df_left, df_right, type_mapping: Dict[str, Iterable[Tuple[str, str]]], how='left'):
76
+ """
77
+ Align dtypes for pairs of columns then merge on aligned pairs.
78
+ type_mapping example:
79
+ {
80
+ 'integer': [('customer_id','temp1'), ('product_type_id','temp2')],
81
+ 'string': [('group2','temp4')],
82
+ 'datetime':[('ts','ts2')],
83
+ 'boolean':[('is_ok','flag')]
84
+ }
40
85
  """
41
- # Map string keys to actual dtypes
42
86
  dtype_map = {
43
87
  'integer': 'int64',
44
88
  'float': 'float64',
@@ -47,238 +91,174 @@ class DfUtils:
47
91
  'boolean': 'bool',
48
92
  }
49
93
 
50
- # Iterate over each dtype and align the column pairs
51
- for target_type, column_pairs in type_mapping.items():
94
+ # Cast columns as requested
95
+ for target_type, pairs in (type_mapping or {}).items():
52
96
  if target_type not in dtype_map:
53
97
  self.logger.error(f"Unsupported type: {target_type}")
54
-
55
- for left_col, right_col in column_pairs:
56
- # Align dtypes in left and right DataFrames
57
- if left_col in df_left.columns and right_col in df_right.columns:
58
- df_left[left_col] = df_left[left_col].astype(dtype_map[target_type])
59
- df_right[right_col] = df_right[right_col].astype(dtype_map[target_type])
60
-
61
- # Flatten all column pairs for the merge operation
62
- all_pairs = [pair for pairs in type_mapping.values() for pair in pairs]
63
-
64
- # Perform the merge
65
- return df_left.merge(
66
- df_right,
67
- how=how,
68
- left_on=[pair[0] for pair in all_pairs],
69
- right_on=[pair[1] for pair in all_pairs]
70
- )
71
-
72
- def exclude_from_dataframe(self, df, conditions):
98
+ continue
99
+ for left_col, right_col in pairs:
100
+ if left_col in df_left.columns:
101
+ self._astype_safe(df_left, left_col, dtype_map[target_type])
102
+ if right_col in df_right.columns:
103
+ self._astype_safe(df_right, right_col, dtype_map[target_type])
104
+
105
+ all_pairs = [p for pairs in (type_mapping or {}).values() for p in pairs]
106
+ left_keys = [p[0] for p in all_pairs]
107
+ right_keys = [p[1] for p in all_pairs]
108
+
109
+ # Dask merge works fine if both are Dask; if mixed, coerce right to Dask for scalability.
110
+ if self._is_dask(df_left) and not self._is_dask(df_right):
111
+ df_right = dd.from_pandas(df_right, npartitions=max(1, df_left.npartitions))
112
+ if self._is_dask(df_right) and not self._is_dask(df_left):
113
+ df_left = dd.from_pandas(df_left, npartitions=max(1, df_right.npartitions))
114
+
115
+ return df_left.merge(df_right, how=how, left_on=left_keys, right_on=right_keys)
116
+
117
+ def exclude_from_dataframe(self, df, conditions: List[Tuple[str, str, object]]):
73
118
  """
74
- Generic function to filter rows from a DataFrame (Pandas or Dask).
75
-
76
- Parameters:
77
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame to filter.
78
- - conditions (list of tuples): List of conditions to apply for filtering.
79
- Each condition is a tuple: (column_name, operator, value).
80
-
81
- Returns:
82
- - pandas.DataFrame or dask.dataframe.DataFrame: Filtered DataFrame.
119
+ Filter rows out based on combined conditions (AND). Returns df[~combined].
120
+ conditions: list of (column, operator, value)
121
+ operators supported: ==, !=, <, <=, >, >=
83
122
  """
84
123
  import operator
124
+ ops = {"==": operator.eq, "!=": operator.ne, "<": operator.lt, "<=": operator.le, ">": operator.gt, ">=": operator.ge}
85
125
 
86
- # Mapping string operators to actual Python operators
87
- ops = {
88
- "==": operator.eq,
89
- "!=": operator.ne,
90
- "<": operator.lt,
91
- "<=": operator.le,
92
- ">": operator.gt,
93
- ">=": operator.ge,
94
- }
95
- # Ensure all specified columns exist in the DataFrame
96
- missing_columns = [col for col, _, _ in conditions if col not in df.columns]
97
- if missing_columns:
98
- self.logger.debug(f"The following columns are missing in the DataFrame: {', '.join(missing_columns)}")
126
+ if not conditions:
99
127
  return df
100
128
 
101
- # Build the combined filtering condition
102
- combined_condition = None
103
- for col, op, value in conditions:
129
+ missing = [c for c, _, _ in conditions if c not in df.columns]
130
+ if missing:
131
+ self.logger.debug(f"Missing columns in DataFrame: {', '.join(missing)}")
132
+ return df
133
+
134
+ combined = None
135
+ for col, op, val in conditions:
104
136
  if op not in ops:
105
137
  raise ValueError(f"Unsupported operator: {op}")
138
+ cond = ops[op](df[col], val)
139
+ combined = cond if combined is None else (combined & cond)
106
140
 
107
- # Get the individual condition
108
- condition = ops[op](df[col], value)
109
-
110
- # Combine the condition with AND (&)
111
- combined_condition = condition if combined_condition is None else (combined_condition & condition)
112
-
113
- # Apply the filtering and return the DataFrame
114
- return df[~combined_condition]
141
+ if combined is None:
142
+ return df
143
+ return df[~combined]
115
144
 
116
- def load_grouped_activity(self, df, group_by_expr, group_expr='count', debug=False):
117
- """
118
- Groups the DataFrame by the specified expression and computes the size.
145
+ # ---- numeric/boolean casting
146
+ @staticmethod
147
+ def _transform_column(series, fill_value, dtype):
148
+ return pd.to_numeric(series, errors="coerce").fillna(fill_value).astype(dtype)
119
149
 
120
- Parameters:
121
- df (DataFrame): Pandas or Dask DataFrame to be grouped.
122
- group_by_expr (str or list): Column(s) to group by.
123
- group_expr (str): Name of the size/count column.
124
- debug (bool): If True, logs grouping information.
150
+ def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0, dtype=int):
151
+ if not columns:
152
+ self.logger.warning("No columns specified.")
153
+ return df
154
+ columns = [c for c in columns if c in df.columns]
155
+ for col in columns:
156
+ if self._is_dask(df):
157
+ df[col] = df[col].map_partitions(self._transform_column, fill_value, dtype, meta=(col, dtype))
158
+ else:
159
+ df[col] = self._transform_column(df[col], fill_value, dtype)
160
+ return df
125
161
 
126
- Returns:
127
- DataFrame: Grouped DataFrame with counts.
128
- """
129
- if debug:
130
- self.logger.debug(f"Grouping by: {group_by_expr}")
162
+ # kept for backward-compat
163
+ def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
164
+ return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=dtype)
131
165
 
132
- df_grouped = df.groupby(by=group_by_expr).size().reset_index(name=group_expr)
133
- return df_grouped
166
+ def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
167
+ return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
134
168
 
135
- def eval_duplicate_removal(self, df, duplicate_expr, sort_field=None, keep='last', debug=False):
169
+ # ---- duplicate handling
170
+ def eval_duplicate_removal(self, df, duplicate_expr, sort_field: str | None = None, keep='last', debug=False):
136
171
  """
137
- Removes duplicate rows based on the specified columns.
138
-
139
- Parameters:
140
- df (DataFrame): Pandas or Dask DataFrame from which duplicates are to be removed.
141
- duplicate_expr (str or list): Column(s) to identify duplicates.
142
- sort_field (str, optional): Column to sort by before dropping duplicates.
143
- keep (str): Which duplicate to keep ('first' or 'last').
144
- debug (bool): If True, logs duplicate rows.
145
-
146
- Returns:
147
- DataFrame: DataFrame with duplicates removed.
172
+ Drop duplicates. For Dask, uses its shuffle-based drop_duplicates.
173
+ If sort_field is provided, we avoid global sorts for Dask.
148
174
  """
149
175
  if duplicate_expr is None:
150
176
  return df
151
177
 
152
178
  if debug:
153
- df_duplicates = df[df.duplicated(subset=duplicate_expr)]
154
- self.logger.debug(f"Duplicate Rows based on columns {duplicate_expr} are:\n{df_duplicates}")
179
+ try:
180
+ dups = df[df.duplicated(subset=duplicate_expr)]
181
+ # Do not .compute() here; just log that duplicates exist in Dask
182
+ self.logger.debug(f"Duplicate rows based on {duplicate_expr}: (preview only)")
183
+ if not self._is_dask(dups):
184
+ self.logger.debug(f"\n{dups}")
185
+ except Exception:
186
+ pass
155
187
 
156
188
  if sort_field:
157
- if isinstance(df, dd.DataFrame):
158
- self.logger.warning("Sorting a Dask DataFrame is expensive and may not be efficient.")
159
- df = df.sort_values(sort_field)
189
+ if self._is_dask(df):
190
+ self.logger.warning("Sorting a Dask DataFrame is expensive; skipping global sort.")
191
+ else:
192
+ df = df.sort_values(sort_field)
160
193
 
161
- # Optimize duplicate removal for Dask DataFrames
162
- if isinstance(df, dd.DataFrame):
163
- df = df.drop_duplicates(subset=duplicate_expr, keep=keep, split_every=False)
194
+ if self._is_dask(df):
195
+ # Let Dask handle the global de-dup with a shuffle under the hood
196
+ df = df.drop_duplicates(subset=duplicate_expr, keep=keep)
164
197
  else:
165
198
  df = df.drop_duplicates(subset=duplicate_expr, keep=keep)
166
-
167
199
  return df
168
200
 
169
201
  def load_latest(self, df, duplicate_expr, sort_field=None, debug=False):
170
- """
171
- Removes duplicates keeping the latest occurrence.
172
-
173
- Parameters:
174
- df (DataFrame): Pandas or Dask DataFrame.
175
- duplicate_expr (str or list): Column(s) to identify duplicates.
176
- sort_field (str, optional): Column to sort by before dropping duplicates.
177
- debug (bool): If True, logs duplicate rows.
178
-
179
- Returns:
180
- DataFrame: DataFrame with latest duplicates removed.
181
- """
182
202
  return self.eval_duplicate_removal(df, duplicate_expr, sort_field=sort_field, keep='last', debug=debug)
183
203
 
184
204
  def load_earliest(self, df, duplicate_expr, sort_field=None, debug=False):
185
- """
186
- Removes duplicates keeping the earliest occurrence.
187
-
188
- Parameters:
189
- df (DataFrame): Pandas or Dask DataFrame.
190
- duplicate_expr (str or list): Column(s) to identify duplicates.
191
- sort_field (str, optional): Column to sort by before dropping duplicates.
192
- debug (bool): If True, logs duplicate rows.
193
-
194
- Returns:
195
- DataFrame: DataFrame with the earliest duplicates removed.
196
- """
197
205
  return self.eval_duplicate_removal(df, duplicate_expr, sort_field=sort_field, keep='first', debug=debug)
198
206
 
199
- @staticmethod
200
- def add_df_totals(df):
207
+ # ---- totals
208
+ def add_df_totals(self, df):
201
209
  """
202
- Adds total row and column to the DataFrame.
203
-
204
- Parameters:
205
- df (DataFrame): Pandas or Dask DataFrame.
206
-
207
- Returns:
208
- DataFrame: DataFrame with total row and column added.
210
+ Adds totals; for Dask, this computes to pandas (be careful with large frames).
209
211
  """
210
- if isinstance(df, dd.DataFrame):
211
- # Dask DataFrames are immutable; compute sums and convert to pandas
212
+ if self._is_dask(df):
213
+ self.logger.warning("add_df_totals will compute to pandas; may be large.")
212
214
  col_totals = df.sum(numeric_only=True).compute()
213
215
  row_totals = df.sum(axis=1, numeric_only=True).compute()
214
-
215
- df = df.compute()
216
- df.loc['Total'] = col_totals
217
- df['Total'] = row_totals
216
+ pdf = df.compute()
217
+ pdf.loc['Total'] = col_totals
218
+ pdf['Total'] = row_totals
219
+ return pdf
218
220
  else:
219
221
  df.loc['Total'] = df.sum(numeric_only=True)
220
222
  df['Total'] = df.sum(axis=1, numeric_only=True)
221
- return df
223
+ return df
222
224
 
225
+ # ---- summarization / resampling
223
226
  def summarise_data(self, df, summary_column, values_column, rule='D', agg_func='count'):
224
227
  """
225
- Summarizes data by creating a pivot table and resampling.
226
-
227
- Parameters:
228
- df (DataFrame): Pandas or Dask DataFrame.
229
- summary_column (str or list): Column(s) for summarization.
230
- values_column (str or list): Column(s) to aggregate.
231
- rule (str): Resampling frequency (e.g., 'D' for daily).
232
- agg_func (str or function): Aggregation function.
233
-
234
- Returns:
235
- DataFrame: Resampled pivot table.
236
- """
237
- if isinstance(df, dd.core.DataFrame):
238
- # Implement Dask-compatible pivot and resample
239
- self.logger.debug("Performing summarization with Dask DataFrame.")
240
- # Ensure the index is a datetime for resampling
241
- if not isinstance(df.index, (pd.DatetimeIndex, dd.core.DatetimeIndex)):
242
- self.logger.warning("Index is not a DatetimeIndex. Converting index to datetime.")
243
- df = df.set_index(dd.to_datetime(df.index))
244
-
245
- # Group by index and summary columns
246
- df_grouped = df.groupby([dd.to_datetime(df.index)] + [summary_column])[values_column].agg(
247
- agg_func).reset_index()
248
-
249
- # Pivot the table
250
- df_pivot = df_grouped.pivot_table(index='index', columns=summary_column, values=values_column,
251
- aggfunc='sum').fillna(0)
252
-
253
- # Resample
254
- df_pivot.index = dd.to_datetime(df_pivot.index)
255
- df_pivot = df_pivot.repartition(freq=rule)
256
- df_resampled = df_pivot.map_partitions(lambda df: df.resample(rule).sum())
257
-
258
- return df_resampled.compute()
259
- else:
260
- df_pivot = df.pivot_table(
261
- index=df.index,
262
- columns=summary_column,
263
- values=values_column,
264
- aggfunc=agg_func
265
- ).fillna(0)
266
- df_resampled = df_pivot.resample(rule).sum()
267
- return df_resampled
268
-
269
- @staticmethod
270
- def summarize_and_resample_data(df, summary_columns, value_columns, rule='D', agg_func='count'):
271
- """
272
- Summarizes and resamples data.
273
-
274
- Parameters:
275
- df (DataFrame): Pandas or Dask DataFrame.
276
- summary_columns (str or list): Column(s) for summarization.
277
- value_columns (str or list): Column(s) to aggregate.
278
- rule (str): Resampling frequency.
279
- agg_func (str or function): Aggregation function.
280
-
281
- Returns:
282
- DataFrame: Resampled pivot table.
228
+ For pandas: pivot+resample on DatetimeIndex.
229
+ For Dask: create time bins and aggregate in Dask, then (optionally) pivot in pandas.
283
230
  """
284
- return DfUtils.summarise_data(df, summary_columns, value_columns, rule=rule, agg_func=agg_func)
231
+ # pandas path
232
+ if not self._is_dask(df):
233
+ idx = df.index
234
+ if not isinstance(idx, pd.DatetimeIndex):
235
+ self.logger.warning("Index is not DatetimeIndex; converting from current index.")
236
+ df = df.copy()
237
+ df.index = pd.to_datetime(idx, errors="coerce")
238
+ pivot = df.pivot_table(index=df.index, columns=summary_column, values=values_column, aggfunc=agg_func).fillna(0)
239
+ return pivot.resample(rule).sum()
240
+
241
+ # Dask path
242
+ # 1) Build a datetime column from index (no global sort)
243
+ ddf = df
244
+ ddf = ddf.assign(_ts_bin=dd.to_datetime(ddf.index, errors="coerce"))
245
+
246
+ # 2) Bucket to rule using floor; do it per partition
247
+ def _floor_partition(pdf: pd.DataFrame, col: str, rule: str) -> pd.DataFrame:
248
+ out = pdf.copy()
249
+ out[col] = pd.to_datetime(out[col], errors="coerce")
250
+ out['_bin'] = out[col].dt.floor(rule)
251
+ return out
252
+
253
+ ddf = ddf.map_partitions(_floor_partition, col="_ts_bin", rule=rule, meta=dd.utils.make_meta(ddf))
254
+
255
+ # 3) Group in Dask on ['_bin', summary_column] and aggregate
256
+ grouped = ddf.groupby(['_bin', summary_column])[values_column].agg(agg_func).reset_index()
257
+
258
+ # 4) If you need a pivoted result, compute to pandas then pivot (Dask pivot_table is not generally supported)
259
+ gpdf = grouped.compute()
260
+ pivot = gpdf.pivot_table(index="_bin", columns=summary_column, values=values_column, aggfunc='sum').fillna(0)
261
+
262
+ # 5) Ensure regular resample (already bucketed; resampling is now cheap in pandas)
263
+ pivot.index = pd.to_datetime(pivot.index)
264
+ return pivot.asfreq(rule, fill_value=0)
@@ -8,8 +8,9 @@ from .log_utils import Logger
8
8
 
9
9
 
10
10
  class FileUtils:
11
- def __init__(self, logger=None):
12
- self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
11
+ def __init__(self, **kwargs):
12
+ self.logger = kwargs.get('logger', Logger.default_logger(logger_name=self.__class__.__name__))
13
+ self.debug = kwargs.get('debug', False)
13
14
 
14
15
  @staticmethod
15
16
  def ensure_directory_exists(directory_path, clear_existing=False):