sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,5 +1,6 @@
1
+ from __future__ import annotations
1
2
 
2
- from typing import Union, List
3
+ from typing import Union, List, Dict, Any, Iterable
3
4
 
4
5
  import dask.dataframe as dd
5
6
  import pandas as pd
@@ -9,240 +10,192 @@ from .log_utils import Logger
9
10
 
10
11
  class DataUtils:
11
12
  """
12
- Utility class for data transformation, manipulation, and merging.
13
-
14
- This class provides functionalities for transforming numeric and boolean columns, merging
15
- lookup data, checking DataFrame emptiness, and converting columns to datetime format in
16
- Pandas or Dask DataFrames. It is designed to handle data preprocessing steps efficiently
17
- for both small-scale and large-scale datasets. Logging and debug options are available
18
- to trace execution and monitor operations.
19
-
20
- :ivar logger: Logger instance for logging messages.
21
- :type logger: logging.Logger
22
- :ivar debug: Flag to enable or disable debug mode.
23
- :type debug: bool
13
+ Helpers for transforming columns, safe emptiness checks, datetime coercion,
14
+ and joining lookup data for Pandas or Dask DataFrames.
24
15
  """
25
- def __init__(self, logger=None, **kwargs):
16
+
17
+ def __init__(self, logger: Logger | None = None, **kwargs: Any) -> None:
26
18
  self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
27
- self.debug = kwargs.get('debug', False)
19
+ self.debug: bool = bool(kwargs.get("debug", False))
20
+
21
+ # ---------- numeric / boolean transforms ----------
28
22
 
29
23
  @staticmethod
30
- def _transform_column(series, fill_value, dtype):
31
- """
32
- Helper method to transform a column by converting it to numeric, filling missing values,
33
- and casting to the specified dtype.
34
-
35
- :param series: The column to transform.
36
- :type series: pd.Series or dd.Series
37
- :param fill_value: Value to replace missing or invalid data.
38
- :type fill_value: int or float
39
- :param dtype: Target data type for the column.
40
- :type dtype: type
41
- :return: Transformed column.
42
- :rtype: pd.Series or dd.Series
43
- """
44
- return (
45
- pd.to_numeric(series, errors="coerce") # Convert to numeric, invalid to NaN
46
- .fillna(fill_value) # Replace NaN with fill_value
47
- .astype(dtype) # Convert to target dtype
48
- )
49
-
50
- def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
51
- dtype=int):
24
+ def _transform_column_pandas(series: pd.Series, fill_value: Any, dtype: type) -> pd.Series:
25
+ return pd.to_numeric(series, errors="coerce").fillna(fill_value).astype(dtype)
26
+
27
+ def transform_numeric_columns(
28
+ self,
29
+ df: Union[pd.DataFrame, dd.DataFrame],
30
+ columns: List[str],
31
+ fill_value: Any = 0,
32
+ dtype: type = int,
33
+ ) -> Union[pd.DataFrame, dd.DataFrame]:
52
34
  """
53
- Transform specified numeric columns in the DataFrame by converting their data types
54
- to the specified dtype and replacing missing values with the given fill_value.
55
-
56
- :param df: DataFrame to be transformed.
57
- :type df: pd.DataFrame or dd.DataFrame
58
- :param columns: List of column names to transform.
59
- :type columns: list[str]
60
- :param fill_value: Value to replace missing or invalid data. Default is 0.
61
- :type fill_value: int or float
62
- :param dtype: Target data type for the columns. Default is int.
63
- :type dtype: type
64
- :return: Transformed DataFrame.
65
- :rtype: pd.DataFrame or dd.DataFrame
35
+ Convert selected columns to numeric fillna cast dtype.
36
+ Works for Pandas and Dask (partition-wise).
66
37
  """
67
38
  if not columns:
68
- self.logger.warning("No columns specified.")
39
+ self.logger.warning("No columns specified for transform_numeric_columns.")
69
40
  return df
70
41
 
71
- self.logger.debug(f"DataFrame type: {type(df)}")
72
- columns = [col for col in columns if col in df.columns]
73
-
74
- for col in columns:
75
- df[col] = df[col].map_partitions(
76
- self._transform_column, fill_value, dtype, meta=(col, dtype)
77
- )
42
+ cols = [c for c in columns if c in df.columns]
43
+ if not cols:
44
+ self.logger.warning("None of the requested columns exist in the DataFrame.")
45
+ return df
78
46
 
79
- return df
47
+ if isinstance(df, pd.DataFrame):
48
+ for col in cols:
49
+ df[col] = self._transform_column_pandas(df[col], fill_value, dtype)
50
+ return df
80
51
 
81
- def transform_numeric_cols(self, df, columns, fill_value=0, dtype=int):
82
- """
83
- This function transforms the specified numeric columns in the given dataframe by converting
84
- their data types to the specified dtype, with an optional parameter for replacing missing
85
- values. It first checks if the provided columns exist in the dataframe, processes each column
86
- to replace non-numeric values with NaN, fills NaN values with the given fill_value, and finally
87
- converts the column to the specified dtype.
88
-
89
- :param df: DataFrame to be transformed.
90
- :type df: dask.dataframe.DataFrame
91
- :param columns: List of column names to be transformed.
92
- :type columns: list[str]
93
- :param fill_value: Value used to replace missing or invalid data. Default is 0.
94
- :type fill_value: int or float
95
- :param dtype: Target data type for the columns after transformation. Default is int.
96
- :type dtype: type
97
- :return: Transformed dataframe with the specified numeric columns converted and modified.
98
- :rtype: dask.dataframe.DataFrame
99
- """
100
- if not columns:
101
- self.logger.warning('No columns specified')
102
- self.logger.debug(f'Dataframe type:{type(df)}')
103
- columns = [column for column in columns if column in df.columns]
104
- for col in columns:
105
- # Replace NaN with 0, then convert to boolean
52
+ # Dask path
53
+ for col in cols:
106
54
  df[col] = df[col].map_partitions(
107
- lambda s: pd.to_numeric(s, errors='coerce') # Convert to numeric, invalid to NaN
108
- .fillna(fill_value) # Replace NaN with 0
109
- .astype(dtype),
110
- meta=(col, dtype)
55
+ self._transform_column_pandas,
56
+ fill_value,
57
+ dtype,
58
+ meta=(col, dtype),
111
59
  )
112
-
113
60
  return df
114
61
 
115
- def transform_boolean_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0):
116
- """
117
- Convert specified columns in the DataFrame to boolean, replacing missing values with
118
- the given fill_value.
119
-
120
- :param df: DataFrame to be transformed.
121
- :type df: pd.DataFrame or dd.DataFrame
122
- :param columns: List of column names to transform.
123
- :type columns: list[str]
124
- :param fill_value: Value to replace missing or invalid data. Default is 0.
125
- :type fill_value: int or float
126
- :return: Transformed DataFrame.
127
- :rtype: pd.DataFrame or dd.DataFrame
128
- """
62
+ def transform_boolean_columns(
63
+ self,
64
+ df: Union[pd.DataFrame, dd.DataFrame],
65
+ columns: List[str],
66
+ fill_value: Any = 0,
67
+ ) -> Union[pd.DataFrame, dd.DataFrame]:
68
+ """Convenience wrapper: cast to boolean via numeric→fillna→astype(bool)."""
129
69
  return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
130
70
 
131
- def merge_lookup_data(self, classname, df, **kwargs):
132
- """
133
- Merge lookup data into the DataFrame based on specified columns.
134
-
135
- Parameters:
136
- - classname: The class instance to use for loading lookup data.
137
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
138
- - kwargs: Additional keyword arguments for configuration.
71
+ # ---------- lookup merge ----------
139
72
 
140
- Returns:
141
- - pandas.DataFrame or dask.dataframe.DataFrame: Updated DataFrame with merged lookup data.
73
+ def merge_lookup_data(
74
+ self,
75
+ classname,
76
+ df: Union[pd.DataFrame, dd.DataFrame],
77
+ **kwargs: Any,
78
+ ) -> Union[pd.DataFrame, dd.DataFrame]:
142
79
  """
143
- # Return early if the DataFrame is empty
80
+ Merge lookup data for ids present in `source_col`.
81
+
82
+ Required kwargs:
83
+ - source_col
84
+ - lookup_col
85
+ - lookup_description_col
86
+ - source_description_alias
87
+
88
+ Optional kwargs:
89
+ - fillna_source_description_alias: bool = False
90
+ - fieldnames: tuple[str, str] = (lookup_col, lookup_description_col)
91
+ - column_names: list[str] = ['temp_join_col', source_description_alias]
92
+ - any other filters passed to `classname.load(...)`
93
+ """
94
+ # Early outs for emptiness and required args
144
95
  if self.is_dataframe_empty(df):
145
- self.logger.debug("merge_lookup_data was given an empty dataFrame")
96
+ self.logger.debug("merge_lookup_data: input DataFrame empty — nothing to merge.")
146
97
  return df
147
98
 
148
- # Extract and validate required parameters
149
- required_params = ['source_col', 'lookup_col', 'lookup_description_col', 'source_description_alias']
150
- missing_params = [param for param in required_params if param not in kwargs]
151
- if missing_params:
152
- raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
99
+ required = ["source_col", "lookup_col", "lookup_description_col", "source_description_alias"]
100
+ missing = [k for k in required if k not in kwargs]
101
+ if missing:
102
+ raise ValueError(f"Missing required parameters: {', '.join(missing)}")
153
103
 
154
- source_col = kwargs.pop('source_col')
155
- lookup_col = kwargs.pop('lookup_col')
156
- lookup_description_col = kwargs.pop('lookup_description_col')
157
- source_description_alias = kwargs.pop('source_description_alias')
104
+ source_col = kwargs.pop("source_col")
105
+ lookup_col = kwargs.pop("lookup_col")
106
+ lookup_description_col = kwargs.pop("lookup_description_col")
107
+ source_description_alias = kwargs.pop("source_description_alias")
158
108
 
159
- # Optional parameters with default values
160
- fillna_source_description_alias = kwargs.pop('fillna_source_description_alias', False)
161
- fieldnames = kwargs.pop('fieldnames', (lookup_col, lookup_description_col))
162
- column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
109
+ fillna_alias = bool(kwargs.pop("fillna_source_description_alias", False))
110
+ fieldnames = kwargs.pop("fieldnames", (lookup_col, lookup_description_col))
111
+ column_names = kwargs.pop("column_names", ["temp_join_col", source_description_alias])
163
112
 
164
113
  if source_col not in df.columns:
165
- self.logger.debug(f"{source_col} not in DataFrame columns")
114
+ self.logger.debug(f"merge_lookup_data: '{source_col}' not found in frame — skipping merge.")
166
115
  return df
167
116
 
168
- # Get unique IDs from source column
169
- ids = df[source_col].dropna().unique()
170
- # Compute if it's a Dask Series
171
- if isinstance(ids, dd.Series):
172
- ids = ids.compute()
117
+ # Collect ids safely
118
+ try:
119
+ ids_series = df[source_col].dropna()
120
+ if isinstance(df, dd.DataFrame):
121
+ # Dask: unique() is lazy → compute smallish result
122
+ ids = ids_series.unique().compute()
123
+ else:
124
+ ids = ids_series.unique()
125
+ ids = sorted(ids.tolist() if not isinstance(ids, list) else ids)
126
+ except Exception as e:
127
+ self.logger.error(f"merge_lookup_data: failed extracting ids from '{source_col}': {e}")
128
+ return df
173
129
 
174
- # Check if any IDs are found
175
- if not len(ids):
176
- self.logger.debug(f"No IDs found in the source column: {source_col}")
130
+ if not ids:
131
+ self.logger.debug(f"merge_lookup_data: no ids found in '{source_col}'.")
177
132
  return df
178
133
 
179
- # Convert to a list only if necessary and sort
180
- if not isinstance(ids, list):
181
- ids = ids.tolist()
182
- ids = sorted(ids)
183
- # Prepare kwargs for loading lookup data
184
- load_kwargs = kwargs.copy()
185
- load_kwargs.update({
186
- 'fieldnames': fieldnames,
187
- 'column_names': column_names,
188
- f'{lookup_col}__in': ids
189
- })
190
- # Load lookup data
134
+ # Load lookup data (expected to be small after filtering)
135
+ load_kwargs = {
136
+ **kwargs,
137
+ "fieldnames": fieldnames,
138
+ "column_names": column_names,
139
+ f"{lookup_col}__in": ids,
140
+ }
141
+
191
142
  lookup_instance = classname(debug=self.debug, logger=self.logger)
192
143
  result = lookup_instance.load(**load_kwargs)
193
- if len(result.index) == 0:
194
- self.logger.debug(f"No IDs found in the source column: {source_col}")
195
- return df
196
- # Determine the join column on the result DataFrame
197
- temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
198
144
 
199
- # Merge DataFrames
200
- df = df.merge(result, how='left', left_on=source_col, right_on=temp_join_col)
145
+ # If lookup returns Dask, compute to pandas (broadcastable) or keep small Dask?
146
+ if isinstance(result, dd.DataFrame):
147
+ # we expect this to be small after filtering by ids; materialize
148
+ result = result.compute()
201
149
 
202
- if fillna_source_description_alias and source_description_alias in df.columns:
203
- df[source_description_alias] = df[source_description_alias].fillna('')
150
+ if not isinstance(result, pd.DataFrame):
151
+ raise TypeError("merge_lookup_data: lookup 'load' must return a pandas or dask DataFrame.")
204
152
 
205
- # Drop temp_join_col if present
206
- df = df.drop(columns='temp_join_col', errors='ignore')
153
+ if result.empty:
154
+ self.logger.debug("merge_lookup_data: lookup returned 0 rows — nothing to merge.")
155
+ return df
207
156
 
208
- return df
157
+ # Determine join key in the lookup result
158
+ temp_join_col = "temp_join_col" if "temp_join_col" in column_names else lookup_col
209
159
 
210
- def is_dataframe_empty(self, df):
211
- """
212
- Check if a DataFrame (Pandas or Dask) is empty.
160
+ # Perform merge (Dask can merge with a small pandas right side)
161
+ merged = df.merge(result, how="left", left_on=source_col, right_on=temp_join_col)
162
+
163
+ if fillna_alias and source_description_alias in merged.columns:
164
+ if isinstance(merged, dd.DataFrame):
165
+ merged[source_description_alias] = merged[source_description_alias].fillna("")
166
+ else:
167
+ merged[source_description_alias] = merged[source_description_alias].fillna("")
213
168
 
214
- Parameters:
215
- - df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
169
+ # Drop helper join column if present
170
+ merged = merged.drop(columns="temp_join_col", errors="ignore")
171
+ return merged
216
172
 
217
- Returns:
218
- - bool: True if the DataFrame is empty, False otherwise.
173
+ # ---------- emptiness & datetime ----------
174
+
175
+ def is_dataframe_empty(self, df: Union[pd.DataFrame, dd.DataFrame]) -> bool:
176
+ """
177
+ Safe emptiness check. For Dask, uses head(1) to avoid full compute.
219
178
  """
220
179
  if isinstance(df, dd.DataFrame):
221
180
  try:
222
- return len(df.index) == 0
181
+ head = df.head(1, npartitions=-1, compute=True)
182
+ return head.empty
223
183
  except Exception as e:
224
- self.logger.error(f"Error while processing Dask DataFrame: {e}")
184
+ self.logger.error(f"is_dataframe_empty: Dask head() failed: {e}")
225
185
  return False
226
- elif isinstance(df, pd.DataFrame):
186
+ if isinstance(df, pd.DataFrame):
227
187
  return df.empty
228
- else:
229
- self.logger.error("Input must be a pandas or dask DataFrame.")
230
- return False
188
+ self.logger.error("is_dataframe_empty: input must be a pandas or dask DataFrame.")
189
+ return False
231
190
 
232
191
  @staticmethod
233
- def convert_to_datetime_dask(df, date_fields):
192
+ def convert_to_datetime_dask(df: dd.DataFrame, date_fields: Iterable[str]) -> dd.DataFrame:
234
193
  """
235
- Convert specified columns in a Dask DataFrame to datetime, handling errors gracefully.
236
-
237
- Parameters:
238
- - df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
239
- - date_fields (list of str): List of column names to convert to datetime.
240
-
241
- Returns:
242
- - dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
194
+ Partition-wise datetime coercion with errors='coerce'.
243
195
  """
244
196
  for col in date_fields:
245
197
  if col in df.columns:
246
- df[col] = df[col].map_partitions(pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]"))
247
- return df
248
-
198
+ df[col] = df[col].map_partitions(
199
+ pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]")
200
+ )
201
+ return df