sibi-dst 2025.1.12__py3-none-any.whl → 2025.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
- sibi_dst/df_helper/_df_helper.py +417 -117
- sibi_dst/df_helper/_parquet_artifact.py +255 -283
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/__init__.py +1 -0
- sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +203 -0
- sibi_dst/osmnx_helper/route_path_builder.py +97 -0
- sibi_dst/osmnx_helper/utils.py +2 -0
- sibi_dst/utils/base.py +302 -96
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +317 -73
- sibi_dst/utils/date_utils.py +1 -0
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +32 -28
- {sibi_dst-2025.1.12.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
sibi_dst/utils/data_utils.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
|
+
from __future__ import annotations
|
1
2
|
|
2
|
-
from typing import Union, List
|
3
|
+
from typing import Union, List, Dict, Any, Iterable
|
3
4
|
|
4
5
|
import dask.dataframe as dd
|
5
6
|
import pandas as pd
|
@@ -9,240 +10,192 @@ from .log_utils import Logger
|
|
9
10
|
|
10
11
|
class DataUtils:
|
11
12
|
"""
|
12
|
-
|
13
|
-
|
14
|
-
This class provides functionalities for transforming numeric and boolean columns, merging
|
15
|
-
lookup data, checking DataFrame emptiness, and converting columns to datetime format in
|
16
|
-
Pandas or Dask DataFrames. It is designed to handle data preprocessing steps efficiently
|
17
|
-
for both small-scale and large-scale datasets. Logging and debug options are available
|
18
|
-
to trace execution and monitor operations.
|
19
|
-
|
20
|
-
:ivar logger: Logger instance for logging messages.
|
21
|
-
:type logger: logging.Logger
|
22
|
-
:ivar debug: Flag to enable or disable debug mode.
|
23
|
-
:type debug: bool
|
13
|
+
Helpers for transforming columns, safe emptiness checks, datetime coercion,
|
14
|
+
and joining lookup data for Pandas or Dask DataFrames.
|
24
15
|
"""
|
25
|
-
|
16
|
+
|
17
|
+
def __init__(self, logger: Logger | None = None, **kwargs: Any) -> None:
|
26
18
|
self.logger = logger or Logger.default_logger(logger_name=self.__class__.__name__)
|
27
|
-
self.debug = kwargs.get(
|
19
|
+
self.debug: bool = bool(kwargs.get("debug", False))
|
20
|
+
|
21
|
+
# ---------- numeric / boolean transforms ----------
|
28
22
|
|
29
23
|
@staticmethod
|
30
|
-
def
|
31
|
-
""
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
:
|
36
|
-
:
|
37
|
-
|
38
|
-
:type
|
39
|
-
|
40
|
-
:type dtype: type
|
41
|
-
:return: Transformed column.
|
42
|
-
:rtype: pd.Series or dd.Series
|
43
|
-
"""
|
44
|
-
return (
|
45
|
-
pd.to_numeric(series, errors="coerce") # Convert to numeric, invalid to NaN
|
46
|
-
.fillna(fill_value) # Replace NaN with fill_value
|
47
|
-
.astype(dtype) # Convert to target dtype
|
48
|
-
)
|
49
|
-
|
50
|
-
def transform_numeric_columns(self, df: Union[pd.DataFrame, dd.DataFrame], columns: List[str], fill_value=0,
|
51
|
-
dtype=int):
|
24
|
+
def _transform_column_pandas(series: pd.Series, fill_value: Any, dtype: type) -> pd.Series:
|
25
|
+
return pd.to_numeric(series, errors="coerce").fillna(fill_value).astype(dtype)
|
26
|
+
|
27
|
+
def transform_numeric_columns(
|
28
|
+
self,
|
29
|
+
df: Union[pd.DataFrame, dd.DataFrame],
|
30
|
+
columns: List[str],
|
31
|
+
fill_value: Any = 0,
|
32
|
+
dtype: type = int,
|
33
|
+
) -> Union[pd.DataFrame, dd.DataFrame]:
|
52
34
|
"""
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
:param df: DataFrame to be transformed.
|
57
|
-
:type df: pd.DataFrame or dd.DataFrame
|
58
|
-
:param columns: List of column names to transform.
|
59
|
-
:type columns: list[str]
|
60
|
-
:param fill_value: Value to replace missing or invalid data. Default is 0.
|
61
|
-
:type fill_value: int or float
|
62
|
-
:param dtype: Target data type for the columns. Default is int.
|
63
|
-
:type dtype: type
|
64
|
-
:return: Transformed DataFrame.
|
65
|
-
:rtype: pd.DataFrame or dd.DataFrame
|
35
|
+
Convert selected columns to numeric → fillna → cast dtype.
|
36
|
+
Works for Pandas and Dask (partition-wise).
|
66
37
|
"""
|
67
38
|
if not columns:
|
68
|
-
self.logger.warning("No columns specified.")
|
39
|
+
self.logger.warning("No columns specified for transform_numeric_columns.")
|
69
40
|
return df
|
70
41
|
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
df[col] = df[col].map_partitions(
|
76
|
-
self._transform_column, fill_value, dtype, meta=(col, dtype)
|
77
|
-
)
|
42
|
+
cols = [c for c in columns if c in df.columns]
|
43
|
+
if not cols:
|
44
|
+
self.logger.warning("None of the requested columns exist in the DataFrame.")
|
45
|
+
return df
|
78
46
|
|
79
|
-
|
47
|
+
if isinstance(df, pd.DataFrame):
|
48
|
+
for col in cols:
|
49
|
+
df[col] = self._transform_column_pandas(df[col], fill_value, dtype)
|
50
|
+
return df
|
80
51
|
|
81
|
-
|
82
|
-
|
83
|
-
This function transforms the specified numeric columns in the given dataframe by converting
|
84
|
-
their data types to the specified dtype, with an optional parameter for replacing missing
|
85
|
-
values. It first checks if the provided columns exist in the dataframe, processes each column
|
86
|
-
to replace non-numeric values with NaN, fills NaN values with the given fill_value, and finally
|
87
|
-
converts the column to the specified dtype.
|
88
|
-
|
89
|
-
:param df: DataFrame to be transformed.
|
90
|
-
:type df: dask.dataframe.DataFrame
|
91
|
-
:param columns: List of column names to be transformed.
|
92
|
-
:type columns: list[str]
|
93
|
-
:param fill_value: Value used to replace missing or invalid data. Default is 0.
|
94
|
-
:type fill_value: int or float
|
95
|
-
:param dtype: Target data type for the columns after transformation. Default is int.
|
96
|
-
:type dtype: type
|
97
|
-
:return: Transformed dataframe with the specified numeric columns converted and modified.
|
98
|
-
:rtype: dask.dataframe.DataFrame
|
99
|
-
"""
|
100
|
-
if not columns:
|
101
|
-
self.logger.warning('No columns specified')
|
102
|
-
self.logger.debug(f'Dataframe type:{type(df)}')
|
103
|
-
columns = [column for column in columns if column in df.columns]
|
104
|
-
for col in columns:
|
105
|
-
# Replace NaN with 0, then convert to boolean
|
52
|
+
# Dask path
|
53
|
+
for col in cols:
|
106
54
|
df[col] = df[col].map_partitions(
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
meta=(col, dtype)
|
55
|
+
self._transform_column_pandas,
|
56
|
+
fill_value,
|
57
|
+
dtype,
|
58
|
+
meta=(col, dtype),
|
111
59
|
)
|
112
|
-
|
113
60
|
return df
|
114
61
|
|
115
|
-
def transform_boolean_columns(
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
:param columns: List of column names to transform.
|
123
|
-
:type columns: list[str]
|
124
|
-
:param fill_value: Value to replace missing or invalid data. Default is 0.
|
125
|
-
:type fill_value: int or float
|
126
|
-
:return: Transformed DataFrame.
|
127
|
-
:rtype: pd.DataFrame or dd.DataFrame
|
128
|
-
"""
|
62
|
+
def transform_boolean_columns(
|
63
|
+
self,
|
64
|
+
df: Union[pd.DataFrame, dd.DataFrame],
|
65
|
+
columns: List[str],
|
66
|
+
fill_value: Any = 0,
|
67
|
+
) -> Union[pd.DataFrame, dd.DataFrame]:
|
68
|
+
"""Convenience wrapper: cast to boolean via numeric→fillna→astype(bool)."""
|
129
69
|
return self.transform_numeric_columns(df, columns, fill_value=fill_value, dtype=bool)
|
130
70
|
|
131
|
-
|
132
|
-
"""
|
133
|
-
Merge lookup data into the DataFrame based on specified columns.
|
134
|
-
|
135
|
-
Parameters:
|
136
|
-
- classname: The class instance to use for loading lookup data.
|
137
|
-
- df (pandas.DataFrame or dask.dataframe.DataFrame): The DataFrame.
|
138
|
-
- kwargs: Additional keyword arguments for configuration.
|
71
|
+
# ---------- lookup merge ----------
|
139
72
|
|
140
|
-
|
141
|
-
|
73
|
+
def merge_lookup_data(
|
74
|
+
self,
|
75
|
+
classname,
|
76
|
+
df: Union[pd.DataFrame, dd.DataFrame],
|
77
|
+
**kwargs: Any,
|
78
|
+
) -> Union[pd.DataFrame, dd.DataFrame]:
|
142
79
|
"""
|
143
|
-
|
80
|
+
Merge lookup data for ids present in `source_col`.
|
81
|
+
|
82
|
+
Required kwargs:
|
83
|
+
- source_col
|
84
|
+
- lookup_col
|
85
|
+
- lookup_description_col
|
86
|
+
- source_description_alias
|
87
|
+
|
88
|
+
Optional kwargs:
|
89
|
+
- fillna_source_description_alias: bool = False
|
90
|
+
- fieldnames: tuple[str, str] = (lookup_col, lookup_description_col)
|
91
|
+
- column_names: list[str] = ['temp_join_col', source_description_alias]
|
92
|
+
- any other filters passed to `classname.load(...)`
|
93
|
+
"""
|
94
|
+
# Early outs for emptiness and required args
|
144
95
|
if self.is_dataframe_empty(df):
|
145
|
-
self.logger.debug("merge_lookup_data
|
96
|
+
self.logger.debug("merge_lookup_data: input DataFrame empty — nothing to merge.")
|
146
97
|
return df
|
147
98
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
raise ValueError(f"Missing required parameters: {', '.join(missing_params)}")
|
99
|
+
required = ["source_col", "lookup_col", "lookup_description_col", "source_description_alias"]
|
100
|
+
missing = [k for k in required if k not in kwargs]
|
101
|
+
if missing:
|
102
|
+
raise ValueError(f"Missing required parameters: {', '.join(missing)}")
|
153
103
|
|
154
|
-
source_col = kwargs.pop(
|
155
|
-
lookup_col = kwargs.pop(
|
156
|
-
lookup_description_col = kwargs.pop(
|
157
|
-
source_description_alias = kwargs.pop(
|
104
|
+
source_col = kwargs.pop("source_col")
|
105
|
+
lookup_col = kwargs.pop("lookup_col")
|
106
|
+
lookup_description_col = kwargs.pop("lookup_description_col")
|
107
|
+
source_description_alias = kwargs.pop("source_description_alias")
|
158
108
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
column_names = kwargs.pop('column_names', ['temp_join_col', source_description_alias])
|
109
|
+
fillna_alias = bool(kwargs.pop("fillna_source_description_alias", False))
|
110
|
+
fieldnames = kwargs.pop("fieldnames", (lookup_col, lookup_description_col))
|
111
|
+
column_names = kwargs.pop("column_names", ["temp_join_col", source_description_alias])
|
163
112
|
|
164
113
|
if source_col not in df.columns:
|
165
|
-
self.logger.debug(f"{source_col} not in
|
114
|
+
self.logger.debug(f"merge_lookup_data: '{source_col}' not found in frame — skipping merge.")
|
166
115
|
return df
|
167
116
|
|
168
|
-
#
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
117
|
+
# Collect ids safely
|
118
|
+
try:
|
119
|
+
ids_series = df[source_col].dropna()
|
120
|
+
if isinstance(df, dd.DataFrame):
|
121
|
+
# Dask: unique() is lazy → compute smallish result
|
122
|
+
ids = ids_series.unique().compute()
|
123
|
+
else:
|
124
|
+
ids = ids_series.unique()
|
125
|
+
ids = sorted(ids.tolist() if not isinstance(ids, list) else ids)
|
126
|
+
except Exception as e:
|
127
|
+
self.logger.error(f"merge_lookup_data: failed extracting ids from '{source_col}': {e}")
|
128
|
+
return df
|
173
129
|
|
174
|
-
|
175
|
-
|
176
|
-
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
130
|
+
if not ids:
|
131
|
+
self.logger.debug(f"merge_lookup_data: no ids found in '{source_col}'.")
|
177
132
|
return df
|
178
133
|
|
179
|
-
#
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
'column_names': column_names,
|
188
|
-
f'{lookup_col}__in': ids
|
189
|
-
})
|
190
|
-
# Load lookup data
|
134
|
+
# Load lookup data (expected to be small after filtering)
|
135
|
+
load_kwargs = {
|
136
|
+
**kwargs,
|
137
|
+
"fieldnames": fieldnames,
|
138
|
+
"column_names": column_names,
|
139
|
+
f"{lookup_col}__in": ids,
|
140
|
+
}
|
141
|
+
|
191
142
|
lookup_instance = classname(debug=self.debug, logger=self.logger)
|
192
143
|
result = lookup_instance.load(**load_kwargs)
|
193
|
-
if len(result.index) == 0:
|
194
|
-
self.logger.debug(f"No IDs found in the source column: {source_col}")
|
195
|
-
return df
|
196
|
-
# Determine the join column on the result DataFrame
|
197
|
-
temp_join_col = 'temp_join_col' if 'temp_join_col' in column_names else lookup_col
|
198
144
|
|
199
|
-
#
|
200
|
-
|
145
|
+
# If lookup returns Dask, compute to pandas (broadcastable) or keep small Dask?
|
146
|
+
if isinstance(result, dd.DataFrame):
|
147
|
+
# we expect this to be small after filtering by ids; materialize
|
148
|
+
result = result.compute()
|
201
149
|
|
202
|
-
if
|
203
|
-
|
150
|
+
if not isinstance(result, pd.DataFrame):
|
151
|
+
raise TypeError("merge_lookup_data: lookup 'load' must return a pandas or dask DataFrame.")
|
204
152
|
|
205
|
-
|
206
|
-
|
153
|
+
if result.empty:
|
154
|
+
self.logger.debug("merge_lookup_data: lookup returned 0 rows — nothing to merge.")
|
155
|
+
return df
|
207
156
|
|
208
|
-
|
157
|
+
# Determine join key in the lookup result
|
158
|
+
temp_join_col = "temp_join_col" if "temp_join_col" in column_names else lookup_col
|
209
159
|
|
210
|
-
|
211
|
-
""
|
212
|
-
|
160
|
+
# Perform merge (Dask can merge with a small pandas right side)
|
161
|
+
merged = df.merge(result, how="left", left_on=source_col, right_on=temp_join_col)
|
162
|
+
|
163
|
+
if fillna_alias and source_description_alias in merged.columns:
|
164
|
+
if isinstance(merged, dd.DataFrame):
|
165
|
+
merged[source_description_alias] = merged[source_description_alias].fillna("")
|
166
|
+
else:
|
167
|
+
merged[source_description_alias] = merged[source_description_alias].fillna("")
|
213
168
|
|
214
|
-
|
215
|
-
|
169
|
+
# Drop helper join column if present
|
170
|
+
merged = merged.drop(columns="temp_join_col", errors="ignore")
|
171
|
+
return merged
|
216
172
|
|
217
|
-
|
218
|
-
|
173
|
+
# ---------- emptiness & datetime ----------
|
174
|
+
|
175
|
+
def is_dataframe_empty(self, df: Union[pd.DataFrame, dd.DataFrame]) -> bool:
|
176
|
+
"""
|
177
|
+
Safe emptiness check. For Dask, uses head(1) to avoid full compute.
|
219
178
|
"""
|
220
179
|
if isinstance(df, dd.DataFrame):
|
221
180
|
try:
|
222
|
-
|
181
|
+
head = df.head(1, npartitions=-1, compute=True)
|
182
|
+
return head.empty
|
223
183
|
except Exception as e:
|
224
|
-
self.logger.error(f"
|
184
|
+
self.logger.error(f"is_dataframe_empty: Dask head() failed: {e}")
|
225
185
|
return False
|
226
|
-
|
186
|
+
if isinstance(df, pd.DataFrame):
|
227
187
|
return df.empty
|
228
|
-
|
229
|
-
|
230
|
-
return False
|
188
|
+
self.logger.error("is_dataframe_empty: input must be a pandas or dask DataFrame.")
|
189
|
+
return False
|
231
190
|
|
232
191
|
@staticmethod
|
233
|
-
def convert_to_datetime_dask(df, date_fields):
|
192
|
+
def convert_to_datetime_dask(df: dd.DataFrame, date_fields: Iterable[str]) -> dd.DataFrame:
|
234
193
|
"""
|
235
|
-
|
236
|
-
|
237
|
-
Parameters:
|
238
|
-
- df (dask.dataframe.DataFrame): The Dask DataFrame containing the columns.
|
239
|
-
- date_fields (list of str): List of column names to convert to datetime.
|
240
|
-
|
241
|
-
Returns:
|
242
|
-
- dask.dataframe.DataFrame: Updated DataFrame with specified columns converted to datetime.
|
194
|
+
Partition-wise datetime coercion with errors='coerce'.
|
243
195
|
"""
|
244
196
|
for col in date_fields:
|
245
197
|
if col in df.columns:
|
246
|
-
df[col] = df[col].map_partitions(
|
247
|
-
|
248
|
-
|
198
|
+
df[col] = df[col].map_partitions(
|
199
|
+
pd.to_datetime, errors="coerce", meta=(col, "datetime64[ns]")
|
200
|
+
)
|
201
|
+
return df
|