sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/__init__.py +7 -1
- sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +235 -342
- sibi_dst/df_helper/_df_helper.py +417 -117
- sibi_dst/df_helper/_parquet_artifact.py +255 -283
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
- sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
- sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
- sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
- sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
- sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
- sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
- sibi_dst/osmnx_helper/route_path_builder.py +45 -46
- sibi_dst/utils/base.py +302 -96
- sibi_dst/utils/clickhouse_writer.py +472 -206
- sibi_dst/utils/data_utils.py +139 -186
- sibi_dst/utils/data_wrapper.py +317 -73
- sibi_dst/utils/date_utils.py +1 -0
- sibi_dst/utils/df_utils.py +193 -213
- sibi_dst/utils/file_utils.py +3 -2
- sibi_dst/utils/filepath_generator.py +314 -152
- sibi_dst/utils/log_utils.py +581 -242
- sibi_dst/utils/manifest_manager.py +60 -76
- sibi_dst/utils/parquet_saver.py +33 -27
- sibi_dst/utils/phone_formatter.py +88 -95
- sibi_dst/utils/update_planner.py +180 -178
- sibi_dst/utils/webdav_client.py +116 -166
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/METADATA +1 -1
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/RECORD +29 -27
- {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.1.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -1,92 +1,84 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import
|
4
|
-
from typing import Any, Dict, Optional, Union, TypeVar
|
3
|
+
from typing import Any, Dict, Optional, TypeVar, Union
|
5
4
|
|
6
5
|
import dask.dataframe as dd
|
7
6
|
import pandas as pd
|
7
|
+
from fsspec import AbstractFileSystem
|
8
8
|
from pydantic import BaseModel
|
9
9
|
|
10
10
|
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
11
|
-
from sibi_dst.utils import ManagedResource
|
12
|
-
from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
11
|
+
from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
|
13
12
|
from .backends.http import HttpConfig
|
14
13
|
from .backends.parquet import ParquetConfig
|
15
14
|
from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
16
15
|
|
17
|
-
warnings.filterwarnings("ignore")
|
18
16
|
T = TypeVar("T", bound=BaseModel)
|
19
17
|
|
20
18
|
|
21
|
-
#
|
22
|
-
|
19
|
+
# ---- Backend Strategy Pattern ----
|
23
20
|
class BaseBackend:
|
24
|
-
|
25
|
-
|
26
|
-
def __init__(self, helper: DfHelper):
|
21
|
+
def __init__(self, helper: "DfHelper"):
|
27
22
|
self.helper = helper
|
28
23
|
self.logger = helper.logger
|
29
24
|
self.debug = helper.debug
|
30
|
-
self.total_records = helper.total_records
|
31
|
-
self._entered = helper._entered # Track if the helper is used in a context manager
|
25
|
+
self.total_records = helper.total_records
|
32
26
|
|
33
|
-
def load(self, **options) -> tuple[Any, Any]
|
34
|
-
|
35
|
-
raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
27
|
+
def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
28
|
+
raise NotImplementedError
|
36
29
|
|
37
|
-
async def aload(self, **options) -> tuple[Any, Any]
|
38
|
-
"""Asynchronous data loading method. By default, it calls the sync version."""
|
30
|
+
async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
39
31
|
return self.load(**options)
|
40
32
|
|
41
33
|
|
42
34
|
class SqlAlchemyBackend(BaseBackend):
|
43
|
-
def load(self, **options)
|
35
|
+
def load(self, **options):
|
44
36
|
try:
|
45
|
-
|
46
|
-
if options and hasattr(self.helper._backend_params, 'parse_params'):
|
37
|
+
if options and hasattr(self.helper._backend_params, "parse_params"):
|
47
38
|
self.helper._backend_params.parse_params(options)
|
48
39
|
|
49
|
-
|
40
|
+
with SqlAlchemyLoadFromDb(
|
50
41
|
plugin_sqlalchemy=self.helper.backend_db_connection,
|
51
42
|
plugin_query=self.helper._backend_query,
|
52
43
|
plugin_params=self.helper._backend_params,
|
53
44
|
logger=self.logger,
|
54
|
-
debug=self.debug
|
55
|
-
)
|
56
|
-
|
57
|
-
|
45
|
+
debug=self.debug,
|
46
|
+
) as db_loader:
|
47
|
+
self.total_records, result = db_loader.build_and_load()
|
48
|
+
return self.total_records, result
|
58
49
|
except Exception as e:
|
59
50
|
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
60
51
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
61
52
|
|
62
53
|
|
63
54
|
class ParquetBackend(BaseBackend):
|
64
|
-
|
65
|
-
|
66
|
-
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
55
|
+
def load(self, **options):
|
67
56
|
try:
|
68
57
|
df = self.helper.backend_parquet.load_files()
|
58
|
+
if len(df.head(1)) == 0:
|
59
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
60
|
+
|
69
61
|
if options and df is not None:
|
70
|
-
df = FilterHandler(
|
62
|
+
df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
|
63
|
+
if len(df.head(1)) == 0:
|
64
|
+
self.logger.debug("No records after filters; returning empty DataFrame.")
|
65
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
71
66
|
|
72
67
|
df = df.persist()
|
73
|
-
|
74
|
-
self.total_records = len(df) or -1 # If df is empty, set total_records to -1
|
68
|
+
self.total_records = len(df) or -1
|
75
69
|
return self.total_records, df
|
76
70
|
except Exception as e:
|
77
|
-
self.total_records = -1
|
78
|
-
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=
|
71
|
+
self.total_records = -1
|
72
|
+
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
79
73
|
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
80
74
|
|
81
75
|
|
82
76
|
class HttpBackend(BaseBackend):
|
83
|
-
|
84
|
-
|
85
|
-
def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
86
|
-
# This will correctly fail by raising NotImplementedError from the base class.
|
77
|
+
def load(self, **options):
|
78
|
+
# Will raise NotImplementedError from helper.backend_http if sync not supported
|
87
79
|
return self.helper.backend_http.fetch_data(**options)
|
88
80
|
|
89
|
-
async def aload(self, **options)
|
81
|
+
async def aload(self, **options):
|
90
82
|
if not self.helper.backend_http:
|
91
83
|
self.logger.warning("HTTP plugin not configured properly.")
|
92
84
|
self.total_records = -1
|
@@ -96,174 +88,189 @@ class HttpBackend(BaseBackend):
|
|
96
88
|
return self.total_records, result
|
97
89
|
|
98
90
|
|
99
|
-
#
|
100
|
-
|
91
|
+
# ---- Main DfHelper ----
|
101
92
|
class DfHelper(ManagedResource):
|
102
|
-
"""
|
103
|
-
A reusable utility for loading data. It provides both sync (`load`) and
|
104
|
-
async (`aload`) methods to accommodate different backends.
|
105
|
-
"""
|
106
93
|
_BACKEND_STRATEGIES = {
|
107
|
-
|
108
|
-
|
109
|
-
|
94
|
+
"sqlalchemy": SqlAlchemyBackend,
|
95
|
+
"parquet": ParquetBackend,
|
96
|
+
"http": HttpBackend,
|
110
97
|
}
|
111
98
|
|
112
99
|
_BACKEND_ATTR_MAP = {
|
113
|
-
|
114
|
-
|
115
|
-
|
100
|
+
"sqlalchemy": "backend_db_connection",
|
101
|
+
"parquet": "backend_parquet",
|
102
|
+
"http": "backend_http",
|
116
103
|
}
|
117
104
|
|
118
|
-
default_config: Dict = None
|
105
|
+
default_config: Dict[str, Any] = None
|
119
106
|
|
120
|
-
def __init__(self, backend=
|
107
|
+
def __init__(self, backend="sqlalchemy", **kwargs):
|
121
108
|
self.default_config = self.default_config or {}
|
122
109
|
kwargs = {**self.default_config.copy(), **kwargs}
|
123
110
|
super().__init__(**kwargs)
|
124
111
|
self.backend = backend
|
125
112
|
|
126
|
-
#
|
113
|
+
# Ensure defaults flow to plugin configs
|
127
114
|
kwargs.setdefault("debug", self.debug)
|
128
115
|
kwargs.setdefault("fs", self.fs)
|
129
116
|
kwargs.setdefault("logger", self.logger)
|
130
|
-
|
117
|
+
|
118
|
+
self.total_records = -1
|
131
119
|
self._backend_query = self._get_config(QueryConfig, kwargs)
|
132
120
|
self._backend_params = self._get_config(ParamsConfig, kwargs)
|
121
|
+
|
133
122
|
self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
134
123
|
self.backend_parquet: Optional[ParquetConfig] = None
|
135
124
|
self.backend_http: Optional[HttpConfig] = None
|
136
125
|
|
137
|
-
if self.backend ==
|
126
|
+
if self.backend == "sqlalchemy":
|
138
127
|
self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
139
|
-
elif self.backend ==
|
128
|
+
elif self.backend == "parquet":
|
140
129
|
self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
141
|
-
elif self.backend ==
|
130
|
+
elif self.backend == "http":
|
142
131
|
self.backend_http = self._get_config(HttpConfig, kwargs)
|
143
132
|
|
144
|
-
|
145
|
-
if not
|
146
|
-
|
147
|
-
|
148
|
-
def __exit__(self, exc_type, exc_value, traceback):
|
149
|
-
self._cleanup()
|
150
|
-
super().__exit__(exc_type, exc_value, traceback)
|
133
|
+
strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
|
134
|
+
if not strategy_cls:
|
135
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
136
|
+
self.backend_strategy = strategy_cls(self)
|
151
137
|
|
138
|
+
# ---------- ManagedResource hooks ----------
|
152
139
|
def _cleanup(self):
|
153
140
|
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
154
141
|
if not attr_name:
|
155
142
|
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
156
143
|
return
|
157
|
-
# Get the actual config object (e.g., self.backend_db_connection)
|
158
144
|
active_config = getattr(self, attr_name, None)
|
159
|
-
|
160
145
|
if active_config and hasattr(active_config, "close"):
|
161
146
|
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
162
147
|
active_config.close()
|
163
148
|
|
149
|
+
async def _acleanup(self):
|
150
|
+
self.logger.warning(
|
151
|
+
"DfHelper instance was not used in an async context manager; cleanup is being called manually."
|
152
|
+
)
|
153
|
+
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
154
|
+
if not attr_name:
|
155
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
156
|
+
return
|
157
|
+
active_config = getattr(self, attr_name, None)
|
158
|
+
if active_config and hasattr(active_config, "aclose"):
|
159
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
160
|
+
await active_config.aclose()
|
161
|
+
|
162
|
+
# ---------- config helpers ----------
|
164
163
|
def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
165
|
-
|
166
|
-
model_kwargs = {k: kwargs[k] for k in
|
164
|
+
recognized = set(model.model_fields.keys())
|
165
|
+
model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
|
167
166
|
return model(**model_kwargs)
|
168
167
|
|
168
|
+
# ---------- load/aload ----------
|
169
169
|
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
170
|
-
"""Loads data synchronously. Fails if backend is async-only."""
|
171
170
|
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
172
171
|
self.total_records, df = self.backend_strategy.load(**options)
|
173
172
|
df = self._process_loaded_data(df)
|
174
173
|
df = self._post_process_df(df)
|
175
|
-
|
176
|
-
self.logger.warning(
|
177
|
-
"DfHelper instance was not used in a context manager; cleanup is being called manually.")
|
178
|
-
self._cleanup()
|
174
|
+
self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
179
175
|
return df.compute() if as_pandas else df
|
180
176
|
|
181
177
|
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
182
|
-
"""Loads data asynchronously from any backend."""
|
183
178
|
self.total_records, df = await self.backend_strategy.aload(**options)
|
184
179
|
df = self._process_loaded_data(df)
|
185
180
|
df = self._post_process_df(df)
|
186
181
|
return df.compute() if as_pandas else df
|
187
182
|
|
183
|
+
# ---------- dataframe post-processing ----------
|
188
184
|
def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
189
|
-
df_params = self._backend_params.df_params
|
190
|
-
if not df_params: return df
|
191
|
-
fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
|
192
|
-
df_params.get("index_col"))
|
193
|
-
if not any([fieldnames, column_names, index_col]): return df
|
194
185
|
self.logger.debug("Post-processing DataFrame.")
|
186
|
+
df_params = self._backend_params.df_params
|
187
|
+
if not df_params:
|
188
|
+
return df
|
189
|
+
fieldnames = df_params.get("fieldnames")
|
190
|
+
column_names = df_params.get("column_names")
|
191
|
+
index_col = df_params.get("index_col")
|
192
|
+
|
195
193
|
if fieldnames:
|
196
|
-
|
197
|
-
if len(
|
198
|
-
f"Missing columns for filtering: {set(fieldnames) - set(
|
199
|
-
df = df[
|
194
|
+
valid = [f for f in fieldnames if f in df.columns]
|
195
|
+
if len(valid) < len(fieldnames):
|
196
|
+
self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
|
197
|
+
df = df[valid]
|
200
198
|
if column_names:
|
201
|
-
if len(df.columns) != len(column_names):
|
202
|
-
|
199
|
+
if len(df.columns) != len(column_names):
|
200
|
+
raise ValueError(
|
201
|
+
f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
|
202
|
+
)
|
203
203
|
df = df.rename(columns=dict(zip(df.columns, column_names)))
|
204
204
|
if index_col:
|
205
|
-
if index_col not in df.columns:
|
205
|
+
if index_col not in df.columns:
|
206
|
+
raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
206
207
|
df = df.set_index(index_col)
|
208
|
+
|
209
|
+
self.logger.debug("Post-processing complete.")
|
207
210
|
return df
|
208
211
|
|
209
212
|
def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
210
213
|
field_map = self._backend_params.field_map or {}
|
211
|
-
if not isinstance(field_map, dict) or not field_map:
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
214
|
+
if not isinstance(field_map, dict) or not field_map:
|
215
|
+
return df
|
216
|
+
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
217
|
+
return df
|
218
|
+
self.logger.debug("Applying rename mapping if necessary.")
|
219
|
+
rename_map = {k: v for k, v in field_map.items() if k in df.columns}
|
220
|
+
if rename_map:
|
221
|
+
df = df.rename(columns=rename_map)
|
216
222
|
return df
|
217
223
|
|
224
|
+
# ---------- sinks ----------
|
218
225
|
def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
219
|
-
|
220
|
-
|
221
|
-
return
|
222
|
-
fs = kwargs.pop('fs', self.fs)
|
226
|
+
fs: AbstractFileSystem = kwargs.get("fs", self.fs)
|
227
|
+
path: str = kwargs.get("parquet_storage_path")
|
223
228
|
if not fs:
|
224
|
-
raise ValueError("
|
225
|
-
path = kwargs.pop('parquet_storage_path', None)
|
229
|
+
raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
226
230
|
if not path:
|
227
|
-
raise ValueError("parquet_storage_path must be provided
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
231
|
+
raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
232
|
+
if len(df.head(1)) == 0:
|
233
|
+
self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
234
|
+
return
|
235
|
+
|
236
|
+
with ParquetSaver(
|
237
|
+
df_result=df,
|
238
|
+
parquet_storage_path=path,
|
239
|
+
fs=fs,
|
240
|
+
debug=self.debug,
|
241
|
+
logger=self.logger,
|
242
|
+
verbose=self.verbose,
|
243
|
+
**kwargs,
|
244
|
+
) as saver:
|
237
245
|
saver.save_to_parquet(parquet_filename)
|
238
246
|
|
239
|
-
self.logger.debug(f"
|
247
|
+
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
240
248
|
|
241
249
|
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
242
|
-
if hasattr(df,
|
250
|
+
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
243
251
|
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
244
252
|
return
|
245
|
-
|
246
253
|
with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
247
254
|
writer.save_to_clickhouse(df)
|
248
255
|
self.logger.debug("Save to ClickHouse completed.")
|
249
256
|
|
250
|
-
|
251
|
-
|
257
|
+
# ---------- convenience period loaders ----------
|
258
|
+
def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
252
259
|
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
253
260
|
return self.load(**final_kwargs)
|
254
261
|
|
255
|
-
async def aload_period(self, dt_field: str, start: str, end: str, **kwargs)
|
256
|
-
"""Asynchronous convenience method for loading a date range."""
|
262
|
+
async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
|
257
263
|
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
258
264
|
return await self.aload(**final_kwargs)
|
259
265
|
|
260
266
|
def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
261
267
|
start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
262
|
-
if start_date > end_date:
|
268
|
+
if start_date > end_date:
|
269
|
+
raise ValueError("'start' date cannot be later than 'end' date.")
|
263
270
|
field_map = self._backend_params.field_map or {}
|
264
271
|
reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
265
|
-
if len(reverse_map) != len(field_map):
|
266
|
-
"field_map values are not unique; reverse mapping may be unreliable.")
|
272
|
+
if len(reverse_map) != len(field_map):
|
273
|
+
self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
267
274
|
mapped_field = reverse_map.get(dt_field, dt_field)
|
268
275
|
if start_date == end_date:
|
269
276
|
kwargs[f"{mapped_field}__date"] = start_date
|
@@ -271,3 +278,296 @@ class DfHelper(ManagedResource):
|
|
271
278
|
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
272
279
|
self.logger.debug(f"Period load generated filters: {kwargs}")
|
273
280
|
return kwargs
|
281
|
+
|
282
|
+
# from __future__ import annotations
|
283
|
+
#
|
284
|
+
# from typing import Any, Dict, Optional, Union, TypeVar
|
285
|
+
#
|
286
|
+
# import dask.dataframe as dd
|
287
|
+
# import pandas as pd
|
288
|
+
# from fsspec import AbstractFileSystem
|
289
|
+
# from pydantic import BaseModel
|
290
|
+
#
|
291
|
+
# from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
|
292
|
+
# from sibi_dst.utils import ManagedResource
|
293
|
+
# from sibi_dst.utils import ParquetSaver, ClickHouseWriter
|
294
|
+
# from .backends.http import HttpConfig
|
295
|
+
# from .backends.parquet import ParquetConfig
|
296
|
+
# from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
297
|
+
#
|
298
|
+
# T = TypeVar("T", bound=BaseModel)
|
299
|
+
#
|
300
|
+
#
|
301
|
+
# # --- Backend Strategy Pattern Implementation ---
|
302
|
+
#
|
303
|
+
# class BaseBackend:
|
304
|
+
# """Abstract base class defining clear sync and async loading interfaces."""
|
305
|
+
#
|
306
|
+
# def __init__(self, helper: DfHelper):
|
307
|
+
# self.helper = helper
|
308
|
+
# self.logger = helper.logger
|
309
|
+
# self.debug = helper.debug
|
310
|
+
# self.total_records = helper.total_records # no records loaded yet
|
311
|
+
#
|
312
|
+
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
313
|
+
# """Synchronous data loading method. Must be implemented by sync backends."""
|
314
|
+
# raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
|
315
|
+
#
|
316
|
+
# async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
317
|
+
# """Asynchronous data loading method. By default, it calls the sync version."""
|
318
|
+
# return self.load(**options)
|
319
|
+
#
|
320
|
+
#
|
321
|
+
# class SqlAlchemyBackend(BaseBackend):
|
322
|
+
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
323
|
+
# try:
|
324
|
+
# # Process incoming filter options into the ParamsConfig object
|
325
|
+
# if options and hasattr(self.helper._backend_params, 'parse_params'):
|
326
|
+
# self.helper._backend_params.parse_params(options)
|
327
|
+
#
|
328
|
+
# with SqlAlchemyLoadFromDb(
|
329
|
+
# plugin_sqlalchemy=self.helper.backend_db_connection,
|
330
|
+
# plugin_query=self.helper._backend_query,
|
331
|
+
# plugin_params=self.helper._backend_params,
|
332
|
+
# logger=self.logger,
|
333
|
+
# debug=self.debug
|
334
|
+
# ) as db_loader:
|
335
|
+
# self.total_records, result = db_loader.build_and_load()
|
336
|
+
# return self.total_records, result
|
337
|
+
# except Exception as e:
|
338
|
+
# self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
339
|
+
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
340
|
+
#
|
341
|
+
#
|
342
|
+
# class ParquetBackend(BaseBackend):
|
343
|
+
# """This backend is also purely synchronous."""
|
344
|
+
#
|
345
|
+
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
346
|
+
# try:
|
347
|
+
# df = self.helper.backend_parquet.load_files()
|
348
|
+
# if len(df.head(1)) == 0:
|
349
|
+
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
350
|
+
# if options and df is not None:
|
351
|
+
# df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
|
352
|
+
# if len(df.head(1)) == 0:
|
353
|
+
# self.logger.debug("No records found after applying filters; returning empty DataFrame.")
|
354
|
+
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
355
|
+
# df = df.persist()
|
356
|
+
#
|
357
|
+
# self.total_records = len(df) or -1 # If df is empty, set total_records to -1
|
358
|
+
# return self.total_records, df
|
359
|
+
# except Exception as e:
|
360
|
+
# self.total_records = -1 # Reset total_records on failure
|
361
|
+
# self.logger.error(f"Failed to load data from parquet: {e}")
|
362
|
+
# return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
363
|
+
#
|
364
|
+
#
|
365
|
+
# class HttpBackend(BaseBackend):
|
366
|
+
# """This backend is purely asynchronous."""
|
367
|
+
#
|
368
|
+
# def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
|
369
|
+
# # This will correctly fail by raising NotImplementedError from the base class.
|
370
|
+
# return self.helper.backend_http.fetch_data(**options)
|
371
|
+
#
|
372
|
+
# async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
|
373
|
+
# if not self.helper.backend_http:
|
374
|
+
# self.logger.warning("HTTP plugin not configured properly.")
|
375
|
+
# self.total_records = -1
|
376
|
+
# return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
377
|
+
# result = await self.helper.backend_http.fetch_data(**options)
|
378
|
+
# self.total_records = len(result)
|
379
|
+
# return self.total_records, result
|
380
|
+
#
|
381
|
+
#
|
382
|
+
# # --- Main DfHelper Facade Class ---
|
383
|
+
#
|
384
|
+
# class DfHelper(ManagedResource):
|
385
|
+
# """
|
386
|
+
# A reusable utility for loading data. It provides both sync (`load`) and
|
387
|
+
# async (`aload`) methods to accommodate different backends.
|
388
|
+
# """
|
389
|
+
# _BACKEND_STRATEGIES = {
|
390
|
+
# 'sqlalchemy': SqlAlchemyBackend,
|
391
|
+
# 'parquet': ParquetBackend,
|
392
|
+
# 'http': HttpBackend,
|
393
|
+
# }
|
394
|
+
#
|
395
|
+
# _BACKEND_ATTR_MAP = {
|
396
|
+
# 'sqlalchemy': 'backend_db_connection',
|
397
|
+
# 'parquet': 'backend_parquet',
|
398
|
+
# 'http': 'backend_http',
|
399
|
+
# }
|
400
|
+
#
|
401
|
+
# default_config: Dict = None
|
402
|
+
#
|
403
|
+
# def __init__(self, backend='sqlalchemy', **kwargs):
|
404
|
+
# self.default_config = self.default_config or {}
|
405
|
+
# kwargs = {**self.default_config.copy(), **kwargs}
|
406
|
+
# super().__init__(**kwargs)
|
407
|
+
# self.backend = backend
|
408
|
+
#
|
409
|
+
# # Need to set default values for backend-specific configurations
|
410
|
+
# kwargs.setdefault("debug", self.debug)
|
411
|
+
# kwargs.setdefault("fs", self.fs)
|
412
|
+
# kwargs.setdefault("logger", self.logger)
|
413
|
+
# self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
|
414
|
+
# self._backend_query = self._get_config(QueryConfig, kwargs)
|
415
|
+
# self._backend_params = self._get_config(ParamsConfig, kwargs)
|
416
|
+
# self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
417
|
+
# self.backend_parquet: Optional[ParquetConfig] = None
|
418
|
+
# self.backend_http: Optional[HttpConfig] = None
|
419
|
+
#
|
420
|
+
# if self.backend == 'sqlalchemy':
|
421
|
+
# self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
422
|
+
# elif self.backend == 'parquet':
|
423
|
+
# self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
424
|
+
# elif self.backend == 'http':
|
425
|
+
# self.backend_http = self._get_config(HttpConfig, kwargs)
|
426
|
+
#
|
427
|
+
# strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
|
428
|
+
# if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
|
429
|
+
# self.backend_strategy = strategy_class(self)
|
430
|
+
#
|
431
|
+
# def _cleanup(self):
|
432
|
+
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
433
|
+
# if not attr_name:
|
434
|
+
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
435
|
+
# return
|
436
|
+
# # Get the actual config object (e.g., self.backend_db_connection)
|
437
|
+
# active_config = getattr(self, attr_name, None)
|
438
|
+
#
|
439
|
+
# if active_config and hasattr(active_config, "close"):
|
440
|
+
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
441
|
+
# active_config.close()
|
442
|
+
#
|
443
|
+
# async def _acleanup(self):
|
444
|
+
# self.logger.warning("DfHelper instance was not used in an async context manager; cleanup is being called manually.")
|
445
|
+
# attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
446
|
+
# if not attr_name:
|
447
|
+
# self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
448
|
+
# return
|
449
|
+
# # Get the actual config object (e.g., self.backend_db_connection)
|
450
|
+
# active_config = getattr(self, attr_name, None)
|
451
|
+
# if active_config and hasattr(active_config, "aclose"):
|
452
|
+
# self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
453
|
+
# await active_config.aclose()
|
454
|
+
#
|
455
|
+
# def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
456
|
+
# recognized_keys = set(model.model_fields.keys())
|
457
|
+
# model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
|
458
|
+
# return model(**model_kwargs)
|
459
|
+
#
|
460
|
+
# def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
461
|
+
# """Loads data synchronously. Fails if backend is async-only."""
|
462
|
+
# self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
463
|
+
# self.total_records, df = self.backend_strategy.load(**options)
|
464
|
+
# df = self._process_loaded_data(df)
|
465
|
+
# df = self._post_process_df(df)
|
466
|
+
# self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
467
|
+
# return df.compute() if as_pandas else df
|
468
|
+
#
|
469
|
+
# async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
470
|
+
# """Loads data asynchronously from any backend."""
|
471
|
+
# self.total_records, df = await self.backend_strategy.aload(**options)
|
472
|
+
# df = self._process_loaded_data(df)
|
473
|
+
# df = self._post_process_df(df)
|
474
|
+
# return df.compute() if as_pandas else df
|
475
|
+
#
|
476
|
+
# def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
477
|
+
# self.logger.debug("Post-processing DataFrame.")
|
478
|
+
# df_params = self._backend_params.df_params
|
479
|
+
# if not df_params: return df
|
480
|
+
# fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
|
481
|
+
# df_params.get("index_col"))
|
482
|
+
# if not any([fieldnames, column_names, index_col]): return df
|
483
|
+
#
|
484
|
+
# if fieldnames:
|
485
|
+
# valid_fieldnames = [f for f in fieldnames if f in df.columns]
|
486
|
+
# if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
|
487
|
+
# f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
|
488
|
+
# df = df[valid_fieldnames]
|
489
|
+
# if column_names:
|
490
|
+
# if len(df.columns) != len(column_names): raise ValueError(
|
491
|
+
# f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
|
492
|
+
# df = df.rename(columns=dict(zip(df.columns, column_names)))
|
493
|
+
# if index_col:
|
494
|
+
# if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
495
|
+
# df = df.set_index(index_col)
|
496
|
+
# self.logger.debug("Post-processing complete.")
|
497
|
+
# return df
|
498
|
+
#
|
499
|
+
# def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
500
|
+
# field_map = self._backend_params.field_map or {}
|
501
|
+
# if not isinstance(field_map, dict) or not field_map: return df
|
502
|
+
# if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
|
503
|
+
# self.logger.debug("Processing loaded data...applying rename mapping if necessary.")
|
504
|
+
# rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
|
505
|
+
# if rename_mapping: df = df.rename(columns=rename_mapping)
|
506
|
+
# self.logger.debug("Rename mapping complete...")
|
507
|
+
# return df
|
508
|
+
#
|
509
|
+
# def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
510
|
+
# """Saves a Dask DataFrame to a Parquet file with validation."""
|
511
|
+
#
|
512
|
+
# # Use .get() for cleaner access to optional arguments.
|
513
|
+
# fs: AbstractFileSystem = kwargs.get('fs', self.fs)
|
514
|
+
# path: str = kwargs.get('parquet_storage_path')
|
515
|
+
#
|
516
|
+
# # Guard clauses to fail fast with clear errors.
|
517
|
+
# if not fs:
|
518
|
+
# raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
519
|
+
# if not path:
|
520
|
+
# raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
521
|
+
#
|
522
|
+
# # An efficient, idiomatic way to check if a Dask DataFrame is empty.
|
523
|
+
# if len(df.head(1)) == 0:
|
524
|
+
# self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
525
|
+
# return
|
526
|
+
#
|
527
|
+
# with ParquetSaver(
|
528
|
+
# df_result=df,
|
529
|
+
# parquet_storage_path=path,
|
530
|
+
# fs=fs,
|
531
|
+
# debug=self.debug,
|
532
|
+
# logger=self.logger,
|
533
|
+
# verbose=self.verbose,
|
534
|
+
# **kwargs
|
535
|
+
# ) as saver:
|
536
|
+
# saver.save_to_parquet(parquet_filename)
|
537
|
+
#
|
538
|
+
# self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
539
|
+
#
|
540
|
+
# def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
541
|
+
# if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
|
542
|
+
# self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
543
|
+
# return
|
544
|
+
#
|
545
|
+
# with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
|
546
|
+
# writer.save_to_clickhouse(df)
|
547
|
+
# self.logger.debug("Save to ClickHouse completed.")
|
548
|
+
#
|
549
|
+
# def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
550
|
+
# """Synchronous convenience method for loading a date range."""
|
551
|
+
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
552
|
+
# return self.load(**final_kwargs)
|
553
|
+
#
|
554
|
+
# async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
|
555
|
+
# """Asynchronous convenience method for loading a date range."""
|
556
|
+
# final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
557
|
+
# return await self.aload(**final_kwargs)
|
558
|
+
#
|
559
|
+
# def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
560
|
+
# start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
561
|
+
# if start_date > end_date:
|
562
|
+
# raise ValueError("'start' date cannot be later than 'end' date.")
|
563
|
+
# field_map = self._backend_params.field_map or {}
|
564
|
+
# reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
565
|
+
# if len(reverse_map) != len(field_map):
|
566
|
+
# self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
567
|
+
# mapped_field = reverse_map.get(dt_field, dt_field)
|
568
|
+
# if start_date == end_date:
|
569
|
+
# kwargs[f"{mapped_field}__date"] = start_date
|
570
|
+
# else:
|
571
|
+
# kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
572
|
+
# self.logger.debug(f"Period load generated filters: {kwargs}")
|
573
|
+
# return kwargs
|