sibi-dst 2025.1.13__py3-none-any.whl → 2025.8.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. sibi_dst/__init__.py +7 -1
  2. sibi_dst/df_helper/__init__.py +3 -2
  3. sibi_dst/df_helper/_artifact_updater_async.py +238 -0
  4. sibi_dst/df_helper/_artifact_updater_threaded.py +195 -0
  5. sibi_dst/df_helper/_df_helper.py +418 -118
  6. sibi_dst/df_helper/_parquet_artifact.py +275 -283
  7. sibi_dst/df_helper/_parquet_reader.py +9 -10
  8. sibi_dst/df_helper/backends/parquet/_parquet_options.py +8 -4
  9. sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +68 -107
  10. sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +15 -0
  11. sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +105 -255
  12. sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +90 -42
  13. sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +192 -0
  14. sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +122 -72
  15. sibi_dst/osmnx_helper/route_path_builder.py +45 -46
  16. sibi_dst/utils/__init__.py +2 -0
  17. sibi_dst/utils/base.py +235 -100
  18. sibi_dst/utils/business_days.py +248 -0
  19. sibi_dst/utils/clickhouse_writer.py +472 -206
  20. sibi_dst/utils/data_utils.py +139 -186
  21. sibi_dst/utils/data_wrapper.py +392 -88
  22. sibi_dst/utils/date_utils.py +711 -393
  23. sibi_dst/utils/df_utils.py +193 -213
  24. sibi_dst/utils/file_age_checker.py +301 -0
  25. sibi_dst/utils/file_utils.py +3 -2
  26. sibi_dst/utils/filepath_generator.py +314 -152
  27. sibi_dst/utils/log_utils.py +581 -242
  28. sibi_dst/utils/manifest_manager.py +60 -76
  29. sibi_dst/utils/parquet_saver.py +33 -27
  30. sibi_dst/utils/periods.py +42 -0
  31. sibi_dst/utils/phone_formatter.py +88 -95
  32. sibi_dst/utils/update_planner.py +180 -178
  33. sibi_dst/utils/webdav_client.py +116 -166
  34. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/METADATA +1 -1
  35. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/RECORD +36 -30
  36. sibi_dst/df_helper/_artifact_updater_multi_wrapper.py +0 -422
  37. {sibi_dst-2025.1.13.dist-info → sibi_dst-2025.8.2.dist-info}/WHEEL +0 -0
@@ -1,92 +1,84 @@
1
1
  from __future__ import annotations
2
2
 
3
- import warnings
4
- from typing import Any, Dict, Optional, Union, TypeVar
3
+ from typing import Any, Dict, Optional, TypeVar, Union
5
4
 
6
5
  import dask.dataframe as dd
7
6
  import pandas as pd
7
+ from fsspec import AbstractFileSystem
8
8
  from pydantic import BaseModel
9
9
 
10
10
  from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
11
- from sibi_dst.utils import ManagedResource
12
- from sibi_dst.utils import ParquetSaver, ClickHouseWriter
11
+ from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
13
12
  from .backends.http import HttpConfig
14
13
  from .backends.parquet import ParquetConfig
15
14
  from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
16
15
 
17
- warnings.filterwarnings("ignore")
18
16
  T = TypeVar("T", bound=BaseModel)
19
17
 
20
18
 
21
- # --- Backend Strategy Pattern Implementation ---
22
-
19
+ # ---- Backend Strategy Pattern ----
23
20
  class BaseBackend:
24
- """Abstract base class defining clear sync and async loading interfaces."""
25
-
26
- def __init__(self, helper: DfHelper):
21
+ def __init__(self, helper: "DfHelper"):
27
22
  self.helper = helper
28
23
  self.logger = helper.logger
29
24
  self.debug = helper.debug
30
- self.total_records = helper.total_records # no records loaded yet
31
- self._entered = helper._entered # Track if the helper is used in a context manager
25
+ self.total_records = helper.total_records
32
26
 
33
- def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
34
- """Synchronous data loading method. Must be implemented by sync backends."""
35
- raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
27
+ def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
28
+ raise NotImplementedError
36
29
 
37
- async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
38
- """Asynchronous data loading method. By default, it calls the sync version."""
30
+ async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
39
31
  return self.load(**options)
40
32
 
41
33
 
42
34
  class SqlAlchemyBackend(BaseBackend):
43
- def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
35
+ def load(self, **options):
44
36
  try:
45
- # Process incoming filter options into the ParamsConfig object
46
- if options and hasattr(self.helper._backend_params, 'parse_params'):
37
+ if options and hasattr(self.helper._backend_params, "parse_params"):
47
38
  self.helper._backend_params.parse_params(options)
48
39
 
49
- db_loader = SqlAlchemyLoadFromDb(
40
+ with SqlAlchemyLoadFromDb(
50
41
  plugin_sqlalchemy=self.helper.backend_db_connection,
51
42
  plugin_query=self.helper._backend_query,
52
43
  plugin_params=self.helper._backend_params,
53
44
  logger=self.logger,
54
- debug=self.debug
55
- )
56
- self.total_records, result = db_loader.build_and_load()
57
- return self.total_records, result
45
+ debug=self.debug,
46
+ ) as db_loader:
47
+ self.total_records, result = db_loader.build_and_load()
48
+ return self.total_records, result
58
49
  except Exception as e:
59
50
  self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
60
51
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
61
52
 
62
53
 
63
54
  class ParquetBackend(BaseBackend):
64
- """This backend is also purely synchronous."""
65
-
66
- def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
55
+ def load(self, **options):
67
56
  try:
68
57
  df = self.helper.backend_parquet.load_files()
58
+ if len(df.head(1)) == 0:
59
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
60
+
69
61
  if options and df is not None:
70
- df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
62
+ df = FilterHandler("dask", logger=self.logger, debug=False).apply_filters(df, filters=options)
63
+ if len(df.head(1)) == 0:
64
+ self.logger.debug("No records after filters; returning empty DataFrame.")
65
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
71
66
 
72
67
  df = df.persist()
73
-
74
- self.total_records = len(df) or -1 # If df is empty, set total_records to -1
68
+ self.total_records = len(df) or -1
75
69
  return self.total_records, df
76
70
  except Exception as e:
77
- self.total_records = -1 # Reset total_records on failure
78
- self.logger.error(f"Failed to load data from parquet: {e}", exc_info=True)
71
+ self.total_records = -1
72
+ self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
79
73
  return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
80
74
 
81
75
 
82
76
  class HttpBackend(BaseBackend):
83
- """This backend is purely asynchronous."""
84
-
85
- def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
86
- # This will correctly fail by raising NotImplementedError from the base class.
77
+ def load(self, **options):
78
+ # Will raise NotImplementedError from helper.backend_http if sync not supported
87
79
  return self.helper.backend_http.fetch_data(**options)
88
80
 
89
- async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
81
+ async def aload(self, **options):
90
82
  if not self.helper.backend_http:
91
83
  self.logger.warning("HTTP plugin not configured properly.")
92
84
  self.total_records = -1
@@ -96,174 +88,189 @@ class HttpBackend(BaseBackend):
96
88
  return self.total_records, result
97
89
 
98
90
 
99
- # --- Main DfHelper Facade Class ---
100
-
91
+ # ---- Main DfHelper ----
101
92
  class DfHelper(ManagedResource):
102
- """
103
- A reusable utility for loading data. It provides both sync (`load`) and
104
- async (`aload`) methods to accommodate different backends.
105
- """
106
93
  _BACKEND_STRATEGIES = {
107
- 'sqlalchemy': SqlAlchemyBackend,
108
- 'parquet': ParquetBackend,
109
- 'http': HttpBackend,
94
+ "sqlalchemy": SqlAlchemyBackend,
95
+ "parquet": ParquetBackend,
96
+ "http": HttpBackend,
110
97
  }
111
98
 
112
99
  _BACKEND_ATTR_MAP = {
113
- 'sqlalchemy': 'backend_db_connection',
114
- 'parquet': 'backend_parquet',
115
- 'http': 'backend_http',
100
+ "sqlalchemy": "backend_db_connection",
101
+ "parquet": "backend_parquet",
102
+ "http": "backend_http",
116
103
  }
117
104
 
118
- default_config: Dict = None
105
+ default_config: Dict[str, Any] = None
119
106
 
120
- def __init__(self, backend='sqlalchemy', **kwargs):
107
+ def __init__(self, backend="sqlalchemy", **kwargs):
121
108
  self.default_config = self.default_config or {}
122
109
  kwargs = {**self.default_config.copy(), **kwargs}
123
110
  super().__init__(**kwargs)
124
111
  self.backend = backend
125
112
 
126
- # Need to set default values for backend-specific configurations
113
+ # Ensure defaults flow to plugin configs
127
114
  kwargs.setdefault("debug", self.debug)
128
115
  kwargs.setdefault("fs", self.fs)
129
116
  kwargs.setdefault("logger", self.logger)
130
- self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
117
+
118
+ self.total_records = -1
131
119
  self._backend_query = self._get_config(QueryConfig, kwargs)
132
120
  self._backend_params = self._get_config(ParamsConfig, kwargs)
121
+
133
122
  self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
134
123
  self.backend_parquet: Optional[ParquetConfig] = None
135
124
  self.backend_http: Optional[HttpConfig] = None
136
125
 
137
- if self.backend == 'sqlalchemy':
126
+ if self.backend == "sqlalchemy":
138
127
  self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
139
- elif self.backend == 'parquet':
128
+ elif self.backend == "parquet":
140
129
  self.backend_parquet = self._get_config(ParquetConfig, kwargs)
141
- elif self.backend == 'http':
130
+ elif self.backend == "http":
142
131
  self.backend_http = self._get_config(HttpConfig, kwargs)
143
132
 
144
- strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
145
- if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
146
- self.backend_strategy = strategy_class(self)
147
-
148
- def __exit__(self, exc_type, exc_value, traceback):
149
- self._cleanup()
150
- super().__exit__(exc_type, exc_value, traceback)
133
+ strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
134
+ if not strategy_cls:
135
+ raise ValueError(f"Unsupported backend: {self.backend}")
136
+ self.backend_strategy = strategy_cls(self)
151
137
 
138
+ # ---------- ManagedResource hooks ----------
152
139
  def _cleanup(self):
153
140
  attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
154
141
  if not attr_name:
155
142
  self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
156
143
  return
157
- # Get the actual config object (e.g., self.backend_db_connection)
158
144
  active_config = getattr(self, attr_name, None)
159
-
160
145
  if active_config and hasattr(active_config, "close"):
161
146
  self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
162
147
  active_config.close()
163
148
 
149
+ async def _acleanup(self):
150
+ self.logger.warning(
151
+ "DfHelper instance was not used in an async context manager; cleanup is being called manually."
152
+ )
153
+ attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
154
+ if not attr_name:
155
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
156
+ return
157
+ active_config = getattr(self, attr_name, None)
158
+ if active_config and hasattr(active_config, "aclose"):
159
+ self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
160
+ await active_config.aclose()
161
+
162
+ # ---------- config helpers ----------
164
163
  def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
165
- recognized_keys = set(model.model_fields.keys())
166
- model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
164
+ recognized = set(model.model_fields.keys())
165
+ model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
167
166
  return model(**model_kwargs)
168
167
 
168
+ # ---------- load/aload ----------
169
169
  def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
170
- """Loads data synchronously. Fails if backend is async-only."""
171
170
  self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
172
171
  self.total_records, df = self.backend_strategy.load(**options)
173
172
  df = self._process_loaded_data(df)
174
173
  df = self._post_process_df(df)
175
- if not self._entered:
176
- self.logger.warning(
177
- "DfHelper instance was not used in a context manager; cleanup is being called manually.")
178
- self._cleanup()
174
+ self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
179
175
  return df.compute() if as_pandas else df
180
176
 
181
177
  async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
182
- """Loads data asynchronously from any backend."""
183
178
  self.total_records, df = await self.backend_strategy.aload(**options)
184
179
  df = self._process_loaded_data(df)
185
180
  df = self._post_process_df(df)
186
181
  return df.compute() if as_pandas else df
187
182
 
183
+ # ---------- dataframe post-processing ----------
188
184
  def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
189
- df_params = self._backend_params.df_params
190
- if not df_params: return df
191
- fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
192
- df_params.get("index_col"))
193
- if not any([fieldnames, column_names, index_col]): return df
194
185
  self.logger.debug("Post-processing DataFrame.")
186
+ df_params = self._backend_params.df_params
187
+ if not df_params:
188
+ return df
189
+ fieldnames = df_params.get("fieldnames")
190
+ column_names = df_params.get("column_names")
191
+ index_col = df_params.get("index_col")
192
+
195
193
  if fieldnames:
196
- valid_fieldnames = [f for f in fieldnames if f in df.columns]
197
- if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
198
- f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
199
- df = df[valid_fieldnames]
194
+ valid = [f for f in fieldnames if f in df.columns]
195
+ if len(valid) < len(fieldnames):
196
+ self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
197
+ df = df[valid]
200
198
  if column_names:
201
- if len(df.columns) != len(column_names): raise ValueError(
202
- f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
199
+ if len(df.columns) != len(column_names):
200
+ raise ValueError(
201
+ f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
202
+ )
203
203
  df = df.rename(columns=dict(zip(df.columns, column_names)))
204
204
  if index_col:
205
- if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
205
+ if index_col not in df.columns:
206
+ raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
206
207
  df = df.set_index(index_col)
208
+
209
+ self.logger.debug("Post-processing complete.")
207
210
  return df
208
211
 
209
212
  def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
210
213
  field_map = self._backend_params.field_map or {}
211
- if not isinstance(field_map, dict) or not field_map: return df
212
- if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
213
- self.logger.debug("Processing loaded data...")
214
- rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
215
- if rename_mapping: df = df.rename(columns=rename_mapping)
214
+ if not isinstance(field_map, dict) or not field_map:
215
+ return df
216
+ if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
217
+ return df
218
+ self.logger.debug("Applying rename mapping if necessary.")
219
+ rename_map = {k: v for k, v in field_map.items() if k in df.columns}
220
+ if rename_map:
221
+ df = df.rename(columns=rename_map)
216
222
  return df
217
223
 
224
+ # ---------- sinks ----------
218
225
  def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
219
- if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
220
- self.logger.warning("Cannot save to parquet; DataFrame is empty.")
221
- return
222
- fs = kwargs.pop('fs', self.fs)
226
+ fs: AbstractFileSystem = kwargs.get("fs", self.fs)
227
+ path: str = kwargs.get("parquet_storage_path")
223
228
  if not fs:
224
- raise ValueError("Filesystem (fs) must be provided to save to parquet.")
225
- path = kwargs.pop('parquet_storage_path', None)
229
+ raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
226
230
  if not path:
227
- raise ValueError("parquet_storage_path must be provided to save to parquet.")
228
- writer_config = {
229
- 'df_result': df,
230
- 'parquet_storage_path': path,
231
- 'fs': fs,
232
- 'debug': self.debug,
233
- 'logger': self.logger,
234
- 'verbose': self.verbose,
235
- }
236
- with ParquetSaver(**writer_config) as saver:
231
+ raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
232
+ if len(df.head(1)) == 0:
233
+ self.logger.warning("Skipping save: The provided DataFrame is empty.")
234
+ return
235
+
236
+ with ParquetSaver(
237
+ df_result=df,
238
+ parquet_storage_path=path,
239
+ fs=fs,
240
+ debug=self.debug,
241
+ logger=self.logger,
242
+ verbose=self.verbose,
243
+ **kwargs,
244
+ ) as saver:
237
245
  saver.save_to_parquet(parquet_filename)
238
246
 
239
- self.logger.debug(f"Parquet saved to {parquet_filename} in {path}.")
247
+ self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
240
248
 
241
249
  def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
242
- if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
250
+ if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
243
251
  self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
244
252
  return
245
-
246
- with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
253
+ with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
247
254
  writer.save_to_clickhouse(df)
248
255
  self.logger.debug("Save to ClickHouse completed.")
249
256
 
250
- def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
251
- """Synchronous convenience method for loading a date range."""
257
+ # ---------- convenience period loaders ----------
258
+ def load_period(self, dt_field: str, start: str, end: str, **kwargs):
252
259
  final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
253
260
  return self.load(**final_kwargs)
254
261
 
255
- async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
256
- """Asynchronous convenience method for loading a date range."""
262
+ async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
257
263
  final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
258
264
  return await self.aload(**final_kwargs)
259
265
 
260
266
  def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
261
267
  start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
262
- if start_date > end_date: raise ValueError("'start' date cannot be later than 'end' date.")
268
+ if start_date > end_date:
269
+ raise ValueError("'start' date cannot be later than 'end' date.")
263
270
  field_map = self._backend_params.field_map or {}
264
271
  reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
265
- if len(reverse_map) != len(field_map): self.logger.warning(
266
- "field_map values are not unique; reverse mapping may be unreliable.")
272
+ if len(reverse_map) != len(field_map):
273
+ self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
267
274
  mapped_field = reverse_map.get(dt_field, dt_field)
268
275
  if start_date == end_date:
269
276
  kwargs[f"{mapped_field}__date"] = start_date
@@ -271,3 +278,296 @@ class DfHelper(ManagedResource):
271
278
  kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
272
279
  self.logger.debug(f"Period load generated filters: {kwargs}")
273
280
  return kwargs
281
+
282
+ # from __future__ import annotations
283
+ #
284
+ # from typing import Any, Dict, Optional, Union, TypeVar
285
+ #
286
+ # import dask.dataframe as dd
287
+ # import pandas as pd
288
+ # from fsspec import AbstractFileSystem
289
+ # from pydantic import BaseModel
290
+ #
291
+ # from sibi_dst.df_helper.core import QueryConfig, ParamsConfig, FilterHandler
292
+ # from sibi_dst.utils import ManagedResource
293
+ # from sibi_dst.utils import ParquetSaver, ClickHouseWriter
294
+ # from .backends.http import HttpConfig
295
+ # from .backends.parquet import ParquetConfig
296
+ # from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
297
+ #
298
+ # T = TypeVar("T", bound=BaseModel)
299
+ #
300
+ #
301
+ # # --- Backend Strategy Pattern Implementation ---
302
+ #
303
+ # class BaseBackend:
304
+ # """Abstract base class defining clear sync and async loading interfaces."""
305
+ #
306
+ # def __init__(self, helper: DfHelper):
307
+ # self.helper = helper
308
+ # self.logger = helper.logger
309
+ # self.debug = helper.debug
310
+ # self.total_records = helper.total_records # no records loaded yet
311
+ #
312
+ # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
313
+ # """Synchronous data loading method. Must be implemented by sync backends."""
314
+ # raise NotImplementedError(f"Backend '{self.__class__.__name__}' does not support synchronous loading.")
315
+ #
316
+ # async def aload(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
317
+ # """Asynchronous data loading method. By default, it calls the sync version."""
318
+ # return self.load(**options)
319
+ #
320
+ #
321
+ # class SqlAlchemyBackend(BaseBackend):
322
+ # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
323
+ # try:
324
+ # # Process incoming filter options into the ParamsConfig object
325
+ # if options and hasattr(self.helper._backend_params, 'parse_params'):
326
+ # self.helper._backend_params.parse_params(options)
327
+ #
328
+ # with SqlAlchemyLoadFromDb(
329
+ # plugin_sqlalchemy=self.helper.backend_db_connection,
330
+ # plugin_query=self.helper._backend_query,
331
+ # plugin_params=self.helper._backend_params,
332
+ # logger=self.logger,
333
+ # debug=self.debug
334
+ # ) as db_loader:
335
+ # self.total_records, result = db_loader.build_and_load()
336
+ # return self.total_records, result
337
+ # except Exception as e:
338
+ # self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
339
+ # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
340
+ #
341
+ #
342
+ # class ParquetBackend(BaseBackend):
343
+ # """This backend is also purely synchronous."""
344
+ #
345
+ # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
346
+ # try:
347
+ # df = self.helper.backend_parquet.load_files()
348
+ # if len(df.head(1)) == 0:
349
+ # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
350
+ # if options and df is not None:
351
+ # df = FilterHandler('dask', logger=self.logger, debug=False).apply_filters(df, filters=options)
352
+ # if len(df.head(1)) == 0:
353
+ # self.logger.debug("No records found after applying filters; returning empty DataFrame.")
354
+ # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
355
+ # df = df.persist()
356
+ #
357
+ # self.total_records = len(df) or -1 # If df is empty, set total_records to -1
358
+ # return self.total_records, df
359
+ # except Exception as e:
360
+ # self.total_records = -1 # Reset total_records on failure
361
+ # self.logger.error(f"Failed to load data from parquet: {e}")
362
+ # return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
363
+ #
364
+ #
365
+ # class HttpBackend(BaseBackend):
366
+ # """This backend is purely asynchronous."""
367
+ #
368
+ # def load(self, **options) -> tuple[Any, Any] | Union[dd.DataFrame | pd.DataFrame]:
369
+ # # This will correctly fail by raising NotImplementedError from the base class.
370
+ # return self.helper.backend_http.fetch_data(**options)
371
+ #
372
+ # async def aload(self, **options) -> tuple[Any, Any] | Union[pd.DataFrame, dd.DataFrame]:
373
+ # if not self.helper.backend_http:
374
+ # self.logger.warning("HTTP plugin not configured properly.")
375
+ # self.total_records = -1
376
+ # return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
377
+ # result = await self.helper.backend_http.fetch_data(**options)
378
+ # self.total_records = len(result)
379
+ # return self.total_records, result
380
+ #
381
+ #
382
+ # # --- Main DfHelper Facade Class ---
383
+ #
384
+ # class DfHelper(ManagedResource):
385
+ # """
386
+ # A reusable utility for loading data. It provides both sync (`load`) and
387
+ # async (`aload`) methods to accommodate different backends.
388
+ # """
389
+ # _BACKEND_STRATEGIES = {
390
+ # 'sqlalchemy': SqlAlchemyBackend,
391
+ # 'parquet': ParquetBackend,
392
+ # 'http': HttpBackend,
393
+ # }
394
+ #
395
+ # _BACKEND_ATTR_MAP = {
396
+ # 'sqlalchemy': 'backend_db_connection',
397
+ # 'parquet': 'backend_parquet',
398
+ # 'http': 'backend_http',
399
+ # }
400
+ #
401
+ # default_config: Dict = None
402
+ #
403
+ # def __init__(self, backend='sqlalchemy', **kwargs):
404
+ # self.default_config = self.default_config or {}
405
+ # kwargs = {**self.default_config.copy(), **kwargs}
406
+ # super().__init__(**kwargs)
407
+ # self.backend = backend
408
+ #
409
+ # # Need to set default values for backend-specific configurations
410
+ # kwargs.setdefault("debug", self.debug)
411
+ # kwargs.setdefault("fs", self.fs)
412
+ # kwargs.setdefault("logger", self.logger)
413
+ # self.total_records = -1 # Initialize total_records to -1 to indicate no records loaded yet
414
+ # self._backend_query = self._get_config(QueryConfig, kwargs)
415
+ # self._backend_params = self._get_config(ParamsConfig, kwargs)
416
+ # self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
417
+ # self.backend_parquet: Optional[ParquetConfig] = None
418
+ # self.backend_http: Optional[HttpConfig] = None
419
+ #
420
+ # if self.backend == 'sqlalchemy':
421
+ # self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
422
+ # elif self.backend == 'parquet':
423
+ # self.backend_parquet = self._get_config(ParquetConfig, kwargs)
424
+ # elif self.backend == 'http':
425
+ # self.backend_http = self._get_config(HttpConfig, kwargs)
426
+ #
427
+ # strategy_class = self._BACKEND_STRATEGIES.get(self.backend)
428
+ # if not strategy_class: raise ValueError(f"Unsupported backend: {self.backend}")
429
+ # self.backend_strategy = strategy_class(self)
430
+ #
431
+ # def _cleanup(self):
432
+ # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
433
+ # if not attr_name:
434
+ # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
435
+ # return
436
+ # # Get the actual config object (e.g., self.backend_db_connection)
437
+ # active_config = getattr(self, attr_name, None)
438
+ #
439
+ # if active_config and hasattr(active_config, "close"):
440
+ # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
441
+ # active_config.close()
442
+ #
443
+ # async def _acleanup(self):
444
+ # self.logger.warning("DfHelper instance was not used in an async context manager; cleanup is being called manually.")
445
+ # attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
446
+ # if not attr_name:
447
+ # self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
448
+ # return
449
+ # # Get the actual config object (e.g., self.backend_db_connection)
450
+ # active_config = getattr(self, attr_name, None)
451
+ # if active_config and hasattr(active_config, "aclose"):
452
+ # self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
453
+ # await active_config.aclose()
454
+ #
455
+ # def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
456
+ # recognized_keys = set(model.model_fields.keys())
457
+ # model_kwargs = {k: kwargs[k] for k in recognized_keys if k in kwargs}
458
+ # return model(**model_kwargs)
459
+ #
460
+ # def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
461
+ # """Loads data synchronously. Fails if backend is async-only."""
462
+ # self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
463
+ # self.total_records, df = self.backend_strategy.load(**options)
464
+ # df = self._process_loaded_data(df)
465
+ # df = self._post_process_df(df)
466
+ # self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
467
+ # return df.compute() if as_pandas else df
468
+ #
469
+ # async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
470
+ # """Loads data asynchronously from any backend."""
471
+ # self.total_records, df = await self.backend_strategy.aload(**options)
472
+ # df = self._process_loaded_data(df)
473
+ # df = self._post_process_df(df)
474
+ # return df.compute() if as_pandas else df
475
+ #
476
+ # def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
477
+ # self.logger.debug("Post-processing DataFrame.")
478
+ # df_params = self._backend_params.df_params
479
+ # if not df_params: return df
480
+ # fieldnames, column_names, index_col = (df_params.get("fieldnames"), df_params.get("column_names"),
481
+ # df_params.get("index_col"))
482
+ # if not any([fieldnames, column_names, index_col]): return df
483
+ #
484
+ # if fieldnames:
485
+ # valid_fieldnames = [f for f in fieldnames if f in df.columns]
486
+ # if len(valid_fieldnames) < len(fieldnames): self.logger.warning(
487
+ # f"Missing columns for filtering: {set(fieldnames) - set(valid_fieldnames)}")
488
+ # df = df[valid_fieldnames]
489
+ # if column_names:
490
+ # if len(df.columns) != len(column_names): raise ValueError(
491
+ # f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided.")
492
+ # df = df.rename(columns=dict(zip(df.columns, column_names)))
493
+ # if index_col:
494
+ # if index_col not in df.columns: raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
495
+ # df = df.set_index(index_col)
496
+ # self.logger.debug("Post-processing complete.")
497
+ # return df
498
+ #
499
+ # def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
500
+ # field_map = self._backend_params.field_map or {}
501
+ # if not isinstance(field_map, dict) or not field_map: return df
502
+ # if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)): return df
503
+ # self.logger.debug("Processing loaded data...applying rename mapping if necessary.")
504
+ # rename_mapping = {k: v for k, v in field_map.items() if k in df.columns}
505
+ # if rename_mapping: df = df.rename(columns=rename_mapping)
506
+ # self.logger.debug("Rename mapping complete...")
507
+ # return df
508
+ #
509
+ # def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
510
+ # """Saves a Dask DataFrame to a Parquet file with validation."""
511
+ #
512
+ # # Use .get() for cleaner access to optional arguments.
513
+ # fs: AbstractFileSystem = kwargs.get('fs', self.fs)
514
+ # path: str = kwargs.get('parquet_storage_path')
515
+ #
516
+ # # Guard clauses to fail fast with clear errors.
517
+ # if not fs:
518
+ # raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
519
+ # if not path:
520
+ # raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
521
+ #
522
+ # # An efficient, idiomatic way to check if a Dask DataFrame is empty.
523
+ # if len(df.head(1)) == 0:
524
+ # self.logger.warning("Skipping save: The provided DataFrame is empty.")
525
+ # return
526
+ #
527
+ # with ParquetSaver(
528
+ # df_result=df,
529
+ # parquet_storage_path=path,
530
+ # fs=fs,
531
+ # debug=self.debug,
532
+ # logger=self.logger,
533
+ # verbose=self.verbose,
534
+ # **kwargs
535
+ # ) as saver:
536
+ # saver.save_to_parquet(parquet_filename)
537
+ #
538
+ # self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
539
+ #
540
+ # def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
541
+ # if hasattr(df, 'npartitions') and df.npartitions == 1 and not len(df.head(1)):
542
+ # self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
543
+ # return
544
+ #
545
+ # with ClickHouseWriter(debug=self.debug, logger=self.logger, verbose=self.verbose, **credentials) as writer:
546
+ # writer.save_to_clickhouse(df)
547
+ # self.logger.debug("Save to ClickHouse completed.")
548
+ #
549
+ # def load_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
550
+ # """Synchronous convenience method for loading a date range."""
551
+ # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
552
+ # return self.load(**final_kwargs)
553
+ #
554
+ # async def aload_period(self, dt_field: str, start: str, end: str, **kwargs) -> Union[pd.DataFrame, dd.DataFrame]:
555
+ # """Asynchronous convenience method for loading a date range."""
556
+ # final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
557
+ # return await self.aload(**final_kwargs)
558
+ #
559
+ # def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
560
+ # start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
561
+ # if start_date > end_date:
562
+ # raise ValueError("'start' date cannot be later than 'end' date.")
563
+ # field_map = self._backend_params.field_map or {}
564
+ # reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
565
+ # if len(reverse_map) != len(field_map):
566
+ # self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
567
+ # mapped_field = reverse_map.get(dt_field, dt_field)
568
+ # if start_date == end_date:
569
+ # kwargs[f"{mapped_field}__date"] = start_date
570
+ # else:
571
+ # kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
572
+ # self.logger.debug(f"Period load generated filters: {kwargs}")
573
+ # return kwargs