sibi-dst 2025.8.2__tar.gz → 2025.8.4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (86) hide show
  1. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/PKG-INFO +1 -1
  2. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/pyproject.toml +1 -1
  3. sibi_dst-2025.8.4/sibi_dst/df_helper/_df_helper.py +298 -0
  4. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -2
  5. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +1 -1
  6. sibi_dst-2025.8.4/sibi_dst/df_helper/core/_filter_handler.py +469 -0
  7. sibi_dst-2025.8.2/sibi_dst/df_helper/_df_helper.py +0 -573
  8. sibi_dst-2025.8.2/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -126
  9. sibi_dst-2025.8.2/sibi_dst/df_helper/core/_filter_handler.py +0 -257
  10. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/README.md +0 -0
  11. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/__init__.py +0 -0
  12. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/__init__.py +0 -0
  13. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
  14. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
  15. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
  16. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/_parquet_reader.py +0 -0
  17. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/__init__.py +0 -0
  18. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
  19. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
  20. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
  21. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  22. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
  23. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  24. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  25. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
  26. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
  27. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/core/__init__.py +0 -0
  28. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/core/_defaults.py +0 -0
  29. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/core/_params_config.py +0 -0
  30. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/core/_query_config.py +0 -0
  31. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/df_helper/data_cleaner.py +0 -0
  32. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/geopy_helper/__init__.py +0 -0
  33. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
  34. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/geopy_helper/utils.py +0 -0
  35. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/__init__.py +0 -0
  36. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
  37. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
  38. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
  39. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
  40. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
  41. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
  42. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/osmnx_helper/utils.py +0 -0
  43. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/tests/__init__.py +0 -0
  44. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
  45. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/__init__.py +0 -0
  46. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/base.py +0 -0
  47. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/business_days.py +0 -0
  48. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/clickhouse_writer.py +0 -0
  49. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/credentials.py +0 -0
  50. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/data_from_http_source.py +0 -0
  51. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/data_utils.py +0 -0
  52. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/data_wrapper.py +0 -0
  53. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/date_utils.py +0 -0
  54. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/df_utils.py +0 -0
  55. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/file_age_checker.py +0 -0
  56. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/file_utils.py +0 -0
  57. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/filepath_generator.py +0 -0
  58. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/log_utils.py +0 -0
  59. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/manifest_manager.py +0 -0
  60. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/parquet_saver.py +0 -0
  61. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/periods.py +0 -0
  62. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/phone_formatter.py +0 -0
  63. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/storage_config.py +0 -0
  64. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/storage_manager.py +0 -0
  65. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/update_planner.py +0 -0
  66. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/utils/webdav_client.py +0 -0
  67. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/__init__.py +0 -0
  68. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/__init__.py +0 -0
  69. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
  70. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
  71. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
  72. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
  73. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
  74. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
  75. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
  76. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
  77. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
  78. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
  79. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
  80. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
  81. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
  82. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
  83. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
  84. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
  85. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/utils/__init__.py +0 -0
  86. {sibi_dst-2025.8.2 → sibi_dst-2025.8.4}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.2
3
+ Version: 2025.8.4
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -1,6 +1,6 @@
1
1
  [tool.poetry]
2
2
  name = "sibi-dst"
3
- version = "2025.8.2"
3
+ version = "2025.8.4"
4
4
  description = "Data Science Toolkit"
5
5
  authors = ["Luis Valverde <lvalverdeb@gmail.com>"]
6
6
  readme = "README.md"
@@ -0,0 +1,298 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Any, Dict, Optional, TypeVar, Union
4
+
5
+ import dask.dataframe as dd
6
+ import pandas as pd
7
+ from fsspec import AbstractFileSystem
8
+ from pydantic import BaseModel
9
+
10
+ from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
11
+ from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
12
+ from .backends.http import HttpConfig
13
+ from .backends.parquet import ParquetConfig
14
+ from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
15
+
16
+ T = TypeVar("T", bound=BaseModel)
17
+
18
+
19
+ # ---- Backend Strategy Pattern ----
20
+ class BaseBackend:
21
+ def __init__(self, helper: "DfHelper"):
22
+ self.helper = helper
23
+ self.logger = helper.logger
24
+ self.debug = helper.debug
25
+ self.total_records = helper.total_records
26
+
27
+ def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
28
+ raise NotImplementedError
29
+
30
+ async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
31
+ return self.load(**options)
32
+
33
+
34
+ class SqlAlchemyBackend(BaseBackend):
35
+ def load(self, **options):
36
+ try:
37
+ if options and hasattr(self.helper._backend_params, "parse_params"):
38
+ self.helper._backend_params.parse_params(options)
39
+
40
+ with SqlAlchemyLoadFromDb(
41
+ plugin_sqlalchemy=self.helper.backend_db_connection,
42
+ plugin_query=self.helper._backend_query,
43
+ plugin_params=self.helper._backend_params,
44
+ logger=self.logger,
45
+ debug=self.debug,
46
+ ) as db_loader:
47
+ self.total_records, result = db_loader.build_and_load()
48
+ return self.total_records, result
49
+ except Exception as e:
50
+ self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
51
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
52
+
53
+
54
+ class ParquetBackend(BaseBackend):
55
+ def load(self, **options):
56
+ try:
57
+ df = self.helper.backend_parquet.load_files(**options)
58
+ if self._is_empty(df):
59
+ return -1, self._empty_like(df)
60
+ nrows = self._row_count(df)
61
+ if nrows == 0:
62
+ self.logger.debug("No records after filters; returning empty DataFrame.")
63
+ return 0, self._empty_like(df)
64
+
65
+ df = df.persist()
66
+ self.total_records = self._row_count(df) or -1
67
+ return self.total_records, df
68
+
69
+ except Exception as e:
70
+ self.total_records = -1 # Reset total_records on failure
71
+ self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
72
+ return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
73
+
74
+ def _is_empty(self, ddf) -> bool:
75
+ """True if no rows across all partitions."""
76
+ try:
77
+ # head with npartitions=-1 walks partitions until it gets n rows
78
+ return ddf.head(1, npartitions=-1).shape[0] == 0
79
+ except Exception:
80
+ return True
81
+
82
+ def _row_count(self, ddf) -> int:
83
+ """Reliable row count for Dask DataFrame."""
84
+ return int(ddf.map_partitions(len).sum().compute())
85
+
86
+ def _empty_like(self, ddf):
87
+ """Return an empty Dask DF with the SAME columns/dtypes."""
88
+ empty_pdf = ddf._meta.iloc[0:0]
89
+ return dd.from_pandas(empty_pdf, npartitions=1)
90
+
91
+
92
+ class HttpBackend(BaseBackend):
93
+ def load(self, **options):
94
+ # Will raise NotImplementedError from helper.backend_http if sync not supported
95
+ return self.helper.backend_http.fetch_data(**options)
96
+
97
+ async def aload(self, **options):
98
+ if not self.helper.backend_http:
99
+ self.logger.warning("HTTP plugin not configured properly.")
100
+ self.total_records = -1
101
+ return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
102
+ result = await self.helper.backend_http.fetch_data(**options)
103
+ self.total_records = len(result)
104
+ return self.total_records, result
105
+
106
+
107
+ # ---- Main DfHelper ----
108
+ class DfHelper(ManagedResource):
109
+ _BACKEND_STRATEGIES = {
110
+ "sqlalchemy": SqlAlchemyBackend,
111
+ "parquet": ParquetBackend,
112
+ "http": HttpBackend,
113
+ }
114
+
115
+ _BACKEND_ATTR_MAP = {
116
+ "sqlalchemy": "backend_db_connection",
117
+ "parquet": "backend_parquet",
118
+ "http": "backend_http",
119
+ }
120
+
121
+ default_config: Dict[str, Any] = None
122
+
123
+ def __init__(self, backend="sqlalchemy", **kwargs):
124
+ self.default_config = self.default_config or {}
125
+ kwargs = {**self.default_config.copy(), **kwargs}
126
+ super().__init__(**kwargs)
127
+ self.backend = backend
128
+
129
+ # Ensure defaults flow to plugin configs
130
+ kwargs.setdefault("debug", self.debug)
131
+ kwargs.setdefault("fs", self.fs)
132
+ kwargs.setdefault("logger", self.logger)
133
+
134
+ self.total_records = -1
135
+ self._backend_query = self._get_config(QueryConfig, kwargs)
136
+ self._backend_params = self._get_config(ParamsConfig, kwargs)
137
+
138
+ self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
139
+ self.backend_parquet: Optional[ParquetConfig] = None
140
+ self.backend_http: Optional[HttpConfig] = None
141
+
142
+ if self.backend == "sqlalchemy":
143
+ self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
144
+ elif self.backend == "parquet":
145
+ self.backend_parquet = self._get_config(ParquetConfig, kwargs)
146
+ elif self.backend == "http":
147
+ self.backend_http = self._get_config(HttpConfig, kwargs)
148
+
149
+ strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
150
+ if not strategy_cls:
151
+ raise ValueError(f"Unsupported backend: {self.backend}")
152
+ self.backend_strategy = strategy_cls(self)
153
+
154
+ # ---------- ManagedResource hooks ----------
155
+ def _cleanup(self):
156
+ attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
157
+ if not attr_name:
158
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
159
+ return
160
+ active_config = getattr(self, attr_name, None)
161
+ if active_config and hasattr(active_config, "close"):
162
+ self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
163
+ active_config.close()
164
+
165
+ async def _acleanup(self):
166
+ self.logger.warning(
167
+ "DfHelper instance was not used in an async context manager; cleanup is being called manually."
168
+ )
169
+ attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
170
+ if not attr_name:
171
+ self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
172
+ return
173
+ active_config = getattr(self, attr_name, None)
174
+ if active_config and hasattr(active_config, "aclose"):
175
+ self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
176
+ await active_config.aclose()
177
+
178
+ # ---------- config helpers ----------
179
+ def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
180
+ recognized = set(model.model_fields.keys())
181
+ model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
182
+ return model(**model_kwargs)
183
+
184
+ # ---------- load/aload ----------
185
+ def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
186
+ self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
187
+ self.total_records, df = self.backend_strategy.load(**options)
188
+ df = self._process_loaded_data(df)
189
+ df = self._post_process_df(df)
190
+ self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
191
+ return df.compute() if as_pandas else df
192
+
193
+ async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
194
+ self.total_records, df = await self.backend_strategy.aload(**options)
195
+ df = self._process_loaded_data(df)
196
+ df = self._post_process_df(df)
197
+ return df.compute() if as_pandas else df
198
+
199
+ # ---------- dataframe post-processing ----------
200
+ def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
201
+ self.logger.debug("Post-processing DataFrame.")
202
+ df_params = self._backend_params.df_params
203
+ if not df_params:
204
+ return df
205
+ fieldnames = df_params.get("fieldnames")
206
+ column_names = df_params.get("column_names")
207
+ index_col = df_params.get("index_col")
208
+
209
+ if fieldnames:
210
+ valid = [f for f in fieldnames if f in df.columns]
211
+ if len(valid) < len(fieldnames):
212
+ self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
213
+ df = df[valid]
214
+ if column_names:
215
+ if len(df.columns) != len(column_names):
216
+ raise ValueError(
217
+ f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
218
+ )
219
+ df = df.rename(columns=dict(zip(df.columns, column_names)))
220
+ if index_col:
221
+ if index_col not in df.columns:
222
+ raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
223
+ df = df.set_index(index_col)
224
+
225
+ self.logger.debug("Post-processing complete.")
226
+ return df
227
+
228
+ def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
229
+ field_map = self._backend_params.field_map or {}
230
+ if not isinstance(field_map, dict) or not field_map:
231
+ return df
232
+ if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
233
+ return df
234
+ self.logger.debug("Applying rename mapping if necessary.")
235
+ rename_map = {k: v for k, v in field_map.items() if k in df.columns}
236
+ if rename_map:
237
+ df = df.rename(columns=rename_map)
238
+ return df
239
+
240
+ # ---------- sinks ----------
241
+ def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
242
+ fs: AbstractFileSystem = kwargs.get("fs", self.fs)
243
+ path: str = kwargs.get("parquet_storage_path")
244
+ if not fs:
245
+ raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
246
+ if not path:
247
+ raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
248
+ if len(df.head(1)) == 0:
249
+ self.logger.warning("Skipping save: The provided DataFrame is empty.")
250
+ return
251
+
252
+ with ParquetSaver(
253
+ df_result=df,
254
+ parquet_storage_path=path,
255
+ fs=fs,
256
+ debug=self.debug,
257
+ logger=self.logger,
258
+ verbose=self.verbose,
259
+ **kwargs,
260
+ ) as saver:
261
+ saver.save_to_parquet(parquet_filename)
262
+
263
+ self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
264
+
265
+ def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
266
+ if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
267
+ self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
268
+ return
269
+ with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
270
+ writer.save_to_clickhouse(df)
271
+ self.logger.debug("Save to ClickHouse completed.")
272
+
273
+ # ---------- convenience period loaders ----------
274
+ def load_period(self, dt_field: str, start: str, end: str, **kwargs):
275
+ final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
276
+ return self.load(**final_kwargs)
277
+
278
+ async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
279
+ final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
280
+ return await self.aload(**final_kwargs)
281
+
282
+ def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
283
+ start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
284
+ if start_date > end_date:
285
+ raise ValueError("'start' date cannot be later than 'end' date.")
286
+ field_map = self._backend_params.field_map or {}
287
+ reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
288
+ if len(reverse_map) != len(field_map):
289
+ self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
290
+ mapped_field = reverse_map.get(dt_field, dt_field)
291
+ if start_date == end_date:
292
+ kwargs[f"{mapped_field}__date"] = start_date
293
+ else:
294
+ kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
295
+ self.logger.debug(f"Period load generated filters: {kwargs}")
296
+ return kwargs
297
+
298
+
@@ -1,9 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
- from ._filter_handler import ParquetFilterHandler
4
3
  from ._parquet_options import *
5
4
 
6
5
  __all__ = [
7
6
  "ParquetConfig",
8
- "ParquetFilterHandler",
9
7
  ]
@@ -175,7 +175,7 @@ class ParquetConfig(BaseModel):
175
175
  total_size += self.fs.size(path)
176
176
  return total_size
177
177
 
178
- def load_files(self):
178
+ def load_files(self, **filters):
179
179
  """
180
180
  Loads parquet files into a Dask DataFrame based on the specified conditions. This
181
181
  method checks if parquet file loading is enabled and loads either from a list of