sibi-dst 2025.8.3__tar.gz → 2025.8.5__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/PKG-INFO +1 -1
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/pyproject.toml +1 -1
- sibi_dst-2025.8.5/sibi_dst/df_helper/_df_helper.py +300 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/parquet/__init__.py +0 -2
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/parquet/_parquet_options.py +57 -16
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/core/_filter_handler.py +212 -79
- sibi_dst-2025.8.3/sibi_dst/df_helper/_df_helper.py +0 -592
- sibi_dst-2025.8.3/sibi_dst/df_helper/backends/parquet/_filter_handler.py +0 -126
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/README.md +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/_artifact_updater_async.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/_artifact_updater_threaded.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/_parquet_artifact.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/_parquet_reader.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/http/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/http/_http_config.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/_model_registry.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/sqlalchemy/_sql_model_builder.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/core/_defaults.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/data_cleaner.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/geopy_helper/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/geopy_helper/geo_location_service.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/geopy_helper/utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/base_osm_map.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/basemaps/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/basemaps/calendar_html.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/basemaps/route_map_plotter.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/basemaps/router_plotter.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/route_path_builder.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/osmnx_helper/utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/tests/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/tests/test_data_wrapper_class.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/base.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/business_days.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/clickhouse_writer.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/credentials.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/data_from_http_source.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/data_utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/data_wrapper.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/date_utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/df_utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/file_age_checker.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/file_utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/filepath_generator.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/log_utils.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/manifest_manager.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/parquet_saver.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/periods.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/phone_formatter.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/storage_config.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/storage_manager.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/update_planner.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/utils/webdav_client.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/_df_helper.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_db_connection.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_io_dask.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_load_from_db.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlalchemy/_model_builder.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlmodel/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_db_connection.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_io_dask.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_load_from_db.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/backends/sqlmodel/_model_builder.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/core/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/core/_filter_handler.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/core/_params_config.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/df_helper/core/_query_config.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/utils/__init__.py +0 -0
- {sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/v2/utils/log_utils.py +0 -0
@@ -0,0 +1,300 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import Any, Dict, Optional, TypeVar, Union
|
4
|
+
|
5
|
+
import dask.dataframe as dd
|
6
|
+
import pandas as pd
|
7
|
+
from fsspec import AbstractFileSystem
|
8
|
+
from pydantic import BaseModel
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import QueryConfig, ParamsConfig
|
11
|
+
from sibi_dst.utils import ManagedResource, ParquetSaver, ClickHouseWriter
|
12
|
+
from .backends.http import HttpConfig
|
13
|
+
from .backends.parquet import ParquetConfig
|
14
|
+
from .backends.sqlalchemy import SqlAlchemyConnectionConfig, SqlAlchemyLoadFromDb
|
15
|
+
|
16
|
+
T = TypeVar("T", bound=BaseModel)
|
17
|
+
|
18
|
+
|
19
|
+
# ---- Backend Strategy Pattern ----
|
20
|
+
class BaseBackend:
|
21
|
+
def __init__(self, helper: "DfHelper"):
|
22
|
+
self.helper = helper
|
23
|
+
self.logger = helper.logger
|
24
|
+
self.debug = helper.debug
|
25
|
+
self.total_records = helper.total_records
|
26
|
+
|
27
|
+
def load(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
28
|
+
raise NotImplementedError
|
29
|
+
|
30
|
+
async def aload(self, **options) -> Union[tuple[Any, Any], dd.DataFrame, pd.DataFrame]:
|
31
|
+
return self.load(**options)
|
32
|
+
|
33
|
+
|
34
|
+
class SqlAlchemyBackend(BaseBackend):
|
35
|
+
def load(self, **options):
|
36
|
+
try:
|
37
|
+
if options and hasattr(self.helper._backend_params, "parse_params"):
|
38
|
+
self.helper._backend_params.parse_params(options)
|
39
|
+
|
40
|
+
with SqlAlchemyLoadFromDb(
|
41
|
+
plugin_sqlalchemy=self.helper.backend_db_connection,
|
42
|
+
plugin_query=self.helper._backend_query,
|
43
|
+
plugin_params=self.helper._backend_params,
|
44
|
+
logger=self.logger,
|
45
|
+
debug=self.debug,
|
46
|
+
) as db_loader:
|
47
|
+
self.total_records, result = db_loader.build_and_load()
|
48
|
+
return self.total_records, result
|
49
|
+
except Exception as e:
|
50
|
+
self.logger.error(f"Failed to load data from sqlalchemy: {e}", exc_info=self.debug)
|
51
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
52
|
+
|
53
|
+
|
54
|
+
class ParquetBackend(BaseBackend):
|
55
|
+
def load(self, **options):
|
56
|
+
try:
|
57
|
+
df = self.helper.backend_parquet.load_files(**options)
|
58
|
+
if self._is_empty(df):
|
59
|
+
return -1, self._empty_like(df)
|
60
|
+
nrows = self._row_count(df)
|
61
|
+
if nrows == 0:
|
62
|
+
self.logger.debug("No records after filters; returning empty DataFrame.")
|
63
|
+
return 0, self._empty_like(df)
|
64
|
+
|
65
|
+
df = df.persist()
|
66
|
+
self.total_records = self._row_count(df) or -1
|
67
|
+
return self.total_records, df
|
68
|
+
|
69
|
+
except Exception as e:
|
70
|
+
self.total_records = -1 # Reset total_records on failure
|
71
|
+
self.logger.error(f"Failed to load data from parquet: {e}", exc_info=self.debug)
|
72
|
+
return -1, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
73
|
+
|
74
|
+
def _is_empty(self, ddf) -> bool:
|
75
|
+
"""True if no rows across all partitions."""
|
76
|
+
try:
|
77
|
+
# head with npartitions=-1 walks partitions until it gets n rows
|
78
|
+
return ddf.head(1, npartitions=-1).shape[0] == 0
|
79
|
+
except Exception:
|
80
|
+
return True
|
81
|
+
|
82
|
+
def _row_count(self, ddf) -> int:
|
83
|
+
"""Reliable row count for Dask DataFrame."""
|
84
|
+
return int(ddf.map_partitions(len).sum().compute())
|
85
|
+
|
86
|
+
def _empty_like(self, ddf):
|
87
|
+
"""Return an empty Dask DF with the SAME columns/dtypes."""
|
88
|
+
empty_pdf = ddf._meta.iloc[0:0]
|
89
|
+
return dd.from_pandas(empty_pdf, npartitions=1)
|
90
|
+
|
91
|
+
|
92
|
+
class HttpBackend(BaseBackend):
|
93
|
+
def load(self, **options):
|
94
|
+
# Will raise NotImplementedError from helper.backend_http if sync not supported
|
95
|
+
return self.helper.backend_http.fetch_data(**options)
|
96
|
+
|
97
|
+
async def aload(self, **options):
|
98
|
+
if not self.helper.backend_http:
|
99
|
+
self.logger.warning("HTTP plugin not configured properly.")
|
100
|
+
self.total_records = -1
|
101
|
+
return self.total_records, dd.from_pandas(pd.DataFrame(), npartitions=1)
|
102
|
+
result = await self.helper.backend_http.fetch_data(**options)
|
103
|
+
self.total_records = len(result)
|
104
|
+
return self.total_records, result
|
105
|
+
|
106
|
+
|
107
|
+
# ---- Main DfHelper ----
|
108
|
+
class DfHelper(ManagedResource):
|
109
|
+
_BACKEND_STRATEGIES = {
|
110
|
+
"sqlalchemy": SqlAlchemyBackend,
|
111
|
+
"parquet": ParquetBackend,
|
112
|
+
"http": HttpBackend,
|
113
|
+
}
|
114
|
+
|
115
|
+
_BACKEND_ATTR_MAP = {
|
116
|
+
"sqlalchemy": "backend_db_connection",
|
117
|
+
"parquet": "backend_parquet",
|
118
|
+
"http": "backend_http",
|
119
|
+
}
|
120
|
+
|
121
|
+
default_config: Dict[str, Any] = None
|
122
|
+
|
123
|
+
def __init__(self, backend="sqlalchemy", **kwargs):
|
124
|
+
self.default_config = self.default_config or {}
|
125
|
+
kwargs = {**self.default_config.copy(), **kwargs}
|
126
|
+
super().__init__(**kwargs)
|
127
|
+
self.backend = backend
|
128
|
+
|
129
|
+
# Ensure defaults flow to plugin configs
|
130
|
+
kwargs.setdefault("debug", self.debug)
|
131
|
+
kwargs.setdefault("fs", self.fs)
|
132
|
+
kwargs.setdefault("logger", self.logger)
|
133
|
+
|
134
|
+
self.total_records = -1
|
135
|
+
self._backend_query = self._get_config(QueryConfig, kwargs)
|
136
|
+
self._backend_params = self._get_config(ParamsConfig, kwargs)
|
137
|
+
|
138
|
+
self.backend_db_connection: Optional[SqlAlchemyConnectionConfig] = None
|
139
|
+
self.backend_parquet: Optional[ParquetConfig] = None
|
140
|
+
self.backend_http: Optional[HttpConfig] = None
|
141
|
+
|
142
|
+
if self.backend == "sqlalchemy":
|
143
|
+
self.backend_db_connection = self._get_config(SqlAlchemyConnectionConfig, kwargs)
|
144
|
+
elif self.backend == "parquet":
|
145
|
+
self.backend_parquet = self._get_config(ParquetConfig, kwargs)
|
146
|
+
elif self.backend == "http":
|
147
|
+
self.backend_http = self._get_config(HttpConfig, kwargs)
|
148
|
+
|
149
|
+
strategy_cls = self._BACKEND_STRATEGIES.get(self.backend)
|
150
|
+
if not strategy_cls:
|
151
|
+
raise ValueError(f"Unsupported backend: {self.backend}")
|
152
|
+
self.backend_strategy = strategy_cls(self)
|
153
|
+
|
154
|
+
# ---------- ManagedResource hooks ----------
|
155
|
+
def _cleanup(self):
|
156
|
+
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
157
|
+
if not attr_name:
|
158
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
159
|
+
return
|
160
|
+
active_config = getattr(self, attr_name, None)
|
161
|
+
if active_config and hasattr(active_config, "close"):
|
162
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
163
|
+
active_config.close()
|
164
|
+
|
165
|
+
async def _acleanup(self):
|
166
|
+
self.logger.warning(
|
167
|
+
"DfHelper instance was not used in an async context manager; cleanup is being called manually."
|
168
|
+
)
|
169
|
+
attr_name = self._BACKEND_ATTR_MAP.get(self.backend)
|
170
|
+
if not attr_name:
|
171
|
+
self.logger.warning(f"No attribute mapping found for backend '{self.backend}'. Cleanup skipped.")
|
172
|
+
return
|
173
|
+
active_config = getattr(self, attr_name, None)
|
174
|
+
if active_config and hasattr(active_config, "aclose"):
|
175
|
+
self.logger.debug(f"Closing resources for '{self.backend}' backend using attribute '{attr_name}'.")
|
176
|
+
await active_config.aclose()
|
177
|
+
|
178
|
+
# ---------- config helpers ----------
|
179
|
+
def _get_config(self, model: T, kwargs: Dict[str, Any]) -> T:
|
180
|
+
recognized = set(model.model_fields.keys())
|
181
|
+
model_kwargs = {k: kwargs[k] for k in recognized if k in kwargs}
|
182
|
+
return model(**model_kwargs)
|
183
|
+
|
184
|
+
# ---------- load/aload ----------
|
185
|
+
def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
186
|
+
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
187
|
+
self.total_records, df = self.backend_strategy.load(**options)
|
188
|
+
df = self._process_loaded_data(df)
|
189
|
+
df = self._post_process_df(df)
|
190
|
+
#self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
191
|
+
df = df.persist() if persist else df
|
192
|
+
return df.compute() if as_pandas else df
|
193
|
+
|
194
|
+
async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
195
|
+
self.total_records, df = await self.backend_strategy.aload(**options)
|
196
|
+
df = self._process_loaded_data(df)
|
197
|
+
df = self._post_process_df(df)
|
198
|
+
df = df.persist() if persist else df
|
199
|
+
return df.compute() if as_pandas else df
|
200
|
+
|
201
|
+
# ---------- dataframe post-processing ----------
|
202
|
+
def _post_process_df(self, df: dd.DataFrame) -> dd.DataFrame:
|
203
|
+
self.logger.debug("Post-processing DataFrame.")
|
204
|
+
df_params = self._backend_params.df_params
|
205
|
+
if not df_params:
|
206
|
+
return df
|
207
|
+
fieldnames = df_params.get("fieldnames")
|
208
|
+
column_names = df_params.get("column_names")
|
209
|
+
index_col = df_params.get("index_col")
|
210
|
+
|
211
|
+
if fieldnames:
|
212
|
+
valid = [f for f in fieldnames if f in df.columns]
|
213
|
+
if len(valid) < len(fieldnames):
|
214
|
+
self.logger.warning(f"Missing columns for filtering: {set(fieldnames) - set(valid)}")
|
215
|
+
df = df[valid]
|
216
|
+
if column_names:
|
217
|
+
if len(df.columns) != len(column_names):
|
218
|
+
raise ValueError(
|
219
|
+
f"Length mismatch: DataFrame has {len(df.columns)} columns, but {len(column_names)} names were provided."
|
220
|
+
)
|
221
|
+
df = df.rename(columns=dict(zip(df.columns, column_names)))
|
222
|
+
if index_col:
|
223
|
+
if index_col not in df.columns:
|
224
|
+
raise ValueError(f"Index column '{index_col}' not found in DataFrame.")
|
225
|
+
df = df.set_index(index_col)
|
226
|
+
|
227
|
+
self.logger.debug("Post-processing complete.")
|
228
|
+
return df
|
229
|
+
|
230
|
+
def _process_loaded_data(self, df: dd.DataFrame) -> dd.DataFrame:
|
231
|
+
field_map = self._backend_params.field_map or {}
|
232
|
+
if not isinstance(field_map, dict) or not field_map:
|
233
|
+
return df
|
234
|
+
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
235
|
+
return df
|
236
|
+
self.logger.debug("Applying rename mapping if necessary.")
|
237
|
+
rename_map = {k: v for k, v in field_map.items() if k in df.columns}
|
238
|
+
if rename_map:
|
239
|
+
df = df.rename(columns=rename_map)
|
240
|
+
return df
|
241
|
+
|
242
|
+
# ---------- sinks ----------
|
243
|
+
def save_to_parquet(self, df: dd.DataFrame, parquet_filename: str, **kwargs):
|
244
|
+
fs: AbstractFileSystem = kwargs.get("fs", self.fs)
|
245
|
+
path: str = kwargs.get("parquet_storage_path")
|
246
|
+
if not fs:
|
247
|
+
raise ValueError("A filesystem (fs) must be provided to save the parquet file.")
|
248
|
+
if not path:
|
249
|
+
raise ValueError("A 'parquet_storage_path' keyword argument must be provided.")
|
250
|
+
if len(df.head(1)) == 0:
|
251
|
+
self.logger.warning("Skipping save: The provided DataFrame is empty.")
|
252
|
+
return
|
253
|
+
|
254
|
+
with ParquetSaver(
|
255
|
+
df_result=df,
|
256
|
+
parquet_storage_path=path,
|
257
|
+
fs=fs,
|
258
|
+
debug=self.debug,
|
259
|
+
logger=self.logger,
|
260
|
+
verbose=self.verbose,
|
261
|
+
**kwargs,
|
262
|
+
) as saver:
|
263
|
+
saver.save_to_parquet(parquet_filename)
|
264
|
+
|
265
|
+
self.logger.debug(f"Successfully saved '{parquet_filename}' to '{path}'.")
|
266
|
+
|
267
|
+
def save_to_clickhouse(self, df: dd.DataFrame, **credentials):
|
268
|
+
if hasattr(df, "npartitions") and df.npartitions == 1 and not len(df.head(1)):
|
269
|
+
self.logger.warning("Cannot write to ClickHouse; DataFrame is empty.")
|
270
|
+
return
|
271
|
+
with ClickHouseWriter(debug=self.debug, logger=self.logger, fs=self.fs, verbose=self.verbose, **credentials) as writer:
|
272
|
+
writer.save_to_clickhouse(df)
|
273
|
+
self.logger.debug("Save to ClickHouse completed.")
|
274
|
+
|
275
|
+
# ---------- convenience period loaders ----------
|
276
|
+
def load_period(self, dt_field: str, start: str, end: str, **kwargs):
|
277
|
+
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
278
|
+
return self.load(**final_kwargs)
|
279
|
+
|
280
|
+
async def aload_period(self, dt_field: str, start: str, end: str, **kwargs):
|
281
|
+
final_kwargs = self._prepare_period_filters(dt_field, start, end, **kwargs)
|
282
|
+
return await self.aload(**final_kwargs)
|
283
|
+
|
284
|
+
def _prepare_period_filters(self, dt_field: str, start: str, end: str, **kwargs) -> dict:
|
285
|
+
start_date, end_date = pd.to_datetime(start).date(), pd.to_datetime(end).date()
|
286
|
+
if start_date > end_date:
|
287
|
+
raise ValueError("'start' date cannot be later than 'end' date.")
|
288
|
+
field_map = self._backend_params.field_map or {}
|
289
|
+
reverse_map = {v: k for k, v in field_map.items()} if field_map else {}
|
290
|
+
if len(reverse_map) != len(field_map):
|
291
|
+
self.logger.warning("field_map values are not unique; reverse mapping may be unreliable.")
|
292
|
+
mapped_field = reverse_map.get(dt_field, dt_field)
|
293
|
+
if start_date == end_date:
|
294
|
+
kwargs[f"{mapped_field}__date"] = start_date
|
295
|
+
else:
|
296
|
+
kwargs[f"{mapped_field}__date__range"] = [start_date, end_date]
|
297
|
+
self.logger.debug(f"Period load generated filters: {kwargs}")
|
298
|
+
return kwargs
|
299
|
+
|
300
|
+
|
{sibi_dst-2025.8.3 → sibi_dst-2025.8.5}/sibi_dst/df_helper/backends/parquet/_parquet_options.py
RENAMED
@@ -6,6 +6,8 @@ import dask.dataframe as dd
|
|
6
6
|
import fsspec
|
7
7
|
import pandas as pd
|
8
8
|
from pydantic import BaseModel, model_validator, ConfigDict
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import FilterHandler
|
9
11
|
from sibi_dst.utils import FilePathGenerator
|
10
12
|
from sibi_dst.utils import Logger
|
11
13
|
|
@@ -175,40 +177,79 @@ class ParquetConfig(BaseModel):
|
|
175
177
|
total_size += self.fs.size(path)
|
176
178
|
return total_size
|
177
179
|
|
178
|
-
def load_files(self):
|
180
|
+
def load_files(self, **filters):
|
179
181
|
"""
|
180
|
-
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
181
|
-
|
182
|
-
parquet folder paths or a single specified parquet path.
|
183
|
-
|
184
|
-
:return: A Dask DataFrame containing loaded parquet file data.
|
185
|
-
:rtype: dask.dataframe.DataFrame
|
182
|
+
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
183
|
+
Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
|
186
184
|
"""
|
187
185
|
if not self.load_parquet:
|
188
186
|
self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
|
189
187
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
190
188
|
|
189
|
+
# Resolve paths
|
191
190
|
paths_to_load = []
|
192
191
|
if self.parquet_folder_list:
|
193
|
-
|
194
|
-
paths_to_load = [p for p in self.parquet_folder_list if p is not None]
|
192
|
+
paths_to_load = [p for p in self.parquet_folder_list if p]
|
195
193
|
elif self.parquet_full_path:
|
196
|
-
# Treat the single path as a list with one item
|
197
194
|
paths_to_load = [self.parquet_full_path]
|
198
195
|
|
199
196
|
if not paths_to_load:
|
200
197
|
self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
|
201
198
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
202
199
|
|
200
|
+
# Prepare filters
|
201
|
+
fh = None
|
202
|
+
expr = None
|
203
|
+
pq_filters = None
|
204
|
+
residual_filters = None
|
205
|
+
if filters:
|
206
|
+
fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
|
207
|
+
|
208
|
+
# Use the compiler + pushdown split so we don't double-apply
|
209
|
+
try:
|
210
|
+
# If you added split_pushdown_and_residual earlier:
|
211
|
+
pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
|
212
|
+
expr = fh.compile_filters(residual_filters) if residual_filters else None
|
213
|
+
except AttributeError:
|
214
|
+
# Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
|
215
|
+
expr = fh.compile_filters(filters)
|
216
|
+
pq_filters = expr.to_parquet_filters()
|
217
|
+
|
203
218
|
try:
|
204
219
|
self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
220
|
+
|
221
|
+
# Optional: prune columns. Keep it simple unless you want to compute from filters.
|
222
|
+
columns = None # or a concrete list if you know it
|
223
|
+
|
224
|
+
if fh and pq_filters:
|
225
|
+
self.logger.debug(f"Applying Parquet filters: {pq_filters}")
|
226
|
+
dd_result = dd.read_parquet(
|
227
|
+
paths_to_load,
|
228
|
+
engine="pyarrow",
|
229
|
+
filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
|
230
|
+
filters=pq_filters,
|
231
|
+
columns=columns,
|
232
|
+
gather_statistics=False, # uncomment if you have *many* files and don't need global stats
|
233
|
+
)
|
234
|
+
# Apply only residual mask (if any)
|
235
|
+
if expr is not None:
|
236
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
237
|
+
else:
|
238
|
+
dd_result = dd.read_parquet(
|
239
|
+
paths_to_load,
|
240
|
+
engine="pyarrow",
|
241
|
+
filesystem=self.fs,
|
242
|
+
columns=columns,
|
243
|
+
gather_statistics=False,
|
244
|
+
)
|
245
|
+
# If we didn't push down, but have filters, apply them here
|
246
|
+
if expr is None and fh and filters:
|
247
|
+
expr = fh.compile_filters(filters)
|
248
|
+
if expr is not None:
|
249
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
250
|
+
|
211
251
|
return dd_result
|
252
|
+
|
212
253
|
except FileNotFoundError as e:
|
213
254
|
self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
|
214
255
|
self.logger.debug("Returning empty DataFrame due to missing parquet files.")
|