sibi-dst 2025.8.4__py3-none-any.whl → 2025.8.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sibi_dst/df_helper/_df_helper.py +5 -3
- sibi_dst/df_helper/backends/parquet/_parquet_options.py +56 -15
- {sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.5.dist-info}/METADATA +1 -1
- {sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.5.dist-info}/RECORD +5 -5
- {sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.5.dist-info}/WHEEL +0 -0
sibi_dst/df_helper/_df_helper.py
CHANGED
@@ -182,18 +182,20 @@ class DfHelper(ManagedResource):
|
|
182
182
|
return model(**model_kwargs)
|
183
183
|
|
184
184
|
# ---------- load/aload ----------
|
185
|
-
def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
185
|
+
def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
186
186
|
self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
|
187
187
|
self.total_records, df = self.backend_strategy.load(**options)
|
188
188
|
df = self._process_loaded_data(df)
|
189
189
|
df = self._post_process_df(df)
|
190
|
-
self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
190
|
+
#self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
|
191
|
+
df = df.persist() if persist else df
|
191
192
|
return df.compute() if as_pandas else df
|
192
193
|
|
193
|
-
async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
194
|
+
async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
|
194
195
|
self.total_records, df = await self.backend_strategy.aload(**options)
|
195
196
|
df = self._process_loaded_data(df)
|
196
197
|
df = self._post_process_df(df)
|
198
|
+
df = df.persist() if persist else df
|
197
199
|
return df.compute() if as_pandas else df
|
198
200
|
|
199
201
|
# ---------- dataframe post-processing ----------
|
@@ -6,6 +6,8 @@ import dask.dataframe as dd
|
|
6
6
|
import fsspec
|
7
7
|
import pandas as pd
|
8
8
|
from pydantic import BaseModel, model_validator, ConfigDict
|
9
|
+
|
10
|
+
from sibi_dst.df_helper.core import FilterHandler
|
9
11
|
from sibi_dst.utils import FilePathGenerator
|
10
12
|
from sibi_dst.utils import Logger
|
11
13
|
|
@@ -177,38 +179,77 @@ class ParquetConfig(BaseModel):
|
|
177
179
|
|
178
180
|
def load_files(self, **filters):
|
179
181
|
"""
|
180
|
-
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
181
|
-
|
182
|
-
parquet folder paths or a single specified parquet path.
|
183
|
-
|
184
|
-
:return: A Dask DataFrame containing loaded parquet file data.
|
185
|
-
:rtype: dask.dataframe.DataFrame
|
182
|
+
Loads parquet files into a Dask DataFrame based on the specified conditions.
|
183
|
+
Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
|
186
184
|
"""
|
187
185
|
if not self.load_parquet:
|
188
186
|
self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
|
189
187
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
190
188
|
|
189
|
+
# Resolve paths
|
191
190
|
paths_to_load = []
|
192
191
|
if self.parquet_folder_list:
|
193
|
-
|
194
|
-
paths_to_load = [p for p in self.parquet_folder_list if p is not None]
|
192
|
+
paths_to_load = [p for p in self.parquet_folder_list if p]
|
195
193
|
elif self.parquet_full_path:
|
196
|
-
# Treat the single path as a list with one item
|
197
194
|
paths_to_load = [self.parquet_full_path]
|
198
195
|
|
199
196
|
if not paths_to_load:
|
200
197
|
self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
|
201
198
|
return dd.from_pandas(pd.DataFrame(), npartitions=1)
|
202
199
|
|
200
|
+
# Prepare filters
|
201
|
+
fh = None
|
202
|
+
expr = None
|
203
|
+
pq_filters = None
|
204
|
+
residual_filters = None
|
205
|
+
if filters:
|
206
|
+
fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
|
207
|
+
|
208
|
+
# Use the compiler + pushdown split so we don't double-apply
|
209
|
+
try:
|
210
|
+
# If you added split_pushdown_and_residual earlier:
|
211
|
+
pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
|
212
|
+
expr = fh.compile_filters(residual_filters) if residual_filters else None
|
213
|
+
except AttributeError:
|
214
|
+
# Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
|
215
|
+
expr = fh.compile_filters(filters)
|
216
|
+
pq_filters = expr.to_parquet_filters()
|
217
|
+
|
203
218
|
try:
|
204
219
|
self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
|
205
|
-
|
206
|
-
|
207
|
-
|
208
|
-
|
209
|
-
|
210
|
-
|
220
|
+
|
221
|
+
# Optional: prune columns. Keep it simple unless you want to compute from filters.
|
222
|
+
columns = None # or a concrete list if you know it
|
223
|
+
|
224
|
+
if fh and pq_filters:
|
225
|
+
self.logger.debug(f"Applying Parquet filters: {pq_filters}")
|
226
|
+
dd_result = dd.read_parquet(
|
227
|
+
paths_to_load,
|
228
|
+
engine="pyarrow",
|
229
|
+
filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
|
230
|
+
filters=pq_filters,
|
231
|
+
columns=columns,
|
232
|
+
gather_statistics=False, # uncomment if you have *many* files and don't need global stats
|
233
|
+
)
|
234
|
+
# Apply only residual mask (if any)
|
235
|
+
if expr is not None:
|
236
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
237
|
+
else:
|
238
|
+
dd_result = dd.read_parquet(
|
239
|
+
paths_to_load,
|
240
|
+
engine="pyarrow",
|
241
|
+
filesystem=self.fs,
|
242
|
+
columns=columns,
|
243
|
+
gather_statistics=False,
|
244
|
+
)
|
245
|
+
# If we didn't push down, but have filters, apply them here
|
246
|
+
if expr is None and fh and filters:
|
247
|
+
expr = fh.compile_filters(filters)
|
248
|
+
if expr is not None:
|
249
|
+
dd_result = dd_result[expr.mask(dd_result)]
|
250
|
+
|
211
251
|
return dd_result
|
252
|
+
|
212
253
|
except FileNotFoundError as e:
|
213
254
|
self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
|
214
255
|
self.logger.debug("Returning empty DataFrame due to missing parquet files.")
|
@@ -2,14 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
|
|
2
2
|
sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
|
3
3
|
sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
|
4
4
|
sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
|
5
|
-
sibi_dst/df_helper/_df_helper.py,sha256=
|
5
|
+
sibi_dst/df_helper/_df_helper.py,sha256=g1ftfSMO40l60EJWRLE0DDZvbIowrqvG1GMf2zXqYGw,12957
|
6
6
|
sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
|
7
7
|
sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
|
8
8
|
sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
9
|
sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
|
10
10
|
sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
|
11
11
|
sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
|
12
|
-
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=
|
12
|
+
sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=V6y1Vco3_uY4UBF79_JPd1CFK5DpNsnGYHCc5PDPGZo,13798
|
13
13
|
sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
|
14
14
|
sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
|
15
15
|
sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
|
@@ -78,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
|
|
78
78
|
sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
|
79
79
|
sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
|
80
80
|
sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
|
81
|
-
sibi_dst-2025.8.
|
82
|
-
sibi_dst-2025.8.
|
83
|
-
sibi_dst-2025.8.
|
81
|
+
sibi_dst-2025.8.5.dist-info/METADATA,sha256=ADWrf_9UI4NiTWslrJ0LgfmHTTdxSSCIc0AaP-mqSQg,2610
|
82
|
+
sibi_dst-2025.8.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
|
83
|
+
sibi_dst-2025.8.5.dist-info/RECORD,,
|
File without changes
|