sibi-dst 2025.8.4__py3-none-any.whl → 2025.8.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -182,18 +182,20 @@ class DfHelper(ManagedResource):
182
182
  return model(**model_kwargs)
183
183
 
184
184
  # ---------- load/aload ----------
185
- def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
185
+ def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
186
186
  self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
187
187
  self.total_records, df = self.backend_strategy.load(**options)
188
188
  df = self._process_loaded_data(df)
189
189
  df = self._post_process_df(df)
190
- self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
190
+ #self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
191
+ df = df.persist() if persist else df
191
192
  return df.compute() if as_pandas else df
192
193
 
193
- async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
194
+ async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
194
195
  self.total_records, df = await self.backend_strategy.aload(**options)
195
196
  df = self._process_loaded_data(df)
196
197
  df = self._post_process_df(df)
198
+ df = df.persist() if persist else df
197
199
  return df.compute() if as_pandas else df
198
200
 
199
201
  # ---------- dataframe post-processing ----------
@@ -6,6 +6,8 @@ import dask.dataframe as dd
6
6
  import fsspec
7
7
  import pandas as pd
8
8
  from pydantic import BaseModel, model_validator, ConfigDict
9
+
10
+ from sibi_dst.df_helper.core import FilterHandler
9
11
  from sibi_dst.utils import FilePathGenerator
10
12
  from sibi_dst.utils import Logger
11
13
 
@@ -177,38 +179,77 @@ class ParquetConfig(BaseModel):
177
179
 
178
180
  def load_files(self, **filters):
179
181
  """
180
- Loads parquet files into a Dask DataFrame based on the specified conditions. This
181
- method checks if parquet file loading is enabled and loads either from a list of
182
- parquet folder paths or a single specified parquet path.
183
-
184
- :return: A Dask DataFrame containing loaded parquet file data.
185
- :rtype: dask.dataframe.DataFrame
182
+ Loads parquet files into a Dask DataFrame based on the specified conditions.
183
+ Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
186
184
  """
187
185
  if not self.load_parquet:
188
186
  self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
189
187
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
190
188
 
189
+ # Resolve paths
191
190
  paths_to_load = []
192
191
  if self.parquet_folder_list:
193
- # Filter out any None values from the list
194
- paths_to_load = [p for p in self.parquet_folder_list if p is not None]
192
+ paths_to_load = [p for p in self.parquet_folder_list if p]
195
193
  elif self.parquet_full_path:
196
- # Treat the single path as a list with one item
197
194
  paths_to_load = [self.parquet_full_path]
198
195
 
199
196
  if not paths_to_load:
200
197
  self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
201
198
  return dd.from_pandas(pd.DataFrame(), npartitions=1)
202
199
 
200
+ # Prepare filters
201
+ fh = None
202
+ expr = None
203
+ pq_filters = None
204
+ residual_filters = None
205
+ if filters:
206
+ fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
207
+
208
+ # Use the compiler + pushdown split so we don't double-apply
209
+ try:
210
+ # If you added split_pushdown_and_residual earlier:
211
+ pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
212
+ expr = fh.compile_filters(residual_filters) if residual_filters else None
213
+ except AttributeError:
214
+ # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
215
+ expr = fh.compile_filters(filters)
216
+ pq_filters = expr.to_parquet_filters()
217
+
203
218
  try:
204
219
  self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
205
- dd_result=dd.read_parquet(
206
- paths_to_load,
207
- engine="pyarrow",
208
- filesystem=self.fs,
209
- exclude=["_*", ".*"]
210
- )
220
+
221
+ # Optional: prune columns. Keep it simple unless you want to compute from filters.
222
+ columns = None # or a concrete list if you know it
223
+
224
+ if fh and pq_filters:
225
+ self.logger.debug(f"Applying Parquet filters: {pq_filters}")
226
+ dd_result = dd.read_parquet(
227
+ paths_to_load,
228
+ engine="pyarrow",
229
+ filesystem=self.fs, # your fsspec filesystem (e.g., s3fs)
230
+ filters=pq_filters,
231
+ columns=columns,
232
+ gather_statistics=False, # uncomment if you have *many* files and don't need global stats
233
+ )
234
+ # Apply only residual mask (if any)
235
+ if expr is not None:
236
+ dd_result = dd_result[expr.mask(dd_result)]
237
+ else:
238
+ dd_result = dd.read_parquet(
239
+ paths_to_load,
240
+ engine="pyarrow",
241
+ filesystem=self.fs,
242
+ columns=columns,
243
+ gather_statistics=False,
244
+ )
245
+ # If we didn't push down, but have filters, apply them here
246
+ if expr is None and fh and filters:
247
+ expr = fh.compile_filters(filters)
248
+ if expr is not None:
249
+ dd_result = dd_result[expr.mask(dd_result)]
250
+
211
251
  return dd_result
252
+
212
253
  except FileNotFoundError as e:
213
254
  self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
214
255
  self.logger.debug("Returning empty DataFrame due to missing parquet files.")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: sibi-dst
3
- Version: 2025.8.4
3
+ Version: 2025.8.5
4
4
  Summary: Data Science Toolkit
5
5
  Author: Luis Valverde
6
6
  Author-email: lvalverdeb@gmail.com
@@ -2,14 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
2
2
  sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
3
3
  sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
4
4
  sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
5
- sibi_dst/df_helper/_df_helper.py,sha256=nG5iITvwyRsdnPgTOql6-w47LEOsZUXYF7-tIM2yGBE,12798
5
+ sibi_dst/df_helper/_df_helper.py,sha256=g1ftfSMO40l60EJWRLE0DDZvbIowrqvG1GMf2zXqYGw,12957
6
6
  sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
7
7
  sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
8
8
  sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
9
  sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
10
10
  sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
11
11
  sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
12
- sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=yQ5pZuF2Tf7eM_krOPkxhPkDFtEKzV7BKjUerTqX0tg,12028
12
+ sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=V6y1Vco3_uY4UBF79_JPd1CFK5DpNsnGYHCc5PDPGZo,13798
13
13
  sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
14
14
  sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
15
15
  sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
@@ -78,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
78
78
  sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
79
79
  sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
80
80
  sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
81
- sibi_dst-2025.8.4.dist-info/METADATA,sha256=LFL_mbMveA_TrO5zelvtZ1rBiEuMWtvhjrAs42DnOd0,2610
82
- sibi_dst-2025.8.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
- sibi_dst-2025.8.4.dist-info/RECORD,,
81
+ sibi_dst-2025.8.5.dist-info/METADATA,sha256=ADWrf_9UI4NiTWslrJ0LgfmHTTdxSSCIc0AaP-mqSQg,2610
82
+ sibi_dst-2025.8.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
83
+ sibi_dst-2025.8.5.dist-info/RECORD,,