PyPI - sibi-dst - Versions diffs - 2025.8.4__py3-none-any.whl → 2025.8.5__py3-none-any.whl - Mend

sibi-dst 2025.8.4py3-none-any.whl → 2025.8.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

sibi_dst/df_helper/_df_helper.py CHANGED Viewed

@@ -182,18 +182,20 @@ class DfHelper(ManagedResource):
         return model(**model_kwargs)
     # ---------- load/aload ----------
-    def load(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
+    def load(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
         self.logger.debug(f"Loading data from {self.backend} backend with options: {options}")
         self.total_records, df = self.backend_strategy.load(**options)
         df = self._process_loaded_data(df)
         df = self._post_process_df(df)
-        self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
+        #self.logger.debug(f"Finished loading data from {self.backend} backend with options: {options}")
+        df = df.persist() if persist else df
         return df.compute() if as_pandas else df
-    async def aload(self, as_pandas=False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
+    async def aload(self, *, persist: bool = False, as_pandas: bool = False, **options) -> Union[pd.DataFrame, dd.DataFrame]:
         self.total_records, df = await self.backend_strategy.aload(**options)
         df = self._process_loaded_data(df)
         df = self._post_process_df(df)
+        df = df.persist() if persist else df
         return df.compute() if as_pandas else df
     # ---------- dataframe post-processing ----------

sibi_dst/df_helper/backends/parquet/_parquet_options.py CHANGED Viewed

@@ -6,6 +6,8 @@ import dask.dataframe as dd
 import fsspec
 import pandas as pd
 from pydantic import BaseModel, model_validator, ConfigDict
+from sibi_dst.df_helper.core import FilterHandler
 from sibi_dst.utils import FilePathGenerator
 from sibi_dst.utils import Logger
@@ -177,38 +179,77 @@ class ParquetConfig(BaseModel):
     def load_files(self, **filters):
         """
-        Loads parquet files into a Dask DataFrame based on the specified conditions. This
-        method checks if parquet file loading is enabled and loads either from a list of
-        parquet folder paths or a single specified parquet path.
-        :return: A Dask DataFrame containing loaded parquet file data.
-        :rtype: dask.dataframe.DataFrame
+        Loads parquet files into a Dask DataFrame based on the specified conditions.
+        Supports Parquet predicate pushdown (pyarrow) + residual Dask mask.
         """
         if not self.load_parquet:
             self.logger.warning("Parquet loading is disabled. Returning empty DataFrame.")
             return dd.from_pandas(pd.DataFrame(), npartitions=1)
+        # Resolve paths
         paths_to_load = []
         if self.parquet_folder_list:
-            # Filter out any None values from the list
-            paths_to_load = [p for p in self.parquet_folder_list if p is not None]
+            paths_to_load = [p for p in self.parquet_folder_list if p]
         elif self.parquet_full_path:
-            # Treat the single path as a list with one item
             paths_to_load = [self.parquet_full_path]
         if not paths_to_load:
             self.logger.warning("No valid parquet file paths were provided. Returning empty DataFrame.")
             return dd.from_pandas(pd.DataFrame(), npartitions=1)
+        # Prepare filters
+        fh = None
+        expr = None
+        pq_filters = None
+        residual_filters = None
+        if filters:
+            fh = FilterHandler(backend="dask", debug=self.debug, logger=self.logger)
+            # Use the compiler + pushdown split so we don't double-apply
+            try:
+                # If you added split_pushdown_and_residual earlier:
+                pq_filters, residual_filters = fh.split_pushdown_and_residual(filters)
+                expr = fh.compile_filters(residual_filters) if residual_filters else None
+            except AttributeError:
+                # Fallback if you didn't add split_*: push everything down and also mask (redundant but correct)
+                expr = fh.compile_filters(filters)
+                pq_filters = expr.to_parquet_filters()
         try:
             self.logger.debug(f"Attempting to load Parquet data from: {paths_to_load}")
-            dd_result=dd.read_parquet(
-                paths_to_load,
-                engine="pyarrow",
-                filesystem=self.fs,
-                exclude=["_*", ".*"]
-            )
+            # Optional: prune columns. Keep it simple unless you want to compute from filters.
+            columns = None  # or a concrete list if you know it
+            if fh and pq_filters:
+                self.logger.debug(f"Applying Parquet filters: {pq_filters}")
+                dd_result = dd.read_parquet(
+                    paths_to_load,
+                    engine="pyarrow",
+                    filesystem=self.fs,  # your fsspec filesystem (e.g., s3fs)
+                    filters=pq_filters,
+                    columns=columns,
+                    gather_statistics=False,   # uncomment if you have *many* files and don't need global stats
+                )
+                # Apply only residual mask (if any)
+                if expr is not None:
+                    dd_result = dd_result[expr.mask(dd_result)]
+            else:
+                dd_result = dd.read_parquet(
+                    paths_to_load,
+                    engine="pyarrow",
+                    filesystem=self.fs,
+                    columns=columns,
+                    gather_statistics=False,
+                )
+                # If we didn't push down, but have filters, apply them here
+                if expr is None and fh and filters:
+                    expr = fh.compile_filters(filters)
+                if expr is not None:
+                    dd_result = dd_result[expr.mask(dd_result)]
             return dd_result
         except FileNotFoundError as e:
             self.logger.debug(f"Parquet files not found at paths {paths_to_load}: {e}")
             self.logger.debug("Returning empty DataFrame due to missing parquet files.")

{sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.5.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: sibi-dst
-Version: 2025.8.4
+Version: 2025.8.5
 Summary: Data Science Toolkit
 Author: Luis Valverde
 Author-email: lvalverdeb@gmail.com

{sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.5.dist-info}/RECORD RENAMED Viewed

@@ -2,14 +2,14 @@ sibi_dst/__init__.py,sha256=D01Z2Ds4zES8uz5Zp7qOWD0EcfCllWgew7AWt2X1SQg,445
 sibi_dst/df_helper/__init__.py,sha256=CyDXtFhRnMrycktxNO8jGGkP0938QiScl56kMZS1Sf8,578
 sibi_dst/df_helper/_artifact_updater_async.py,sha256=0lUwel-IkmKewRnmMv9GtuT-P6SivkIKtgOHvKchHlc,8462
 sibi_dst/df_helper/_artifact_updater_threaded.py,sha256=M5GNZismOqMmBrcyfolP1DPv87VILQf_P18is_epn50,7238
-sibi_dst/df_helper/_df_helper.py,sha256=nG5iITvwyRsdnPgTOql6-w47LEOsZUXYF7-tIM2yGBE,12798
+sibi_dst/df_helper/_df_helper.py,sha256=g1ftfSMO40l60EJWRLE0DDZvbIowrqvG1GMf2zXqYGw,12957
 sibi_dst/df_helper/_parquet_artifact.py,sha256=tqYOjwxHV1MsADmn-RNFuVI_RrEvvmCJHZieRcsVXuc,12334
 sibi_dst/df_helper/_parquet_reader.py,sha256=tFq0OQVczozbKZou93vscokp2R6O2DIJ1zHbZqVjagc,3069
 sibi_dst/df_helper/backends/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 sibi_dst/df_helper/backends/http/__init__.py,sha256=d1pfgYxbiYg7E0Iw8RbJ7xfqIfJShqqTBQQGU_S6OOo,105
 sibi_dst/df_helper/backends/http/_http_config.py,sha256=eGPFdqZ5M3Tscqx2P93B6XoBEEzlmdt7yNg7PXUQnNQ,4726
 sibi_dst/df_helper/backends/parquet/__init__.py,sha256=0A6BGHZLwiLBmuBBaUvEHfeWTcInvy2NbymlrI_nuXE,104
-sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=yQ5pZuF2Tf7eM_krOPkxhPkDFtEKzV7BKjUerTqX0tg,12028
+sibi_dst/df_helper/backends/parquet/_parquet_options.py,sha256=V6y1Vco3_uY4UBF79_JPd1CFK5DpNsnGYHCc5PDPGZo,13798
 sibi_dst/df_helper/backends/sqlalchemy/__init__.py,sha256=LjWm9B7CweTvlvFOgB90XjSe0lVLILAIYMWKPkFXFm8,265
 sibi_dst/df_helper/backends/sqlalchemy/_db_connection.py,sha256=R3_WY_lsQrfQwD6yAzH66MqvsgZdMd0HKcVChDQcbpM,8401
 sibi_dst/df_helper/backends/sqlalchemy/_db_gatekeeper.py,sha256=GQwDy2JwPUx37vpwxPM5hg4ZydilPIP824y5C_clsl0,383
@@ -78,6 +78,6 @@ sibi_dst/v2/df_helper/core/_params_config.py,sha256=DYx2drDz3uF-lSPzizPkchhy-kxR
 sibi_dst/v2/df_helper/core/_query_config.py,sha256=Y8LVSyaKuVkrPluRDkQoOwuXHQxner1pFWG3HPfnDHM,441
 sibi_dst/v2/utils/__init__.py,sha256=6H4cvhqTiFufnFPETBF0f8beVVMpfJfvUs6Ne0TQZNY,58
 sibi_dst/v2/utils/log_utils.py,sha256=rfk5VsLAt-FKpv6aPTC1FToIPiyrnHAFFBAkHme24po,4123
-sibi_dst-2025.8.4.dist-info/METADATA,sha256=LFL_mbMveA_TrO5zelvtZ1rBiEuMWtvhjrAs42DnOd0,2610
-sibi_dst-2025.8.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-sibi_dst-2025.8.4.dist-info/RECORD,,
+sibi_dst-2025.8.5.dist-info/METADATA,sha256=ADWrf_9UI4NiTWslrJ0LgfmHTTdxSSCIc0AaP-mqSQg,2610
+sibi_dst-2025.8.5.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+sibi_dst-2025.8.5.dist-info/RECORD,,

{sibi_dst-2025.8.4.dist-info → sibi_dst-2025.8.5.dist-info}/WHEEL RENAMED Viewed

File without changes

sibi-dst 2025.8.4__py3-none-any.whl → 2025.8.5__py3-none-any.whl

sibi-dst 2025.8.4py3-none-any.whl → 2025.8.5py3-none-any.whl