PyPI - FlowerPower - Versions diffs - 0.11.5.8__py3-none-any.whl → 0.11.6.1__py3-none-any.whl - Mend

FlowerPower 0.11.5.8py3-none-any.whl → 0.11.6.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

flowerpower/fs/ext.py CHANGED Viewed

@@ -10,13 +10,19 @@ else:
     raise ImportError("To use this module, please install `flowerpower[io]`.")
 import orjson
-import polars as pl
+# import polars as pl
 import pyarrow as pa
 import pyarrow.dataset as pds
 import pyarrow.parquet as pq
 from fsspec import AbstractFileSystem
 from pydala.dataset import ParquetDataset
+from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
+from ..plugins.io.helpers.polars import pl
+# from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
+from ..plugins.io.helpers.pyarrow import cast_schema
+from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
+from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
 from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
                           run_parallel, to_pyarrow_table)
@@ -172,6 +178,7 @@ def _read_json(
     as_dataframe: bool = True,
     concat: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs,
 ) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
     """
@@ -186,6 +193,7 @@ def _read_json(
         as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
         concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
         verbose: (bool, optional) If True, print verbose output. Defaults to False.
+        opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
         **kwargs: Additional keyword arguments.
     Returns:
@@ -236,8 +244,13 @@ def _read_json(
                 ][0]
                 for _data in data
             ]
+        if opt_dtypes:
+            data = [opt_dtype_pl(df, strict=False) for df in data]
         if concat:
-            return pl.concat(data, how="diagonal_relaxed")
+            result = pl.concat(data, how="diagonal_relaxed")
+            # if opt_dtypes:
+            #   result = opt_dtype_pl(result, strict=False)
+            return result
     return data
@@ -251,6 +264,7 @@ def _read_json_batches(
     concat: bool = True,
     use_threads: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
     """Process JSON files in batches with optional parallel reading.
@@ -267,6 +281,7 @@ def _read_json_batches(
         concat: Combine files within each batch
         use_threads: Enable parallel file reading within batches
         verbose: Print progress information
+        opt_dtypes: Optimize DataFrame dtypes
         **kwargs: Additional arguments for DataFrame conversion
     Yields:
@@ -341,10 +356,16 @@ def _read_json_batches(
                     ][0]
                     for _data in batch_data
                 ]
+            if opt_dtypes:
+                batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
             if concat and len(batch_dfs) > 1:
-                yield pl.concat(batch_dfs, how="diagonal_relaxed")
+                batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
+                # if opt_dtypes:
+                #    batch_df = opt_dtype_pl(batch_df, strict=False)
+                yield batch_df
             else:
+                # if opt_dtypes:
+                #    batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
                 yield batch_dfs
         else:
             yield batch_data
@@ -360,6 +381,7 @@ def read_json(
     concat: bool = True,
     use_threads: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> (
     dict
@@ -389,6 +411,7 @@ def read_json(
         concat: Combine multiple files/batches into single result
         use_threads: Enable parallel file reading
         verbose: Print progress information
+        opt_dtypes: Optimize DataFrame dtypes for performance
         **kwargs: Additional arguments passed to DataFrame conversion
     Returns:
@@ -439,6 +462,7 @@ def read_json(
             concat=concat,
             use_threads=use_threads,
             verbose=verbose,
+            opt_dtypes=opt_dtypes,
             **kwargs,
         )
     return _read_json(
@@ -450,12 +474,17 @@ def read_json(
         concat=concat,
         use_threads=use_threads,
         verbose=verbose,
+        opt_dtypes=opt_dtypes,
         **kwargs,
     )
 def _read_csv_file(
-    path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
+    path: str,
+    self: AbstractFileSystem,
+    include_file_path: bool = False,
+    opt_dtypes: bool = False,
+    **kwargs: Any,
 ) -> pl.DataFrame:
     """Read a single CSV file from any filesystem.
@@ -466,6 +495,7 @@ def _read_csv_file(
         path: Path to CSV file
         self: Filesystem instance to use for reading
         include_file_path: Add source filepath as a column
+        opt_dtypes: Optimize DataFrame dtypes
         **kwargs: Additional arguments passed to pl.read_csv()
     Returns:
@@ -486,15 +516,21 @@ def _read_csv_file(
     with self.open(path) as f:
         df = pl.read_csv(f, **kwargs)
     if include_file_path:
-        return df.with_columns(pl.lit(path).alias("file_path"))
+        df = df.with_columns(pl.lit(path).alias("file_path"))
+    if opt_dtypes:
+        df = opt_dtype_pl(df, strict=False)
     return df
 def read_csv_file(
-    self, path: str, include_file_path: bool = False, **kwargs
+    self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
 ) -> pl.DataFrame:
     return _read_csv_file(
-        path=path, self=self, include_file_path=include_file_path, **kwargs
+        path=path,
+        self=self,
+        include_file_path=include_file_path,
+        opt_dtypes=opt_dtypes,
+        **kwargs,
     )
@@ -505,6 +541,7 @@ def _read_csv(
     use_threads: bool = True,
     concat: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs,
 ) -> pl.DataFrame | list[pl.DataFrame]:
     """
@@ -517,6 +554,7 @@ def _read_csv(
         use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
         concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
         verbose: (bool, optional) If True, print verbose output. Defaults to False.
+        opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
         **kwargs: Additional keyword arguments.
     Returns:
@@ -533,21 +571,36 @@ def _read_csv(
                 path,
                 self=self,
                 include_file_path=include_file_path,
+                opt_dtypes=opt_dtypes,
                 n_jobs=-1,
                 backend="threading",
                 verbose=verbose,
                 **kwargs,
             )
-        dfs = [
-            _read_csv_file(p, self=self, include_file_path=include_file_path, **kwargs)
-            for p in path
-        ]
+        else:
+            dfs = [
+                _read_csv_file(
+                    p,
+                    self=self,
+                    include_file_path=include_file_path,
+                    opt_dtypes=opt_dtypes,
+                    **kwargs,
+                )
+                for p in path
+            ]
     else:
         dfs = _read_csv_file(
-            path, self=self, include_file_path=include_file_path, **kwargs
+            path,
+            self=self,
+            include_file_path=include_file_path,
+            opt_dtypes=opt_dtypes,
+            **kwargs,
         )
     if concat:
-        return pl.concat(dfs, how="diagonal_relaxed")
+        result = pl.concat(dfs, how="diagonal_relaxed")
+        # if opt_dtypes:
+        #    result = opt_dtype_pl(result, strict=False)
+        return result
     return dfs
@@ -559,6 +612,7 @@ def _read_csv_batches(
     concat: bool = True,
     use_threads: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
     """Process CSV files in batches with optional parallel reading.
@@ -573,6 +627,7 @@ def _read_csv_batches(
         concat: Combine files within each batch
         use_threads: Enable parallel file reading within batches
         verbose: Print progress information
+        opt_dtypes: Optimize DataFrame dtypes
         **kwargs: Additional arguments passed to pl.read_csv()
     Yields:
@@ -624,18 +679,29 @@ def _read_csv_batches(
                 n_jobs=-1,
                 backend="threading",
                 verbose=verbose,
+                opt_dtypes=opt_dtypes,
                 **kwargs,
             )
         else:
             batch_dfs = [
                 _read_csv_file(
-                    p, self=self, include_file_path=include_file_path, **kwargs
+                    p,
+                    self=self,
+                    include_file_path=include_file_path,
+                    opt_dtypes=opt_dtypes,
+                    **kwargs,
                 )
                 for p in batch_paths
             ]
+        # if opt_dtypes:
+        #    batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
         if concat and len(batch_dfs) > 1:
-            yield pl.concat(batch_dfs, how="diagonal_relaxed")
+            result = pl.concat(batch_dfs, how="diagonal_relaxed")
+            # if opt_dtypes:
+            #    result = opt_dtype_pl(result, strict=False)
+            yield result
         else:
             yield batch_dfs
@@ -648,6 +714,7 @@ def read_csv(
     concat: bool = True,
     use_threads: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> (
     pl.DataFrame
@@ -716,6 +783,7 @@ def read_csv(
             concat=concat,
             use_threads=use_threads,
             verbose=verbose,
+            opt_dtypes=opt_dtypes,
             **kwargs,
         )
     return _read_csv(
@@ -725,12 +793,17 @@ def read_csv(
         concat=concat,
         use_threads=use_threads,
         verbose=verbose,
+        opt_dtypes=opt_dtypes,
         **kwargs,
     )
 def _read_parquet_file(
-    path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
+    path: str,
+    self: AbstractFileSystem,
+    include_file_path: bool = False,
+    opt_dtypes: bool = False,
+    **kwargs: Any,
 ) -> pa.Table:
     """Read a single Parquet file from any filesystem.
@@ -759,15 +832,21 @@ def _read_parquet_file(
     """
     table = pq.read_table(path, filesystem=self, **kwargs)
     if include_file_path:
-        return table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
+        table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
+    if opt_dtypes:
+        table = opt_dtype_pa(table, strict=False)
     return table
 def read_parquet_file(
-    self, path: str, include_file_path: bool = False, **kwargs
+    self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
 ) -> pa.Table:
     return _read_parquet_file(
-        path=path, self=self, include_file_path=include_file_path, **kwargs
+        path=path,
+        self=self,
+        include_file_path=include_file_path,
+        opt_dtypes=opt_dtypes,
+        **kwargs,
     )
@@ -778,6 +857,7 @@ def _read_parquet(
     use_threads: bool = True,
     concat: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs,
 ) -> pa.Table | list[pa.Table]:
     """
@@ -797,7 +877,8 @@ def _read_parquet(
     if not include_file_path and concat:
         if isinstance(path, str):
             path = path.replace("**", "").replace("*.parquet", "")
-        return pq.read_table(path, filesystem=self, **kwargs)
+        table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
+        return table
     else:
         if isinstance(path, str):
             path = path_to_glob(path, format="parquet")
@@ -805,30 +886,54 @@ def _read_parquet(
         if isinstance(path, list):
             if use_threads:
-                table = run_parallel(
+                tables = run_parallel(
                     _read_parquet_file,
                     path,
                     self=self,
                     include_file_path=include_file_path,
+                    opt_dtypes=opt_dtypes,
                     n_jobs=-1,
                     backend="threading",
                     verbose=verbose,
                     **kwargs,
                 )
             else:
-                table = [
+                tables = [
                     _read_parquet_file(
-                        p, self=self, include_file_path=include_file_path, **kwargs
+                        p,
+                        self=self,
+                        include_file_path=include_file_path,
+                        opt_dtypes=opt_dtypes,
+                        **kwargs,
                     )
                     for p in path
                 ]
         else:
-            table = _read_parquet_file(
-                path=path, self=self, include_file_path=include_file_path, **kwargs
+            tables = _read_parquet_file(
+                path=path,
+                self=self,
+                include_file_path=include_file_path,
+                opt_dtypes=opt_dtypes,
+                **kwargs,
             )
     if concat:
-        return pa.concat_tables(table, promote_options="permissive")
-    return table
+        # Unify schemas before concatenation if opt_dtypes or multiple tables
+        if isinstance(tables, list):
+            if len(tables) > 1:
+                schemas = [t.schema for t in tables]
+                unified_schema = unify_schemas_pa(schemas)
+                tables = [cast_schema(t, unified_schema) for t in tables]
+            result = pa.concat_tables(tables, promote_options="permissive")
+            # if opt_dtypes:
+            #    result = opt_dtype_pa(result, strict=False)
+            return result
+        elif isinstance(tables, pa.Table):
+            # if opt_dtypes:
+            #    tables = opt_dtype_pa(tables, strict=False)
+            return tables
+        else:
+            return pa.concat_tables(tables, promote_options="permissive")
+    return tables
 def _read_parquet_batches(
@@ -839,6 +944,7 @@ def _read_parquet_batches(
     use_threads: bool = True,
     concat: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> Generator[pa.Table | list[pa.Table], None, None]:
     """Process Parquet files in batches with performance optimizations.
@@ -892,7 +998,10 @@ def _read_parquet_batches(
     if not include_file_path and concat and batch_size is None:
         if isinstance(path, str):
             path = path.replace("**", "").replace("*.parquet", "")
-        yield pq.read_table(path, filesystem=self, **kwargs)
+        table = _read_parquet_file(
+            path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
+        )
+        yield table
         return
     # Resolve path(s) to list
@@ -902,7 +1011,11 @@ def _read_parquet_batches(
     if not isinstance(path, list):
         yield _read_parquet_file(
-            path=path, self=self, include_file_path=include_file_path, **kwargs
+            path=path,
+            self=self,
+            include_file_path=include_file_path,
+            opt_dtypes=opt_dtypes,
+            **kwargs,
         )
         return
@@ -915,6 +1028,7 @@ def _read_parquet_batches(
                 batch_paths,
                 self=self,
                 include_file_path=include_file_path,
+                opt_dtypes=opt_dtypes,
                 n_jobs=-1,
                 backend="threading",
                 verbose=verbose,
@@ -923,14 +1037,28 @@ def _read_parquet_batches(
         else:
             batch_tables = [
                 _read_parquet_file(
-                    p, self=self, include_file_path=include_file_path, **kwargs
+                    p,
+                    self=self,
+                    include_file_path=include_file_path,
+                    opt_dtypes=opt_dtypes,
+                    **kwargs,
                 )
                 for p in batch_paths
             ]
         if concat and batch_tables:
-            yield pa.concat_tables(batch_tables, promote_options="permissive")
+            # Unify schemas before concatenation
+            if len(batch_tables) > 1:
+                schemas = [t.schema for t in batch_tables]
+                unified_schema = unify_schemas_pa(schemas)
+                batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
+            result = pa.concat_tables(batch_tables, promote_options="permissive")
+            # if opt_dtypes:
+            #    result = opt_dtype_pa(result, strict=False)
+            yield result
         else:
+            # if opt_dtypes and isinstance(batch_tables, list):
+            #    batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
             yield batch_tables
@@ -942,6 +1070,7 @@ def read_parquet(
     concat: bool = True,
     use_threads: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
     """Read Parquet data with advanced features and optimizations.
@@ -969,6 +1098,7 @@ def read_parquet(
         concat: Combine multiple files/batches into single Table
         use_threads: Enable parallel file reading
         verbose: Print progress information
+        opt_dtypes: Optimize Table dtypes for performance
         **kwargs: Additional arguments passed to pq.read_table()
     Returns:
@@ -1011,6 +1141,7 @@ def read_parquet(
             concat=concat,
             use_threads=use_threads,
             verbose=verbose,
+            opt_dtypes=opt_dtypes,
             **kwargs,
         )
     return _read_parquet(
@@ -1020,6 +1151,7 @@ def read_parquet(
         use_threads=use_threads,
         concat=concat,
         verbose=verbose,
+        opt_dtypes=opt_dtypes,
         **kwargs,
     )
@@ -1034,6 +1166,7 @@ def read_files(
     jsonlines: bool = False,
     use_threads: bool = True,
     verbose: bool = False,
+    opt_dtypes: bool = False,
     **kwargs: Any,
 ) -> (
     pl.DataFrame
@@ -1067,6 +1200,7 @@ def read_files(
         jsonlines: For JSON format, whether to read as JSON Lines
         use_threads: Enable parallel file reading
         verbose: Print progress information
+        opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
         **kwargs: Additional format-specific arguments
     Returns:
@@ -1116,6 +1250,7 @@ def read_files(
                 concat=concat,
                 use_threads=use_threads,
                 verbose=verbose,
+                opt_dtypes=opt_dtypes,
                 **kwargs,
             )
         return read_json(
@@ -1126,6 +1261,7 @@ def read_files(
             concat=concat,
             use_threads=use_threads,
             verbose=verbose,
+            opt_dtypes=opt_dtypes,
             **kwargs,
         )
     elif format == "csv":
@@ -1138,6 +1274,7 @@ def read_files(
                 concat=concat,
                 use_threads=use_threads,
                 verbose=verbose,
+                opt_dtypes=opt_dtypes,
                 **kwargs,
             )
         return read_csv(
@@ -1147,6 +1284,7 @@ def read_files(
             use_threads=use_threads,
             concat=concat,
             verbose=verbose,
+            opt_dtypes=opt_dtypes,
             **kwargs,
         )
     elif format == "parquet":
@@ -1159,6 +1297,7 @@ def read_files(
                 concat=concat,
                 use_threads=use_threads,
                 verbose=verbose,
+                opt_dtypes=opt_dtypes,
                 **kwargs,
             )
         return read_parquet(
@@ -1168,6 +1307,7 @@ def read_files(
             use_threads=use_threads,
             concat=concat,
             verbose=verbose,
+            opt_dtypes=opt_dtypes,
             **kwargs,
         )
@@ -1415,7 +1555,7 @@ def write_parquet(
     data = to_pyarrow_table(data, concat=False, unique=False)
     if schema is not None:
-        data = data.cast(schema)
+        data = cast_schema(data, schema)
     metadata = []
     pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
     metadata = metadata[0]
@@ -1469,7 +1609,9 @@ def write_json(
         data = data.collect()
     if isinstance(data, pl.DataFrame):
         data = data.to_arrow()
-        data = data.cast(convert_large_types_to_standard(data.schema)).to_pydict()
+        data = cast_schema(
+            data, convert_large_types_to_standard(data.schema)
+        ).to_pydict()
     elif isinstance(data, pd.DataFrame):
         data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
     elif isinstance(data, pa.Table):

flowerpower/pipeline/base.py CHANGED Viewed

@@ -81,7 +81,9 @@ class BasePipeline:
         """
         if self._fs.is_cache_fs:
             self._fs.sync_cache()
-            modules_path = posixpath.join(self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir)
+            modules_path = posixpath.join(
+                self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir
+            )
         else:
             modules_path = posixpath.join(self._fs.path, self._pipelines_dir)
         if modules_path not in sys.path:

flowerpower/pipeline/registry.py CHANGED Viewed

@@ -190,7 +190,9 @@ class PipelineRegistry:
             )
         # Sync filesystem if needed (using _fs)
-        if hasattr(self._fs, "sync_cache") and callable(getattr(self._fs, "sync_cache")):
+        if hasattr(self._fs, "sync_cache") and callable(
+            getattr(self._fs, "sync_cache")
+        ):
             self._fs.sync_cache()
     def _get_files(self) -> list[str]:
@@ -447,14 +449,12 @@ class PipelineRegistry:
                 logger.warning(f"Could not get size for {path}: {e}")
                 size = "Error"
-            pipeline_info.append(
-                {
-                    "name": name,
-                    "path": path,
-                    "mod_time": mod_time,
-                    "size": size,
-                }
-            )
+            pipeline_info.append({
+                "name": name,
+                "path": path,
+                "mod_time": mod_time,
+                "size": size,
+            })
         if show:
             table = Table(title="Available Pipelines")

FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6.1__py3-none-any.whl

FlowerPower 0.11.5.8py3-none-any.whl → 0.11.6.1py3-none-any.whl