FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +140 -30
- flowerpower/pipeline/base.py +3 -1
- flowerpower/pipeline/registry.py +9 -9
- flowerpower/plugins/io/base.py +13 -7
- flowerpower/plugins/io/helpers/polars.py +346 -124
- flowerpower/plugins/io/helpers/pyarrow.py +406 -0
- flowerpower/settings/general.py +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/METADATA +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/RECORD +13 -12
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
CHANGED
@@ -10,13 +10,19 @@ else:
|
|
10
10
|
raise ImportError("To use this module, please install `flowerpower[io]`.")
|
11
11
|
|
12
12
|
import orjson
|
13
|
-
import polars as pl
|
13
|
+
# import polars as pl
|
14
14
|
import pyarrow as pa
|
15
15
|
import pyarrow.dataset as pds
|
16
16
|
import pyarrow.parquet as pq
|
17
17
|
from fsspec import AbstractFileSystem
|
18
18
|
from pydala.dataset import ParquetDataset
|
19
19
|
|
20
|
+
from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
|
21
|
+
from ..plugins.io.helpers.polars import pl
|
22
|
+
# from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
|
23
|
+
from ..plugins.io.helpers.pyarrow import cast_schema
|
24
|
+
from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
|
25
|
+
from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
|
20
26
|
from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
|
21
27
|
run_parallel, to_pyarrow_table)
|
22
28
|
|
@@ -172,6 +178,7 @@ def _read_json(
|
|
172
178
|
as_dataframe: bool = True,
|
173
179
|
concat: bool = True,
|
174
180
|
verbose: bool = False,
|
181
|
+
opt_dtypes: bool = False,
|
175
182
|
**kwargs,
|
176
183
|
) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
|
177
184
|
"""
|
@@ -236,8 +243,13 @@ def _read_json(
|
|
236
243
|
][0]
|
237
244
|
for _data in data
|
238
245
|
]
|
246
|
+
if opt_dtypes:
|
247
|
+
data = [opt_dtype_pl(df, strict=False) for df in data]
|
239
248
|
if concat:
|
240
|
-
|
249
|
+
result = pl.concat(data, how="diagonal_relaxed")
|
250
|
+
if opt_dtypes:
|
251
|
+
result = opt_dtype_pl(result, strict=False)
|
252
|
+
return result
|
241
253
|
return data
|
242
254
|
|
243
255
|
|
@@ -251,6 +263,7 @@ def _read_json_batches(
|
|
251
263
|
concat: bool = True,
|
252
264
|
use_threads: bool = True,
|
253
265
|
verbose: bool = False,
|
266
|
+
opt_dtypes: bool = False,
|
254
267
|
**kwargs: Any,
|
255
268
|
) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
|
256
269
|
"""Process JSON files in batches with optional parallel reading.
|
@@ -360,6 +373,7 @@ def read_json(
|
|
360
373
|
concat: bool = True,
|
361
374
|
use_threads: bool = True,
|
362
375
|
verbose: bool = False,
|
376
|
+
opt_dtypes: bool = False,
|
363
377
|
**kwargs: Any,
|
364
378
|
) -> (
|
365
379
|
dict
|
@@ -439,6 +453,7 @@ def read_json(
|
|
439
453
|
concat=concat,
|
440
454
|
use_threads=use_threads,
|
441
455
|
verbose=verbose,
|
456
|
+
opt_dtypes=opt_dtypes,
|
442
457
|
**kwargs,
|
443
458
|
)
|
444
459
|
return _read_json(
|
@@ -450,12 +465,17 @@ def read_json(
|
|
450
465
|
concat=concat,
|
451
466
|
use_threads=use_threads,
|
452
467
|
verbose=verbose,
|
468
|
+
opt_dtypes=opt_dtypes,
|
453
469
|
**kwargs,
|
454
470
|
)
|
455
471
|
|
456
472
|
|
457
473
|
def _read_csv_file(
|
458
|
-
path: str,
|
474
|
+
path: str,
|
475
|
+
self: AbstractFileSystem,
|
476
|
+
include_file_path: bool = False,
|
477
|
+
opt_dtypes: bool = False,
|
478
|
+
**kwargs: Any,
|
459
479
|
) -> pl.DataFrame:
|
460
480
|
"""Read a single CSV file from any filesystem.
|
461
481
|
|
@@ -486,15 +506,21 @@ def _read_csv_file(
|
|
486
506
|
with self.open(path) as f:
|
487
507
|
df = pl.read_csv(f, **kwargs)
|
488
508
|
if include_file_path:
|
489
|
-
|
509
|
+
df = df.with_columns(pl.lit(path).alias("file_path"))
|
510
|
+
if opt_dtypes:
|
511
|
+
df = opt_dtype_pl(df, strict=False)
|
490
512
|
return df
|
491
513
|
|
492
514
|
|
493
515
|
def read_csv_file(
|
494
|
-
self, path: str, include_file_path: bool = False, **kwargs
|
516
|
+
self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
|
495
517
|
) -> pl.DataFrame:
|
496
518
|
return _read_csv_file(
|
497
|
-
path=path,
|
519
|
+
path=path,
|
520
|
+
self=self,
|
521
|
+
include_file_path=include_file_path,
|
522
|
+
opt_dtypes=opt_dtypes,
|
523
|
+
**kwargs,
|
498
524
|
)
|
499
525
|
|
500
526
|
|
@@ -505,6 +531,7 @@ def _read_csv(
|
|
505
531
|
use_threads: bool = True,
|
506
532
|
concat: bool = True,
|
507
533
|
verbose: bool = False,
|
534
|
+
opt_dtypes: bool = False,
|
508
535
|
**kwargs,
|
509
536
|
) -> pl.DataFrame | list[pl.DataFrame]:
|
510
537
|
"""
|
@@ -533,21 +560,36 @@ def _read_csv(
|
|
533
560
|
path,
|
534
561
|
self=self,
|
535
562
|
include_file_path=include_file_path,
|
563
|
+
opt_dtypes=opt_dtypes,
|
536
564
|
n_jobs=-1,
|
537
565
|
backend="threading",
|
538
566
|
verbose=verbose,
|
539
567
|
**kwargs,
|
540
568
|
)
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
569
|
+
else:
|
570
|
+
dfs = [
|
571
|
+
_read_csv_file(
|
572
|
+
p,
|
573
|
+
self=self,
|
574
|
+
include_file_path=include_file_path,
|
575
|
+
opt_dtypes=opt_dtypes,
|
576
|
+
**kwargs,
|
577
|
+
)
|
578
|
+
for p in path
|
579
|
+
]
|
545
580
|
else:
|
546
581
|
dfs = _read_csv_file(
|
547
|
-
path,
|
582
|
+
path,
|
583
|
+
self=self,
|
584
|
+
include_file_path=include_file_path,
|
585
|
+
opt_dtypes=opt_dtypes,
|
586
|
+
**kwargs,
|
548
587
|
)
|
549
588
|
if concat:
|
550
|
-
|
589
|
+
result = pl.concat(dfs, how="diagonal_relaxed")
|
590
|
+
if opt_dtypes:
|
591
|
+
result = opt_dtype_pl(result, strict=False)
|
592
|
+
return result
|
551
593
|
return dfs
|
552
594
|
|
553
595
|
|
@@ -559,6 +601,7 @@ def _read_csv_batches(
|
|
559
601
|
concat: bool = True,
|
560
602
|
use_threads: bool = True,
|
561
603
|
verbose: bool = False,
|
604
|
+
opt_dtypes: bool = False,
|
562
605
|
**kwargs: Any,
|
563
606
|
) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
|
564
607
|
"""Process CSV files in batches with optional parallel reading.
|
@@ -634,8 +677,14 @@ def _read_csv_batches(
|
|
634
677
|
for p in batch_paths
|
635
678
|
]
|
636
679
|
|
680
|
+
if opt_dtypes:
|
681
|
+
batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
682
|
+
|
637
683
|
if concat and len(batch_dfs) > 1:
|
638
|
-
|
684
|
+
result = pl.concat(batch_dfs, how="diagonal_relaxed")
|
685
|
+
if opt_dtypes:
|
686
|
+
result = opt_dtype_pl(result, strict=False)
|
687
|
+
yield result
|
639
688
|
else:
|
640
689
|
yield batch_dfs
|
641
690
|
|
@@ -648,6 +697,7 @@ def read_csv(
|
|
648
697
|
concat: bool = True,
|
649
698
|
use_threads: bool = True,
|
650
699
|
verbose: bool = False,
|
700
|
+
opt_dtypes: bool = False,
|
651
701
|
**kwargs: Any,
|
652
702
|
) -> (
|
653
703
|
pl.DataFrame
|
@@ -730,7 +780,11 @@ def read_csv(
|
|
730
780
|
|
731
781
|
|
732
782
|
def _read_parquet_file(
|
733
|
-
path: str,
|
783
|
+
path: str,
|
784
|
+
self: AbstractFileSystem,
|
785
|
+
include_file_path: bool = False,
|
786
|
+
opt_dtypes: bool = False,
|
787
|
+
**kwargs: Any,
|
734
788
|
) -> pa.Table:
|
735
789
|
"""Read a single Parquet file from any filesystem.
|
736
790
|
|
@@ -759,15 +813,21 @@ def _read_parquet_file(
|
|
759
813
|
"""
|
760
814
|
table = pq.read_table(path, filesystem=self, **kwargs)
|
761
815
|
if include_file_path:
|
762
|
-
|
816
|
+
table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
|
817
|
+
if opt_dtypes:
|
818
|
+
table = opt_dtype_pa(table, strict=False)
|
763
819
|
return table
|
764
820
|
|
765
821
|
|
766
822
|
def read_parquet_file(
|
767
|
-
self, path: str, include_file_path: bool = False, **kwargs
|
823
|
+
self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
|
768
824
|
) -> pa.Table:
|
769
825
|
return _read_parquet_file(
|
770
|
-
path=path,
|
826
|
+
path=path,
|
827
|
+
self=self,
|
828
|
+
include_file_path=include_file_path,
|
829
|
+
opt_dtypes=opt_dtypes,
|
830
|
+
**kwargs,
|
771
831
|
)
|
772
832
|
|
773
833
|
|
@@ -778,6 +838,7 @@ def _read_parquet(
|
|
778
838
|
use_threads: bool = True,
|
779
839
|
concat: bool = True,
|
780
840
|
verbose: bool = False,
|
841
|
+
opt_dtypes: bool = False,
|
781
842
|
**kwargs,
|
782
843
|
) -> pa.Table | list[pa.Table]:
|
783
844
|
"""
|
@@ -797,7 +858,10 @@ def _read_parquet(
|
|
797
858
|
if not include_file_path and concat:
|
798
859
|
if isinstance(path, str):
|
799
860
|
path = path.replace("**", "").replace("*.parquet", "")
|
800
|
-
|
861
|
+
table = pq.read_table(path, filesystem=self, **kwargs)
|
862
|
+
if opt_dtypes:
|
863
|
+
table = opt_dtype_pa(table, strict=False)
|
864
|
+
return table
|
801
865
|
else:
|
802
866
|
if isinstance(path, str):
|
803
867
|
path = path_to_glob(path, format="parquet")
|
@@ -805,30 +869,54 @@ def _read_parquet(
|
|
805
869
|
|
806
870
|
if isinstance(path, list):
|
807
871
|
if use_threads:
|
808
|
-
|
872
|
+
tables = run_parallel(
|
809
873
|
_read_parquet_file,
|
810
874
|
path,
|
811
875
|
self=self,
|
812
876
|
include_file_path=include_file_path,
|
877
|
+
opt_dtypes=opt_dtypes,
|
813
878
|
n_jobs=-1,
|
814
879
|
backend="threading",
|
815
880
|
verbose=verbose,
|
816
881
|
**kwargs,
|
817
882
|
)
|
818
883
|
else:
|
819
|
-
|
884
|
+
tables = [
|
820
885
|
_read_parquet_file(
|
821
|
-
p,
|
886
|
+
p,
|
887
|
+
self=self,
|
888
|
+
include_file_path=include_file_path,
|
889
|
+
opt_dtypes=opt_dtypes,
|
890
|
+
**kwargs,
|
822
891
|
)
|
823
892
|
for p in path
|
824
893
|
]
|
825
894
|
else:
|
826
|
-
|
827
|
-
path=path,
|
895
|
+
tables = _read_parquet_file(
|
896
|
+
path=path,
|
897
|
+
self=self,
|
898
|
+
include_file_path=include_file_path,
|
899
|
+
opt_dtypes=opt_dtypes,
|
900
|
+
**kwargs,
|
828
901
|
)
|
829
902
|
if concat:
|
830
|
-
|
831
|
-
|
903
|
+
# Unify schemas before concatenation if opt_dtypes or multiple tables
|
904
|
+
if isinstance(tables, list):
|
905
|
+
if len(tables) > 1:
|
906
|
+
schemas = [t.schema for t in tables]
|
907
|
+
unified_schema = unify_schemas_pa(schemas)
|
908
|
+
tables = [cast_schema(t, unified_schema) for t in tables]
|
909
|
+
result = pa.concat_tables(tables, promote_options="permissive")
|
910
|
+
if opt_dtypes:
|
911
|
+
result = opt_dtype_pa(result, strict=False)
|
912
|
+
return result
|
913
|
+
elif isinstance(tables, pa.Table):
|
914
|
+
if opt_dtypes:
|
915
|
+
tables = opt_dtype_pa(tables, strict=False)
|
916
|
+
return tables
|
917
|
+
else:
|
918
|
+
return pa.concat_tables(tables, promote_options="permissive")
|
919
|
+
return tables
|
832
920
|
|
833
921
|
|
834
922
|
def _read_parquet_batches(
|
@@ -839,6 +927,7 @@ def _read_parquet_batches(
|
|
839
927
|
use_threads: bool = True,
|
840
928
|
concat: bool = True,
|
841
929
|
verbose: bool = False,
|
930
|
+
opt_dtypes: bool = False,
|
842
931
|
**kwargs: Any,
|
843
932
|
) -> Generator[pa.Table | list[pa.Table], None, None]:
|
844
933
|
"""Process Parquet files in batches with performance optimizations.
|
@@ -892,7 +981,10 @@ def _read_parquet_batches(
|
|
892
981
|
if not include_file_path and concat and batch_size is None:
|
893
982
|
if isinstance(path, str):
|
894
983
|
path = path.replace("**", "").replace("*.parquet", "")
|
895
|
-
|
984
|
+
table = pq.read_table(path, filesystem=self, **kwargs)
|
985
|
+
if opt_dtypes:
|
986
|
+
table = opt_dtype_pa(table, strict=False)
|
987
|
+
yield table
|
896
988
|
return
|
897
989
|
|
898
990
|
# Resolve path(s) to list
|
@@ -915,6 +1007,7 @@ def _read_parquet_batches(
|
|
915
1007
|
batch_paths,
|
916
1008
|
self=self,
|
917
1009
|
include_file_path=include_file_path,
|
1010
|
+
opt_dtypes=opt_dtypes,
|
918
1011
|
n_jobs=-1,
|
919
1012
|
backend="threading",
|
920
1013
|
verbose=verbose,
|
@@ -923,14 +1016,28 @@ def _read_parquet_batches(
|
|
923
1016
|
else:
|
924
1017
|
batch_tables = [
|
925
1018
|
_read_parquet_file(
|
926
|
-
p,
|
1019
|
+
p,
|
1020
|
+
self=self,
|
1021
|
+
include_file_path=include_file_path,
|
1022
|
+
opt_dtypes=opt_dtypes,
|
1023
|
+
**kwargs,
|
927
1024
|
)
|
928
1025
|
for p in batch_paths
|
929
1026
|
]
|
930
1027
|
|
931
1028
|
if concat and batch_tables:
|
932
|
-
|
1029
|
+
# Unify schemas before concatenation
|
1030
|
+
if len(batch_tables) > 1:
|
1031
|
+
schemas = [t.schema for t in batch_tables]
|
1032
|
+
unified_schema = unify_schemas_pa(schemas)
|
1033
|
+
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1034
|
+
result = pa.concat_tables(batch_tables, promote_options="permissive")
|
1035
|
+
if opt_dtypes:
|
1036
|
+
result = opt_dtype_pa(result, strict=False)
|
1037
|
+
yield result
|
933
1038
|
else:
|
1039
|
+
if opt_dtypes and isinstance(batch_tables, list):
|
1040
|
+
batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
|
934
1041
|
yield batch_tables
|
935
1042
|
|
936
1043
|
|
@@ -942,6 +1049,7 @@ def read_parquet(
|
|
942
1049
|
concat: bool = True,
|
943
1050
|
use_threads: bool = True,
|
944
1051
|
verbose: bool = False,
|
1052
|
+
opt_dtypes: bool = False,
|
945
1053
|
**kwargs: Any,
|
946
1054
|
) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
|
947
1055
|
"""Read Parquet data with advanced features and optimizations.
|
@@ -1415,7 +1523,7 @@ def write_parquet(
|
|
1415
1523
|
data = to_pyarrow_table(data, concat=False, unique=False)
|
1416
1524
|
|
1417
1525
|
if schema is not None:
|
1418
|
-
data = data
|
1526
|
+
data = cast_schema(data, schema)
|
1419
1527
|
metadata = []
|
1420
1528
|
pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
|
1421
1529
|
metadata = metadata[0]
|
@@ -1469,7 +1577,9 @@ def write_json(
|
|
1469
1577
|
data = data.collect()
|
1470
1578
|
if isinstance(data, pl.DataFrame):
|
1471
1579
|
data = data.to_arrow()
|
1472
|
-
data =
|
1580
|
+
data = cast_schema(
|
1581
|
+
data, convert_large_types_to_standard(data.schema)
|
1582
|
+
).to_pydict()
|
1473
1583
|
elif isinstance(data, pd.DataFrame):
|
1474
1584
|
data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
|
1475
1585
|
elif isinstance(data, pa.Table):
|
flowerpower/pipeline/base.py
CHANGED
@@ -81,7 +81,9 @@ class BasePipeline:
|
|
81
81
|
"""
|
82
82
|
if self._fs.is_cache_fs:
|
83
83
|
self._fs.sync_cache()
|
84
|
-
modules_path = posixpath.join(
|
84
|
+
modules_path = posixpath.join(
|
85
|
+
self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir
|
86
|
+
)
|
85
87
|
else:
|
86
88
|
modules_path = posixpath.join(self._fs.path, self._pipelines_dir)
|
87
89
|
if modules_path not in sys.path:
|
flowerpower/pipeline/registry.py
CHANGED
@@ -190,7 +190,9 @@ class PipelineRegistry:
|
|
190
190
|
)
|
191
191
|
|
192
192
|
# Sync filesystem if needed (using _fs)
|
193
|
-
if hasattr(self._fs, "sync_cache") and callable(
|
193
|
+
if hasattr(self._fs, "sync_cache") and callable(
|
194
|
+
getattr(self._fs, "sync_cache")
|
195
|
+
):
|
194
196
|
self._fs.sync_cache()
|
195
197
|
|
196
198
|
def _get_files(self) -> list[str]:
|
@@ -447,14 +449,12 @@ class PipelineRegistry:
|
|
447
449
|
logger.warning(f"Could not get size for {path}: {e}")
|
448
450
|
size = "Error"
|
449
451
|
|
450
|
-
pipeline_info.append(
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
}
|
457
|
-
)
|
452
|
+
pipeline_info.append({
|
453
|
+
"name": name,
|
454
|
+
"path": path,
|
455
|
+
"mod_time": mod_time,
|
456
|
+
"size": size,
|
457
|
+
})
|
458
458
|
|
459
459
|
if show:
|
460
460
|
table = Table(title="Available Pipelines")
|
flowerpower/plugins/io/base.py
CHANGED
@@ -22,11 +22,11 @@ from sqlalchemy import create_engine, text
|
|
22
22
|
from ...fs import get_filesystem
|
23
23
|
from ...fs.ext import _dict_to_dataframe, path_to_glob
|
24
24
|
from ...fs.storage_options import (AwsStorageOptions, AzureStorageOptions,
|
25
|
-
|
26
|
-
|
27
|
-
StorageOptions)
|
25
|
+
GcsStorageOptions, GitHubStorageOptions,
|
26
|
+
GitLabStorageOptions, StorageOptions)
|
28
27
|
from ...utils.misc import convert_large_types_to_standard, to_pyarrow_table
|
29
28
|
from .helpers.polars import pl
|
29
|
+
from .helpers.pyarrow import opt_dtype
|
30
30
|
from .helpers.sql import sql2polars_filter, sql2pyarrow_filter
|
31
31
|
from .metadata import get_dataframe_metadata, get_pyarrow_dataset_metadata
|
32
32
|
|
@@ -236,12 +236,18 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
236
236
|
df=self._data,
|
237
237
|
path=self.path,
|
238
238
|
format=self.format,
|
239
|
-
num_files=pl.from_arrow(self._data.select(["file_path"])).select(
|
240
|
-
|
241
|
-
)[0, 0],
|
239
|
+
# num_files=pl.from_arrow(self._data.select(["file_path"])).select(
|
240
|
+
# pl.n_unique("file_path")
|
241
|
+
# )[0, 0],
|
242
242
|
)
|
243
243
|
if not self.include_file_path:
|
244
|
-
self._data
|
244
|
+
if isinstance(self._data, pa.Table):
|
245
|
+
self._data = self._data.drop("file_path")
|
246
|
+
elif isinstance(self._data, list | tuple):
|
247
|
+
self._data = [
|
248
|
+
df.drop("file_path") if isinstance(df, pa.Table) else df
|
249
|
+
for df in self._data
|
250
|
+
]
|
245
251
|
else:
|
246
252
|
self._metadata = {}
|
247
253
|
|