FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +176 -34
- flowerpower/pipeline/base.py +3 -1
- flowerpower/pipeline/registry.py +9 -9
- flowerpower/plugins/io/base.py +501 -78
- flowerpower/plugins/io/helpers/polars.py +346 -124
- flowerpower/plugins/io/helpers/pyarrow.py +406 -0
- flowerpower/settings/general.py +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/METADATA +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/RECORD +13 -12
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
CHANGED
@@ -10,13 +10,19 @@ else:
|
|
10
10
|
raise ImportError("To use this module, please install `flowerpower[io]`.")
|
11
11
|
|
12
12
|
import orjson
|
13
|
-
import polars as pl
|
13
|
+
# import polars as pl
|
14
14
|
import pyarrow as pa
|
15
15
|
import pyarrow.dataset as pds
|
16
16
|
import pyarrow.parquet as pq
|
17
17
|
from fsspec import AbstractFileSystem
|
18
18
|
from pydala.dataset import ParquetDataset
|
19
19
|
|
20
|
+
from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
|
21
|
+
from ..plugins.io.helpers.polars import pl
|
22
|
+
# from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
|
23
|
+
from ..plugins.io.helpers.pyarrow import cast_schema
|
24
|
+
from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
|
25
|
+
from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
|
20
26
|
from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
|
21
27
|
run_parallel, to_pyarrow_table)
|
22
28
|
|
@@ -172,6 +178,7 @@ def _read_json(
|
|
172
178
|
as_dataframe: bool = True,
|
173
179
|
concat: bool = True,
|
174
180
|
verbose: bool = False,
|
181
|
+
opt_dtypes: bool = False,
|
175
182
|
**kwargs,
|
176
183
|
) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
|
177
184
|
"""
|
@@ -186,6 +193,7 @@ def _read_json(
|
|
186
193
|
as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
|
187
194
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
188
195
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
196
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
189
197
|
**kwargs: Additional keyword arguments.
|
190
198
|
|
191
199
|
Returns:
|
@@ -236,8 +244,13 @@ def _read_json(
|
|
236
244
|
][0]
|
237
245
|
for _data in data
|
238
246
|
]
|
247
|
+
if opt_dtypes:
|
248
|
+
data = [opt_dtype_pl(df, strict=False) for df in data]
|
239
249
|
if concat:
|
240
|
-
|
250
|
+
result = pl.concat(data, how="diagonal_relaxed")
|
251
|
+
# if opt_dtypes:
|
252
|
+
# result = opt_dtype_pl(result, strict=False)
|
253
|
+
return result
|
241
254
|
return data
|
242
255
|
|
243
256
|
|
@@ -251,6 +264,7 @@ def _read_json_batches(
|
|
251
264
|
concat: bool = True,
|
252
265
|
use_threads: bool = True,
|
253
266
|
verbose: bool = False,
|
267
|
+
opt_dtypes: bool = False,
|
254
268
|
**kwargs: Any,
|
255
269
|
) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
|
256
270
|
"""Process JSON files in batches with optional parallel reading.
|
@@ -267,6 +281,7 @@ def _read_json_batches(
|
|
267
281
|
concat: Combine files within each batch
|
268
282
|
use_threads: Enable parallel file reading within batches
|
269
283
|
verbose: Print progress information
|
284
|
+
opt_dtypes: Optimize DataFrame dtypes
|
270
285
|
**kwargs: Additional arguments for DataFrame conversion
|
271
286
|
|
272
287
|
Yields:
|
@@ -341,10 +356,16 @@ def _read_json_batches(
|
|
341
356
|
][0]
|
342
357
|
for _data in batch_data
|
343
358
|
]
|
344
|
-
|
359
|
+
if opt_dtypes:
|
360
|
+
batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
345
361
|
if concat and len(batch_dfs) > 1:
|
346
|
-
|
362
|
+
batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
|
363
|
+
# if opt_dtypes:
|
364
|
+
# batch_df = opt_dtype_pl(batch_df, strict=False)
|
365
|
+
yield batch_df
|
347
366
|
else:
|
367
|
+
# if opt_dtypes:
|
368
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
348
369
|
yield batch_dfs
|
349
370
|
else:
|
350
371
|
yield batch_data
|
@@ -360,6 +381,7 @@ def read_json(
|
|
360
381
|
concat: bool = True,
|
361
382
|
use_threads: bool = True,
|
362
383
|
verbose: bool = False,
|
384
|
+
opt_dtypes: bool = False,
|
363
385
|
**kwargs: Any,
|
364
386
|
) -> (
|
365
387
|
dict
|
@@ -389,6 +411,7 @@ def read_json(
|
|
389
411
|
concat: Combine multiple files/batches into single result
|
390
412
|
use_threads: Enable parallel file reading
|
391
413
|
verbose: Print progress information
|
414
|
+
opt_dtypes: Optimize DataFrame dtypes for performance
|
392
415
|
**kwargs: Additional arguments passed to DataFrame conversion
|
393
416
|
|
394
417
|
Returns:
|
@@ -439,6 +462,7 @@ def read_json(
|
|
439
462
|
concat=concat,
|
440
463
|
use_threads=use_threads,
|
441
464
|
verbose=verbose,
|
465
|
+
opt_dtypes=opt_dtypes,
|
442
466
|
**kwargs,
|
443
467
|
)
|
444
468
|
return _read_json(
|
@@ -450,12 +474,17 @@ def read_json(
|
|
450
474
|
concat=concat,
|
451
475
|
use_threads=use_threads,
|
452
476
|
verbose=verbose,
|
477
|
+
opt_dtypes=opt_dtypes,
|
453
478
|
**kwargs,
|
454
479
|
)
|
455
480
|
|
456
481
|
|
457
482
|
def _read_csv_file(
|
458
|
-
path: str,
|
483
|
+
path: str,
|
484
|
+
self: AbstractFileSystem,
|
485
|
+
include_file_path: bool = False,
|
486
|
+
opt_dtypes: bool = False,
|
487
|
+
**kwargs: Any,
|
459
488
|
) -> pl.DataFrame:
|
460
489
|
"""Read a single CSV file from any filesystem.
|
461
490
|
|
@@ -466,6 +495,7 @@ def _read_csv_file(
|
|
466
495
|
path: Path to CSV file
|
467
496
|
self: Filesystem instance to use for reading
|
468
497
|
include_file_path: Add source filepath as a column
|
498
|
+
opt_dtypes: Optimize DataFrame dtypes
|
469
499
|
**kwargs: Additional arguments passed to pl.read_csv()
|
470
500
|
|
471
501
|
Returns:
|
@@ -486,15 +516,21 @@ def _read_csv_file(
|
|
486
516
|
with self.open(path) as f:
|
487
517
|
df = pl.read_csv(f, **kwargs)
|
488
518
|
if include_file_path:
|
489
|
-
|
519
|
+
df = df.with_columns(pl.lit(path).alias("file_path"))
|
520
|
+
if opt_dtypes:
|
521
|
+
df = opt_dtype_pl(df, strict=False)
|
490
522
|
return df
|
491
523
|
|
492
524
|
|
493
525
|
def read_csv_file(
|
494
|
-
self, path: str, include_file_path: bool = False, **kwargs
|
526
|
+
self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
|
495
527
|
) -> pl.DataFrame:
|
496
528
|
return _read_csv_file(
|
497
|
-
path=path,
|
529
|
+
path=path,
|
530
|
+
self=self,
|
531
|
+
include_file_path=include_file_path,
|
532
|
+
opt_dtypes=opt_dtypes,
|
533
|
+
**kwargs,
|
498
534
|
)
|
499
535
|
|
500
536
|
|
@@ -505,6 +541,7 @@ def _read_csv(
|
|
505
541
|
use_threads: bool = True,
|
506
542
|
concat: bool = True,
|
507
543
|
verbose: bool = False,
|
544
|
+
opt_dtypes: bool = False,
|
508
545
|
**kwargs,
|
509
546
|
) -> pl.DataFrame | list[pl.DataFrame]:
|
510
547
|
"""
|
@@ -517,6 +554,7 @@ def _read_csv(
|
|
517
554
|
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
518
555
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
519
556
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
557
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
520
558
|
**kwargs: Additional keyword arguments.
|
521
559
|
|
522
560
|
Returns:
|
@@ -533,21 +571,36 @@ def _read_csv(
|
|
533
571
|
path,
|
534
572
|
self=self,
|
535
573
|
include_file_path=include_file_path,
|
574
|
+
opt_dtypes=opt_dtypes,
|
536
575
|
n_jobs=-1,
|
537
576
|
backend="threading",
|
538
577
|
verbose=verbose,
|
539
578
|
**kwargs,
|
540
579
|
)
|
541
|
-
|
542
|
-
|
543
|
-
|
544
|
-
|
580
|
+
else:
|
581
|
+
dfs = [
|
582
|
+
_read_csv_file(
|
583
|
+
p,
|
584
|
+
self=self,
|
585
|
+
include_file_path=include_file_path,
|
586
|
+
opt_dtypes=opt_dtypes,
|
587
|
+
**kwargs,
|
588
|
+
)
|
589
|
+
for p in path
|
590
|
+
]
|
545
591
|
else:
|
546
592
|
dfs = _read_csv_file(
|
547
|
-
path,
|
593
|
+
path,
|
594
|
+
self=self,
|
595
|
+
include_file_path=include_file_path,
|
596
|
+
opt_dtypes=opt_dtypes,
|
597
|
+
**kwargs,
|
548
598
|
)
|
549
599
|
if concat:
|
550
|
-
|
600
|
+
result = pl.concat(dfs, how="diagonal_relaxed")
|
601
|
+
# if opt_dtypes:
|
602
|
+
# result = opt_dtype_pl(result, strict=False)
|
603
|
+
return result
|
551
604
|
return dfs
|
552
605
|
|
553
606
|
|
@@ -559,6 +612,7 @@ def _read_csv_batches(
|
|
559
612
|
concat: bool = True,
|
560
613
|
use_threads: bool = True,
|
561
614
|
verbose: bool = False,
|
615
|
+
opt_dtypes: bool = False,
|
562
616
|
**kwargs: Any,
|
563
617
|
) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
|
564
618
|
"""Process CSV files in batches with optional parallel reading.
|
@@ -573,6 +627,7 @@ def _read_csv_batches(
|
|
573
627
|
concat: Combine files within each batch
|
574
628
|
use_threads: Enable parallel file reading within batches
|
575
629
|
verbose: Print progress information
|
630
|
+
opt_dtypes: Optimize DataFrame dtypes
|
576
631
|
**kwargs: Additional arguments passed to pl.read_csv()
|
577
632
|
|
578
633
|
Yields:
|
@@ -624,18 +679,29 @@ def _read_csv_batches(
|
|
624
679
|
n_jobs=-1,
|
625
680
|
backend="threading",
|
626
681
|
verbose=verbose,
|
682
|
+
opt_dtypes=opt_dtypes,
|
627
683
|
**kwargs,
|
628
684
|
)
|
629
685
|
else:
|
630
686
|
batch_dfs = [
|
631
687
|
_read_csv_file(
|
632
|
-
p,
|
688
|
+
p,
|
689
|
+
self=self,
|
690
|
+
include_file_path=include_file_path,
|
691
|
+
opt_dtypes=opt_dtypes,
|
692
|
+
**kwargs,
|
633
693
|
)
|
634
694
|
for p in batch_paths
|
635
695
|
]
|
636
696
|
|
697
|
+
# if opt_dtypes:
|
698
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
699
|
+
|
637
700
|
if concat and len(batch_dfs) > 1:
|
638
|
-
|
701
|
+
result = pl.concat(batch_dfs, how="diagonal_relaxed")
|
702
|
+
# if opt_dtypes:
|
703
|
+
# result = opt_dtype_pl(result, strict=False)
|
704
|
+
yield result
|
639
705
|
else:
|
640
706
|
yield batch_dfs
|
641
707
|
|
@@ -648,6 +714,7 @@ def read_csv(
|
|
648
714
|
concat: bool = True,
|
649
715
|
use_threads: bool = True,
|
650
716
|
verbose: bool = False,
|
717
|
+
opt_dtypes: bool = False,
|
651
718
|
**kwargs: Any,
|
652
719
|
) -> (
|
653
720
|
pl.DataFrame
|
@@ -716,6 +783,7 @@ def read_csv(
|
|
716
783
|
concat=concat,
|
717
784
|
use_threads=use_threads,
|
718
785
|
verbose=verbose,
|
786
|
+
opt_dtypes=opt_dtypes,
|
719
787
|
**kwargs,
|
720
788
|
)
|
721
789
|
return _read_csv(
|
@@ -725,12 +793,17 @@ def read_csv(
|
|
725
793
|
concat=concat,
|
726
794
|
use_threads=use_threads,
|
727
795
|
verbose=verbose,
|
796
|
+
opt_dtypes=opt_dtypes,
|
728
797
|
**kwargs,
|
729
798
|
)
|
730
799
|
|
731
800
|
|
732
801
|
def _read_parquet_file(
|
733
|
-
path: str,
|
802
|
+
path: str,
|
803
|
+
self: AbstractFileSystem,
|
804
|
+
include_file_path: bool = False,
|
805
|
+
opt_dtypes: bool = False,
|
806
|
+
**kwargs: Any,
|
734
807
|
) -> pa.Table:
|
735
808
|
"""Read a single Parquet file from any filesystem.
|
736
809
|
|
@@ -759,15 +832,21 @@ def _read_parquet_file(
|
|
759
832
|
"""
|
760
833
|
table = pq.read_table(path, filesystem=self, **kwargs)
|
761
834
|
if include_file_path:
|
762
|
-
|
835
|
+
table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
|
836
|
+
if opt_dtypes:
|
837
|
+
table = opt_dtype_pa(table, strict=False)
|
763
838
|
return table
|
764
839
|
|
765
840
|
|
766
841
|
def read_parquet_file(
|
767
|
-
self, path: str, include_file_path: bool = False, **kwargs
|
842
|
+
self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
|
768
843
|
) -> pa.Table:
|
769
844
|
return _read_parquet_file(
|
770
|
-
path=path,
|
845
|
+
path=path,
|
846
|
+
self=self,
|
847
|
+
include_file_path=include_file_path,
|
848
|
+
opt_dtypes=opt_dtypes,
|
849
|
+
**kwargs,
|
771
850
|
)
|
772
851
|
|
773
852
|
|
@@ -778,6 +857,7 @@ def _read_parquet(
|
|
778
857
|
use_threads: bool = True,
|
779
858
|
concat: bool = True,
|
780
859
|
verbose: bool = False,
|
860
|
+
opt_dtypes: bool = False,
|
781
861
|
**kwargs,
|
782
862
|
) -> pa.Table | list[pa.Table]:
|
783
863
|
"""
|
@@ -797,7 +877,8 @@ def _read_parquet(
|
|
797
877
|
if not include_file_path and concat:
|
798
878
|
if isinstance(path, str):
|
799
879
|
path = path.replace("**", "").replace("*.parquet", "")
|
800
|
-
|
880
|
+
table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
|
881
|
+
return table
|
801
882
|
else:
|
802
883
|
if isinstance(path, str):
|
803
884
|
path = path_to_glob(path, format="parquet")
|
@@ -805,30 +886,54 @@ def _read_parquet(
|
|
805
886
|
|
806
887
|
if isinstance(path, list):
|
807
888
|
if use_threads:
|
808
|
-
|
889
|
+
tables = run_parallel(
|
809
890
|
_read_parquet_file,
|
810
891
|
path,
|
811
892
|
self=self,
|
812
893
|
include_file_path=include_file_path,
|
894
|
+
opt_dtypes=opt_dtypes,
|
813
895
|
n_jobs=-1,
|
814
896
|
backend="threading",
|
815
897
|
verbose=verbose,
|
816
898
|
**kwargs,
|
817
899
|
)
|
818
900
|
else:
|
819
|
-
|
901
|
+
tables = [
|
820
902
|
_read_parquet_file(
|
821
|
-
p,
|
903
|
+
p,
|
904
|
+
self=self,
|
905
|
+
include_file_path=include_file_path,
|
906
|
+
opt_dtypes=opt_dtypes,
|
907
|
+
**kwargs,
|
822
908
|
)
|
823
909
|
for p in path
|
824
910
|
]
|
825
911
|
else:
|
826
|
-
|
827
|
-
path=path,
|
912
|
+
tables = _read_parquet_file(
|
913
|
+
path=path,
|
914
|
+
self=self,
|
915
|
+
include_file_path=include_file_path,
|
916
|
+
opt_dtypes=opt_dtypes,
|
917
|
+
**kwargs,
|
828
918
|
)
|
829
919
|
if concat:
|
830
|
-
|
831
|
-
|
920
|
+
# Unify schemas before concatenation if opt_dtypes or multiple tables
|
921
|
+
if isinstance(tables, list):
|
922
|
+
if len(tables) > 1:
|
923
|
+
schemas = [t.schema for t in tables]
|
924
|
+
unified_schema = unify_schemas_pa(schemas)
|
925
|
+
tables = [cast_schema(t, unified_schema) for t in tables]
|
926
|
+
result = pa.concat_tables(tables, promote_options="permissive")
|
927
|
+
# if opt_dtypes:
|
928
|
+
# result = opt_dtype_pa(result, strict=False)
|
929
|
+
return result
|
930
|
+
elif isinstance(tables, pa.Table):
|
931
|
+
# if opt_dtypes:
|
932
|
+
# tables = opt_dtype_pa(tables, strict=False)
|
933
|
+
return tables
|
934
|
+
else:
|
935
|
+
return pa.concat_tables(tables, promote_options="permissive")
|
936
|
+
return tables
|
832
937
|
|
833
938
|
|
834
939
|
def _read_parquet_batches(
|
@@ -839,6 +944,7 @@ def _read_parquet_batches(
|
|
839
944
|
use_threads: bool = True,
|
840
945
|
concat: bool = True,
|
841
946
|
verbose: bool = False,
|
947
|
+
opt_dtypes: bool = False,
|
842
948
|
**kwargs: Any,
|
843
949
|
) -> Generator[pa.Table | list[pa.Table], None, None]:
|
844
950
|
"""Process Parquet files in batches with performance optimizations.
|
@@ -892,7 +998,10 @@ def _read_parquet_batches(
|
|
892
998
|
if not include_file_path and concat and batch_size is None:
|
893
999
|
if isinstance(path, str):
|
894
1000
|
path = path.replace("**", "").replace("*.parquet", "")
|
895
|
-
|
1001
|
+
table = _read_parquet_file(
|
1002
|
+
path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
|
1003
|
+
)
|
1004
|
+
yield table
|
896
1005
|
return
|
897
1006
|
|
898
1007
|
# Resolve path(s) to list
|
@@ -902,7 +1011,11 @@ def _read_parquet_batches(
|
|
902
1011
|
|
903
1012
|
if not isinstance(path, list):
|
904
1013
|
yield _read_parquet_file(
|
905
|
-
path=path,
|
1014
|
+
path=path,
|
1015
|
+
self=self,
|
1016
|
+
include_file_path=include_file_path,
|
1017
|
+
opt_dtypes=opt_dtypes,
|
1018
|
+
**kwargs,
|
906
1019
|
)
|
907
1020
|
return
|
908
1021
|
|
@@ -915,6 +1028,7 @@ def _read_parquet_batches(
|
|
915
1028
|
batch_paths,
|
916
1029
|
self=self,
|
917
1030
|
include_file_path=include_file_path,
|
1031
|
+
opt_dtypes=opt_dtypes,
|
918
1032
|
n_jobs=-1,
|
919
1033
|
backend="threading",
|
920
1034
|
verbose=verbose,
|
@@ -923,14 +1037,28 @@ def _read_parquet_batches(
|
|
923
1037
|
else:
|
924
1038
|
batch_tables = [
|
925
1039
|
_read_parquet_file(
|
926
|
-
p,
|
1040
|
+
p,
|
1041
|
+
self=self,
|
1042
|
+
include_file_path=include_file_path,
|
1043
|
+
opt_dtypes=opt_dtypes,
|
1044
|
+
**kwargs,
|
927
1045
|
)
|
928
1046
|
for p in batch_paths
|
929
1047
|
]
|
930
1048
|
|
931
1049
|
if concat and batch_tables:
|
932
|
-
|
1050
|
+
# Unify schemas before concatenation
|
1051
|
+
if len(batch_tables) > 1:
|
1052
|
+
schemas = [t.schema for t in batch_tables]
|
1053
|
+
unified_schema = unify_schemas_pa(schemas)
|
1054
|
+
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1055
|
+
result = pa.concat_tables(batch_tables, promote_options="permissive")
|
1056
|
+
# if opt_dtypes:
|
1057
|
+
# result = opt_dtype_pa(result, strict=False)
|
1058
|
+
yield result
|
933
1059
|
else:
|
1060
|
+
# if opt_dtypes and isinstance(batch_tables, list):
|
1061
|
+
# batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
|
934
1062
|
yield batch_tables
|
935
1063
|
|
936
1064
|
|
@@ -942,6 +1070,7 @@ def read_parquet(
|
|
942
1070
|
concat: bool = True,
|
943
1071
|
use_threads: bool = True,
|
944
1072
|
verbose: bool = False,
|
1073
|
+
opt_dtypes: bool = False,
|
945
1074
|
**kwargs: Any,
|
946
1075
|
) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
|
947
1076
|
"""Read Parquet data with advanced features and optimizations.
|
@@ -969,6 +1098,7 @@ def read_parquet(
|
|
969
1098
|
concat: Combine multiple files/batches into single Table
|
970
1099
|
use_threads: Enable parallel file reading
|
971
1100
|
verbose: Print progress information
|
1101
|
+
opt_dtypes: Optimize Table dtypes for performance
|
972
1102
|
**kwargs: Additional arguments passed to pq.read_table()
|
973
1103
|
|
974
1104
|
Returns:
|
@@ -1011,6 +1141,7 @@ def read_parquet(
|
|
1011
1141
|
concat=concat,
|
1012
1142
|
use_threads=use_threads,
|
1013
1143
|
verbose=verbose,
|
1144
|
+
opt_dtypes=opt_dtypes,
|
1014
1145
|
**kwargs,
|
1015
1146
|
)
|
1016
1147
|
return _read_parquet(
|
@@ -1020,6 +1151,7 @@ def read_parquet(
|
|
1020
1151
|
use_threads=use_threads,
|
1021
1152
|
concat=concat,
|
1022
1153
|
verbose=verbose,
|
1154
|
+
opt_dtypes=opt_dtypes,
|
1023
1155
|
**kwargs,
|
1024
1156
|
)
|
1025
1157
|
|
@@ -1034,6 +1166,7 @@ def read_files(
|
|
1034
1166
|
jsonlines: bool = False,
|
1035
1167
|
use_threads: bool = True,
|
1036
1168
|
verbose: bool = False,
|
1169
|
+
opt_dtypes: bool = False,
|
1037
1170
|
**kwargs: Any,
|
1038
1171
|
) -> (
|
1039
1172
|
pl.DataFrame
|
@@ -1067,6 +1200,7 @@ def read_files(
|
|
1067
1200
|
jsonlines: For JSON format, whether to read as JSON Lines
|
1068
1201
|
use_threads: Enable parallel file reading
|
1069
1202
|
verbose: Print progress information
|
1203
|
+
opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
|
1070
1204
|
**kwargs: Additional format-specific arguments
|
1071
1205
|
|
1072
1206
|
Returns:
|
@@ -1116,6 +1250,7 @@ def read_files(
|
|
1116
1250
|
concat=concat,
|
1117
1251
|
use_threads=use_threads,
|
1118
1252
|
verbose=verbose,
|
1253
|
+
opt_dtypes=opt_dtypes,
|
1119
1254
|
**kwargs,
|
1120
1255
|
)
|
1121
1256
|
return read_json(
|
@@ -1126,6 +1261,7 @@ def read_files(
|
|
1126
1261
|
concat=concat,
|
1127
1262
|
use_threads=use_threads,
|
1128
1263
|
verbose=verbose,
|
1264
|
+
opt_dtypes=opt_dtypes,
|
1129
1265
|
**kwargs,
|
1130
1266
|
)
|
1131
1267
|
elif format == "csv":
|
@@ -1138,6 +1274,7 @@ def read_files(
|
|
1138
1274
|
concat=concat,
|
1139
1275
|
use_threads=use_threads,
|
1140
1276
|
verbose=verbose,
|
1277
|
+
opt_dtypes=opt_dtypes,
|
1141
1278
|
**kwargs,
|
1142
1279
|
)
|
1143
1280
|
return read_csv(
|
@@ -1147,6 +1284,7 @@ def read_files(
|
|
1147
1284
|
use_threads=use_threads,
|
1148
1285
|
concat=concat,
|
1149
1286
|
verbose=verbose,
|
1287
|
+
opt_dtypes=opt_dtypes,
|
1150
1288
|
**kwargs,
|
1151
1289
|
)
|
1152
1290
|
elif format == "parquet":
|
@@ -1159,6 +1297,7 @@ def read_files(
|
|
1159
1297
|
concat=concat,
|
1160
1298
|
use_threads=use_threads,
|
1161
1299
|
verbose=verbose,
|
1300
|
+
opt_dtypes=opt_dtypes,
|
1162
1301
|
**kwargs,
|
1163
1302
|
)
|
1164
1303
|
return read_parquet(
|
@@ -1168,6 +1307,7 @@ def read_files(
|
|
1168
1307
|
use_threads=use_threads,
|
1169
1308
|
concat=concat,
|
1170
1309
|
verbose=verbose,
|
1310
|
+
opt_dtypes=opt_dtypes,
|
1171
1311
|
**kwargs,
|
1172
1312
|
)
|
1173
1313
|
|
@@ -1415,7 +1555,7 @@ def write_parquet(
|
|
1415
1555
|
data = to_pyarrow_table(data, concat=False, unique=False)
|
1416
1556
|
|
1417
1557
|
if schema is not None:
|
1418
|
-
data = data
|
1558
|
+
data = cast_schema(data, schema)
|
1419
1559
|
metadata = []
|
1420
1560
|
pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
|
1421
1561
|
metadata = metadata[0]
|
@@ -1469,7 +1609,9 @@ def write_json(
|
|
1469
1609
|
data = data.collect()
|
1470
1610
|
if isinstance(data, pl.DataFrame):
|
1471
1611
|
data = data.to_arrow()
|
1472
|
-
data =
|
1612
|
+
data = cast_schema(
|
1613
|
+
data, convert_large_types_to_standard(data.schema)
|
1614
|
+
).to_pydict()
|
1473
1615
|
elif isinstance(data, pd.DataFrame):
|
1474
1616
|
data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
|
1475
1617
|
elif isinstance(data, pa.Table):
|
flowerpower/pipeline/base.py
CHANGED
@@ -81,7 +81,9 @@ class BasePipeline:
|
|
81
81
|
"""
|
82
82
|
if self._fs.is_cache_fs:
|
83
83
|
self._fs.sync_cache()
|
84
|
-
modules_path = posixpath.join(
|
84
|
+
modules_path = posixpath.join(
|
85
|
+
self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir
|
86
|
+
)
|
85
87
|
else:
|
86
88
|
modules_path = posixpath.join(self._fs.path, self._pipelines_dir)
|
87
89
|
if modules_path not in sys.path:
|
flowerpower/pipeline/registry.py
CHANGED
@@ -190,7 +190,9 @@ class PipelineRegistry:
|
|
190
190
|
)
|
191
191
|
|
192
192
|
# Sync filesystem if needed (using _fs)
|
193
|
-
if hasattr(self._fs, "sync_cache") and callable(
|
193
|
+
if hasattr(self._fs, "sync_cache") and callable(
|
194
|
+
getattr(self._fs, "sync_cache")
|
195
|
+
):
|
194
196
|
self._fs.sync_cache()
|
195
197
|
|
196
198
|
def _get_files(self) -> list[str]:
|
@@ -447,14 +449,12 @@ class PipelineRegistry:
|
|
447
449
|
logger.warning(f"Could not get size for {path}: {e}")
|
448
450
|
size = "Error"
|
449
451
|
|
450
|
-
pipeline_info.append(
|
451
|
-
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
}
|
457
|
-
)
|
452
|
+
pipeline_info.append({
|
453
|
+
"name": name,
|
454
|
+
"path": path,
|
455
|
+
"mod_time": mod_time,
|
456
|
+
"size": size,
|
457
|
+
})
|
458
458
|
|
459
459
|
if show:
|
460
460
|
table = Table(title="Available Pipelines")
|