FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
flowerpower/fs/ext.py CHANGED
@@ -10,13 +10,19 @@ else:
10
10
  raise ImportError("To use this module, please install `flowerpower[io]`.")
11
11
 
12
12
  import orjson
13
- import polars as pl
13
+ # import polars as pl
14
14
  import pyarrow as pa
15
15
  import pyarrow.dataset as pds
16
16
  import pyarrow.parquet as pq
17
17
  from fsspec import AbstractFileSystem
18
18
  from pydala.dataset import ParquetDataset
19
19
 
20
+ from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
21
+ from ..plugins.io.helpers.polars import pl
22
+ # from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
23
+ from ..plugins.io.helpers.pyarrow import cast_schema
24
+ from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
25
+ from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
20
26
  from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
21
27
  run_parallel, to_pyarrow_table)
22
28
 
@@ -172,6 +178,7 @@ def _read_json(
172
178
  as_dataframe: bool = True,
173
179
  concat: bool = True,
174
180
  verbose: bool = False,
181
+ opt_dtypes: bool = False,
175
182
  **kwargs,
176
183
  ) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
177
184
  """
@@ -186,6 +193,7 @@ def _read_json(
186
193
  as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
187
194
  concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
188
195
  verbose: (bool, optional) If True, print verbose output. Defaults to False.
196
+ opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
189
197
  **kwargs: Additional keyword arguments.
190
198
 
191
199
  Returns:
@@ -236,8 +244,13 @@ def _read_json(
236
244
  ][0]
237
245
  for _data in data
238
246
  ]
247
+ if opt_dtypes:
248
+ data = [opt_dtype_pl(df, strict=False) for df in data]
239
249
  if concat:
240
- return pl.concat(data, how="diagonal_relaxed")
250
+ result = pl.concat(data, how="diagonal_relaxed")
251
+ # if opt_dtypes:
252
+ # result = opt_dtype_pl(result, strict=False)
253
+ return result
241
254
  return data
242
255
 
243
256
 
@@ -251,6 +264,7 @@ def _read_json_batches(
251
264
  concat: bool = True,
252
265
  use_threads: bool = True,
253
266
  verbose: bool = False,
267
+ opt_dtypes: bool = False,
254
268
  **kwargs: Any,
255
269
  ) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
256
270
  """Process JSON files in batches with optional parallel reading.
@@ -267,6 +281,7 @@ def _read_json_batches(
267
281
  concat: Combine files within each batch
268
282
  use_threads: Enable parallel file reading within batches
269
283
  verbose: Print progress information
284
+ opt_dtypes: Optimize DataFrame dtypes
270
285
  **kwargs: Additional arguments for DataFrame conversion
271
286
 
272
287
  Yields:
@@ -341,10 +356,16 @@ def _read_json_batches(
341
356
  ][0]
342
357
  for _data in batch_data
343
358
  ]
344
-
359
+ if opt_dtypes:
360
+ batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
345
361
  if concat and len(batch_dfs) > 1:
346
- yield pl.concat(batch_dfs, how="diagonal_relaxed")
362
+ batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
363
+ # if opt_dtypes:
364
+ # batch_df = opt_dtype_pl(batch_df, strict=False)
365
+ yield batch_df
347
366
  else:
367
+ # if opt_dtypes:
368
+ # batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
348
369
  yield batch_dfs
349
370
  else:
350
371
  yield batch_data
@@ -360,6 +381,7 @@ def read_json(
360
381
  concat: bool = True,
361
382
  use_threads: bool = True,
362
383
  verbose: bool = False,
384
+ opt_dtypes: bool = False,
363
385
  **kwargs: Any,
364
386
  ) -> (
365
387
  dict
@@ -389,6 +411,7 @@ def read_json(
389
411
  concat: Combine multiple files/batches into single result
390
412
  use_threads: Enable parallel file reading
391
413
  verbose: Print progress information
414
+ opt_dtypes: Optimize DataFrame dtypes for performance
392
415
  **kwargs: Additional arguments passed to DataFrame conversion
393
416
 
394
417
  Returns:
@@ -439,6 +462,7 @@ def read_json(
439
462
  concat=concat,
440
463
  use_threads=use_threads,
441
464
  verbose=verbose,
465
+ opt_dtypes=opt_dtypes,
442
466
  **kwargs,
443
467
  )
444
468
  return _read_json(
@@ -450,12 +474,17 @@ def read_json(
450
474
  concat=concat,
451
475
  use_threads=use_threads,
452
476
  verbose=verbose,
477
+ opt_dtypes=opt_dtypes,
453
478
  **kwargs,
454
479
  )
455
480
 
456
481
 
457
482
  def _read_csv_file(
458
- path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
483
+ path: str,
484
+ self: AbstractFileSystem,
485
+ include_file_path: bool = False,
486
+ opt_dtypes: bool = False,
487
+ **kwargs: Any,
459
488
  ) -> pl.DataFrame:
460
489
  """Read a single CSV file from any filesystem.
461
490
 
@@ -466,6 +495,7 @@ def _read_csv_file(
466
495
  path: Path to CSV file
467
496
  self: Filesystem instance to use for reading
468
497
  include_file_path: Add source filepath as a column
498
+ opt_dtypes: Optimize DataFrame dtypes
469
499
  **kwargs: Additional arguments passed to pl.read_csv()
470
500
 
471
501
  Returns:
@@ -486,15 +516,21 @@ def _read_csv_file(
486
516
  with self.open(path) as f:
487
517
  df = pl.read_csv(f, **kwargs)
488
518
  if include_file_path:
489
- return df.with_columns(pl.lit(path).alias("file_path"))
519
+ df = df.with_columns(pl.lit(path).alias("file_path"))
520
+ if opt_dtypes:
521
+ df = opt_dtype_pl(df, strict=False)
490
522
  return df
491
523
 
492
524
 
493
525
  def read_csv_file(
494
- self, path: str, include_file_path: bool = False, **kwargs
526
+ self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
495
527
  ) -> pl.DataFrame:
496
528
  return _read_csv_file(
497
- path=path, self=self, include_file_path=include_file_path, **kwargs
529
+ path=path,
530
+ self=self,
531
+ include_file_path=include_file_path,
532
+ opt_dtypes=opt_dtypes,
533
+ **kwargs,
498
534
  )
499
535
 
500
536
 
@@ -505,6 +541,7 @@ def _read_csv(
505
541
  use_threads: bool = True,
506
542
  concat: bool = True,
507
543
  verbose: bool = False,
544
+ opt_dtypes: bool = False,
508
545
  **kwargs,
509
546
  ) -> pl.DataFrame | list[pl.DataFrame]:
510
547
  """
@@ -517,6 +554,7 @@ def _read_csv(
517
554
  use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
518
555
  concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
519
556
  verbose: (bool, optional) If True, print verbose output. Defaults to False.
557
+ opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
520
558
  **kwargs: Additional keyword arguments.
521
559
 
522
560
  Returns:
@@ -533,21 +571,36 @@ def _read_csv(
533
571
  path,
534
572
  self=self,
535
573
  include_file_path=include_file_path,
574
+ opt_dtypes=opt_dtypes,
536
575
  n_jobs=-1,
537
576
  backend="threading",
538
577
  verbose=verbose,
539
578
  **kwargs,
540
579
  )
541
- dfs = [
542
- _read_csv_file(p, self=self, include_file_path=include_file_path, **kwargs)
543
- for p in path
544
- ]
580
+ else:
581
+ dfs = [
582
+ _read_csv_file(
583
+ p,
584
+ self=self,
585
+ include_file_path=include_file_path,
586
+ opt_dtypes=opt_dtypes,
587
+ **kwargs,
588
+ )
589
+ for p in path
590
+ ]
545
591
  else:
546
592
  dfs = _read_csv_file(
547
- path, self=self, include_file_path=include_file_path, **kwargs
593
+ path,
594
+ self=self,
595
+ include_file_path=include_file_path,
596
+ opt_dtypes=opt_dtypes,
597
+ **kwargs,
548
598
  )
549
599
  if concat:
550
- return pl.concat(dfs, how="diagonal_relaxed")
600
+ result = pl.concat(dfs, how="diagonal_relaxed")
601
+ # if opt_dtypes:
602
+ # result = opt_dtype_pl(result, strict=False)
603
+ return result
551
604
  return dfs
552
605
 
553
606
 
@@ -559,6 +612,7 @@ def _read_csv_batches(
559
612
  concat: bool = True,
560
613
  use_threads: bool = True,
561
614
  verbose: bool = False,
615
+ opt_dtypes: bool = False,
562
616
  **kwargs: Any,
563
617
  ) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
564
618
  """Process CSV files in batches with optional parallel reading.
@@ -573,6 +627,7 @@ def _read_csv_batches(
573
627
  concat: Combine files within each batch
574
628
  use_threads: Enable parallel file reading within batches
575
629
  verbose: Print progress information
630
+ opt_dtypes: Optimize DataFrame dtypes
576
631
  **kwargs: Additional arguments passed to pl.read_csv()
577
632
 
578
633
  Yields:
@@ -624,18 +679,29 @@ def _read_csv_batches(
624
679
  n_jobs=-1,
625
680
  backend="threading",
626
681
  verbose=verbose,
682
+ opt_dtypes=opt_dtypes,
627
683
  **kwargs,
628
684
  )
629
685
  else:
630
686
  batch_dfs = [
631
687
  _read_csv_file(
632
- p, self=self, include_file_path=include_file_path, **kwargs
688
+ p,
689
+ self=self,
690
+ include_file_path=include_file_path,
691
+ opt_dtypes=opt_dtypes,
692
+ **kwargs,
633
693
  )
634
694
  for p in batch_paths
635
695
  ]
636
696
 
697
+ # if opt_dtypes:
698
+ # batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
699
+
637
700
  if concat and len(batch_dfs) > 1:
638
- yield pl.concat(batch_dfs, how="diagonal_relaxed")
701
+ result = pl.concat(batch_dfs, how="diagonal_relaxed")
702
+ # if opt_dtypes:
703
+ # result = opt_dtype_pl(result, strict=False)
704
+ yield result
639
705
  else:
640
706
  yield batch_dfs
641
707
 
@@ -648,6 +714,7 @@ def read_csv(
648
714
  concat: bool = True,
649
715
  use_threads: bool = True,
650
716
  verbose: bool = False,
717
+ opt_dtypes: bool = False,
651
718
  **kwargs: Any,
652
719
  ) -> (
653
720
  pl.DataFrame
@@ -716,6 +783,7 @@ def read_csv(
716
783
  concat=concat,
717
784
  use_threads=use_threads,
718
785
  verbose=verbose,
786
+ opt_dtypes=opt_dtypes,
719
787
  **kwargs,
720
788
  )
721
789
  return _read_csv(
@@ -725,12 +793,17 @@ def read_csv(
725
793
  concat=concat,
726
794
  use_threads=use_threads,
727
795
  verbose=verbose,
796
+ opt_dtypes=opt_dtypes,
728
797
  **kwargs,
729
798
  )
730
799
 
731
800
 
732
801
  def _read_parquet_file(
733
- path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
802
+ path: str,
803
+ self: AbstractFileSystem,
804
+ include_file_path: bool = False,
805
+ opt_dtypes: bool = False,
806
+ **kwargs: Any,
734
807
  ) -> pa.Table:
735
808
  """Read a single Parquet file from any filesystem.
736
809
 
@@ -759,15 +832,21 @@ def _read_parquet_file(
759
832
  """
760
833
  table = pq.read_table(path, filesystem=self, **kwargs)
761
834
  if include_file_path:
762
- return table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
835
+ table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
836
+ if opt_dtypes:
837
+ table = opt_dtype_pa(table, strict=False)
763
838
  return table
764
839
 
765
840
 
766
841
  def read_parquet_file(
767
- self, path: str, include_file_path: bool = False, **kwargs
842
+ self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
768
843
  ) -> pa.Table:
769
844
  return _read_parquet_file(
770
- path=path, self=self, include_file_path=include_file_path, **kwargs
845
+ path=path,
846
+ self=self,
847
+ include_file_path=include_file_path,
848
+ opt_dtypes=opt_dtypes,
849
+ **kwargs,
771
850
  )
772
851
 
773
852
 
@@ -778,6 +857,7 @@ def _read_parquet(
778
857
  use_threads: bool = True,
779
858
  concat: bool = True,
780
859
  verbose: bool = False,
860
+ opt_dtypes: bool = False,
781
861
  **kwargs,
782
862
  ) -> pa.Table | list[pa.Table]:
783
863
  """
@@ -797,7 +877,8 @@ def _read_parquet(
797
877
  if not include_file_path and concat:
798
878
  if isinstance(path, str):
799
879
  path = path.replace("**", "").replace("*.parquet", "")
800
- return pq.read_table(path, filesystem=self, **kwargs)
880
+ table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
881
+ return table
801
882
  else:
802
883
  if isinstance(path, str):
803
884
  path = path_to_glob(path, format="parquet")
@@ -805,30 +886,54 @@ def _read_parquet(
805
886
 
806
887
  if isinstance(path, list):
807
888
  if use_threads:
808
- table = run_parallel(
889
+ tables = run_parallel(
809
890
  _read_parquet_file,
810
891
  path,
811
892
  self=self,
812
893
  include_file_path=include_file_path,
894
+ opt_dtypes=opt_dtypes,
813
895
  n_jobs=-1,
814
896
  backend="threading",
815
897
  verbose=verbose,
816
898
  **kwargs,
817
899
  )
818
900
  else:
819
- table = [
901
+ tables = [
820
902
  _read_parquet_file(
821
- p, self=self, include_file_path=include_file_path, **kwargs
903
+ p,
904
+ self=self,
905
+ include_file_path=include_file_path,
906
+ opt_dtypes=opt_dtypes,
907
+ **kwargs,
822
908
  )
823
909
  for p in path
824
910
  ]
825
911
  else:
826
- table = _read_parquet_file(
827
- path=path, self=self, include_file_path=include_file_path, **kwargs
912
+ tables = _read_parquet_file(
913
+ path=path,
914
+ self=self,
915
+ include_file_path=include_file_path,
916
+ opt_dtypes=opt_dtypes,
917
+ **kwargs,
828
918
  )
829
919
  if concat:
830
- return pa.concat_tables(table, promote_options="permissive")
831
- return table
920
+ # Unify schemas before concatenation if opt_dtypes or multiple tables
921
+ if isinstance(tables, list):
922
+ if len(tables) > 1:
923
+ schemas = [t.schema for t in tables]
924
+ unified_schema = unify_schemas_pa(schemas)
925
+ tables = [cast_schema(t, unified_schema) for t in tables]
926
+ result = pa.concat_tables(tables, promote_options="permissive")
927
+ # if opt_dtypes:
928
+ # result = opt_dtype_pa(result, strict=False)
929
+ return result
930
+ elif isinstance(tables, pa.Table):
931
+ # if opt_dtypes:
932
+ # tables = opt_dtype_pa(tables, strict=False)
933
+ return tables
934
+ else:
935
+ return pa.concat_tables(tables, promote_options="permissive")
936
+ return tables
832
937
 
833
938
 
834
939
  def _read_parquet_batches(
@@ -839,6 +944,7 @@ def _read_parquet_batches(
839
944
  use_threads: bool = True,
840
945
  concat: bool = True,
841
946
  verbose: bool = False,
947
+ opt_dtypes: bool = False,
842
948
  **kwargs: Any,
843
949
  ) -> Generator[pa.Table | list[pa.Table], None, None]:
844
950
  """Process Parquet files in batches with performance optimizations.
@@ -892,7 +998,10 @@ def _read_parquet_batches(
892
998
  if not include_file_path and concat and batch_size is None:
893
999
  if isinstance(path, str):
894
1000
  path = path.replace("**", "").replace("*.parquet", "")
895
- yield pq.read_table(path, filesystem=self, **kwargs)
1001
+ table = _read_parquet_file(
1002
+ path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
1003
+ )
1004
+ yield table
896
1005
  return
897
1006
 
898
1007
  # Resolve path(s) to list
@@ -902,7 +1011,11 @@ def _read_parquet_batches(
902
1011
 
903
1012
  if not isinstance(path, list):
904
1013
  yield _read_parquet_file(
905
- path=path, self=self, include_file_path=include_file_path, **kwargs
1014
+ path=path,
1015
+ self=self,
1016
+ include_file_path=include_file_path,
1017
+ opt_dtypes=opt_dtypes,
1018
+ **kwargs,
906
1019
  )
907
1020
  return
908
1021
 
@@ -915,6 +1028,7 @@ def _read_parquet_batches(
915
1028
  batch_paths,
916
1029
  self=self,
917
1030
  include_file_path=include_file_path,
1031
+ opt_dtypes=opt_dtypes,
918
1032
  n_jobs=-1,
919
1033
  backend="threading",
920
1034
  verbose=verbose,
@@ -923,14 +1037,28 @@ def _read_parquet_batches(
923
1037
  else:
924
1038
  batch_tables = [
925
1039
  _read_parquet_file(
926
- p, self=self, include_file_path=include_file_path, **kwargs
1040
+ p,
1041
+ self=self,
1042
+ include_file_path=include_file_path,
1043
+ opt_dtypes=opt_dtypes,
1044
+ **kwargs,
927
1045
  )
928
1046
  for p in batch_paths
929
1047
  ]
930
1048
 
931
1049
  if concat and batch_tables:
932
- yield pa.concat_tables(batch_tables, promote_options="permissive")
1050
+ # Unify schemas before concatenation
1051
+ if len(batch_tables) > 1:
1052
+ schemas = [t.schema for t in batch_tables]
1053
+ unified_schema = unify_schemas_pa(schemas)
1054
+ batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
1055
+ result = pa.concat_tables(batch_tables, promote_options="permissive")
1056
+ # if opt_dtypes:
1057
+ # result = opt_dtype_pa(result, strict=False)
1058
+ yield result
933
1059
  else:
1060
+ # if opt_dtypes and isinstance(batch_tables, list):
1061
+ # batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
934
1062
  yield batch_tables
935
1063
 
936
1064
 
@@ -942,6 +1070,7 @@ def read_parquet(
942
1070
  concat: bool = True,
943
1071
  use_threads: bool = True,
944
1072
  verbose: bool = False,
1073
+ opt_dtypes: bool = False,
945
1074
  **kwargs: Any,
946
1075
  ) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
947
1076
  """Read Parquet data with advanced features and optimizations.
@@ -969,6 +1098,7 @@ def read_parquet(
969
1098
  concat: Combine multiple files/batches into single Table
970
1099
  use_threads: Enable parallel file reading
971
1100
  verbose: Print progress information
1101
+ opt_dtypes: Optimize Table dtypes for performance
972
1102
  **kwargs: Additional arguments passed to pq.read_table()
973
1103
 
974
1104
  Returns:
@@ -1011,6 +1141,7 @@ def read_parquet(
1011
1141
  concat=concat,
1012
1142
  use_threads=use_threads,
1013
1143
  verbose=verbose,
1144
+ opt_dtypes=opt_dtypes,
1014
1145
  **kwargs,
1015
1146
  )
1016
1147
  return _read_parquet(
@@ -1020,6 +1151,7 @@ def read_parquet(
1020
1151
  use_threads=use_threads,
1021
1152
  concat=concat,
1022
1153
  verbose=verbose,
1154
+ opt_dtypes=opt_dtypes,
1023
1155
  **kwargs,
1024
1156
  )
1025
1157
 
@@ -1034,6 +1166,7 @@ def read_files(
1034
1166
  jsonlines: bool = False,
1035
1167
  use_threads: bool = True,
1036
1168
  verbose: bool = False,
1169
+ opt_dtypes: bool = False,
1037
1170
  **kwargs: Any,
1038
1171
  ) -> (
1039
1172
  pl.DataFrame
@@ -1067,6 +1200,7 @@ def read_files(
1067
1200
  jsonlines: For JSON format, whether to read as JSON Lines
1068
1201
  use_threads: Enable parallel file reading
1069
1202
  verbose: Print progress information
1203
+ opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
1070
1204
  **kwargs: Additional format-specific arguments
1071
1205
 
1072
1206
  Returns:
@@ -1116,6 +1250,7 @@ def read_files(
1116
1250
  concat=concat,
1117
1251
  use_threads=use_threads,
1118
1252
  verbose=verbose,
1253
+ opt_dtypes=opt_dtypes,
1119
1254
  **kwargs,
1120
1255
  )
1121
1256
  return read_json(
@@ -1126,6 +1261,7 @@ def read_files(
1126
1261
  concat=concat,
1127
1262
  use_threads=use_threads,
1128
1263
  verbose=verbose,
1264
+ opt_dtypes=opt_dtypes,
1129
1265
  **kwargs,
1130
1266
  )
1131
1267
  elif format == "csv":
@@ -1138,6 +1274,7 @@ def read_files(
1138
1274
  concat=concat,
1139
1275
  use_threads=use_threads,
1140
1276
  verbose=verbose,
1277
+ opt_dtypes=opt_dtypes,
1141
1278
  **kwargs,
1142
1279
  )
1143
1280
  return read_csv(
@@ -1147,6 +1284,7 @@ def read_files(
1147
1284
  use_threads=use_threads,
1148
1285
  concat=concat,
1149
1286
  verbose=verbose,
1287
+ opt_dtypes=opt_dtypes,
1150
1288
  **kwargs,
1151
1289
  )
1152
1290
  elif format == "parquet":
@@ -1159,6 +1297,7 @@ def read_files(
1159
1297
  concat=concat,
1160
1298
  use_threads=use_threads,
1161
1299
  verbose=verbose,
1300
+ opt_dtypes=opt_dtypes,
1162
1301
  **kwargs,
1163
1302
  )
1164
1303
  return read_parquet(
@@ -1168,6 +1307,7 @@ def read_files(
1168
1307
  use_threads=use_threads,
1169
1308
  concat=concat,
1170
1309
  verbose=verbose,
1310
+ opt_dtypes=opt_dtypes,
1171
1311
  **kwargs,
1172
1312
  )
1173
1313
 
@@ -1415,7 +1555,7 @@ def write_parquet(
1415
1555
  data = to_pyarrow_table(data, concat=False, unique=False)
1416
1556
 
1417
1557
  if schema is not None:
1418
- data = data.cast(schema)
1558
+ data = cast_schema(data, schema)
1419
1559
  metadata = []
1420
1560
  pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
1421
1561
  metadata = metadata[0]
@@ -1469,7 +1609,9 @@ def write_json(
1469
1609
  data = data.collect()
1470
1610
  if isinstance(data, pl.DataFrame):
1471
1611
  data = data.to_arrow()
1472
- data = data.cast(convert_large_types_to_standard(data.schema)).to_pydict()
1612
+ data = cast_schema(
1613
+ data, convert_large_types_to_standard(data.schema)
1614
+ ).to_pydict()
1473
1615
  elif isinstance(data, pd.DataFrame):
1474
1616
  data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
1475
1617
  elif isinstance(data, pa.Table):
@@ -81,7 +81,9 @@ class BasePipeline:
81
81
  """
82
82
  if self._fs.is_cache_fs:
83
83
  self._fs.sync_cache()
84
- modules_path = posixpath.join(self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir)
84
+ modules_path = posixpath.join(
85
+ self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir
86
+ )
85
87
  else:
86
88
  modules_path = posixpath.join(self._fs.path, self._pipelines_dir)
87
89
  if modules_path not in sys.path:
@@ -190,7 +190,9 @@ class PipelineRegistry:
190
190
  )
191
191
 
192
192
  # Sync filesystem if needed (using _fs)
193
- if hasattr(self._fs, "sync_cache") and callable(getattr(self._fs, "sync_cache")):
193
+ if hasattr(self._fs, "sync_cache") and callable(
194
+ getattr(self._fs, "sync_cache")
195
+ ):
194
196
  self._fs.sync_cache()
195
197
 
196
198
  def _get_files(self) -> list[str]:
@@ -447,14 +449,12 @@ class PipelineRegistry:
447
449
  logger.warning(f"Could not get size for {path}: {e}")
448
450
  size = "Error"
449
451
 
450
- pipeline_info.append(
451
- {
452
- "name": name,
453
- "path": path,
454
- "mod_time": mod_time,
455
- "size": size,
456
- }
457
- )
452
+ pipeline_info.append({
453
+ "name": name,
454
+ "path": path,
455
+ "mod_time": mod_time,
456
+ "size": size,
457
+ })
458
458
 
459
459
  if show:
460
460
  table = Table(title="Available Pipelines")