FlowerPower 0.11.6__py3-none-any.whl → 0.11.6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +58 -26
- flowerpower/plugins/io/base.py +498 -80
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.2.dist-info}/METADATA +1 -1
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.2.dist-info}/RECORD +8 -8
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.2.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.2.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.2.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.2.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
CHANGED
@@ -193,6 +193,7 @@ def _read_json(
|
|
193
193
|
as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
|
194
194
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
195
195
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
196
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
196
197
|
**kwargs: Additional keyword arguments.
|
197
198
|
|
198
199
|
Returns:
|
@@ -247,8 +248,8 @@ def _read_json(
|
|
247
248
|
data = [opt_dtype_pl(df, strict=False) for df in data]
|
248
249
|
if concat:
|
249
250
|
result = pl.concat(data, how="diagonal_relaxed")
|
250
|
-
if opt_dtypes:
|
251
|
-
|
251
|
+
# if opt_dtypes:
|
252
|
+
# result = opt_dtype_pl(result, strict=False)
|
252
253
|
return result
|
253
254
|
return data
|
254
255
|
|
@@ -280,6 +281,7 @@ def _read_json_batches(
|
|
280
281
|
concat: Combine files within each batch
|
281
282
|
use_threads: Enable parallel file reading within batches
|
282
283
|
verbose: Print progress information
|
284
|
+
opt_dtypes: Optimize DataFrame dtypes
|
283
285
|
**kwargs: Additional arguments for DataFrame conversion
|
284
286
|
|
285
287
|
Yields:
|
@@ -354,10 +356,16 @@ def _read_json_batches(
|
|
354
356
|
][0]
|
355
357
|
for _data in batch_data
|
356
358
|
]
|
357
|
-
|
359
|
+
if opt_dtypes:
|
360
|
+
batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
358
361
|
if concat and len(batch_dfs) > 1:
|
359
|
-
|
362
|
+
batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
|
363
|
+
# if opt_dtypes:
|
364
|
+
# batch_df = opt_dtype_pl(batch_df, strict=False)
|
365
|
+
yield batch_df
|
360
366
|
else:
|
367
|
+
# if opt_dtypes:
|
368
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
361
369
|
yield batch_dfs
|
362
370
|
else:
|
363
371
|
yield batch_data
|
@@ -403,6 +411,7 @@ def read_json(
|
|
403
411
|
concat: Combine multiple files/batches into single result
|
404
412
|
use_threads: Enable parallel file reading
|
405
413
|
verbose: Print progress information
|
414
|
+
opt_dtypes: Optimize DataFrame dtypes for performance
|
406
415
|
**kwargs: Additional arguments passed to DataFrame conversion
|
407
416
|
|
408
417
|
Returns:
|
@@ -486,6 +495,7 @@ def _read_csv_file(
|
|
486
495
|
path: Path to CSV file
|
487
496
|
self: Filesystem instance to use for reading
|
488
497
|
include_file_path: Add source filepath as a column
|
498
|
+
opt_dtypes: Optimize DataFrame dtypes
|
489
499
|
**kwargs: Additional arguments passed to pl.read_csv()
|
490
500
|
|
491
501
|
Returns:
|
@@ -544,6 +554,7 @@ def _read_csv(
|
|
544
554
|
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
545
555
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
546
556
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
557
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
547
558
|
**kwargs: Additional keyword arguments.
|
548
559
|
|
549
560
|
Returns:
|
@@ -587,8 +598,8 @@ def _read_csv(
|
|
587
598
|
)
|
588
599
|
if concat:
|
589
600
|
result = pl.concat(dfs, how="diagonal_relaxed")
|
590
|
-
if opt_dtypes:
|
591
|
-
|
601
|
+
# if opt_dtypes:
|
602
|
+
# result = opt_dtype_pl(result, strict=False)
|
592
603
|
return result
|
593
604
|
return dfs
|
594
605
|
|
@@ -616,6 +627,7 @@ def _read_csv_batches(
|
|
616
627
|
concat: Combine files within each batch
|
617
628
|
use_threads: Enable parallel file reading within batches
|
618
629
|
verbose: Print progress information
|
630
|
+
opt_dtypes: Optimize DataFrame dtypes
|
619
631
|
**kwargs: Additional arguments passed to pl.read_csv()
|
620
632
|
|
621
633
|
Yields:
|
@@ -667,23 +679,28 @@ def _read_csv_batches(
|
|
667
679
|
n_jobs=-1,
|
668
680
|
backend="threading",
|
669
681
|
verbose=verbose,
|
682
|
+
opt_dtypes=opt_dtypes,
|
670
683
|
**kwargs,
|
671
684
|
)
|
672
685
|
else:
|
673
686
|
batch_dfs = [
|
674
687
|
_read_csv_file(
|
675
|
-
p,
|
688
|
+
p,
|
689
|
+
self=self,
|
690
|
+
include_file_path=include_file_path,
|
691
|
+
opt_dtypes=opt_dtypes,
|
692
|
+
**kwargs,
|
676
693
|
)
|
677
694
|
for p in batch_paths
|
678
695
|
]
|
679
696
|
|
680
|
-
if opt_dtypes:
|
681
|
-
|
697
|
+
# if opt_dtypes:
|
698
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
682
699
|
|
683
700
|
if concat and len(batch_dfs) > 1:
|
684
701
|
result = pl.concat(batch_dfs, how="diagonal_relaxed")
|
685
|
-
if opt_dtypes:
|
686
|
-
|
702
|
+
# if opt_dtypes:
|
703
|
+
# result = opt_dtype_pl(result, strict=False)
|
687
704
|
yield result
|
688
705
|
else:
|
689
706
|
yield batch_dfs
|
@@ -766,6 +783,7 @@ def read_csv(
|
|
766
783
|
concat=concat,
|
767
784
|
use_threads=use_threads,
|
768
785
|
verbose=verbose,
|
786
|
+
opt_dtypes=opt_dtypes,
|
769
787
|
**kwargs,
|
770
788
|
)
|
771
789
|
return _read_csv(
|
@@ -775,6 +793,7 @@ def read_csv(
|
|
775
793
|
concat=concat,
|
776
794
|
use_threads=use_threads,
|
777
795
|
verbose=verbose,
|
796
|
+
opt_dtypes=opt_dtypes,
|
778
797
|
**kwargs,
|
779
798
|
)
|
780
799
|
|
@@ -858,9 +877,7 @@ def _read_parquet(
|
|
858
877
|
if not include_file_path and concat:
|
859
878
|
if isinstance(path, str):
|
860
879
|
path = path.replace("**", "").replace("*.parquet", "")
|
861
|
-
table =
|
862
|
-
if opt_dtypes:
|
863
|
-
table = opt_dtype_pa(table, strict=False)
|
880
|
+
table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
|
864
881
|
return table
|
865
882
|
else:
|
866
883
|
if isinstance(path, str):
|
@@ -907,12 +924,12 @@ def _read_parquet(
|
|
907
924
|
unified_schema = unify_schemas_pa(schemas)
|
908
925
|
tables = [cast_schema(t, unified_schema) for t in tables]
|
909
926
|
result = pa.concat_tables(tables, promote_options="permissive")
|
910
|
-
if opt_dtypes:
|
911
|
-
|
927
|
+
# if opt_dtypes:
|
928
|
+
# result = opt_dtype_pa(result, strict=False)
|
912
929
|
return result
|
913
930
|
elif isinstance(tables, pa.Table):
|
914
|
-
if opt_dtypes:
|
915
|
-
|
931
|
+
# if opt_dtypes:
|
932
|
+
# tables = opt_dtype_pa(tables, strict=False)
|
916
933
|
return tables
|
917
934
|
else:
|
918
935
|
return pa.concat_tables(tables, promote_options="permissive")
|
@@ -981,9 +998,9 @@ def _read_parquet_batches(
|
|
981
998
|
if not include_file_path and concat and batch_size is None:
|
982
999
|
if isinstance(path, str):
|
983
1000
|
path = path.replace("**", "").replace("*.parquet", "")
|
984
|
-
table =
|
985
|
-
|
986
|
-
|
1001
|
+
table = _read_parquet_file(
|
1002
|
+
path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
|
1003
|
+
)
|
987
1004
|
yield table
|
988
1005
|
return
|
989
1006
|
|
@@ -994,7 +1011,11 @@ def _read_parquet_batches(
|
|
994
1011
|
|
995
1012
|
if not isinstance(path, list):
|
996
1013
|
yield _read_parquet_file(
|
997
|
-
path=path,
|
1014
|
+
path=path,
|
1015
|
+
self=self,
|
1016
|
+
include_file_path=include_file_path,
|
1017
|
+
opt_dtypes=opt_dtypes,
|
1018
|
+
**kwargs,
|
998
1019
|
)
|
999
1020
|
return
|
1000
1021
|
|
@@ -1032,12 +1053,12 @@ def _read_parquet_batches(
|
|
1032
1053
|
unified_schema = unify_schemas_pa(schemas)
|
1033
1054
|
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1034
1055
|
result = pa.concat_tables(batch_tables, promote_options="permissive")
|
1035
|
-
if opt_dtypes:
|
1036
|
-
|
1056
|
+
# if opt_dtypes:
|
1057
|
+
# result = opt_dtype_pa(result, strict=False)
|
1037
1058
|
yield result
|
1038
1059
|
else:
|
1039
|
-
if opt_dtypes and isinstance(batch_tables, list):
|
1040
|
-
|
1060
|
+
# if opt_dtypes and isinstance(batch_tables, list):
|
1061
|
+
# batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
|
1041
1062
|
yield batch_tables
|
1042
1063
|
|
1043
1064
|
|
@@ -1077,6 +1098,7 @@ def read_parquet(
|
|
1077
1098
|
concat: Combine multiple files/batches into single Table
|
1078
1099
|
use_threads: Enable parallel file reading
|
1079
1100
|
verbose: Print progress information
|
1101
|
+
opt_dtypes: Optimize Table dtypes for performance
|
1080
1102
|
**kwargs: Additional arguments passed to pq.read_table()
|
1081
1103
|
|
1082
1104
|
Returns:
|
@@ -1119,6 +1141,7 @@ def read_parquet(
|
|
1119
1141
|
concat=concat,
|
1120
1142
|
use_threads=use_threads,
|
1121
1143
|
verbose=verbose,
|
1144
|
+
opt_dtypes=opt_dtypes,
|
1122
1145
|
**kwargs,
|
1123
1146
|
)
|
1124
1147
|
return _read_parquet(
|
@@ -1128,6 +1151,7 @@ def read_parquet(
|
|
1128
1151
|
use_threads=use_threads,
|
1129
1152
|
concat=concat,
|
1130
1153
|
verbose=verbose,
|
1154
|
+
opt_dtypes=opt_dtypes,
|
1131
1155
|
**kwargs,
|
1132
1156
|
)
|
1133
1157
|
|
@@ -1142,6 +1166,7 @@ def read_files(
|
|
1142
1166
|
jsonlines: bool = False,
|
1143
1167
|
use_threads: bool = True,
|
1144
1168
|
verbose: bool = False,
|
1169
|
+
opt_dtypes: bool = False,
|
1145
1170
|
**kwargs: Any,
|
1146
1171
|
) -> (
|
1147
1172
|
pl.DataFrame
|
@@ -1175,6 +1200,7 @@ def read_files(
|
|
1175
1200
|
jsonlines: For JSON format, whether to read as JSON Lines
|
1176
1201
|
use_threads: Enable parallel file reading
|
1177
1202
|
verbose: Print progress information
|
1203
|
+
opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
|
1178
1204
|
**kwargs: Additional format-specific arguments
|
1179
1205
|
|
1180
1206
|
Returns:
|
@@ -1224,6 +1250,7 @@ def read_files(
|
|
1224
1250
|
concat=concat,
|
1225
1251
|
use_threads=use_threads,
|
1226
1252
|
verbose=verbose,
|
1253
|
+
opt_dtypes=opt_dtypes,
|
1227
1254
|
**kwargs,
|
1228
1255
|
)
|
1229
1256
|
return read_json(
|
@@ -1234,6 +1261,7 @@ def read_files(
|
|
1234
1261
|
concat=concat,
|
1235
1262
|
use_threads=use_threads,
|
1236
1263
|
verbose=verbose,
|
1264
|
+
opt_dtypes=opt_dtypes,
|
1237
1265
|
**kwargs,
|
1238
1266
|
)
|
1239
1267
|
elif format == "csv":
|
@@ -1246,6 +1274,7 @@ def read_files(
|
|
1246
1274
|
concat=concat,
|
1247
1275
|
use_threads=use_threads,
|
1248
1276
|
verbose=verbose,
|
1277
|
+
opt_dtypes=opt_dtypes,
|
1249
1278
|
**kwargs,
|
1250
1279
|
)
|
1251
1280
|
return read_csv(
|
@@ -1255,6 +1284,7 @@ def read_files(
|
|
1255
1284
|
use_threads=use_threads,
|
1256
1285
|
concat=concat,
|
1257
1286
|
verbose=verbose,
|
1287
|
+
opt_dtypes=opt_dtypes,
|
1258
1288
|
**kwargs,
|
1259
1289
|
)
|
1260
1290
|
elif format == "parquet":
|
@@ -1267,6 +1297,7 @@ def read_files(
|
|
1267
1297
|
concat=concat,
|
1268
1298
|
use_threads=use_threads,
|
1269
1299
|
verbose=verbose,
|
1300
|
+
opt_dtypes=opt_dtypes,
|
1270
1301
|
**kwargs,
|
1271
1302
|
)
|
1272
1303
|
return read_parquet(
|
@@ -1276,6 +1307,7 @@ def read_files(
|
|
1276
1307
|
use_threads=use_threads,
|
1277
1308
|
concat=concat,
|
1278
1309
|
verbose=verbose,
|
1310
|
+
opt_dtypes=opt_dtypes,
|
1279
1311
|
**kwargs,
|
1280
1312
|
)
|
1281
1313
|
|
flowerpower/plugins/io/base.py
CHANGED
@@ -185,74 +185,135 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
185
185
|
include_file_path: bool = field(default=False)
|
186
186
|
concat: bool = field(default=True)
|
187
187
|
batch_size: int | None = field(default=None)
|
188
|
+
opt_dtypes: bool = field(default=True)
|
189
|
+
use_threads: bool = field(default=True)
|
188
190
|
conn: duckdb.DuckDBPyConnection | None = field(default=None)
|
189
191
|
ctx: datafusion.SessionContext | None = field(default=None)
|
190
192
|
jsonlines: bool | None = field(default=None)
|
191
193
|
partitioning: str | list[str] | pds.Partitioning | None = field(default=None)
|
194
|
+
verbose: bool | None = field(default=None)
|
192
195
|
_data: Any | None = field(default=None)
|
193
196
|
|
194
|
-
def _load(
|
195
|
-
|
196
|
-
|
197
|
+
def _load(
|
198
|
+
self,
|
199
|
+
metadata: bool = False,
|
200
|
+
reload: bool = False,
|
201
|
+
batch_size: int | None = None,
|
202
|
+
include_file_path: bool = False,
|
203
|
+
concat: bool | None = None,
|
204
|
+
use_threads: bool | None = None,
|
205
|
+
verbose: bool | None = None,
|
206
|
+
opt_dtypes: bool | None = None,
|
207
|
+
**kwargs,
|
208
|
+
):
|
209
|
+
if batch_size is not None:
|
210
|
+
if self.batch_size != batch_size:
|
197
211
|
reload = True
|
198
|
-
|
199
|
-
else:
|
200
|
-
kwargs.pop("include_file_path")
|
212
|
+
self.batch_size = batch_size
|
201
213
|
|
202
|
-
if
|
203
|
-
if self.
|
214
|
+
if include_file_path is not None:
|
215
|
+
if self.include_file_path != include_file_path:
|
204
216
|
reload = True
|
205
|
-
self.
|
206
|
-
else:
|
207
|
-
kwargs.pop("concat")
|
217
|
+
self.include_file_path = include_file_path
|
208
218
|
|
209
|
-
if
|
210
|
-
if self.
|
219
|
+
if concat is not None:
|
220
|
+
if self.concat != concat:
|
211
221
|
reload = True
|
212
|
-
self.
|
213
|
-
|
214
|
-
|
222
|
+
self.concat = concat
|
223
|
+
|
224
|
+
if use_threads is not None:
|
225
|
+
if self.use_threads != use_threads:
|
226
|
+
reload = True
|
227
|
+
self.use_threads = use_threads
|
228
|
+
|
229
|
+
if verbose is not None:
|
230
|
+
if self.verbose != verbose:
|
231
|
+
reload = True
|
232
|
+
self.verbose = verbose
|
233
|
+
|
234
|
+
if opt_dtypes is not None:
|
235
|
+
if self.opt_dtypes != opt_dtypes:
|
236
|
+
reload = True
|
237
|
+
self.opt_dtypes = opt_dtypes
|
215
238
|
|
216
239
|
if "partitioning" in kwargs:
|
217
240
|
if self.partitioning != kwargs["partitioning"]:
|
218
241
|
reload = True
|
219
242
|
self.partitioning = kwargs.pop("partitioning")
|
220
|
-
else:
|
221
|
-
kwargs.pop("partitioning")
|
222
243
|
|
223
244
|
if not hasattr(self, "_data") or self._data is None or reload:
|
224
245
|
self._data = self.fs.read_files(
|
225
246
|
path=self._glob_path,
|
226
247
|
format=self.format,
|
227
|
-
include_file_path=True,
|
248
|
+
include_file_path=True if metadata or self.include_file_path else False,
|
228
249
|
concat=self.concat,
|
229
250
|
jsonlines=self.jsonlines or None,
|
230
251
|
batch_size=self.batch_size,
|
231
252
|
partitioning=self.partitioning,
|
253
|
+
opt_dtypes=self.opt_dtypes,
|
254
|
+
verbose=self.verbose,
|
255
|
+
use_threads=self.use_threads,
|
232
256
|
**kwargs,
|
233
257
|
)
|
234
|
-
if
|
235
|
-
self.
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
258
|
+
if metadata:
|
259
|
+
if isinstance(self._data, tuple | list):
|
260
|
+
self._metadata = [
|
261
|
+
get_dataframe_metadata(
|
262
|
+
df=df,
|
263
|
+
path=self.path,
|
264
|
+
format=self.format,
|
265
|
+
num_files=pl.from_arrow(df.select(["file_path"])).select(
|
266
|
+
pl.n_unique("file_path")
|
267
|
+
)[0, 0]
|
268
|
+
if isinstance(df, pa.Table)
|
269
|
+
else df.select(pl.n_unique("file_path"))[0, 0],
|
270
|
+
)
|
271
|
+
for df in self._data
|
272
|
+
]
|
273
|
+
if not self.include_file_path:
|
274
|
+
self._data = [df.drop("file_path") for df in self._data]
|
275
|
+
|
276
|
+
elif isinstance(self._data, pa.Table):
|
277
|
+
self._metadata = get_dataframe_metadata(
|
278
|
+
df=self._data,
|
279
|
+
path=self.path,
|
280
|
+
format=self.format,
|
281
|
+
num_files=pl.from_arrow(
|
282
|
+
self._data.select(pl.n_unique("file_path"))
|
283
|
+
)[0, 0],
|
284
|
+
)
|
285
|
+
if not self.include_file_path:
|
286
|
+
self._data = self._data.drop("file_path")
|
287
|
+
|
288
|
+
elif isinstance(self._data, pl.DataFrame | pl.LazyFrame):
|
289
|
+
self._metadata = get_dataframe_metadata(
|
290
|
+
df=self._data,
|
291
|
+
path=self.path,
|
292
|
+
format=self.format,
|
293
|
+
num_files=self._data.select(pl.n_unique("file_path"))[0, 0]
|
294
|
+
if isinstance(self._data, pl.DataFrame)
|
295
|
+
else self._data.select(pl.n_unique("file_path")).collect()[
|
296
|
+
0, 0
|
297
|
+
],
|
298
|
+
)
|
299
|
+
|
300
|
+
if not self.include_file_path:
|
245
301
|
self._data = self._data.drop("file_path")
|
246
|
-
|
247
|
-
|
248
|
-
df.drop("file_path") if isinstance(df, pa.Table) else df
|
249
|
-
for df in self._data
|
250
|
-
]
|
302
|
+
else:
|
303
|
+
metadata = {}
|
251
304
|
else:
|
252
305
|
self._metadata = {}
|
253
306
|
|
254
307
|
def to_pandas(
|
255
|
-
self,
|
308
|
+
self,
|
309
|
+
metadata: bool = False,
|
310
|
+
reload: bool = False,
|
311
|
+
include_file_path: bool = False,
|
312
|
+
concat: bool | None = None,
|
313
|
+
use_threads: bool | None = None,
|
314
|
+
verbose: bool | None = None,
|
315
|
+
opt_dtypes: bool | None = None,
|
316
|
+
**kwargs,
|
256
317
|
) -> (
|
257
318
|
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]]
|
258
319
|
| pd.DataFrame
|
@@ -263,12 +324,28 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
263
324
|
Args:
|
264
325
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
265
326
|
reload (bool, optional): Reload data if True. Default is False.
|
327
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
328
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
329
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
330
|
+
verbose (bool, optional): Verbose output. Default is None.
|
331
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
332
|
+
kwargs: Additional keyword arguments.
|
266
333
|
|
267
334
|
Returns:
|
268
335
|
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]] | pd.DataFrame | list[pd.DataFrame]: Pandas
|
269
336
|
DataFrame or list of DataFrames and optional metadata.
|
270
337
|
"""
|
271
|
-
self._load(
|
338
|
+
self._load(
|
339
|
+
reload=reload,
|
340
|
+
metadata=metadata,
|
341
|
+
batch_size=None,
|
342
|
+
include_file_path=include_file_path,
|
343
|
+
concat=concat,
|
344
|
+
use_threads=use_threads,
|
345
|
+
verbose=verbose,
|
346
|
+
opt_dtypes=opt_dtypes,
|
347
|
+
**kwargs,
|
348
|
+
)
|
272
349
|
if isinstance(self._data, list):
|
273
350
|
df = [
|
274
351
|
df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
@@ -282,26 +359,49 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
282
359
|
else self._data.to_pandas()
|
283
360
|
)
|
284
361
|
if metadata:
|
285
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
286
|
-
return df,
|
362
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
363
|
+
return df, self._metadata
|
287
364
|
return df
|
288
365
|
|
289
366
|
def iter_pandas(
|
290
|
-
self,
|
367
|
+
self,
|
368
|
+
reload: bool = False,
|
369
|
+
batch_size: int | None = None,
|
370
|
+
include_file_path: bool = False,
|
371
|
+
concat: bool | None = None,
|
372
|
+
use_threads: bool | None = None,
|
373
|
+
verbose: bool | None = None,
|
374
|
+
opt_dtypes: bool | None = None,
|
375
|
+
**kwargs,
|
291
376
|
) -> Generator[pd.DataFrame, None, None]:
|
292
377
|
"""Iterate over Pandas DataFrames.
|
293
378
|
|
294
379
|
Args:
|
295
380
|
batch_size (int, optional): Batch size for iteration. Default is 1.
|
296
381
|
reload (bool, optional): Reload data if True. Default is False.
|
382
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
383
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
384
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
385
|
+
verbose (bool, optional): Verbose output. Default is None.
|
386
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
387
|
+
kwargs: Additional keyword arguments.
|
297
388
|
|
298
389
|
Returns:
|
299
390
|
Generator[pd.DataFrame, None, None]: Generator of Pandas DataFrames.
|
300
391
|
"""
|
301
|
-
|
302
|
-
|
392
|
+
batch_size = batch_size or self.batch_size or 1
|
393
|
+
|
394
|
+
self._load(
|
395
|
+
reload=reload,
|
396
|
+
batch_size=batch_size,
|
397
|
+
include_file_path=include_file_path,
|
398
|
+
concat=concat,
|
399
|
+
use_threads=use_threads,
|
400
|
+
verbose=verbose,
|
401
|
+
opt_dtypes=opt_dtypes,
|
402
|
+
**kwargs,
|
403
|
+
)
|
303
404
|
|
304
|
-
self._load(reload=reload, **kwargs)
|
305
405
|
if isinstance(self._data, list | Generator):
|
306
406
|
for df in self._data:
|
307
407
|
yield df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
@@ -313,13 +413,47 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
313
413
|
)
|
314
414
|
|
315
415
|
def _to_polars_dataframe(
|
316
|
-
self,
|
416
|
+
self,
|
417
|
+
metadata: bool = False,
|
418
|
+
reload: bool = False,
|
419
|
+
include_file_path: bool = False,
|
420
|
+
concat: bool | None = None,
|
421
|
+
use_threads: bool | None = None,
|
422
|
+
verbose: bool | None = None,
|
423
|
+
opt_dtypes: bool | None = None,
|
424
|
+
**kwargs,
|
317
425
|
) -> (
|
318
426
|
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]]
|
319
427
|
| pl.DataFrame
|
320
428
|
| list[pl.DataFrame]
|
321
429
|
):
|
322
|
-
|
430
|
+
"""Convert data to Polars DataFrame(s).
|
431
|
+
|
432
|
+
Args:
|
433
|
+
metadata (bool, optional): Include metadata in the output. Default is False.
|
434
|
+
reload (bool, optional): Reload data if True. Default is False.
|
435
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
436
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
437
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
438
|
+
verbose (bool, optional): Verbose output. Default is None.
|
439
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
440
|
+
kwargs: Additional keyword arguments.
|
441
|
+
|
442
|
+
Returns:
|
443
|
+
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]] | pl.DataFrame | list[pl.DataFrame]: Polars
|
444
|
+
DataFrame or list of DataFrames and optional metadata.
|
445
|
+
"""
|
446
|
+
self._load(
|
447
|
+
metadata=metadata,
|
448
|
+
reload=reload,
|
449
|
+
batch_size=None,
|
450
|
+
include_file_path=include_file_path,
|
451
|
+
concat=concat,
|
452
|
+
use_threads=use_threads,
|
453
|
+
verbose=verbose,
|
454
|
+
opt_dtypes=opt_dtypes,
|
455
|
+
**kwargs,
|
456
|
+
)
|
323
457
|
if isinstance(self._data, list):
|
324
458
|
df = [
|
325
459
|
df if isinstance(self._data, pl.DataFrame) else pl.from_arrow(df)
|
@@ -333,22 +467,48 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
333
467
|
else pl.from_arrow(self._data)
|
334
468
|
)
|
335
469
|
if metadata:
|
336
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
337
|
-
return df,
|
470
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
471
|
+
return df, self._metadata
|
338
472
|
return df
|
339
473
|
|
340
474
|
def _iter_polars_dataframe(
|
341
|
-
self,
|
475
|
+
self,
|
476
|
+
reload: bool = False,
|
477
|
+
batch_size: int | None = None,
|
478
|
+
include_file_path: bool = False,
|
479
|
+
concat: bool | None = None,
|
480
|
+
use_threads: bool | None = None,
|
481
|
+
verbose: bool | None = None,
|
482
|
+
opt_dtypes: bool | None = None,
|
483
|
+
**kwargs,
|
342
484
|
) -> Generator[pl.DataFrame, None, None]:
|
343
485
|
"""Iterate over Polars DataFrames.
|
344
486
|
|
487
|
+
Args:
|
488
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
489
|
+
reload (bool, optional): Reload data if True. Default is False.
|
490
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
491
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
492
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
493
|
+
verbose (bool, optional): Verbose output. Default is None.
|
494
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
495
|
+
kwargs: Additional keyword arguments.
|
496
|
+
|
345
497
|
Returns:
|
346
498
|
Generator[pl.DataFrame, None, None]: Generator of Polars DataFrames.
|
347
499
|
"""
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
500
|
+
batch_size = batch_size or self.batch_size or 1
|
501
|
+
|
502
|
+
self._load(
|
503
|
+
reload=reload,
|
504
|
+
batch_size=batch_size,
|
505
|
+
include_file_path=include_file_path,
|
506
|
+
concat=concat,
|
507
|
+
use_threads=use_threads,
|
508
|
+
verbose=verbose,
|
509
|
+
opt_dtypes=opt_dtypes,
|
510
|
+
**kwargs,
|
511
|
+
)
|
352
512
|
if isinstance(self._data, list | Generator):
|
353
513
|
for df in self._data:
|
354
514
|
yield df if isinstance(df, pl.DataFrame) else pl.from_arrow(df)
|
@@ -360,38 +520,95 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
360
520
|
)
|
361
521
|
|
362
522
|
def _to_polars_lazyframe(
|
363
|
-
self,
|
523
|
+
self,
|
524
|
+
metadata: bool = False,
|
525
|
+
reload: bool = False,
|
526
|
+
include_file_path: bool = False,
|
527
|
+
concat: bool | None = None,
|
528
|
+
use_threads: bool | None = None,
|
529
|
+
verbose: bool | None = None,
|
530
|
+
opt_dtypes: bool | None = None,
|
531
|
+
**kwargs,
|
364
532
|
) -> (
|
365
533
|
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]]
|
366
534
|
| pl.LazyFrame
|
367
535
|
| list[pl.LazyFrame]
|
368
536
|
):
|
369
|
-
|
537
|
+
"""Convert data to Polars LazyFrame(s).
|
538
|
+
|
539
|
+
Args:
|
540
|
+
metadata (bool, optional): Include metadata in the output. Default is False.
|
541
|
+
reload (bool, optional): Reload data if True. Default is False.
|
542
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
543
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
544
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
545
|
+
verbose (bool, optional): Verbose output. Default is None.
|
546
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
547
|
+
kwargs: Additional keyword arguments.
|
548
|
+
|
549
|
+
Returns:
|
550
|
+
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]] | pl.LazyFrame | list[pl.LazyFrame]: Polars
|
551
|
+
LazyFrame or list of LazyFrames and optional metadata.
|
552
|
+
"""
|
553
|
+
self._load(
|
554
|
+
metadata=metadata,
|
555
|
+
reload=reload,
|
556
|
+
batch_size=None,
|
557
|
+
include_file_path=include_file_path,
|
558
|
+
concat=concat,
|
559
|
+
use_threads=use_threads,
|
560
|
+
verbose=verbose,
|
561
|
+
opt_dtypes=opt_dtypes,
|
562
|
+
**kwargs,
|
563
|
+
)
|
370
564
|
if not self.concat:
|
371
565
|
df = [df.lazy() for df in self._to_polars_dataframe()]
|
372
566
|
|
373
567
|
else:
|
374
568
|
df = self._to_polars_dataframe().lazy()
|
375
569
|
if metadata:
|
376
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
377
|
-
return df,
|
570
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
571
|
+
return df, self._metadata
|
378
572
|
return df
|
379
573
|
|
380
574
|
def _iter_polars_lazyframe(
|
381
|
-
self,
|
575
|
+
self,
|
576
|
+
reload: bool = False,
|
577
|
+
batch_size: int | None = None,
|
578
|
+
include_file_path: bool = False,
|
579
|
+
concat: bool | None = None,
|
580
|
+
use_threads: bool | None = None,
|
581
|
+
verbose: bool | None = None,
|
582
|
+
opt_dtypes: bool | None = None,
|
583
|
+
**kwargs,
|
382
584
|
) -> Generator[pl.LazyFrame, None, None]:
|
383
585
|
"""Iterate over Polars LazyFrames.
|
384
586
|
|
385
587
|
Args:
|
386
588
|
batch_size (int, optional): Batch size for iteration. Default is 1.
|
387
589
|
reload (bool, optional): Reload data if True. Default is False.
|
590
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
591
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
592
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
593
|
+
verbose (bool, optional): Verbose output. Default is None.
|
594
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
595
|
+
kwargs: Additional keyword arguments.
|
388
596
|
|
389
597
|
Returns:
|
390
598
|
Generator[pl.LazyFrame, None, None]: Generator of Polars LazyFrames.
|
391
599
|
"""
|
392
|
-
|
393
|
-
|
394
|
-
self._load(
|
600
|
+
batch_size = batch_size or self.batch_size or 1
|
601
|
+
|
602
|
+
self._load(
|
603
|
+
reload=reload,
|
604
|
+
batch_size=batch_size,
|
605
|
+
include_file_path=include_file_path,
|
606
|
+
concat=concat,
|
607
|
+
use_threads=use_threads,
|
608
|
+
verbose=verbose,
|
609
|
+
opt_dtypes=opt_dtypes,
|
610
|
+
**kwargs,
|
611
|
+
)
|
395
612
|
if isinstance(self._data, list | Generator):
|
396
613
|
for df in self._data:
|
397
614
|
yield (
|
@@ -410,6 +627,12 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
410
627
|
self,
|
411
628
|
lazy: bool = False,
|
412
629
|
metadata: bool = False,
|
630
|
+
reload: bool = False,
|
631
|
+
include_file_path: bool = False,
|
632
|
+
concat: bool | None = None,
|
633
|
+
use_threads: bool | None = None,
|
634
|
+
verbose: bool | None = None,
|
635
|
+
opt_dtypes: bool | None = None,
|
413
636
|
**kwargs,
|
414
637
|
) -> (
|
415
638
|
pl.DataFrame
|
@@ -426,6 +649,14 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
426
649
|
Args:
|
427
650
|
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
|
428
651
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
652
|
+
reload (bool, optional): Reload data if True. Default is False.
|
653
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
654
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
655
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
656
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
657
|
+
verbose (bool, optional): Verbose output. Default is None.
|
658
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
659
|
+
kwargs: Additional keyword arguments.
|
429
660
|
|
430
661
|
Returns:
|
431
662
|
pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame] | tuple[pl.DataFrame | pl.LazyFrame
|
@@ -433,32 +664,115 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
433
664
|
metadata.
|
434
665
|
"""
|
435
666
|
if lazy:
|
436
|
-
return self._to_polars_lazyframe(
|
437
|
-
|
667
|
+
return self._to_polars_lazyframe(
|
668
|
+
metadata=metadata,
|
669
|
+
reload=reload,
|
670
|
+
batch_size=None,
|
671
|
+
include_file_path=include_file_path,
|
672
|
+
concat=concat,
|
673
|
+
use_threads=use_threads,
|
674
|
+
verbose=verbose,
|
675
|
+
opt_dtypes=opt_dtypes,
|
676
|
+
**kwargs,
|
677
|
+
)
|
678
|
+
return self._to_polars_dataframe(
|
679
|
+
metadata=metadata,
|
680
|
+
reload=reload,
|
681
|
+
batch_size=None,
|
682
|
+
include_file_path=include_file_path,
|
683
|
+
concat=concat,
|
684
|
+
use_threads=use_threads,
|
685
|
+
verbose=verbose,
|
686
|
+
opt_dtypes=opt_dtypes,
|
687
|
+
**kwargs,
|
688
|
+
)
|
438
689
|
|
439
690
|
def iter_polars(
|
440
691
|
self,
|
441
692
|
lazy: bool = False,
|
693
|
+
reload: bool = False,
|
694
|
+
batch_size: int | None = None,
|
695
|
+
include_file_path: bool = False,
|
696
|
+
concat: bool | None = None,
|
697
|
+
use_threads: bool | None = None,
|
698
|
+
verbose: bool | None = None,
|
699
|
+
opt_dtypes: bool | None = None,
|
442
700
|
**kwargs,
|
443
701
|
) -> Generator[pl.DataFrame | pl.LazyFrame, None, None]:
|
702
|
+
"""Iterate over Polars DataFrames or LazyFrames.
|
703
|
+
|
704
|
+
Args:
|
705
|
+
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame. Default is False.
|
706
|
+
reload (bool, optional): Reload data if True. Default is False.
|
707
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
708
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
709
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
710
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
711
|
+
verbose (bool, optional): Verbose output. Default is None.
|
712
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
713
|
+
kwargs: Additional keyword arguments.
|
714
|
+
|
715
|
+
Returns:
|
716
|
+
Generator[pl.DataFrame | pl.LazyFrame, None, None]: Generator of Polars DataFrames or LazyFrames.
|
717
|
+
"""
|
444
718
|
if lazy:
|
445
|
-
yield from self._iter_polars_lazyframe(
|
446
|
-
|
719
|
+
yield from self._iter_polars_lazyframe(
|
720
|
+
reload=reload,
|
721
|
+
batch_size=batch_size,
|
722
|
+
include_file_path=include_file_path,
|
723
|
+
concat=concat,
|
724
|
+
use_threads=use_threads,
|
725
|
+
verbose=verbose,
|
726
|
+
opt_dtypes=opt_dtypes,
|
727
|
+
**kwargs,
|
728
|
+
)
|
729
|
+
yield from self._iter_polars_dataframe(
|
730
|
+
reload=reload,
|
731
|
+
batch_size=batch_size,
|
732
|
+
include_file_path=include_file_path,
|
733
|
+
concat=concat,
|
734
|
+
use_threads=use_threads,
|
735
|
+
verbose=verbose,
|
736
|
+
opt_dtypes=opt_dtypes,
|
737
|
+
**kwargs,
|
738
|
+
)
|
447
739
|
|
448
740
|
def to_pyarrow_table(
|
449
|
-
self,
|
741
|
+
self,
|
742
|
+
metadata: bool = False,
|
743
|
+
reload: bool = False,
|
744
|
+
include_file_path: bool = False,
|
745
|
+
use_threads: bool | None = None,
|
746
|
+
verbose: bool | None = None,
|
747
|
+
opt_dtypes: bool | None = None,
|
748
|
+
**kwargs,
|
450
749
|
) -> pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]:
|
451
750
|
"""Convert data to PyArrow Table(s).
|
452
751
|
|
453
752
|
Args:
|
454
753
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
455
754
|
reload (bool, optional): Reload data if True. Default is False.
|
755
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
756
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
757
|
+
verbose (bool, optional): Verbose output. Default is None.
|
758
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
759
|
+
kwargs: Additional keyword arguments.
|
456
760
|
|
457
761
|
Returns:
|
458
762
|
pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]: PyArrow Table or list of
|
459
763
|
Tables and optional metadata.
|
460
764
|
"""
|
461
|
-
self._load(
|
765
|
+
self._load(
|
766
|
+
reload=reload,
|
767
|
+
metadata=metadata,
|
768
|
+
batch_size=None,
|
769
|
+
include_file_path=include_file_path,
|
770
|
+
concat=None,
|
771
|
+
use_threads=use_threads,
|
772
|
+
verbose=verbose,
|
773
|
+
opt_dtypes=opt_dtypes,
|
774
|
+
**kwargs,
|
775
|
+
)
|
462
776
|
if isinstance(self._data, list):
|
463
777
|
df = [
|
464
778
|
df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
@@ -472,22 +786,48 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
472
786
|
else self._data
|
473
787
|
)
|
474
788
|
if metadata:
|
475
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
476
|
-
return df,
|
789
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
790
|
+
return df, self._metadata
|
477
791
|
return df
|
478
792
|
|
479
793
|
def iter_pyarrow_table(
|
480
|
-
self,
|
794
|
+
self,
|
795
|
+
reload: bool = False,
|
796
|
+
batch_size: int | None = None,
|
797
|
+
include_file_path: bool = False,
|
798
|
+
concat: bool | None = None,
|
799
|
+
use_threads: bool | None = None,
|
800
|
+
verbose: bool | None = None,
|
801
|
+
opt_dtypes: bool | None = None,
|
802
|
+
**kwargs,
|
481
803
|
) -> Generator[pa.Table, None, None]:
|
482
804
|
"""Iterate over PyArrow Tables.
|
483
805
|
|
806
|
+
Args:
|
807
|
+
reload (bool, optional): Reload data if True. Default is False.
|
808
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
809
|
+
concat (bool, optional): Concatenate multiple files into a single Table. Default is True.
|
810
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
811
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
812
|
+
verbose (bool, optional): Verbose output. Default is None.
|
813
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
814
|
+
kwargs: Additional keyword arguments.
|
815
|
+
|
484
816
|
Returns:
|
485
817
|
Generator[pa.Table, None, None]: Generator of PyArrow Tables.
|
486
818
|
"""
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
819
|
+
batch_size = batch_size or self.batch_size or 1
|
820
|
+
|
821
|
+
self._load(
|
822
|
+
reload=reload,
|
823
|
+
batch_size=batch_size,
|
824
|
+
include_file_path=include_file_path,
|
825
|
+
concat=concat,
|
826
|
+
use_threads=use_threads,
|
827
|
+
verbose=verbose,
|
828
|
+
opt_dtypes=opt_dtypes,
|
829
|
+
**kwargs,
|
830
|
+
)
|
491
831
|
if isinstance(self._data, list | Generator):
|
492
832
|
for df in self._data:
|
493
833
|
yield df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
@@ -503,6 +843,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
503
843
|
conn: duckdb.DuckDBPyConnection | None = None,
|
504
844
|
metadata: bool = False,
|
505
845
|
reload: bool = False,
|
846
|
+
include_file_path: bool = False,
|
847
|
+
use_threads: bool | None = None,
|
848
|
+
verbose: bool | None = None,
|
849
|
+
opt_dtypes: bool | None = None,
|
506
850
|
**kwargs,
|
507
851
|
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
508
852
|
"""Convert data to DuckDB relation.
|
@@ -511,6 +855,11 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
511
855
|
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
512
856
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
513
857
|
reload (bool, optional): Reload data if True. Default is False.
|
858
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
859
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
860
|
+
verbose (bool, optional): Verbose output. Default is None.
|
861
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
862
|
+
kwargs: Additional keyword arguments.
|
514
863
|
|
515
864
|
Returns:
|
516
865
|
duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
|
@@ -523,10 +872,27 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
523
872
|
|
524
873
|
if metadata:
|
525
874
|
return self._conn.from_arrow(
|
526
|
-
self.to_pyarrow_table(
|
875
|
+
self.to_pyarrow_table(
|
876
|
+
metadata=metadata,
|
877
|
+
reload=reload,
|
878
|
+
batch_size=None,
|
879
|
+
include_file_path=include_file_path,
|
880
|
+
se_threads=use_threads,
|
881
|
+
verbose=verbose,
|
882
|
+
opt_dtypes=opt_dtypes,
|
883
|
+
**kwargs,
|
884
|
+
),
|
527
885
|
), self._metadata
|
528
886
|
return self._conn.from_arrow(
|
529
|
-
self.to_pyarrow_table(
|
887
|
+
self.to_pyarrow_table(
|
888
|
+
reload=reload,
|
889
|
+
batch_size=None,
|
890
|
+
include_file_path=include_file_path,
|
891
|
+
use_threads=use_threads,
|
892
|
+
verbose=verbose,
|
893
|
+
opt_dtypes=opt_dtypes,
|
894
|
+
**kwargs,
|
895
|
+
)
|
530
896
|
)
|
531
897
|
|
532
898
|
def register_in_duckdb(
|
@@ -535,6 +901,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
535
901
|
name: str | None = None,
|
536
902
|
metadata: bool = False,
|
537
903
|
reload: bool = False,
|
904
|
+
include_file_path: bool = False,
|
905
|
+
use_threads: bool | None = None,
|
906
|
+
verbose: bool | None = None,
|
907
|
+
opt_dtypes: bool | None = None,
|
538
908
|
**kwargs,
|
539
909
|
) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
|
540
910
|
"""Register data in DuckDB.
|
@@ -544,6 +914,11 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
544
914
|
name (str, optional): Name for the DuckDB table.
|
545
915
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
546
916
|
reload (bool, optional): Reload data if True. Default is False.
|
917
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
918
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
919
|
+
verbose (bool, optional): Verbose output. Default is None.
|
920
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
921
|
+
kwargs: Additional keyword arguments.
|
547
922
|
|
548
923
|
Returns:
|
549
924
|
duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
|
@@ -558,7 +933,16 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
558
933
|
self._conn = conn
|
559
934
|
|
560
935
|
self._conn.register(
|
561
|
-
name,
|
936
|
+
name,
|
937
|
+
self.to_pyarrow_table(
|
938
|
+
metadata=metadata,
|
939
|
+
reload=reload,
|
940
|
+
include_file_path=include_file_path,
|
941
|
+
use_threads=use_threads,
|
942
|
+
verbose=verbose,
|
943
|
+
opt_dtypes=opt_dtypes,
|
944
|
+
**kwargs,
|
945
|
+
),
|
562
946
|
)
|
563
947
|
if metadata:
|
564
948
|
return self._conn, self._metadata
|
@@ -571,6 +955,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
571
955
|
name: str | None = None,
|
572
956
|
metadata: bool = False,
|
573
957
|
reload: bool = False,
|
958
|
+
include_file_path: bool = False,
|
959
|
+
use_threads: bool | None = None,
|
960
|
+
verbose: bool | None = None,
|
961
|
+
opt_dtypes: bool | None = None,
|
574
962
|
**kwargs,
|
575
963
|
) -> (
|
576
964
|
duckdb.DuckDBPyRelation
|
@@ -586,6 +974,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
586
974
|
name (str, optional): Name for the DuckDB table.
|
587
975
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
588
976
|
reload (bool, optional): Reload data if True. Default is False.
|
977
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
978
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
979
|
+
verbose (bool, optional): Verbose output. Default is None.
|
980
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
589
981
|
**kwargs: Additional keyword arguments.
|
590
982
|
|
591
983
|
Returns:
|
@@ -596,10 +988,25 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
596
988
|
"""
|
597
989
|
if as_relation:
|
598
990
|
return self.to_duckdb_relation(
|
599
|
-
conn=conn,
|
991
|
+
conn=conn,
|
992
|
+
metadata=metadata,
|
993
|
+
reload=reload,
|
994
|
+
include_file_path=include_file_path,
|
995
|
+
use_threads=use_threads,
|
996
|
+
verbose=verbose,
|
997
|
+
opt_dtypes=opt_dtypes,
|
998
|
+
**kwargs,
|
600
999
|
)
|
601
1000
|
return self.register_in_duckdb(
|
602
|
-
conn=conn,
|
1001
|
+
conn=conn,
|
1002
|
+
name=name,
|
1003
|
+
metadata=metadata,
|
1004
|
+
reload=reload,
|
1005
|
+
include_file_path=include_file_path,
|
1006
|
+
use_threads=use_threads,
|
1007
|
+
verbose=verbose,
|
1008
|
+
opt_dtypes=opt_dtypes,
|
1009
|
+
**kwargs,
|
603
1010
|
)
|
604
1011
|
|
605
1012
|
def register_in_datafusion(
|
@@ -608,6 +1015,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
608
1015
|
name: str | None = None,
|
609
1016
|
metadata: bool = False,
|
610
1017
|
reload: bool = False,
|
1018
|
+
include_file_path: bool = False,
|
1019
|
+
use_threads: bool | None = None,
|
1020
|
+
verbose: bool | None = None,
|
1021
|
+
opt_dtypes: bool | None = None,
|
611
1022
|
**kwargs,
|
612
1023
|
) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
|
613
1024
|
"""Register data in DataFusion.
|
@@ -632,11 +1043,18 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
632
1043
|
|
633
1044
|
self._ctx.register_record_batches(
|
634
1045
|
name,
|
635
|
-
[
|
1046
|
+
[
|
1047
|
+
self.to_pyarrow_table(
|
1048
|
+
reload=reload,
|
1049
|
+
include_file_path=include_file_path,
|
1050
|
+
use_threads=use_threads,
|
1051
|
+
opt_dtypes=opt_dtypes**kwargs,
|
1052
|
+
).to_batches()
|
1053
|
+
],
|
636
1054
|
)
|
637
1055
|
if metadata:
|
638
1056
|
return self._ctx, self._metadata
|
639
|
-
return
|
1057
|
+
return self._ctx
|
640
1058
|
|
641
1059
|
def filter(
|
642
1060
|
self, filter_expr: str | pl.Expr | pa.compute.Expression
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: FlowerPower
|
3
|
-
Version: 0.11.6
|
3
|
+
Version: 0.11.6.2
|
4
4
|
Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
5
5
|
Author-email: "Volker L." <ligno.blades@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/legout/flowerpower
|
@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
|
|
18
18
|
flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
|
19
19
|
flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
|
20
20
|
flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
|
21
|
-
flowerpower/fs/ext.py,sha256=
|
21
|
+
flowerpower/fs/ext.py,sha256=2NmhSbCIL0qnONMRNPHcPUuR39bGjWpxJE4hNHU5Rvw,69044
|
22
22
|
flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
|
23
23
|
flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
|
24
24
|
flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
|
@@ -44,7 +44,7 @@ flowerpower/pipeline/manager.py,sha256=KVpOclUEUAETUNJamJJGuKt3oxCaLitQgxWxkE1q0
|
|
44
44
|
flowerpower/pipeline/registry.py,sha256=6ngmHyKyQsxvIO4qRYxljedY0BE1wE3lpfksEGOzjNs,18963
|
45
45
|
flowerpower/pipeline/runner.py,sha256=dsSVYixFXqlxFk8EJfT4wV_7IwgkXq0ErwH_yf_NGS8,25654
|
46
46
|
flowerpower/pipeline/visualizer.py,sha256=amjMrl5NetErE198HzZBPWVZBi_t5jj9ydxWpuNLoTI,5013
|
47
|
-
flowerpower/plugins/io/base.py,sha256=
|
47
|
+
flowerpower/plugins/io/base.py,sha256=d3U5L--SpmowOpXyLTpnvbpaVCeyzoxyiqBbSk2h_K4,96685
|
48
48
|
flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
|
49
49
|
flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
|
50
50
|
flowerpower/plugins/io/helpers/polars.py,sha256=346DBHG-HvoGZWF-DWxgz7H3KlZu8bFylKIqMOnVJSk,27031
|
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
|
|
94
94
|
flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
|
95
95
|
flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
|
96
96
|
flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
|
97
|
-
flowerpower-0.11.6.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
|
98
|
-
flowerpower-0.11.6.dist-info/METADATA,sha256=
|
99
|
-
flowerpower-0.11.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
-
flowerpower-0.11.6.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
|
101
|
-
flowerpower-0.11.6.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
|
102
|
-
flowerpower-0.11.6.dist-info/RECORD,,
|
97
|
+
flowerpower-0.11.6.2.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
|
98
|
+
flowerpower-0.11.6.2.dist-info/METADATA,sha256=ftcXBLIRI60sqhug1BnV6KRZY66H0_a65hlamW3COz0,21612
|
99
|
+
flowerpower-0.11.6.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
+
flowerpower-0.11.6.2.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
|
101
|
+
flowerpower-0.11.6.2.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
|
102
|
+
flowerpower-0.11.6.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|