FlowerPower 0.11.6__py3-none-any.whl → 0.11.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +58 -26
- flowerpower/plugins/io/base.py +497 -80
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.1.dist-info}/METADATA +1 -1
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.1.dist-info}/RECORD +8 -8
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.1.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.1.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.1.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.dist-info → flowerpower-0.11.6.1.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
CHANGED
@@ -193,6 +193,7 @@ def _read_json(
|
|
193
193
|
as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
|
194
194
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
195
195
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
196
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
196
197
|
**kwargs: Additional keyword arguments.
|
197
198
|
|
198
199
|
Returns:
|
@@ -247,8 +248,8 @@ def _read_json(
|
|
247
248
|
data = [opt_dtype_pl(df, strict=False) for df in data]
|
248
249
|
if concat:
|
249
250
|
result = pl.concat(data, how="diagonal_relaxed")
|
250
|
-
if opt_dtypes:
|
251
|
-
|
251
|
+
# if opt_dtypes:
|
252
|
+
# result = opt_dtype_pl(result, strict=False)
|
252
253
|
return result
|
253
254
|
return data
|
254
255
|
|
@@ -280,6 +281,7 @@ def _read_json_batches(
|
|
280
281
|
concat: Combine files within each batch
|
281
282
|
use_threads: Enable parallel file reading within batches
|
282
283
|
verbose: Print progress information
|
284
|
+
opt_dtypes: Optimize DataFrame dtypes
|
283
285
|
**kwargs: Additional arguments for DataFrame conversion
|
284
286
|
|
285
287
|
Yields:
|
@@ -354,10 +356,16 @@ def _read_json_batches(
|
|
354
356
|
][0]
|
355
357
|
for _data in batch_data
|
356
358
|
]
|
357
|
-
|
359
|
+
if opt_dtypes:
|
360
|
+
batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
358
361
|
if concat and len(batch_dfs) > 1:
|
359
|
-
|
362
|
+
batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
|
363
|
+
# if opt_dtypes:
|
364
|
+
# batch_df = opt_dtype_pl(batch_df, strict=False)
|
365
|
+
yield batch_df
|
360
366
|
else:
|
367
|
+
# if opt_dtypes:
|
368
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
361
369
|
yield batch_dfs
|
362
370
|
else:
|
363
371
|
yield batch_data
|
@@ -403,6 +411,7 @@ def read_json(
|
|
403
411
|
concat: Combine multiple files/batches into single result
|
404
412
|
use_threads: Enable parallel file reading
|
405
413
|
verbose: Print progress information
|
414
|
+
opt_dtypes: Optimize DataFrame dtypes for performance
|
406
415
|
**kwargs: Additional arguments passed to DataFrame conversion
|
407
416
|
|
408
417
|
Returns:
|
@@ -486,6 +495,7 @@ def _read_csv_file(
|
|
486
495
|
path: Path to CSV file
|
487
496
|
self: Filesystem instance to use for reading
|
488
497
|
include_file_path: Add source filepath as a column
|
498
|
+
opt_dtypes: Optimize DataFrame dtypes
|
489
499
|
**kwargs: Additional arguments passed to pl.read_csv()
|
490
500
|
|
491
501
|
Returns:
|
@@ -544,6 +554,7 @@ def _read_csv(
|
|
544
554
|
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
545
555
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
546
556
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
557
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
547
558
|
**kwargs: Additional keyword arguments.
|
548
559
|
|
549
560
|
Returns:
|
@@ -587,8 +598,8 @@ def _read_csv(
|
|
587
598
|
)
|
588
599
|
if concat:
|
589
600
|
result = pl.concat(dfs, how="diagonal_relaxed")
|
590
|
-
if opt_dtypes:
|
591
|
-
|
601
|
+
# if opt_dtypes:
|
602
|
+
# result = opt_dtype_pl(result, strict=False)
|
592
603
|
return result
|
593
604
|
return dfs
|
594
605
|
|
@@ -616,6 +627,7 @@ def _read_csv_batches(
|
|
616
627
|
concat: Combine files within each batch
|
617
628
|
use_threads: Enable parallel file reading within batches
|
618
629
|
verbose: Print progress information
|
630
|
+
opt_dtypes: Optimize DataFrame dtypes
|
619
631
|
**kwargs: Additional arguments passed to pl.read_csv()
|
620
632
|
|
621
633
|
Yields:
|
@@ -667,23 +679,28 @@ def _read_csv_batches(
|
|
667
679
|
n_jobs=-1,
|
668
680
|
backend="threading",
|
669
681
|
verbose=verbose,
|
682
|
+
opt_dtypes=opt_dtypes,
|
670
683
|
**kwargs,
|
671
684
|
)
|
672
685
|
else:
|
673
686
|
batch_dfs = [
|
674
687
|
_read_csv_file(
|
675
|
-
p,
|
688
|
+
p,
|
689
|
+
self=self,
|
690
|
+
include_file_path=include_file_path,
|
691
|
+
opt_dtypes=opt_dtypes,
|
692
|
+
**kwargs,
|
676
693
|
)
|
677
694
|
for p in batch_paths
|
678
695
|
]
|
679
696
|
|
680
|
-
if opt_dtypes:
|
681
|
-
|
697
|
+
# if opt_dtypes:
|
698
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
682
699
|
|
683
700
|
if concat and len(batch_dfs) > 1:
|
684
701
|
result = pl.concat(batch_dfs, how="diagonal_relaxed")
|
685
|
-
if opt_dtypes:
|
686
|
-
|
702
|
+
# if opt_dtypes:
|
703
|
+
# result = opt_dtype_pl(result, strict=False)
|
687
704
|
yield result
|
688
705
|
else:
|
689
706
|
yield batch_dfs
|
@@ -766,6 +783,7 @@ def read_csv(
|
|
766
783
|
concat=concat,
|
767
784
|
use_threads=use_threads,
|
768
785
|
verbose=verbose,
|
786
|
+
opt_dtypes=opt_dtypes,
|
769
787
|
**kwargs,
|
770
788
|
)
|
771
789
|
return _read_csv(
|
@@ -775,6 +793,7 @@ def read_csv(
|
|
775
793
|
concat=concat,
|
776
794
|
use_threads=use_threads,
|
777
795
|
verbose=verbose,
|
796
|
+
opt_dtypes=opt_dtypes,
|
778
797
|
**kwargs,
|
779
798
|
)
|
780
799
|
|
@@ -858,9 +877,7 @@ def _read_parquet(
|
|
858
877
|
if not include_file_path and concat:
|
859
878
|
if isinstance(path, str):
|
860
879
|
path = path.replace("**", "").replace("*.parquet", "")
|
861
|
-
table =
|
862
|
-
if opt_dtypes:
|
863
|
-
table = opt_dtype_pa(table, strict=False)
|
880
|
+
table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
|
864
881
|
return table
|
865
882
|
else:
|
866
883
|
if isinstance(path, str):
|
@@ -907,12 +924,12 @@ def _read_parquet(
|
|
907
924
|
unified_schema = unify_schemas_pa(schemas)
|
908
925
|
tables = [cast_schema(t, unified_schema) for t in tables]
|
909
926
|
result = pa.concat_tables(tables, promote_options="permissive")
|
910
|
-
if opt_dtypes:
|
911
|
-
|
927
|
+
# if opt_dtypes:
|
928
|
+
# result = opt_dtype_pa(result, strict=False)
|
912
929
|
return result
|
913
930
|
elif isinstance(tables, pa.Table):
|
914
|
-
if opt_dtypes:
|
915
|
-
|
931
|
+
# if opt_dtypes:
|
932
|
+
# tables = opt_dtype_pa(tables, strict=False)
|
916
933
|
return tables
|
917
934
|
else:
|
918
935
|
return pa.concat_tables(tables, promote_options="permissive")
|
@@ -981,9 +998,9 @@ def _read_parquet_batches(
|
|
981
998
|
if not include_file_path and concat and batch_size is None:
|
982
999
|
if isinstance(path, str):
|
983
1000
|
path = path.replace("**", "").replace("*.parquet", "")
|
984
|
-
table =
|
985
|
-
|
986
|
-
|
1001
|
+
table = _read_parquet_file(
|
1002
|
+
path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
|
1003
|
+
)
|
987
1004
|
yield table
|
988
1005
|
return
|
989
1006
|
|
@@ -994,7 +1011,11 @@ def _read_parquet_batches(
|
|
994
1011
|
|
995
1012
|
if not isinstance(path, list):
|
996
1013
|
yield _read_parquet_file(
|
997
|
-
path=path,
|
1014
|
+
path=path,
|
1015
|
+
self=self,
|
1016
|
+
include_file_path=include_file_path,
|
1017
|
+
opt_dtypes=opt_dtypes,
|
1018
|
+
**kwargs,
|
998
1019
|
)
|
999
1020
|
return
|
1000
1021
|
|
@@ -1032,12 +1053,12 @@ def _read_parquet_batches(
|
|
1032
1053
|
unified_schema = unify_schemas_pa(schemas)
|
1033
1054
|
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1034
1055
|
result = pa.concat_tables(batch_tables, promote_options="permissive")
|
1035
|
-
if opt_dtypes:
|
1036
|
-
|
1056
|
+
# if opt_dtypes:
|
1057
|
+
# result = opt_dtype_pa(result, strict=False)
|
1037
1058
|
yield result
|
1038
1059
|
else:
|
1039
|
-
if opt_dtypes and isinstance(batch_tables, list):
|
1040
|
-
|
1060
|
+
# if opt_dtypes and isinstance(batch_tables, list):
|
1061
|
+
# batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
|
1041
1062
|
yield batch_tables
|
1042
1063
|
|
1043
1064
|
|
@@ -1077,6 +1098,7 @@ def read_parquet(
|
|
1077
1098
|
concat: Combine multiple files/batches into single Table
|
1078
1099
|
use_threads: Enable parallel file reading
|
1079
1100
|
verbose: Print progress information
|
1101
|
+
opt_dtypes: Optimize Table dtypes for performance
|
1080
1102
|
**kwargs: Additional arguments passed to pq.read_table()
|
1081
1103
|
|
1082
1104
|
Returns:
|
@@ -1119,6 +1141,7 @@ def read_parquet(
|
|
1119
1141
|
concat=concat,
|
1120
1142
|
use_threads=use_threads,
|
1121
1143
|
verbose=verbose,
|
1144
|
+
opt_dtypes=opt_dtypes,
|
1122
1145
|
**kwargs,
|
1123
1146
|
)
|
1124
1147
|
return _read_parquet(
|
@@ -1128,6 +1151,7 @@ def read_parquet(
|
|
1128
1151
|
use_threads=use_threads,
|
1129
1152
|
concat=concat,
|
1130
1153
|
verbose=verbose,
|
1154
|
+
opt_dtypes=opt_dtypes,
|
1131
1155
|
**kwargs,
|
1132
1156
|
)
|
1133
1157
|
|
@@ -1142,6 +1166,7 @@ def read_files(
|
|
1142
1166
|
jsonlines: bool = False,
|
1143
1167
|
use_threads: bool = True,
|
1144
1168
|
verbose: bool = False,
|
1169
|
+
opt_dtypes: bool = False,
|
1145
1170
|
**kwargs: Any,
|
1146
1171
|
) -> (
|
1147
1172
|
pl.DataFrame
|
@@ -1175,6 +1200,7 @@ def read_files(
|
|
1175
1200
|
jsonlines: For JSON format, whether to read as JSON Lines
|
1176
1201
|
use_threads: Enable parallel file reading
|
1177
1202
|
verbose: Print progress information
|
1203
|
+
opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
|
1178
1204
|
**kwargs: Additional format-specific arguments
|
1179
1205
|
|
1180
1206
|
Returns:
|
@@ -1224,6 +1250,7 @@ def read_files(
|
|
1224
1250
|
concat=concat,
|
1225
1251
|
use_threads=use_threads,
|
1226
1252
|
verbose=verbose,
|
1253
|
+
opt_dtypes=opt_dtypes,
|
1227
1254
|
**kwargs,
|
1228
1255
|
)
|
1229
1256
|
return read_json(
|
@@ -1234,6 +1261,7 @@ def read_files(
|
|
1234
1261
|
concat=concat,
|
1235
1262
|
use_threads=use_threads,
|
1236
1263
|
verbose=verbose,
|
1264
|
+
opt_dtypes=opt_dtypes,
|
1237
1265
|
**kwargs,
|
1238
1266
|
)
|
1239
1267
|
elif format == "csv":
|
@@ -1246,6 +1274,7 @@ def read_files(
|
|
1246
1274
|
concat=concat,
|
1247
1275
|
use_threads=use_threads,
|
1248
1276
|
verbose=verbose,
|
1277
|
+
opt_dtypes=opt_dtypes,
|
1249
1278
|
**kwargs,
|
1250
1279
|
)
|
1251
1280
|
return read_csv(
|
@@ -1255,6 +1284,7 @@ def read_files(
|
|
1255
1284
|
use_threads=use_threads,
|
1256
1285
|
concat=concat,
|
1257
1286
|
verbose=verbose,
|
1287
|
+
opt_dtypes=opt_dtypes,
|
1258
1288
|
**kwargs,
|
1259
1289
|
)
|
1260
1290
|
elif format == "parquet":
|
@@ -1267,6 +1297,7 @@ def read_files(
|
|
1267
1297
|
concat=concat,
|
1268
1298
|
use_threads=use_threads,
|
1269
1299
|
verbose=verbose,
|
1300
|
+
opt_dtypes=opt_dtypes,
|
1270
1301
|
**kwargs,
|
1271
1302
|
)
|
1272
1303
|
return read_parquet(
|
@@ -1276,6 +1307,7 @@ def read_files(
|
|
1276
1307
|
use_threads=use_threads,
|
1277
1308
|
concat=concat,
|
1278
1309
|
verbose=verbose,
|
1310
|
+
opt_dtypes=opt_dtypes,
|
1279
1311
|
**kwargs,
|
1280
1312
|
)
|
1281
1313
|
|
flowerpower/plugins/io/base.py
CHANGED
@@ -185,74 +185,134 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
185
185
|
include_file_path: bool = field(default=False)
|
186
186
|
concat: bool = field(default=True)
|
187
187
|
batch_size: int | None = field(default=None)
|
188
|
+
opt_dtypes: bool = field(default=True)
|
189
|
+
use_threads: bool = field(default=True)
|
188
190
|
conn: duckdb.DuckDBPyConnection | None = field(default=None)
|
189
191
|
ctx: datafusion.SessionContext | None = field(default=None)
|
190
192
|
jsonlines: bool | None = field(default=None)
|
191
193
|
partitioning: str | list[str] | pds.Partitioning | None = field(default=None)
|
192
194
|
_data: Any | None = field(default=None)
|
193
195
|
|
194
|
-
def _load(
|
195
|
-
|
196
|
-
|
196
|
+
def _load(
|
197
|
+
self,
|
198
|
+
metadata: bool = False,
|
199
|
+
reload: bool = False,
|
200
|
+
batch_size: int | None = None,
|
201
|
+
include_file_path: bool = False,
|
202
|
+
concat: bool | None = None,
|
203
|
+
use_threads: bool | None = None,
|
204
|
+
verbose: bool | None = None,
|
205
|
+
opt_dtypes: bool | None = None,
|
206
|
+
**kwargs,
|
207
|
+
):
|
208
|
+
if batch_size is not None:
|
209
|
+
if self.batch_size != batch_size:
|
197
210
|
reload = True
|
198
|
-
|
199
|
-
else:
|
200
|
-
kwargs.pop("include_file_path")
|
211
|
+
self.batch_size = batch_size
|
201
212
|
|
202
|
-
if
|
203
|
-
if self.
|
213
|
+
if include_file_path is not None:
|
214
|
+
if self.include_file_path != include_file_path:
|
204
215
|
reload = True
|
205
|
-
self.
|
206
|
-
else:
|
207
|
-
kwargs.pop("concat")
|
216
|
+
self.include_file_path = include_file_path
|
208
217
|
|
209
|
-
if
|
210
|
-
if self.
|
218
|
+
if concat is not None:
|
219
|
+
if self.concat != concat:
|
211
220
|
reload = True
|
212
|
-
self.
|
213
|
-
|
214
|
-
|
221
|
+
self.concat = concat
|
222
|
+
|
223
|
+
if use_threads is not None:
|
224
|
+
if self.use_threads != use_threads:
|
225
|
+
reload = True
|
226
|
+
self.use_threads = use_threads
|
227
|
+
|
228
|
+
if verbose is not None:
|
229
|
+
if self.fs.verbose != verbose:
|
230
|
+
reload = True
|
231
|
+
self.fs.verbose = verbose
|
232
|
+
|
233
|
+
if opt_dtypes is not None:
|
234
|
+
if self.opt_dtypes != opt_dtypes:
|
235
|
+
reload = True
|
236
|
+
self.opt_dtypes = opt_dtypes
|
215
237
|
|
216
238
|
if "partitioning" in kwargs:
|
217
239
|
if self.partitioning != kwargs["partitioning"]:
|
218
240
|
reload = True
|
219
241
|
self.partitioning = kwargs.pop("partitioning")
|
220
|
-
else:
|
221
|
-
kwargs.pop("partitioning")
|
222
242
|
|
223
243
|
if not hasattr(self, "_data") or self._data is None or reload:
|
224
244
|
self._data = self.fs.read_files(
|
225
245
|
path=self._glob_path,
|
226
246
|
format=self.format,
|
227
|
-
include_file_path=True,
|
247
|
+
include_file_path=True if metadata or self.include_file_path else False,
|
228
248
|
concat=self.concat,
|
229
249
|
jsonlines=self.jsonlines or None,
|
230
250
|
batch_size=self.batch_size,
|
231
251
|
partitioning=self.partitioning,
|
252
|
+
opt_dtypes=self.opt_dtypes,
|
253
|
+
verbose=self.verbose,
|
254
|
+
use_threads=self.use_threads,
|
232
255
|
**kwargs,
|
233
256
|
)
|
234
|
-
if
|
235
|
-
self.
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
257
|
+
if metadata:
|
258
|
+
if isinstance(self._data, tuple | list):
|
259
|
+
self._metadata = [
|
260
|
+
get_dataframe_metadata(
|
261
|
+
df=df,
|
262
|
+
path=self.path,
|
263
|
+
format=self.format,
|
264
|
+
num_files=pl.from_arrow(df.select(["file_path"])).select(
|
265
|
+
pl.n_unique("file_path")
|
266
|
+
)[0, 0]
|
267
|
+
if isinstance(df, pa.Table)
|
268
|
+
else df.select(pl.n_unique("file_path"))[0, 0],
|
269
|
+
)
|
270
|
+
for df in self._data
|
271
|
+
]
|
272
|
+
if not self.include_file_path:
|
273
|
+
self._data = [df.drop("file_path") for df in self._data]
|
274
|
+
|
275
|
+
elif isinstance(self._data, pa.Table):
|
276
|
+
self._metadata = get_dataframe_metadata(
|
277
|
+
df=self._data,
|
278
|
+
path=self.path,
|
279
|
+
format=self.format,
|
280
|
+
num_files=pl.from_arrow(
|
281
|
+
self._data.select(pl.n_unique("file_path"))
|
282
|
+
)[0, 0],
|
283
|
+
)
|
284
|
+
if not self.include_file_path:
|
285
|
+
self._data = self._data.drop("file_path")
|
286
|
+
|
287
|
+
elif isinstance(self._data, pl.DataFrame | pl.LazyFrame):
|
288
|
+
self._metadata = get_dataframe_metadata(
|
289
|
+
df=self._data,
|
290
|
+
path=self.path,
|
291
|
+
format=self.format,
|
292
|
+
num_files=self._data.select(pl.n_unique("file_path"))[0, 0]
|
293
|
+
if isinstance(self._data, pl.DataFrame)
|
294
|
+
else self._data.select(pl.n_unique("file_path")).collect()[
|
295
|
+
0, 0
|
296
|
+
],
|
297
|
+
)
|
298
|
+
|
299
|
+
if not self.include_file_path:
|
245
300
|
self._data = self._data.drop("file_path")
|
246
|
-
|
247
|
-
|
248
|
-
df.drop("file_path") if isinstance(df, pa.Table) else df
|
249
|
-
for df in self._data
|
250
|
-
]
|
301
|
+
else:
|
302
|
+
metadata = {}
|
251
303
|
else:
|
252
304
|
self._metadata = {}
|
253
305
|
|
254
306
|
def to_pandas(
|
255
|
-
self,
|
307
|
+
self,
|
308
|
+
metadata: bool = False,
|
309
|
+
reload: bool = False,
|
310
|
+
include_file_path: bool = False,
|
311
|
+
concat: bool | None = None,
|
312
|
+
use_threads: bool | None = None,
|
313
|
+
verbose: bool | None = None,
|
314
|
+
opt_dtypes: bool | None = None,
|
315
|
+
**kwargs,
|
256
316
|
) -> (
|
257
317
|
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]]
|
258
318
|
| pd.DataFrame
|
@@ -263,12 +323,28 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
263
323
|
Args:
|
264
324
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
265
325
|
reload (bool, optional): Reload data if True. Default is False.
|
326
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
327
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
328
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
329
|
+
verbose (bool, optional): Verbose output. Default is None.
|
330
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
331
|
+
kwargs: Additional keyword arguments.
|
266
332
|
|
267
333
|
Returns:
|
268
334
|
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]] | pd.DataFrame | list[pd.DataFrame]: Pandas
|
269
335
|
DataFrame or list of DataFrames and optional metadata.
|
270
336
|
"""
|
271
|
-
self._load(
|
337
|
+
self._load(
|
338
|
+
reload=reload,
|
339
|
+
metadata=metadata,
|
340
|
+
batch_size=None,
|
341
|
+
include_file_path=include_file_path,
|
342
|
+
concat=concat,
|
343
|
+
use_threads=use_threads,
|
344
|
+
verbose=verbose,
|
345
|
+
opt_dtypes=opt_dtypes,
|
346
|
+
**kwargs,
|
347
|
+
)
|
272
348
|
if isinstance(self._data, list):
|
273
349
|
df = [
|
274
350
|
df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
@@ -282,26 +358,49 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
282
358
|
else self._data.to_pandas()
|
283
359
|
)
|
284
360
|
if metadata:
|
285
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
286
|
-
return df,
|
361
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
362
|
+
return df, self._metadata
|
287
363
|
return df
|
288
364
|
|
289
365
|
def iter_pandas(
|
290
|
-
self,
|
366
|
+
self,
|
367
|
+
reload: bool = False,
|
368
|
+
batch_size: int | None = None,
|
369
|
+
include_file_path: bool = False,
|
370
|
+
concat: bool | None = None,
|
371
|
+
use_threads: bool | None = None,
|
372
|
+
verbose: bool | None = None,
|
373
|
+
opt_dtypes: bool | None = None,
|
374
|
+
**kwargs,
|
291
375
|
) -> Generator[pd.DataFrame, None, None]:
|
292
376
|
"""Iterate over Pandas DataFrames.
|
293
377
|
|
294
378
|
Args:
|
295
379
|
batch_size (int, optional): Batch size for iteration. Default is 1.
|
296
380
|
reload (bool, optional): Reload data if True. Default is False.
|
381
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
382
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
383
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
384
|
+
verbose (bool, optional): Verbose output. Default is None.
|
385
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
386
|
+
kwargs: Additional keyword arguments.
|
297
387
|
|
298
388
|
Returns:
|
299
389
|
Generator[pd.DataFrame, None, None]: Generator of Pandas DataFrames.
|
300
390
|
"""
|
301
|
-
|
302
|
-
|
391
|
+
batch_size = batch_size or self.batch_size or 1
|
392
|
+
|
393
|
+
self._load(
|
394
|
+
reload=reload,
|
395
|
+
batch_size=batch_size,
|
396
|
+
include_file_path=include_file_path,
|
397
|
+
concat=concat,
|
398
|
+
use_threads=use_threads,
|
399
|
+
verbose=verbose,
|
400
|
+
opt_dtypes=opt_dtypes,
|
401
|
+
**kwargs,
|
402
|
+
)
|
303
403
|
|
304
|
-
self._load(reload=reload, **kwargs)
|
305
404
|
if isinstance(self._data, list | Generator):
|
306
405
|
for df in self._data:
|
307
406
|
yield df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
@@ -313,13 +412,47 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
313
412
|
)
|
314
413
|
|
315
414
|
def _to_polars_dataframe(
|
316
|
-
self,
|
415
|
+
self,
|
416
|
+
metadata: bool = False,
|
417
|
+
reload: bool = False,
|
418
|
+
include_file_path: bool = False,
|
419
|
+
concat: bool | None = None,
|
420
|
+
use_threads: bool | None = None,
|
421
|
+
verbose: bool | None = None,
|
422
|
+
opt_dtypes: bool | None = None,
|
423
|
+
**kwargs,
|
317
424
|
) -> (
|
318
425
|
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]]
|
319
426
|
| pl.DataFrame
|
320
427
|
| list[pl.DataFrame]
|
321
428
|
):
|
322
|
-
|
429
|
+
"""Convert data to Polars DataFrame(s).
|
430
|
+
|
431
|
+
Args:
|
432
|
+
metadata (bool, optional): Include metadata in the output. Default is False.
|
433
|
+
reload (bool, optional): Reload data if True. Default is False.
|
434
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
435
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
436
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
437
|
+
verbose (bool, optional): Verbose output. Default is None.
|
438
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
439
|
+
kwargs: Additional keyword arguments.
|
440
|
+
|
441
|
+
Returns:
|
442
|
+
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]] | pl.DataFrame | list[pl.DataFrame]: Polars
|
443
|
+
DataFrame or list of DataFrames and optional metadata.
|
444
|
+
"""
|
445
|
+
self._load(
|
446
|
+
metadata=metadata,
|
447
|
+
reload=reload,
|
448
|
+
batch_size=None,
|
449
|
+
include_file_path=include_file_path,
|
450
|
+
concat=concat,
|
451
|
+
use_threads=use_threads,
|
452
|
+
verbose=verbose,
|
453
|
+
opt_dtypes=opt_dtypes,
|
454
|
+
**kwargs,
|
455
|
+
)
|
323
456
|
if isinstance(self._data, list):
|
324
457
|
df = [
|
325
458
|
df if isinstance(self._data, pl.DataFrame) else pl.from_arrow(df)
|
@@ -333,22 +466,48 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
333
466
|
else pl.from_arrow(self._data)
|
334
467
|
)
|
335
468
|
if metadata:
|
336
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
337
|
-
return df,
|
469
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
470
|
+
return df, self._metadata
|
338
471
|
return df
|
339
472
|
|
340
473
|
def _iter_polars_dataframe(
|
341
|
-
self,
|
474
|
+
self,
|
475
|
+
reload: bool = False,
|
476
|
+
batch_size: int | None = None,
|
477
|
+
include_file_path: bool = False,
|
478
|
+
concat: bool | None = None,
|
479
|
+
use_threads: bool | None = None,
|
480
|
+
verbose: bool | None = None,
|
481
|
+
opt_dtypes: bool | None = None,
|
482
|
+
**kwargs,
|
342
483
|
) -> Generator[pl.DataFrame, None, None]:
|
343
484
|
"""Iterate over Polars DataFrames.
|
344
485
|
|
486
|
+
Args:
|
487
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
488
|
+
reload (bool, optional): Reload data if True. Default is False.
|
489
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
490
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
491
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
492
|
+
verbose (bool, optional): Verbose output. Default is None.
|
493
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
494
|
+
kwargs: Additional keyword arguments.
|
495
|
+
|
345
496
|
Returns:
|
346
497
|
Generator[pl.DataFrame, None, None]: Generator of Polars DataFrames.
|
347
498
|
"""
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
499
|
+
batch_size = batch_size or self.batch_size or 1
|
500
|
+
|
501
|
+
self._load(
|
502
|
+
reload=reload,
|
503
|
+
batch_size=batch_size,
|
504
|
+
include_file_path=include_file_path,
|
505
|
+
concat=concat,
|
506
|
+
use_threads=use_threads,
|
507
|
+
verbose=verbose,
|
508
|
+
opt_dtypes=opt_dtypes,
|
509
|
+
**kwargs,
|
510
|
+
)
|
352
511
|
if isinstance(self._data, list | Generator):
|
353
512
|
for df in self._data:
|
354
513
|
yield df if isinstance(df, pl.DataFrame) else pl.from_arrow(df)
|
@@ -360,38 +519,95 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
360
519
|
)
|
361
520
|
|
362
521
|
def _to_polars_lazyframe(
|
363
|
-
self,
|
522
|
+
self,
|
523
|
+
metadata: bool = False,
|
524
|
+
reload: bool = False,
|
525
|
+
include_file_path: bool = False,
|
526
|
+
concat: bool | None = None,
|
527
|
+
use_threads: bool | None = None,
|
528
|
+
verbose: bool | None = None,
|
529
|
+
opt_dtypes: bool | None = None,
|
530
|
+
**kwargs,
|
364
531
|
) -> (
|
365
532
|
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]]
|
366
533
|
| pl.LazyFrame
|
367
534
|
| list[pl.LazyFrame]
|
368
535
|
):
|
369
|
-
|
536
|
+
"""Convert data to Polars LazyFrame(s).
|
537
|
+
|
538
|
+
Args:
|
539
|
+
metadata (bool, optional): Include metadata in the output. Default is False.
|
540
|
+
reload (bool, optional): Reload data if True. Default is False.
|
541
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
542
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
543
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
544
|
+
verbose (bool, optional): Verbose output. Default is None.
|
545
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
546
|
+
kwargs: Additional keyword arguments.
|
547
|
+
|
548
|
+
Returns:
|
549
|
+
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]] | pl.LazyFrame | list[pl.LazyFrame]: Polars
|
550
|
+
LazyFrame or list of LazyFrames and optional metadata.
|
551
|
+
"""
|
552
|
+
self._load(
|
553
|
+
metadata=metadata,
|
554
|
+
reload=reload,
|
555
|
+
batch_size=None,
|
556
|
+
include_file_path=include_file_path,
|
557
|
+
concat=concat,
|
558
|
+
use_threads=use_threads,
|
559
|
+
verbose=verbose,
|
560
|
+
opt_dtypes=opt_dtypes,
|
561
|
+
**kwargs,
|
562
|
+
)
|
370
563
|
if not self.concat:
|
371
564
|
df = [df.lazy() for df in self._to_polars_dataframe()]
|
372
565
|
|
373
566
|
else:
|
374
567
|
df = self._to_polars_dataframe().lazy()
|
375
568
|
if metadata:
|
376
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
377
|
-
return df,
|
569
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
570
|
+
return df, self._metadata
|
378
571
|
return df
|
379
572
|
|
380
573
|
def _iter_polars_lazyframe(
|
381
|
-
self,
|
574
|
+
self,
|
575
|
+
reload: bool = False,
|
576
|
+
batch_size: int | None = None,
|
577
|
+
include_file_path: bool = False,
|
578
|
+
concat: bool | None = None,
|
579
|
+
use_threads: bool | None = None,
|
580
|
+
verbose: bool | None = None,
|
581
|
+
opt_dtypes: bool | None = None,
|
582
|
+
**kwargs,
|
382
583
|
) -> Generator[pl.LazyFrame, None, None]:
|
383
584
|
"""Iterate over Polars LazyFrames.
|
384
585
|
|
385
586
|
Args:
|
386
587
|
batch_size (int, optional): Batch size for iteration. Default is 1.
|
387
588
|
reload (bool, optional): Reload data if True. Default is False.
|
589
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
590
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
591
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
592
|
+
verbose (bool, optional): Verbose output. Default is None.
|
593
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
594
|
+
kwargs: Additional keyword arguments.
|
388
595
|
|
389
596
|
Returns:
|
390
597
|
Generator[pl.LazyFrame, None, None]: Generator of Polars LazyFrames.
|
391
598
|
"""
|
392
|
-
|
393
|
-
|
394
|
-
self._load(
|
599
|
+
batch_size = batch_size or self.batch_size or 1
|
600
|
+
|
601
|
+
self._load(
|
602
|
+
reload=reload,
|
603
|
+
batch_size=batch_size,
|
604
|
+
include_file_path=include_file_path,
|
605
|
+
concat=concat,
|
606
|
+
use_threads=use_threads,
|
607
|
+
verbose=verbose,
|
608
|
+
opt_dtypes=opt_dtypes,
|
609
|
+
**kwargs,
|
610
|
+
)
|
395
611
|
if isinstance(self._data, list | Generator):
|
396
612
|
for df in self._data:
|
397
613
|
yield (
|
@@ -410,6 +626,12 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
410
626
|
self,
|
411
627
|
lazy: bool = False,
|
412
628
|
metadata: bool = False,
|
629
|
+
reload: bool = False,
|
630
|
+
include_file_path: bool = False,
|
631
|
+
concat: bool | None = None,
|
632
|
+
use_threads: bool | None = None,
|
633
|
+
verbose: bool | None = None,
|
634
|
+
opt_dtypes: bool | None = None,
|
413
635
|
**kwargs,
|
414
636
|
) -> (
|
415
637
|
pl.DataFrame
|
@@ -426,6 +648,14 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
426
648
|
Args:
|
427
649
|
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
|
428
650
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
651
|
+
reload (bool, optional): Reload data if True. Default is False.
|
652
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
653
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
654
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
655
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
656
|
+
verbose (bool, optional): Verbose output. Default is None.
|
657
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
658
|
+
kwargs: Additional keyword arguments.
|
429
659
|
|
430
660
|
Returns:
|
431
661
|
pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame] | tuple[pl.DataFrame | pl.LazyFrame
|
@@ -433,32 +663,115 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
433
663
|
metadata.
|
434
664
|
"""
|
435
665
|
if lazy:
|
436
|
-
return self._to_polars_lazyframe(
|
437
|
-
|
666
|
+
return self._to_polars_lazyframe(
|
667
|
+
metadata=metadata,
|
668
|
+
reload=reload,
|
669
|
+
batch_size=None,
|
670
|
+
include_file_path=include_file_path,
|
671
|
+
concat=concat,
|
672
|
+
use_threads=use_threads,
|
673
|
+
verbose=verbose,
|
674
|
+
opt_dtypes=opt_dtypes,
|
675
|
+
**kwargs,
|
676
|
+
)
|
677
|
+
return self._to_polars_dataframe(
|
678
|
+
metadata=metadata,
|
679
|
+
reload=reload,
|
680
|
+
batch_size=None,
|
681
|
+
include_file_path=include_file_path,
|
682
|
+
concat=concat,
|
683
|
+
use_threads=use_threads,
|
684
|
+
verbose=verbose,
|
685
|
+
opt_dtypes=opt_dtypes,
|
686
|
+
**kwargs,
|
687
|
+
)
|
438
688
|
|
439
689
|
def iter_polars(
|
440
690
|
self,
|
441
691
|
lazy: bool = False,
|
692
|
+
reload: bool = False,
|
693
|
+
batch_size: int | None = None,
|
694
|
+
include_file_path: bool = False,
|
695
|
+
concat: bool | None = None,
|
696
|
+
use_threads: bool | None = None,
|
697
|
+
verbose: bool | None = None,
|
698
|
+
opt_dtypes: bool | None = None,
|
442
699
|
**kwargs,
|
443
700
|
) -> Generator[pl.DataFrame | pl.LazyFrame, None, None]:
|
701
|
+
"""Iterate over Polars DataFrames or LazyFrames.
|
702
|
+
|
703
|
+
Args:
|
704
|
+
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame. Default is False.
|
705
|
+
reload (bool, optional): Reload data if True. Default is False.
|
706
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
707
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
708
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
709
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
710
|
+
verbose (bool, optional): Verbose output. Default is None.
|
711
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
712
|
+
kwargs: Additional keyword arguments.
|
713
|
+
|
714
|
+
Returns:
|
715
|
+
Generator[pl.DataFrame | pl.LazyFrame, None, None]: Generator of Polars DataFrames or LazyFrames.
|
716
|
+
"""
|
444
717
|
if lazy:
|
445
|
-
yield from self._iter_polars_lazyframe(
|
446
|
-
|
718
|
+
yield from self._iter_polars_lazyframe(
|
719
|
+
reload=reload,
|
720
|
+
batch_size=batch_size,
|
721
|
+
include_file_path=include_file_path,
|
722
|
+
concat=concat,
|
723
|
+
use_threads=use_threads,
|
724
|
+
verbose=verbose,
|
725
|
+
opt_dtypes=opt_dtypes,
|
726
|
+
**kwargs,
|
727
|
+
)
|
728
|
+
yield from self._iter_polars_dataframe(
|
729
|
+
reload=reload,
|
730
|
+
batch_size=batch_size,
|
731
|
+
include_file_path=include_file_path,
|
732
|
+
concat=concat,
|
733
|
+
use_threads=use_threads,
|
734
|
+
verbose=verbose,
|
735
|
+
opt_dtypes=opt_dtypes,
|
736
|
+
**kwargs,
|
737
|
+
)
|
447
738
|
|
448
739
|
def to_pyarrow_table(
|
449
|
-
self,
|
740
|
+
self,
|
741
|
+
metadata: bool = False,
|
742
|
+
reload: bool = False,
|
743
|
+
include_file_path: bool = False,
|
744
|
+
use_threads: bool | None = None,
|
745
|
+
verbose: bool | None = None,
|
746
|
+
opt_dtypes: bool | None = None,
|
747
|
+
**kwargs,
|
450
748
|
) -> pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]:
|
451
749
|
"""Convert data to PyArrow Table(s).
|
452
750
|
|
453
751
|
Args:
|
454
752
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
455
753
|
reload (bool, optional): Reload data if True. Default is False.
|
754
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
755
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
756
|
+
verbose (bool, optional): Verbose output. Default is None.
|
757
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
758
|
+
kwargs: Additional keyword arguments.
|
456
759
|
|
457
760
|
Returns:
|
458
761
|
pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]: PyArrow Table or list of
|
459
762
|
Tables and optional metadata.
|
460
763
|
"""
|
461
|
-
self._load(
|
764
|
+
self._load(
|
765
|
+
reload=reload,
|
766
|
+
metadata=metadata,
|
767
|
+
batch_size=None,
|
768
|
+
include_file_path=include_file_path,
|
769
|
+
concat=None,
|
770
|
+
use_threads=use_threads,
|
771
|
+
verbose=verbose,
|
772
|
+
opt_dtypes=opt_dtypes,
|
773
|
+
**kwargs,
|
774
|
+
)
|
462
775
|
if isinstance(self._data, list):
|
463
776
|
df = [
|
464
777
|
df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
@@ -472,22 +785,48 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
472
785
|
else self._data
|
473
786
|
)
|
474
787
|
if metadata:
|
475
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
476
|
-
return df,
|
788
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
789
|
+
return df, self._metadata
|
477
790
|
return df
|
478
791
|
|
479
792
|
def iter_pyarrow_table(
|
480
|
-
self,
|
793
|
+
self,
|
794
|
+
reload: bool = False,
|
795
|
+
batch_size: int | None = None,
|
796
|
+
include_file_path: bool = False,
|
797
|
+
concat: bool | None = None,
|
798
|
+
use_threads: bool | None = None,
|
799
|
+
verbose: bool | None = None,
|
800
|
+
opt_dtypes: bool | None = None,
|
801
|
+
**kwargs,
|
481
802
|
) -> Generator[pa.Table, None, None]:
|
482
803
|
"""Iterate over PyArrow Tables.
|
483
804
|
|
805
|
+
Args:
|
806
|
+
reload (bool, optional): Reload data if True. Default is False.
|
807
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
808
|
+
concat (bool, optional): Concatenate multiple files into a single Table. Default is True.
|
809
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
810
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
811
|
+
verbose (bool, optional): Verbose output. Default is None.
|
812
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
813
|
+
kwargs: Additional keyword arguments.
|
814
|
+
|
484
815
|
Returns:
|
485
816
|
Generator[pa.Table, None, None]: Generator of PyArrow Tables.
|
486
817
|
"""
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
818
|
+
batch_size = batch_size or self.batch_size or 1
|
819
|
+
|
820
|
+
self._load(
|
821
|
+
reload=reload,
|
822
|
+
batch_size=batch_size,
|
823
|
+
include_file_path=include_file_path,
|
824
|
+
concat=concat,
|
825
|
+
use_threads=use_threads,
|
826
|
+
verbose=verbose,
|
827
|
+
opt_dtypes=opt_dtypes,
|
828
|
+
**kwargs,
|
829
|
+
)
|
491
830
|
if isinstance(self._data, list | Generator):
|
492
831
|
for df in self._data:
|
493
832
|
yield df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
@@ -503,6 +842,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
503
842
|
conn: duckdb.DuckDBPyConnection | None = None,
|
504
843
|
metadata: bool = False,
|
505
844
|
reload: bool = False,
|
845
|
+
include_file_path: bool = False,
|
846
|
+
use_threads: bool | None = None,
|
847
|
+
verbose: bool | None = None,
|
848
|
+
opt_dtypes: bool | None = None,
|
506
849
|
**kwargs,
|
507
850
|
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
508
851
|
"""Convert data to DuckDB relation.
|
@@ -511,6 +854,11 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
511
854
|
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
512
855
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
513
856
|
reload (bool, optional): Reload data if True. Default is False.
|
857
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
858
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
859
|
+
verbose (bool, optional): Verbose output. Default is None.
|
860
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
861
|
+
kwargs: Additional keyword arguments.
|
514
862
|
|
515
863
|
Returns:
|
516
864
|
duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
|
@@ -523,10 +871,27 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
523
871
|
|
524
872
|
if metadata:
|
525
873
|
return self._conn.from_arrow(
|
526
|
-
self.to_pyarrow_table(
|
874
|
+
self.to_pyarrow_table(
|
875
|
+
metadata=metadata,
|
876
|
+
reload=reload,
|
877
|
+
batch_size=None,
|
878
|
+
include_file_path=include_file_path,
|
879
|
+
se_threads=use_threads,
|
880
|
+
verbose=verbose,
|
881
|
+
opt_dtypes=opt_dtypes,
|
882
|
+
**kwargs,
|
883
|
+
),
|
527
884
|
), self._metadata
|
528
885
|
return self._conn.from_arrow(
|
529
|
-
self.to_pyarrow_table(
|
886
|
+
self.to_pyarrow_table(
|
887
|
+
reload=reload,
|
888
|
+
batch_size=None,
|
889
|
+
include_file_path=include_file_path,
|
890
|
+
use_threads=use_threads,
|
891
|
+
verbose=verbose,
|
892
|
+
opt_dtypes=opt_dtypes,
|
893
|
+
**kwargs,
|
894
|
+
)
|
530
895
|
)
|
531
896
|
|
532
897
|
def register_in_duckdb(
|
@@ -535,6 +900,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
535
900
|
name: str | None = None,
|
536
901
|
metadata: bool = False,
|
537
902
|
reload: bool = False,
|
903
|
+
include_file_path: bool = False,
|
904
|
+
use_threads: bool | None = None,
|
905
|
+
verbose: bool | None = None,
|
906
|
+
opt_dtypes: bool | None = None,
|
538
907
|
**kwargs,
|
539
908
|
) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
|
540
909
|
"""Register data in DuckDB.
|
@@ -544,6 +913,11 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
544
913
|
name (str, optional): Name for the DuckDB table.
|
545
914
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
546
915
|
reload (bool, optional): Reload data if True. Default is False.
|
916
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
917
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
918
|
+
verbose (bool, optional): Verbose output. Default is None.
|
919
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
920
|
+
kwargs: Additional keyword arguments.
|
547
921
|
|
548
922
|
Returns:
|
549
923
|
duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
|
@@ -558,7 +932,16 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
558
932
|
self._conn = conn
|
559
933
|
|
560
934
|
self._conn.register(
|
561
|
-
name,
|
935
|
+
name,
|
936
|
+
self.to_pyarrow_table(
|
937
|
+
metadata=metadata,
|
938
|
+
reload=reload,
|
939
|
+
include_file_path=include_file_path,
|
940
|
+
use_threads=use_threads,
|
941
|
+
verbose=verbose,
|
942
|
+
opt_dtypes=opt_dtypes,
|
943
|
+
**kwargs,
|
944
|
+
),
|
562
945
|
)
|
563
946
|
if metadata:
|
564
947
|
return self._conn, self._metadata
|
@@ -571,6 +954,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
571
954
|
name: str | None = None,
|
572
955
|
metadata: bool = False,
|
573
956
|
reload: bool = False,
|
957
|
+
include_file_path: bool = False,
|
958
|
+
use_threads: bool | None = None,
|
959
|
+
verbose: bool | None = None,
|
960
|
+
opt_dtypes: bool | None = None,
|
574
961
|
**kwargs,
|
575
962
|
) -> (
|
576
963
|
duckdb.DuckDBPyRelation
|
@@ -586,6 +973,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
586
973
|
name (str, optional): Name for the DuckDB table.
|
587
974
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
588
975
|
reload (bool, optional): Reload data if True. Default is False.
|
976
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
977
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
978
|
+
verbose (bool, optional): Verbose output. Default is None.
|
979
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
589
980
|
**kwargs: Additional keyword arguments.
|
590
981
|
|
591
982
|
Returns:
|
@@ -596,10 +987,25 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
596
987
|
"""
|
597
988
|
if as_relation:
|
598
989
|
return self.to_duckdb_relation(
|
599
|
-
conn=conn,
|
990
|
+
conn=conn,
|
991
|
+
metadata=metadata,
|
992
|
+
reload=reload,
|
993
|
+
include_file_path=include_file_path,
|
994
|
+
use_threads=use_threads,
|
995
|
+
verbose=verbose,
|
996
|
+
opt_dtypes=opt_dtypes,
|
997
|
+
**kwargs,
|
600
998
|
)
|
601
999
|
return self.register_in_duckdb(
|
602
|
-
conn=conn,
|
1000
|
+
conn=conn,
|
1001
|
+
name=name,
|
1002
|
+
metadata=metadata,
|
1003
|
+
reload=reload,
|
1004
|
+
include_file_path=include_file_path,
|
1005
|
+
use_threads=use_threads,
|
1006
|
+
verbose=verbose,
|
1007
|
+
opt_dtypes=opt_dtypes,
|
1008
|
+
**kwargs,
|
603
1009
|
)
|
604
1010
|
|
605
1011
|
def register_in_datafusion(
|
@@ -608,6 +1014,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
608
1014
|
name: str | None = None,
|
609
1015
|
metadata: bool = False,
|
610
1016
|
reload: bool = False,
|
1017
|
+
include_file_path: bool = False,
|
1018
|
+
use_threads: bool | None = None,
|
1019
|
+
verbose: bool | None = None,
|
1020
|
+
opt_dtypes: bool | None = None,
|
611
1021
|
**kwargs,
|
612
1022
|
) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
|
613
1023
|
"""Register data in DataFusion.
|
@@ -632,11 +1042,18 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
632
1042
|
|
633
1043
|
self._ctx.register_record_batches(
|
634
1044
|
name,
|
635
|
-
[
|
1045
|
+
[
|
1046
|
+
self.to_pyarrow_table(
|
1047
|
+
reload=reload,
|
1048
|
+
include_file_path=include_file_path,
|
1049
|
+
use_threads=use_threads,
|
1050
|
+
opt_dtypes=opt_dtypes**kwargs,
|
1051
|
+
).to_batches()
|
1052
|
+
],
|
636
1053
|
)
|
637
1054
|
if metadata:
|
638
1055
|
return self._ctx, self._metadata
|
639
|
-
return
|
1056
|
+
return self._ctx
|
640
1057
|
|
641
1058
|
def filter(
|
642
1059
|
self, filter_expr: str | pl.Expr | pa.compute.Expression
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: FlowerPower
|
3
|
-
Version: 0.11.6
|
3
|
+
Version: 0.11.6.1
|
4
4
|
Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
5
5
|
Author-email: "Volker L." <ligno.blades@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/legout/flowerpower
|
@@ -18,7 +18,7 @@ flowerpower/cli/pipeline.py,sha256=60P6u_QOSgp0jJXEMxazEEo5Sh7-SWFo-Kkuaz21YuI,3
|
|
18
18
|
flowerpower/cli/utils.py,sha256=nDSSj_1nlYlMmj252kRZeohhFqHv9yvdgDEduQCyWOc,5152
|
19
19
|
flowerpower/fs/__init__.py,sha256=uZaPXErEfQqQRbKRIjkB9yiygd45X5_psYn9-VVrBTQ,910
|
20
20
|
flowerpower/fs/base.py,sha256=TqgqBsaFj13O1NpAr8kHuGJ9CTlaSWViMB8Ai_iuCjs,22761
|
21
|
-
flowerpower/fs/ext.py,sha256=
|
21
|
+
flowerpower/fs/ext.py,sha256=2NmhSbCIL0qnONMRNPHcPUuR39bGjWpxJE4hNHU5Rvw,69044
|
22
22
|
flowerpower/fs/storage_options.py,sha256=msq5TpxAU8tcE_Bxjw6SyxaFa75UjdYnR4-O9U2wmbk,48034
|
23
23
|
flowerpower/job_queue/__init__.py,sha256=a25hIqv2xoFKb4JZlyUukS0ppZ9-2sJKH3XAvbk3rlk,10788
|
24
24
|
flowerpower/job_queue/base.py,sha256=YwLunDQSyqkSU_vJ69C5SSybJeJP1bAiZ3teUtOchxA,13640
|
@@ -44,7 +44,7 @@ flowerpower/pipeline/manager.py,sha256=KVpOclUEUAETUNJamJJGuKt3oxCaLitQgxWxkE1q0
|
|
44
44
|
flowerpower/pipeline/registry.py,sha256=6ngmHyKyQsxvIO4qRYxljedY0BE1wE3lpfksEGOzjNs,18963
|
45
45
|
flowerpower/pipeline/runner.py,sha256=dsSVYixFXqlxFk8EJfT4wV_7IwgkXq0ErwH_yf_NGS8,25654
|
46
46
|
flowerpower/pipeline/visualizer.py,sha256=amjMrl5NetErE198HzZBPWVZBi_t5jj9ydxWpuNLoTI,5013
|
47
|
-
flowerpower/plugins/io/base.py,sha256
|
47
|
+
flowerpower/plugins/io/base.py,sha256=-bZBTdFGUWm60JuFpBG_1TZO7D0hmjgSA3a8Prg1MnY,96644
|
48
48
|
flowerpower/plugins/io/metadata.py,sha256=PCrepLilXRWKDsB5BKFF_-OFs712s1zBeitW-84lDLQ,7005
|
49
49
|
flowerpower/plugins/io/helpers/datetime.py,sha256=1WBUg2ywcsodJQwoF6JiIGc9yhVobvE2IErWp4i95m4,10649
|
50
50
|
flowerpower/plugins/io/helpers/polars.py,sha256=346DBHG-HvoGZWF-DWxgz7H3KlZu8bFylKIqMOnVJSk,27031
|
@@ -94,9 +94,9 @@ flowerpower/utils/monkey.py,sha256=VPl3yimoWhwD9kI05BFsjNvtyQiDyLfY4Q85Bb6Ma0w,2
|
|
94
94
|
flowerpower/utils/open_telemetry.py,sha256=fQWJWbIQFtKIxMBjAWeF12NGnqT0isO3A3j-DSOv_vE,949
|
95
95
|
flowerpower/utils/scheduler.py,sha256=2zJ_xmLXpvXUQNF1XS2Gqm3Ogo907ctZ50GtvQB_rhE,9354
|
96
96
|
flowerpower/utils/templates.py,sha256=ouyEeSDqa9PjW8c32fGpcINlpC0WToawRFZkMPtwsLE,1591
|
97
|
-
flowerpower-0.11.6.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
|
98
|
-
flowerpower-0.11.6.dist-info/METADATA,sha256=
|
99
|
-
flowerpower-0.11.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
-
flowerpower-0.11.6.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
|
101
|
-
flowerpower-0.11.6.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
|
102
|
-
flowerpower-0.11.6.dist-info/RECORD,,
|
97
|
+
flowerpower-0.11.6.1.dist-info/licenses/LICENSE,sha256=9AkLexxrmr0aBgSHiqxpJk9wgazpP1CTJyiDyr56J9k,1063
|
98
|
+
flowerpower-0.11.6.1.dist-info/METADATA,sha256=KOkDA61ZYzXs3vvwKIQciSsSl-OoniBSpdilRlYXU8g,21612
|
99
|
+
flowerpower-0.11.6.1.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
100
|
+
flowerpower-0.11.6.1.dist-info/entry_points.txt,sha256=61X11i5a2IwC9LBiP20XCDl5zMOigGCjMCx17B7bDbQ,52
|
101
|
+
flowerpower-0.11.6.1.dist-info/top_level.txt,sha256=VraH4WtEUfSxs5L-rXwDQhzQb9eLHTUtgvmFZ2dAYnA,12
|
102
|
+
flowerpower-0.11.6.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|