FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/fs/ext.py +176 -34
- flowerpower/pipeline/base.py +3 -1
- flowerpower/pipeline/registry.py +9 -9
- flowerpower/plugins/io/base.py +501 -78
- flowerpower/plugins/io/helpers/polars.py +346 -124
- flowerpower/plugins/io/helpers/pyarrow.py +406 -0
- flowerpower/settings/general.py +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/METADATA +1 -1
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/RECORD +13 -12
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.5.8.dist-info → flowerpower-0.11.6.1.dist-info}/top_level.txt +0 -0
flowerpower/plugins/io/base.py
CHANGED
@@ -22,11 +22,11 @@ from sqlalchemy import create_engine, text
|
|
22
22
|
from ...fs import get_filesystem
|
23
23
|
from ...fs.ext import _dict_to_dataframe, path_to_glob
|
24
24
|
from ...fs.storage_options import (AwsStorageOptions, AzureStorageOptions,
|
25
|
-
|
26
|
-
|
27
|
-
StorageOptions)
|
25
|
+
GcsStorageOptions, GitHubStorageOptions,
|
26
|
+
GitLabStorageOptions, StorageOptions)
|
28
27
|
from ...utils.misc import convert_large_types_to_standard, to_pyarrow_table
|
29
28
|
from .helpers.polars import pl
|
29
|
+
from .helpers.pyarrow import opt_dtype
|
30
30
|
from .helpers.sql import sql2polars_filter, sql2pyarrow_filter
|
31
31
|
from .metadata import get_dataframe_metadata, get_pyarrow_dataset_metadata
|
32
32
|
|
@@ -185,68 +185,134 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
185
185
|
include_file_path: bool = field(default=False)
|
186
186
|
concat: bool = field(default=True)
|
187
187
|
batch_size: int | None = field(default=None)
|
188
|
+
opt_dtypes: bool = field(default=True)
|
189
|
+
use_threads: bool = field(default=True)
|
188
190
|
conn: duckdb.DuckDBPyConnection | None = field(default=None)
|
189
191
|
ctx: datafusion.SessionContext | None = field(default=None)
|
190
192
|
jsonlines: bool | None = field(default=None)
|
191
193
|
partitioning: str | list[str] | pds.Partitioning | None = field(default=None)
|
192
194
|
_data: Any | None = field(default=None)
|
193
195
|
|
194
|
-
def _load(
|
195
|
-
|
196
|
-
|
196
|
+
def _load(
|
197
|
+
self,
|
198
|
+
metadata: bool = False,
|
199
|
+
reload: bool = False,
|
200
|
+
batch_size: int | None = None,
|
201
|
+
include_file_path: bool = False,
|
202
|
+
concat: bool | None = None,
|
203
|
+
use_threads: bool | None = None,
|
204
|
+
verbose: bool | None = None,
|
205
|
+
opt_dtypes: bool | None = None,
|
206
|
+
**kwargs,
|
207
|
+
):
|
208
|
+
if batch_size is not None:
|
209
|
+
if self.batch_size != batch_size:
|
197
210
|
reload = True
|
198
|
-
|
199
|
-
else:
|
200
|
-
kwargs.pop("include_file_path")
|
211
|
+
self.batch_size = batch_size
|
201
212
|
|
202
|
-
if
|
203
|
-
if self.
|
213
|
+
if include_file_path is not None:
|
214
|
+
if self.include_file_path != include_file_path:
|
204
215
|
reload = True
|
205
|
-
self.
|
206
|
-
else:
|
207
|
-
kwargs.pop("concat")
|
216
|
+
self.include_file_path = include_file_path
|
208
217
|
|
209
|
-
if
|
210
|
-
if self.
|
218
|
+
if concat is not None:
|
219
|
+
if self.concat != concat:
|
211
220
|
reload = True
|
212
|
-
self.
|
213
|
-
|
214
|
-
|
221
|
+
self.concat = concat
|
222
|
+
|
223
|
+
if use_threads is not None:
|
224
|
+
if self.use_threads != use_threads:
|
225
|
+
reload = True
|
226
|
+
self.use_threads = use_threads
|
227
|
+
|
228
|
+
if verbose is not None:
|
229
|
+
if self.fs.verbose != verbose:
|
230
|
+
reload = True
|
231
|
+
self.fs.verbose = verbose
|
232
|
+
|
233
|
+
if opt_dtypes is not None:
|
234
|
+
if self.opt_dtypes != opt_dtypes:
|
235
|
+
reload = True
|
236
|
+
self.opt_dtypes = opt_dtypes
|
215
237
|
|
216
238
|
if "partitioning" in kwargs:
|
217
239
|
if self.partitioning != kwargs["partitioning"]:
|
218
240
|
reload = True
|
219
241
|
self.partitioning = kwargs.pop("partitioning")
|
220
|
-
else:
|
221
|
-
kwargs.pop("partitioning")
|
222
242
|
|
223
243
|
if not hasattr(self, "_data") or self._data is None or reload:
|
224
244
|
self._data = self.fs.read_files(
|
225
245
|
path=self._glob_path,
|
226
246
|
format=self.format,
|
227
|
-
include_file_path=True,
|
247
|
+
include_file_path=True if metadata or self.include_file_path else False,
|
228
248
|
concat=self.concat,
|
229
249
|
jsonlines=self.jsonlines or None,
|
230
250
|
batch_size=self.batch_size,
|
231
251
|
partitioning=self.partitioning,
|
252
|
+
opt_dtypes=self.opt_dtypes,
|
253
|
+
verbose=self.verbose,
|
254
|
+
use_threads=self.use_threads,
|
232
255
|
**kwargs,
|
233
256
|
)
|
234
|
-
if
|
235
|
-
self.
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
257
|
+
if metadata:
|
258
|
+
if isinstance(self._data, tuple | list):
|
259
|
+
self._metadata = [
|
260
|
+
get_dataframe_metadata(
|
261
|
+
df=df,
|
262
|
+
path=self.path,
|
263
|
+
format=self.format,
|
264
|
+
num_files=pl.from_arrow(df.select(["file_path"])).select(
|
265
|
+
pl.n_unique("file_path")
|
266
|
+
)[0, 0]
|
267
|
+
if isinstance(df, pa.Table)
|
268
|
+
else df.select(pl.n_unique("file_path"))[0, 0],
|
269
|
+
)
|
270
|
+
for df in self._data
|
271
|
+
]
|
272
|
+
if not self.include_file_path:
|
273
|
+
self._data = [df.drop("file_path") for df in self._data]
|
274
|
+
|
275
|
+
elif isinstance(self._data, pa.Table):
|
276
|
+
self._metadata = get_dataframe_metadata(
|
277
|
+
df=self._data,
|
278
|
+
path=self.path,
|
279
|
+
format=self.format,
|
280
|
+
num_files=pl.from_arrow(
|
281
|
+
self._data.select(pl.n_unique("file_path"))
|
282
|
+
)[0, 0],
|
283
|
+
)
|
284
|
+
if not self.include_file_path:
|
285
|
+
self._data = self._data.drop("file_path")
|
286
|
+
|
287
|
+
elif isinstance(self._data, pl.DataFrame | pl.LazyFrame):
|
288
|
+
self._metadata = get_dataframe_metadata(
|
289
|
+
df=self._data,
|
290
|
+
path=self.path,
|
291
|
+
format=self.format,
|
292
|
+
num_files=self._data.select(pl.n_unique("file_path"))[0, 0]
|
293
|
+
if isinstance(self._data, pl.DataFrame)
|
294
|
+
else self._data.select(pl.n_unique("file_path")).collect()[
|
295
|
+
0, 0
|
296
|
+
],
|
297
|
+
)
|
298
|
+
|
299
|
+
if not self.include_file_path:
|
300
|
+
self._data = self._data.drop("file_path")
|
301
|
+
else:
|
302
|
+
metadata = {}
|
245
303
|
else:
|
246
304
|
self._metadata = {}
|
247
305
|
|
248
306
|
def to_pandas(
|
249
|
-
self,
|
307
|
+
self,
|
308
|
+
metadata: bool = False,
|
309
|
+
reload: bool = False,
|
310
|
+
include_file_path: bool = False,
|
311
|
+
concat: bool | None = None,
|
312
|
+
use_threads: bool | None = None,
|
313
|
+
verbose: bool | None = None,
|
314
|
+
opt_dtypes: bool | None = None,
|
315
|
+
**kwargs,
|
250
316
|
) -> (
|
251
317
|
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]]
|
252
318
|
| pd.DataFrame
|
@@ -257,12 +323,28 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
257
323
|
Args:
|
258
324
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
259
325
|
reload (bool, optional): Reload data if True. Default is False.
|
326
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
327
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
328
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
329
|
+
verbose (bool, optional): Verbose output. Default is None.
|
330
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
331
|
+
kwargs: Additional keyword arguments.
|
260
332
|
|
261
333
|
Returns:
|
262
334
|
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]] | pd.DataFrame | list[pd.DataFrame]: Pandas
|
263
335
|
DataFrame or list of DataFrames and optional metadata.
|
264
336
|
"""
|
265
|
-
self._load(
|
337
|
+
self._load(
|
338
|
+
reload=reload,
|
339
|
+
metadata=metadata,
|
340
|
+
batch_size=None,
|
341
|
+
include_file_path=include_file_path,
|
342
|
+
concat=concat,
|
343
|
+
use_threads=use_threads,
|
344
|
+
verbose=verbose,
|
345
|
+
opt_dtypes=opt_dtypes,
|
346
|
+
**kwargs,
|
347
|
+
)
|
266
348
|
if isinstance(self._data, list):
|
267
349
|
df = [
|
268
350
|
df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
@@ -276,26 +358,49 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
276
358
|
else self._data.to_pandas()
|
277
359
|
)
|
278
360
|
if metadata:
|
279
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
280
|
-
return df,
|
361
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
362
|
+
return df, self._metadata
|
281
363
|
return df
|
282
364
|
|
283
365
|
def iter_pandas(
|
284
|
-
self,
|
366
|
+
self,
|
367
|
+
reload: bool = False,
|
368
|
+
batch_size: int | None = None,
|
369
|
+
include_file_path: bool = False,
|
370
|
+
concat: bool | None = None,
|
371
|
+
use_threads: bool | None = None,
|
372
|
+
verbose: bool | None = None,
|
373
|
+
opt_dtypes: bool | None = None,
|
374
|
+
**kwargs,
|
285
375
|
) -> Generator[pd.DataFrame, None, None]:
|
286
376
|
"""Iterate over Pandas DataFrames.
|
287
377
|
|
288
378
|
Args:
|
289
379
|
batch_size (int, optional): Batch size for iteration. Default is 1.
|
290
380
|
reload (bool, optional): Reload data if True. Default is False.
|
381
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
382
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
383
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
384
|
+
verbose (bool, optional): Verbose output. Default is None.
|
385
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
386
|
+
kwargs: Additional keyword arguments.
|
291
387
|
|
292
388
|
Returns:
|
293
389
|
Generator[pd.DataFrame, None, None]: Generator of Pandas DataFrames.
|
294
390
|
"""
|
295
|
-
|
296
|
-
|
391
|
+
batch_size = batch_size or self.batch_size or 1
|
392
|
+
|
393
|
+
self._load(
|
394
|
+
reload=reload,
|
395
|
+
batch_size=batch_size,
|
396
|
+
include_file_path=include_file_path,
|
397
|
+
concat=concat,
|
398
|
+
use_threads=use_threads,
|
399
|
+
verbose=verbose,
|
400
|
+
opt_dtypes=opt_dtypes,
|
401
|
+
**kwargs,
|
402
|
+
)
|
297
403
|
|
298
|
-
self._load(reload=reload, **kwargs)
|
299
404
|
if isinstance(self._data, list | Generator):
|
300
405
|
for df in self._data:
|
301
406
|
yield df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
@@ -307,13 +412,47 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
307
412
|
)
|
308
413
|
|
309
414
|
def _to_polars_dataframe(
|
310
|
-
self,
|
415
|
+
self,
|
416
|
+
metadata: bool = False,
|
417
|
+
reload: bool = False,
|
418
|
+
include_file_path: bool = False,
|
419
|
+
concat: bool | None = None,
|
420
|
+
use_threads: bool | None = None,
|
421
|
+
verbose: bool | None = None,
|
422
|
+
opt_dtypes: bool | None = None,
|
423
|
+
**kwargs,
|
311
424
|
) -> (
|
312
425
|
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]]
|
313
426
|
| pl.DataFrame
|
314
427
|
| list[pl.DataFrame]
|
315
428
|
):
|
316
|
-
|
429
|
+
"""Convert data to Polars DataFrame(s).
|
430
|
+
|
431
|
+
Args:
|
432
|
+
metadata (bool, optional): Include metadata in the output. Default is False.
|
433
|
+
reload (bool, optional): Reload data if True. Default is False.
|
434
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
435
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
436
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
437
|
+
verbose (bool, optional): Verbose output. Default is None.
|
438
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
439
|
+
kwargs: Additional keyword arguments.
|
440
|
+
|
441
|
+
Returns:
|
442
|
+
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]] | pl.DataFrame | list[pl.DataFrame]: Polars
|
443
|
+
DataFrame or list of DataFrames and optional metadata.
|
444
|
+
"""
|
445
|
+
self._load(
|
446
|
+
metadata=metadata,
|
447
|
+
reload=reload,
|
448
|
+
batch_size=None,
|
449
|
+
include_file_path=include_file_path,
|
450
|
+
concat=concat,
|
451
|
+
use_threads=use_threads,
|
452
|
+
verbose=verbose,
|
453
|
+
opt_dtypes=opt_dtypes,
|
454
|
+
**kwargs,
|
455
|
+
)
|
317
456
|
if isinstance(self._data, list):
|
318
457
|
df = [
|
319
458
|
df if isinstance(self._data, pl.DataFrame) else pl.from_arrow(df)
|
@@ -327,22 +466,48 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
327
466
|
else pl.from_arrow(self._data)
|
328
467
|
)
|
329
468
|
if metadata:
|
330
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
331
|
-
return df,
|
469
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
470
|
+
return df, self._metadata
|
332
471
|
return df
|
333
472
|
|
334
473
|
def _iter_polars_dataframe(
|
335
|
-
self,
|
474
|
+
self,
|
475
|
+
reload: bool = False,
|
476
|
+
batch_size: int | None = None,
|
477
|
+
include_file_path: bool = False,
|
478
|
+
concat: bool | None = None,
|
479
|
+
use_threads: bool | None = None,
|
480
|
+
verbose: bool | None = None,
|
481
|
+
opt_dtypes: bool | None = None,
|
482
|
+
**kwargs,
|
336
483
|
) -> Generator[pl.DataFrame, None, None]:
|
337
484
|
"""Iterate over Polars DataFrames.
|
338
485
|
|
486
|
+
Args:
|
487
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
488
|
+
reload (bool, optional): Reload data if True. Default is False.
|
489
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
490
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
491
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
492
|
+
verbose (bool, optional): Verbose output. Default is None.
|
493
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
494
|
+
kwargs: Additional keyword arguments.
|
495
|
+
|
339
496
|
Returns:
|
340
497
|
Generator[pl.DataFrame, None, None]: Generator of Polars DataFrames.
|
341
498
|
"""
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
499
|
+
batch_size = batch_size or self.batch_size or 1
|
500
|
+
|
501
|
+
self._load(
|
502
|
+
reload=reload,
|
503
|
+
batch_size=batch_size,
|
504
|
+
include_file_path=include_file_path,
|
505
|
+
concat=concat,
|
506
|
+
use_threads=use_threads,
|
507
|
+
verbose=verbose,
|
508
|
+
opt_dtypes=opt_dtypes,
|
509
|
+
**kwargs,
|
510
|
+
)
|
346
511
|
if isinstance(self._data, list | Generator):
|
347
512
|
for df in self._data:
|
348
513
|
yield df if isinstance(df, pl.DataFrame) else pl.from_arrow(df)
|
@@ -354,38 +519,95 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
354
519
|
)
|
355
520
|
|
356
521
|
def _to_polars_lazyframe(
|
357
|
-
self,
|
522
|
+
self,
|
523
|
+
metadata: bool = False,
|
524
|
+
reload: bool = False,
|
525
|
+
include_file_path: bool = False,
|
526
|
+
concat: bool | None = None,
|
527
|
+
use_threads: bool | None = None,
|
528
|
+
verbose: bool | None = None,
|
529
|
+
opt_dtypes: bool | None = None,
|
530
|
+
**kwargs,
|
358
531
|
) -> (
|
359
532
|
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]]
|
360
533
|
| pl.LazyFrame
|
361
534
|
| list[pl.LazyFrame]
|
362
535
|
):
|
363
|
-
|
536
|
+
"""Convert data to Polars LazyFrame(s).
|
537
|
+
|
538
|
+
Args:
|
539
|
+
metadata (bool, optional): Include metadata in the output. Default is False.
|
540
|
+
reload (bool, optional): Reload data if True. Default is False.
|
541
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
542
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
543
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
544
|
+
verbose (bool, optional): Verbose output. Default is None.
|
545
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
546
|
+
kwargs: Additional keyword arguments.
|
547
|
+
|
548
|
+
Returns:
|
549
|
+
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]] | pl.LazyFrame | list[pl.LazyFrame]: Polars
|
550
|
+
LazyFrame or list of LazyFrames and optional metadata.
|
551
|
+
"""
|
552
|
+
self._load(
|
553
|
+
metadata=metadata,
|
554
|
+
reload=reload,
|
555
|
+
batch_size=None,
|
556
|
+
include_file_path=include_file_path,
|
557
|
+
concat=concat,
|
558
|
+
use_threads=use_threads,
|
559
|
+
verbose=verbose,
|
560
|
+
opt_dtypes=opt_dtypes,
|
561
|
+
**kwargs,
|
562
|
+
)
|
364
563
|
if not self.concat:
|
365
564
|
df = [df.lazy() for df in self._to_polars_dataframe()]
|
366
565
|
|
367
566
|
else:
|
368
567
|
df = self._to_polars_dataframe().lazy()
|
369
568
|
if metadata:
|
370
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
371
|
-
return df,
|
569
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
570
|
+
return df, self._metadata
|
372
571
|
return df
|
373
572
|
|
374
573
|
def _iter_polars_lazyframe(
|
375
|
-
self,
|
574
|
+
self,
|
575
|
+
reload: bool = False,
|
576
|
+
batch_size: int | None = None,
|
577
|
+
include_file_path: bool = False,
|
578
|
+
concat: bool | None = None,
|
579
|
+
use_threads: bool | None = None,
|
580
|
+
verbose: bool | None = None,
|
581
|
+
opt_dtypes: bool | None = None,
|
582
|
+
**kwargs,
|
376
583
|
) -> Generator[pl.LazyFrame, None, None]:
|
377
584
|
"""Iterate over Polars LazyFrames.
|
378
585
|
|
379
586
|
Args:
|
380
587
|
batch_size (int, optional): Batch size for iteration. Default is 1.
|
381
588
|
reload (bool, optional): Reload data if True. Default is False.
|
589
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
590
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
591
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
592
|
+
verbose (bool, optional): Verbose output. Default is None.
|
593
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
594
|
+
kwargs: Additional keyword arguments.
|
382
595
|
|
383
596
|
Returns:
|
384
597
|
Generator[pl.LazyFrame, None, None]: Generator of Polars LazyFrames.
|
385
598
|
"""
|
386
|
-
|
387
|
-
|
388
|
-
self._load(
|
599
|
+
batch_size = batch_size or self.batch_size or 1
|
600
|
+
|
601
|
+
self._load(
|
602
|
+
reload=reload,
|
603
|
+
batch_size=batch_size,
|
604
|
+
include_file_path=include_file_path,
|
605
|
+
concat=concat,
|
606
|
+
use_threads=use_threads,
|
607
|
+
verbose=verbose,
|
608
|
+
opt_dtypes=opt_dtypes,
|
609
|
+
**kwargs,
|
610
|
+
)
|
389
611
|
if isinstance(self._data, list | Generator):
|
390
612
|
for df in self._data:
|
391
613
|
yield (
|
@@ -404,6 +626,12 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
404
626
|
self,
|
405
627
|
lazy: bool = False,
|
406
628
|
metadata: bool = False,
|
629
|
+
reload: bool = False,
|
630
|
+
include_file_path: bool = False,
|
631
|
+
concat: bool | None = None,
|
632
|
+
use_threads: bool | None = None,
|
633
|
+
verbose: bool | None = None,
|
634
|
+
opt_dtypes: bool | None = None,
|
407
635
|
**kwargs,
|
408
636
|
) -> (
|
409
637
|
pl.DataFrame
|
@@ -420,6 +648,14 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
420
648
|
Args:
|
421
649
|
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
|
422
650
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
651
|
+
reload (bool, optional): Reload data if True. Default is False.
|
652
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
653
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
654
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
655
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
656
|
+
verbose (bool, optional): Verbose output. Default is None.
|
657
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
658
|
+
kwargs: Additional keyword arguments.
|
423
659
|
|
424
660
|
Returns:
|
425
661
|
pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame] | tuple[pl.DataFrame | pl.LazyFrame
|
@@ -427,32 +663,115 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
427
663
|
metadata.
|
428
664
|
"""
|
429
665
|
if lazy:
|
430
|
-
return self._to_polars_lazyframe(
|
431
|
-
|
666
|
+
return self._to_polars_lazyframe(
|
667
|
+
metadata=metadata,
|
668
|
+
reload=reload,
|
669
|
+
batch_size=None,
|
670
|
+
include_file_path=include_file_path,
|
671
|
+
concat=concat,
|
672
|
+
use_threads=use_threads,
|
673
|
+
verbose=verbose,
|
674
|
+
opt_dtypes=opt_dtypes,
|
675
|
+
**kwargs,
|
676
|
+
)
|
677
|
+
return self._to_polars_dataframe(
|
678
|
+
metadata=metadata,
|
679
|
+
reload=reload,
|
680
|
+
batch_size=None,
|
681
|
+
include_file_path=include_file_path,
|
682
|
+
concat=concat,
|
683
|
+
use_threads=use_threads,
|
684
|
+
verbose=verbose,
|
685
|
+
opt_dtypes=opt_dtypes,
|
686
|
+
**kwargs,
|
687
|
+
)
|
432
688
|
|
433
689
|
def iter_polars(
|
434
690
|
self,
|
435
691
|
lazy: bool = False,
|
692
|
+
reload: bool = False,
|
693
|
+
batch_size: int | None = None,
|
694
|
+
include_file_path: bool = False,
|
695
|
+
concat: bool | None = None,
|
696
|
+
use_threads: bool | None = None,
|
697
|
+
verbose: bool | None = None,
|
698
|
+
opt_dtypes: bool | None = None,
|
436
699
|
**kwargs,
|
437
700
|
) -> Generator[pl.DataFrame | pl.LazyFrame, None, None]:
|
701
|
+
"""Iterate over Polars DataFrames or LazyFrames.
|
702
|
+
|
703
|
+
Args:
|
704
|
+
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame. Default is False.
|
705
|
+
reload (bool, optional): Reload data if True. Default is False.
|
706
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
707
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
708
|
+
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
709
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
710
|
+
verbose (bool, optional): Verbose output. Default is None.
|
711
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
712
|
+
kwargs: Additional keyword arguments.
|
713
|
+
|
714
|
+
Returns:
|
715
|
+
Generator[pl.DataFrame | pl.LazyFrame, None, None]: Generator of Polars DataFrames or LazyFrames.
|
716
|
+
"""
|
438
717
|
if lazy:
|
439
|
-
yield from self._iter_polars_lazyframe(
|
440
|
-
|
718
|
+
yield from self._iter_polars_lazyframe(
|
719
|
+
reload=reload,
|
720
|
+
batch_size=batch_size,
|
721
|
+
include_file_path=include_file_path,
|
722
|
+
concat=concat,
|
723
|
+
use_threads=use_threads,
|
724
|
+
verbose=verbose,
|
725
|
+
opt_dtypes=opt_dtypes,
|
726
|
+
**kwargs,
|
727
|
+
)
|
728
|
+
yield from self._iter_polars_dataframe(
|
729
|
+
reload=reload,
|
730
|
+
batch_size=batch_size,
|
731
|
+
include_file_path=include_file_path,
|
732
|
+
concat=concat,
|
733
|
+
use_threads=use_threads,
|
734
|
+
verbose=verbose,
|
735
|
+
opt_dtypes=opt_dtypes,
|
736
|
+
**kwargs,
|
737
|
+
)
|
441
738
|
|
442
739
|
def to_pyarrow_table(
|
443
|
-
self,
|
740
|
+
self,
|
741
|
+
metadata: bool = False,
|
742
|
+
reload: bool = False,
|
743
|
+
include_file_path: bool = False,
|
744
|
+
use_threads: bool | None = None,
|
745
|
+
verbose: bool | None = None,
|
746
|
+
opt_dtypes: bool | None = None,
|
747
|
+
**kwargs,
|
444
748
|
) -> pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]:
|
445
749
|
"""Convert data to PyArrow Table(s).
|
446
750
|
|
447
751
|
Args:
|
448
752
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
449
753
|
reload (bool, optional): Reload data if True. Default is False.
|
754
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
755
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
756
|
+
verbose (bool, optional): Verbose output. Default is None.
|
757
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
758
|
+
kwargs: Additional keyword arguments.
|
450
759
|
|
451
760
|
Returns:
|
452
761
|
pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]: PyArrow Table or list of
|
453
762
|
Tables and optional metadata.
|
454
763
|
"""
|
455
|
-
self._load(
|
764
|
+
self._load(
|
765
|
+
reload=reload,
|
766
|
+
metadata=metadata,
|
767
|
+
batch_size=None,
|
768
|
+
include_file_path=include_file_path,
|
769
|
+
concat=None,
|
770
|
+
use_threads=use_threads,
|
771
|
+
verbose=verbose,
|
772
|
+
opt_dtypes=opt_dtypes,
|
773
|
+
**kwargs,
|
774
|
+
)
|
456
775
|
if isinstance(self._data, list):
|
457
776
|
df = [
|
458
777
|
df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
@@ -466,22 +785,48 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
466
785
|
else self._data
|
467
786
|
)
|
468
787
|
if metadata:
|
469
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
470
|
-
return df,
|
788
|
+
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
789
|
+
return df, self._metadata
|
471
790
|
return df
|
472
791
|
|
473
792
|
def iter_pyarrow_table(
|
474
|
-
self,
|
793
|
+
self,
|
794
|
+
reload: bool = False,
|
795
|
+
batch_size: int | None = None,
|
796
|
+
include_file_path: bool = False,
|
797
|
+
concat: bool | None = None,
|
798
|
+
use_threads: bool | None = None,
|
799
|
+
verbose: bool | None = None,
|
800
|
+
opt_dtypes: bool | None = None,
|
801
|
+
**kwargs,
|
475
802
|
) -> Generator[pa.Table, None, None]:
|
476
803
|
"""Iterate over PyArrow Tables.
|
477
804
|
|
805
|
+
Args:
|
806
|
+
reload (bool, optional): Reload data if True. Default is False.
|
807
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
808
|
+
concat (bool, optional): Concatenate multiple files into a single Table. Default is True.
|
809
|
+
batch_size (int, optional): Batch size for iteration. Default is 1.
|
810
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
811
|
+
verbose (bool, optional): Verbose output. Default is None.
|
812
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
813
|
+
kwargs: Additional keyword arguments.
|
814
|
+
|
478
815
|
Returns:
|
479
816
|
Generator[pa.Table, None, None]: Generator of PyArrow Tables.
|
480
817
|
"""
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
818
|
+
batch_size = batch_size or self.batch_size or 1
|
819
|
+
|
820
|
+
self._load(
|
821
|
+
reload=reload,
|
822
|
+
batch_size=batch_size,
|
823
|
+
include_file_path=include_file_path,
|
824
|
+
concat=concat,
|
825
|
+
use_threads=use_threads,
|
826
|
+
verbose=verbose,
|
827
|
+
opt_dtypes=opt_dtypes,
|
828
|
+
**kwargs,
|
829
|
+
)
|
485
830
|
if isinstance(self._data, list | Generator):
|
486
831
|
for df in self._data:
|
487
832
|
yield df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
@@ -497,6 +842,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
497
842
|
conn: duckdb.DuckDBPyConnection | None = None,
|
498
843
|
metadata: bool = False,
|
499
844
|
reload: bool = False,
|
845
|
+
include_file_path: bool = False,
|
846
|
+
use_threads: bool | None = None,
|
847
|
+
verbose: bool | None = None,
|
848
|
+
opt_dtypes: bool | None = None,
|
500
849
|
**kwargs,
|
501
850
|
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
502
851
|
"""Convert data to DuckDB relation.
|
@@ -505,6 +854,11 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
505
854
|
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
506
855
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
507
856
|
reload (bool, optional): Reload data if True. Default is False.
|
857
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
858
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
859
|
+
verbose (bool, optional): Verbose output. Default is None.
|
860
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
861
|
+
kwargs: Additional keyword arguments.
|
508
862
|
|
509
863
|
Returns:
|
510
864
|
duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
|
@@ -517,10 +871,27 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
517
871
|
|
518
872
|
if metadata:
|
519
873
|
return self._conn.from_arrow(
|
520
|
-
self.to_pyarrow_table(
|
874
|
+
self.to_pyarrow_table(
|
875
|
+
metadata=metadata,
|
876
|
+
reload=reload,
|
877
|
+
batch_size=None,
|
878
|
+
include_file_path=include_file_path,
|
879
|
+
se_threads=use_threads,
|
880
|
+
verbose=verbose,
|
881
|
+
opt_dtypes=opt_dtypes,
|
882
|
+
**kwargs,
|
883
|
+
),
|
521
884
|
), self._metadata
|
522
885
|
return self._conn.from_arrow(
|
523
|
-
self.to_pyarrow_table(
|
886
|
+
self.to_pyarrow_table(
|
887
|
+
reload=reload,
|
888
|
+
batch_size=None,
|
889
|
+
include_file_path=include_file_path,
|
890
|
+
use_threads=use_threads,
|
891
|
+
verbose=verbose,
|
892
|
+
opt_dtypes=opt_dtypes,
|
893
|
+
**kwargs,
|
894
|
+
)
|
524
895
|
)
|
525
896
|
|
526
897
|
def register_in_duckdb(
|
@@ -529,6 +900,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
529
900
|
name: str | None = None,
|
530
901
|
metadata: bool = False,
|
531
902
|
reload: bool = False,
|
903
|
+
include_file_path: bool = False,
|
904
|
+
use_threads: bool | None = None,
|
905
|
+
verbose: bool | None = None,
|
906
|
+
opt_dtypes: bool | None = None,
|
532
907
|
**kwargs,
|
533
908
|
) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
|
534
909
|
"""Register data in DuckDB.
|
@@ -538,6 +913,11 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
538
913
|
name (str, optional): Name for the DuckDB table.
|
539
914
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
540
915
|
reload (bool, optional): Reload data if True. Default is False.
|
916
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
917
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
918
|
+
verbose (bool, optional): Verbose output. Default is None.
|
919
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
920
|
+
kwargs: Additional keyword arguments.
|
541
921
|
|
542
922
|
Returns:
|
543
923
|
duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
|
@@ -552,7 +932,16 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
552
932
|
self._conn = conn
|
553
933
|
|
554
934
|
self._conn.register(
|
555
|
-
name,
|
935
|
+
name,
|
936
|
+
self.to_pyarrow_table(
|
937
|
+
metadata=metadata,
|
938
|
+
reload=reload,
|
939
|
+
include_file_path=include_file_path,
|
940
|
+
use_threads=use_threads,
|
941
|
+
verbose=verbose,
|
942
|
+
opt_dtypes=opt_dtypes,
|
943
|
+
**kwargs,
|
944
|
+
),
|
556
945
|
)
|
557
946
|
if metadata:
|
558
947
|
return self._conn, self._metadata
|
@@ -565,6 +954,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
565
954
|
name: str | None = None,
|
566
955
|
metadata: bool = False,
|
567
956
|
reload: bool = False,
|
957
|
+
include_file_path: bool = False,
|
958
|
+
use_threads: bool | None = None,
|
959
|
+
verbose: bool | None = None,
|
960
|
+
opt_dtypes: bool | None = None,
|
568
961
|
**kwargs,
|
569
962
|
) -> (
|
570
963
|
duckdb.DuckDBPyRelation
|
@@ -580,6 +973,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
580
973
|
name (str, optional): Name for the DuckDB table.
|
581
974
|
metadata (bool, optional): Include metadata in the output. Default is False.
|
582
975
|
reload (bool, optional): Reload data if True. Default is False.
|
976
|
+
include_file_path (bool, optional): Include file path in the output. Default is False.
|
977
|
+
use_threads (bool, optional): Use threads for reading data. Default is True.
|
978
|
+
verbose (bool, optional): Verbose output. Default is None.
|
979
|
+
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
583
980
|
**kwargs: Additional keyword arguments.
|
584
981
|
|
585
982
|
Returns:
|
@@ -590,10 +987,25 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
590
987
|
"""
|
591
988
|
if as_relation:
|
592
989
|
return self.to_duckdb_relation(
|
593
|
-
conn=conn,
|
990
|
+
conn=conn,
|
991
|
+
metadata=metadata,
|
992
|
+
reload=reload,
|
993
|
+
include_file_path=include_file_path,
|
994
|
+
use_threads=use_threads,
|
995
|
+
verbose=verbose,
|
996
|
+
opt_dtypes=opt_dtypes,
|
997
|
+
**kwargs,
|
594
998
|
)
|
595
999
|
return self.register_in_duckdb(
|
596
|
-
conn=conn,
|
1000
|
+
conn=conn,
|
1001
|
+
name=name,
|
1002
|
+
metadata=metadata,
|
1003
|
+
reload=reload,
|
1004
|
+
include_file_path=include_file_path,
|
1005
|
+
use_threads=use_threads,
|
1006
|
+
verbose=verbose,
|
1007
|
+
opt_dtypes=opt_dtypes,
|
1008
|
+
**kwargs,
|
597
1009
|
)
|
598
1010
|
|
599
1011
|
def register_in_datafusion(
|
@@ -602,6 +1014,10 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
602
1014
|
name: str | None = None,
|
603
1015
|
metadata: bool = False,
|
604
1016
|
reload: bool = False,
|
1017
|
+
include_file_path: bool = False,
|
1018
|
+
use_threads: bool | None = None,
|
1019
|
+
verbose: bool | None = None,
|
1020
|
+
opt_dtypes: bool | None = None,
|
605
1021
|
**kwargs,
|
606
1022
|
) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
|
607
1023
|
"""Register data in DataFusion.
|
@@ -626,11 +1042,18 @@ class BaseFileReader(BaseFileIO, gc=False):
|
|
626
1042
|
|
627
1043
|
self._ctx.register_record_batches(
|
628
1044
|
name,
|
629
|
-
[
|
1045
|
+
[
|
1046
|
+
self.to_pyarrow_table(
|
1047
|
+
reload=reload,
|
1048
|
+
include_file_path=include_file_path,
|
1049
|
+
use_threads=use_threads,
|
1050
|
+
opt_dtypes=opt_dtypes**kwargs,
|
1051
|
+
).to_batches()
|
1052
|
+
],
|
630
1053
|
)
|
631
1054
|
if metadata:
|
632
1055
|
return self._ctx, self._metadata
|
633
|
-
return
|
1056
|
+
return self._ctx
|
634
1057
|
|
635
1058
|
def filter(
|
636
1059
|
self, filter_expr: str | pl.Expr | pa.compute.Expression
|