FlowerPower 0.11.5.8__py3-none-any.whl → 0.11.6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,11 +22,11 @@ from sqlalchemy import create_engine, text
22
22
  from ...fs import get_filesystem
23
23
  from ...fs.ext import _dict_to_dataframe, path_to_glob
24
24
  from ...fs.storage_options import (AwsStorageOptions, AzureStorageOptions,
25
- BaseStorageOptions, GcsStorageOptions,
26
- GitHubStorageOptions, GitLabStorageOptions,
27
- StorageOptions)
25
+ GcsStorageOptions, GitHubStorageOptions,
26
+ GitLabStorageOptions, StorageOptions)
28
27
  from ...utils.misc import convert_large_types_to_standard, to_pyarrow_table
29
28
  from .helpers.polars import pl
29
+ from .helpers.pyarrow import opt_dtype
30
30
  from .helpers.sql import sql2polars_filter, sql2pyarrow_filter
31
31
  from .metadata import get_dataframe_metadata, get_pyarrow_dataset_metadata
32
32
 
@@ -185,68 +185,134 @@ class BaseFileReader(BaseFileIO, gc=False):
185
185
  include_file_path: bool = field(default=False)
186
186
  concat: bool = field(default=True)
187
187
  batch_size: int | None = field(default=None)
188
+ opt_dtypes: bool = field(default=True)
189
+ use_threads: bool = field(default=True)
188
190
  conn: duckdb.DuckDBPyConnection | None = field(default=None)
189
191
  ctx: datafusion.SessionContext | None = field(default=None)
190
192
  jsonlines: bool | None = field(default=None)
191
193
  partitioning: str | list[str] | pds.Partitioning | None = field(default=None)
192
194
  _data: Any | None = field(default=None)
193
195
 
194
- def _load(self, reload: bool = False, **kwargs):
195
- if "include_file_path" in kwargs:
196
- if self.include_file_path != kwargs["include_file_path"]:
196
+ def _load(
197
+ self,
198
+ metadata: bool = False,
199
+ reload: bool = False,
200
+ batch_size: int | None = None,
201
+ include_file_path: bool = False,
202
+ concat: bool | None = None,
203
+ use_threads: bool | None = None,
204
+ verbose: bool | None = None,
205
+ opt_dtypes: bool | None = None,
206
+ **kwargs,
207
+ ):
208
+ if batch_size is not None:
209
+ if self.batch_size != batch_size:
197
210
  reload = True
198
- self.include_file_path = kwargs.pop("include_file_path")
199
- else:
200
- kwargs.pop("include_file_path")
211
+ self.batch_size = batch_size
201
212
 
202
- if "concat" in kwargs:
203
- if self.concat != kwargs["concat"]:
213
+ if include_file_path is not None:
214
+ if self.include_file_path != include_file_path:
204
215
  reload = True
205
- self.concat = kwargs.pop("concat")
206
- else:
207
- kwargs.pop("concat")
216
+ self.include_file_path = include_file_path
208
217
 
209
- if "batch_size" in kwargs:
210
- if self.batch_size != kwargs["batch_size"]:
218
+ if concat is not None:
219
+ if self.concat != concat:
211
220
  reload = True
212
- self.batch_size = kwargs.pop("batch_size")
213
- else:
214
- kwargs.pop("batch_size")
221
+ self.concat = concat
222
+
223
+ if use_threads is not None:
224
+ if self.use_threads != use_threads:
225
+ reload = True
226
+ self.use_threads = use_threads
227
+
228
+ if verbose is not None:
229
+ if self.fs.verbose != verbose:
230
+ reload = True
231
+ self.fs.verbose = verbose
232
+
233
+ if opt_dtypes is not None:
234
+ if self.opt_dtypes != opt_dtypes:
235
+ reload = True
236
+ self.opt_dtypes = opt_dtypes
215
237
 
216
238
  if "partitioning" in kwargs:
217
239
  if self.partitioning != kwargs["partitioning"]:
218
240
  reload = True
219
241
  self.partitioning = kwargs.pop("partitioning")
220
- else:
221
- kwargs.pop("partitioning")
222
242
 
223
243
  if not hasattr(self, "_data") or self._data is None or reload:
224
244
  self._data = self.fs.read_files(
225
245
  path=self._glob_path,
226
246
  format=self.format,
227
- include_file_path=True,
247
+ include_file_path=True if metadata or self.include_file_path else False,
228
248
  concat=self.concat,
229
249
  jsonlines=self.jsonlines or None,
230
250
  batch_size=self.batch_size,
231
251
  partitioning=self.partitioning,
252
+ opt_dtypes=self.opt_dtypes,
253
+ verbose=self.verbose,
254
+ use_threads=self.use_threads,
232
255
  **kwargs,
233
256
  )
234
- if not isinstance(self._data, Generator):
235
- self._metadata = get_dataframe_metadata(
236
- df=self._data,
237
- path=self.path,
238
- format=self.format,
239
- num_files=pl.from_arrow(self._data.select(["file_path"])).select(
240
- pl.n_unique("file_path")
241
- )[0, 0],
242
- )
243
- if not self.include_file_path:
244
- self._data = self._data.drop("file_path")
257
+ if metadata:
258
+ if isinstance(self._data, tuple | list):
259
+ self._metadata = [
260
+ get_dataframe_metadata(
261
+ df=df,
262
+ path=self.path,
263
+ format=self.format,
264
+ num_files=pl.from_arrow(df.select(["file_path"])).select(
265
+ pl.n_unique("file_path")
266
+ )[0, 0]
267
+ if isinstance(df, pa.Table)
268
+ else df.select(pl.n_unique("file_path"))[0, 0],
269
+ )
270
+ for df in self._data
271
+ ]
272
+ if not self.include_file_path:
273
+ self._data = [df.drop("file_path") for df in self._data]
274
+
275
+ elif isinstance(self._data, pa.Table):
276
+ self._metadata = get_dataframe_metadata(
277
+ df=self._data,
278
+ path=self.path,
279
+ format=self.format,
280
+ num_files=pl.from_arrow(
281
+ self._data.select(pl.n_unique("file_path"))
282
+ )[0, 0],
283
+ )
284
+ if not self.include_file_path:
285
+ self._data = self._data.drop("file_path")
286
+
287
+ elif isinstance(self._data, pl.DataFrame | pl.LazyFrame):
288
+ self._metadata = get_dataframe_metadata(
289
+ df=self._data,
290
+ path=self.path,
291
+ format=self.format,
292
+ num_files=self._data.select(pl.n_unique("file_path"))[0, 0]
293
+ if isinstance(self._data, pl.DataFrame)
294
+ else self._data.select(pl.n_unique("file_path")).collect()[
295
+ 0, 0
296
+ ],
297
+ )
298
+
299
+ if not self.include_file_path:
300
+ self._data = self._data.drop("file_path")
301
+ else:
302
+ metadata = {}
245
303
  else:
246
304
  self._metadata = {}
247
305
 
248
306
  def to_pandas(
249
- self, metadata: bool = False, reload: bool = False, **kwargs
307
+ self,
308
+ metadata: bool = False,
309
+ reload: bool = False,
310
+ include_file_path: bool = False,
311
+ concat: bool | None = None,
312
+ use_threads: bool | None = None,
313
+ verbose: bool | None = None,
314
+ opt_dtypes: bool | None = None,
315
+ **kwargs,
250
316
  ) -> (
251
317
  tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]]
252
318
  | pd.DataFrame
@@ -257,12 +323,28 @@ class BaseFileReader(BaseFileIO, gc=False):
257
323
  Args:
258
324
  metadata (bool, optional): Include metadata in the output. Default is False.
259
325
  reload (bool, optional): Reload data if True. Default is False.
326
+ include_file_path (bool, optional): Include file path in the output. Default is False.
327
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
328
+ use_threads (bool, optional): Use threads for reading data. Default is True.
329
+ verbose (bool, optional): Verbose output. Default is None.
330
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
331
+ kwargs: Additional keyword arguments.
260
332
 
261
333
  Returns:
262
334
  tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]] | pd.DataFrame | list[pd.DataFrame]: Pandas
263
335
  DataFrame or list of DataFrames and optional metadata.
264
336
  """
265
- self._load(reload=reload, **kwargs)
337
+ self._load(
338
+ reload=reload,
339
+ metadata=metadata,
340
+ batch_size=None,
341
+ include_file_path=include_file_path,
342
+ concat=concat,
343
+ use_threads=use_threads,
344
+ verbose=verbose,
345
+ opt_dtypes=opt_dtypes,
346
+ **kwargs,
347
+ )
266
348
  if isinstance(self._data, list):
267
349
  df = [
268
350
  df if isinstance(df, pd.DataFrame) else df.to_pandas()
@@ -276,26 +358,49 @@ class BaseFileReader(BaseFileIO, gc=False):
276
358
  else self._data.to_pandas()
277
359
  )
278
360
  if metadata:
279
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
280
- return df, metadata
361
+ # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
362
+ return df, self._metadata
281
363
  return df
282
364
 
283
365
  def iter_pandas(
284
- self, reload: bool = False, **kwargs
366
+ self,
367
+ reload: bool = False,
368
+ batch_size: int | None = None,
369
+ include_file_path: bool = False,
370
+ concat: bool | None = None,
371
+ use_threads: bool | None = None,
372
+ verbose: bool | None = None,
373
+ opt_dtypes: bool | None = None,
374
+ **kwargs,
285
375
  ) -> Generator[pd.DataFrame, None, None]:
286
376
  """Iterate over Pandas DataFrames.
287
377
 
288
378
  Args:
289
379
  batch_size (int, optional): Batch size for iteration. Default is 1.
290
380
  reload (bool, optional): Reload data if True. Default is False.
381
+ include_file_path (bool, optional): Include file path in the output. Default is False.
382
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
383
+ use_threads (bool, optional): Use threads for reading data. Default is True.
384
+ verbose (bool, optional): Verbose output. Default is None.
385
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
386
+ kwargs: Additional keyword arguments.
291
387
 
292
388
  Returns:
293
389
  Generator[pd.DataFrame, None, None]: Generator of Pandas DataFrames.
294
390
  """
295
- if self.batch_size is None and "batch_size" not in kwargs:
296
- self.batch_size = 1
391
+ batch_size = batch_size or self.batch_size or 1
392
+
393
+ self._load(
394
+ reload=reload,
395
+ batch_size=batch_size,
396
+ include_file_path=include_file_path,
397
+ concat=concat,
398
+ use_threads=use_threads,
399
+ verbose=verbose,
400
+ opt_dtypes=opt_dtypes,
401
+ **kwargs,
402
+ )
297
403
 
298
- self._load(reload=reload, **kwargs)
299
404
  if isinstance(self._data, list | Generator):
300
405
  for df in self._data:
301
406
  yield df if isinstance(df, pd.DataFrame) else df.to_pandas()
@@ -307,13 +412,47 @@ class BaseFileReader(BaseFileIO, gc=False):
307
412
  )
308
413
 
309
414
  def _to_polars_dataframe(
310
- self, metadata: bool = False, reload: bool = False, **kwargs
415
+ self,
416
+ metadata: bool = False,
417
+ reload: bool = False,
418
+ include_file_path: bool = False,
419
+ concat: bool | None = None,
420
+ use_threads: bool | None = None,
421
+ verbose: bool | None = None,
422
+ opt_dtypes: bool | None = None,
423
+ **kwargs,
311
424
  ) -> (
312
425
  tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]]
313
426
  | pl.DataFrame
314
427
  | list[pl.DataFrame]
315
428
  ):
316
- self._load(reload=reload, **kwargs)
429
+ """Convert data to Polars DataFrame(s).
430
+
431
+ Args:
432
+ metadata (bool, optional): Include metadata in the output. Default is False.
433
+ reload (bool, optional): Reload data if True. Default is False.
434
+ include_file_path (bool, optional): Include file path in the output. Default is False.
435
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
436
+ use_threads (bool, optional): Use threads for reading data. Default is True.
437
+ verbose (bool, optional): Verbose output. Default is None.
438
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
439
+ kwargs: Additional keyword arguments.
440
+
441
+ Returns:
442
+ tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]] | pl.DataFrame | list[pl.DataFrame]: Polars
443
+ DataFrame or list of DataFrames and optional metadata.
444
+ """
445
+ self._load(
446
+ metadata=metadata,
447
+ reload=reload,
448
+ batch_size=None,
449
+ include_file_path=include_file_path,
450
+ concat=concat,
451
+ use_threads=use_threads,
452
+ verbose=verbose,
453
+ opt_dtypes=opt_dtypes,
454
+ **kwargs,
455
+ )
317
456
  if isinstance(self._data, list):
318
457
  df = [
319
458
  df if isinstance(self._data, pl.DataFrame) else pl.from_arrow(df)
@@ -327,22 +466,48 @@ class BaseFileReader(BaseFileIO, gc=False):
327
466
  else pl.from_arrow(self._data)
328
467
  )
329
468
  if metadata:
330
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
331
- return df, metadata
469
+ # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
470
+ return df, self._metadata
332
471
  return df
333
472
 
334
473
  def _iter_polars_dataframe(
335
- self, reload: bool = False, **kwargs
474
+ self,
475
+ reload: bool = False,
476
+ batch_size: int | None = None,
477
+ include_file_path: bool = False,
478
+ concat: bool | None = None,
479
+ use_threads: bool | None = None,
480
+ verbose: bool | None = None,
481
+ opt_dtypes: bool | None = None,
482
+ **kwargs,
336
483
  ) -> Generator[pl.DataFrame, None, None]:
337
484
  """Iterate over Polars DataFrames.
338
485
 
486
+ Args:
487
+ batch_size (int, optional): Batch size for iteration. Default is 1.
488
+ reload (bool, optional): Reload data if True. Default is False.
489
+ include_file_path (bool, optional): Include file path in the output. Default is False.
490
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
491
+ use_threads (bool, optional): Use threads for reading data. Default is True.
492
+ verbose (bool, optional): Verbose output. Default is None.
493
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
494
+ kwargs: Additional keyword arguments.
495
+
339
496
  Returns:
340
497
  Generator[pl.DataFrame, None, None]: Generator of Polars DataFrames.
341
498
  """
342
- if self.batch_size is None and "batch_size" not in kwargs:
343
- self.batch_size = 1
344
-
345
- self._load(reload=reload, **kwargs)
499
+ batch_size = batch_size or self.batch_size or 1
500
+
501
+ self._load(
502
+ reload=reload,
503
+ batch_size=batch_size,
504
+ include_file_path=include_file_path,
505
+ concat=concat,
506
+ use_threads=use_threads,
507
+ verbose=verbose,
508
+ opt_dtypes=opt_dtypes,
509
+ **kwargs,
510
+ )
346
511
  if isinstance(self._data, list | Generator):
347
512
  for df in self._data:
348
513
  yield df if isinstance(df, pl.DataFrame) else pl.from_arrow(df)
@@ -354,38 +519,95 @@ class BaseFileReader(BaseFileIO, gc=False):
354
519
  )
355
520
 
356
521
  def _to_polars_lazyframe(
357
- self, metadata: bool = False, reload: bool = False, **kwargs
522
+ self,
523
+ metadata: bool = False,
524
+ reload: bool = False,
525
+ include_file_path: bool = False,
526
+ concat: bool | None = None,
527
+ use_threads: bool | None = None,
528
+ verbose: bool | None = None,
529
+ opt_dtypes: bool | None = None,
530
+ **kwargs,
358
531
  ) -> (
359
532
  tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]]
360
533
  | pl.LazyFrame
361
534
  | list[pl.LazyFrame]
362
535
  ):
363
- self._load(reload=reload, **kwargs)
536
+ """Convert data to Polars LazyFrame(s).
537
+
538
+ Args:
539
+ metadata (bool, optional): Include metadata in the output. Default is False.
540
+ reload (bool, optional): Reload data if True. Default is False.
541
+ include_file_path (bool, optional): Include file path in the output. Default is False.
542
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
543
+ use_threads (bool, optional): Use threads for reading data. Default is True.
544
+ verbose (bool, optional): Verbose output. Default is None.
545
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
546
+ kwargs: Additional keyword arguments.
547
+
548
+ Returns:
549
+ tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]] | pl.LazyFrame | list[pl.LazyFrame]: Polars
550
+ LazyFrame or list of LazyFrames and optional metadata.
551
+ """
552
+ self._load(
553
+ metadata=metadata,
554
+ reload=reload,
555
+ batch_size=None,
556
+ include_file_path=include_file_path,
557
+ concat=concat,
558
+ use_threads=use_threads,
559
+ verbose=verbose,
560
+ opt_dtypes=opt_dtypes,
561
+ **kwargs,
562
+ )
364
563
  if not self.concat:
365
564
  df = [df.lazy() for df in self._to_polars_dataframe()]
366
565
 
367
566
  else:
368
567
  df = self._to_polars_dataframe().lazy()
369
568
  if metadata:
370
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
371
- return df, metadata
569
+ # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
570
+ return df, self._metadata
372
571
  return df
373
572
 
374
573
  def _iter_polars_lazyframe(
375
- self, reload: bool = False, **kwargs
574
+ self,
575
+ reload: bool = False,
576
+ batch_size: int | None = None,
577
+ include_file_path: bool = False,
578
+ concat: bool | None = None,
579
+ use_threads: bool | None = None,
580
+ verbose: bool | None = None,
581
+ opt_dtypes: bool | None = None,
582
+ **kwargs,
376
583
  ) -> Generator[pl.LazyFrame, None, None]:
377
584
  """Iterate over Polars LazyFrames.
378
585
 
379
586
  Args:
380
587
  batch_size (int, optional): Batch size for iteration. Default is 1.
381
588
  reload (bool, optional): Reload data if True. Default is False.
589
+ include_file_path (bool, optional): Include file path in the output. Default is False.
590
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
591
+ use_threads (bool, optional): Use threads for reading data. Default is True.
592
+ verbose (bool, optional): Verbose output. Default is None.
593
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
594
+ kwargs: Additional keyword arguments.
382
595
 
383
596
  Returns:
384
597
  Generator[pl.LazyFrame, None, None]: Generator of Polars LazyFrames.
385
598
  """
386
- if self.batch_size is None and "batch_size" not in kwargs:
387
- self.batch_size = 1
388
- self._load(reload=reload, **kwargs)
599
+ batch_size = batch_size or self.batch_size or 1
600
+
601
+ self._load(
602
+ reload=reload,
603
+ batch_size=batch_size,
604
+ include_file_path=include_file_path,
605
+ concat=concat,
606
+ use_threads=use_threads,
607
+ verbose=verbose,
608
+ opt_dtypes=opt_dtypes,
609
+ **kwargs,
610
+ )
389
611
  if isinstance(self._data, list | Generator):
390
612
  for df in self._data:
391
613
  yield (
@@ -404,6 +626,12 @@ class BaseFileReader(BaseFileIO, gc=False):
404
626
  self,
405
627
  lazy: bool = False,
406
628
  metadata: bool = False,
629
+ reload: bool = False,
630
+ include_file_path: bool = False,
631
+ concat: bool | None = None,
632
+ use_threads: bool | None = None,
633
+ verbose: bool | None = None,
634
+ opt_dtypes: bool | None = None,
407
635
  **kwargs,
408
636
  ) -> (
409
637
  pl.DataFrame
@@ -420,6 +648,14 @@ class BaseFileReader(BaseFileIO, gc=False):
420
648
  Args:
421
649
  lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
422
650
  metadata (bool, optional): Include metadata in the output. Default is False.
651
+ reload (bool, optional): Reload data if True. Default is False.
652
+ batch_size (int, optional): Batch size for iteration. Default is 1.
653
+ include_file_path (bool, optional): Include file path in the output. Default is False.
654
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
655
+ use_threads (bool, optional): Use threads for reading data. Default is True.
656
+ verbose (bool, optional): Verbose output. Default is None.
657
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
658
+ kwargs: Additional keyword arguments.
423
659
 
424
660
  Returns:
425
661
  pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame] | tuple[pl.DataFrame | pl.LazyFrame
@@ -427,32 +663,115 @@ class BaseFileReader(BaseFileIO, gc=False):
427
663
  metadata.
428
664
  """
429
665
  if lazy:
430
- return self._to_polars_lazyframe(**kwargs)
431
- return self._to_polars_dataframe(**kwargs)
666
+ return self._to_polars_lazyframe(
667
+ metadata=metadata,
668
+ reload=reload,
669
+ batch_size=None,
670
+ include_file_path=include_file_path,
671
+ concat=concat,
672
+ use_threads=use_threads,
673
+ verbose=verbose,
674
+ opt_dtypes=opt_dtypes,
675
+ **kwargs,
676
+ )
677
+ return self._to_polars_dataframe(
678
+ metadata=metadata,
679
+ reload=reload,
680
+ batch_size=None,
681
+ include_file_path=include_file_path,
682
+ concat=concat,
683
+ use_threads=use_threads,
684
+ verbose=verbose,
685
+ opt_dtypes=opt_dtypes,
686
+ **kwargs,
687
+ )
432
688
 
433
689
  def iter_polars(
434
690
  self,
435
691
  lazy: bool = False,
692
+ reload: bool = False,
693
+ batch_size: int | None = None,
694
+ include_file_path: bool = False,
695
+ concat: bool | None = None,
696
+ use_threads: bool | None = None,
697
+ verbose: bool | None = None,
698
+ opt_dtypes: bool | None = None,
436
699
  **kwargs,
437
700
  ) -> Generator[pl.DataFrame | pl.LazyFrame, None, None]:
701
+ """Iterate over Polars DataFrames or LazyFrames.
702
+
703
+ Args:
704
+ lazy (bool, optional): Return a LazyFrame if True, else a DataFrame. Default is False.
705
+ reload (bool, optional): Reload data if True. Default is False.
706
+ batch_size (int, optional): Batch size for iteration. Default is 1.
707
+ include_file_path (bool, optional): Include file path in the output. Default is False.
708
+ concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
709
+ use_threads (bool, optional): Use threads for reading data. Default is True.
710
+ verbose (bool, optional): Verbose output. Default is None.
711
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
712
+ kwargs: Additional keyword arguments.
713
+
714
+ Returns:
715
+ Generator[pl.DataFrame | pl.LazyFrame, None, None]: Generator of Polars DataFrames or LazyFrames.
716
+ """
438
717
  if lazy:
439
- yield from self._iter_polars_lazyframe(**kwargs)
440
- yield from self._iter_polars_dataframe(**kwargs)
718
+ yield from self._iter_polars_lazyframe(
719
+ reload=reload,
720
+ batch_size=batch_size,
721
+ include_file_path=include_file_path,
722
+ concat=concat,
723
+ use_threads=use_threads,
724
+ verbose=verbose,
725
+ opt_dtypes=opt_dtypes,
726
+ **kwargs,
727
+ )
728
+ yield from self._iter_polars_dataframe(
729
+ reload=reload,
730
+ batch_size=batch_size,
731
+ include_file_path=include_file_path,
732
+ concat=concat,
733
+ use_threads=use_threads,
734
+ verbose=verbose,
735
+ opt_dtypes=opt_dtypes,
736
+ **kwargs,
737
+ )
441
738
 
442
739
  def to_pyarrow_table(
443
- self, metadata: bool = False, reload: bool = False, **kwargs
740
+ self,
741
+ metadata: bool = False,
742
+ reload: bool = False,
743
+ include_file_path: bool = False,
744
+ use_threads: bool | None = None,
745
+ verbose: bool | None = None,
746
+ opt_dtypes: bool | None = None,
747
+ **kwargs,
444
748
  ) -> pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]:
445
749
  """Convert data to PyArrow Table(s).
446
750
 
447
751
  Args:
448
752
  metadata (bool, optional): Include metadata in the output. Default is False.
449
753
  reload (bool, optional): Reload data if True. Default is False.
754
+ include_file_path (bool, optional): Include file path in the output. Default is False.
755
+ use_threads (bool, optional): Use threads for reading data. Default is True.
756
+ verbose (bool, optional): Verbose output. Default is None.
757
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
758
+ kwargs: Additional keyword arguments.
450
759
 
451
760
  Returns:
452
761
  pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]: PyArrow Table or list of
453
762
  Tables and optional metadata.
454
763
  """
455
- self._load(reload=reload, **kwargs)
764
+ self._load(
765
+ reload=reload,
766
+ metadata=metadata,
767
+ batch_size=None,
768
+ include_file_path=include_file_path,
769
+ concat=None,
770
+ use_threads=use_threads,
771
+ verbose=verbose,
772
+ opt_dtypes=opt_dtypes,
773
+ **kwargs,
774
+ )
456
775
  if isinstance(self._data, list):
457
776
  df = [
458
777
  df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
@@ -466,22 +785,48 @@ class BaseFileReader(BaseFileIO, gc=False):
466
785
  else self._data
467
786
  )
468
787
  if metadata:
469
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
470
- return df, metadata
788
+ # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
789
+ return df, self._metadata
471
790
  return df
472
791
 
473
792
  def iter_pyarrow_table(
474
- self, reload: bool = False, **kwargs
793
+ self,
794
+ reload: bool = False,
795
+ batch_size: int | None = None,
796
+ include_file_path: bool = False,
797
+ concat: bool | None = None,
798
+ use_threads: bool | None = None,
799
+ verbose: bool | None = None,
800
+ opt_dtypes: bool | None = None,
801
+ **kwargs,
475
802
  ) -> Generator[pa.Table, None, None]:
476
803
  """Iterate over PyArrow Tables.
477
804
 
805
+ Args:
806
+ reload (bool, optional): Reload data if True. Default is False.
807
+ include_file_path (bool, optional): Include file path in the output. Default is False.
808
+ concat (bool, optional): Concatenate multiple files into a single Table. Default is True.
809
+ batch_size (int, optional): Batch size for iteration. Default is 1.
810
+ use_threads (bool, optional): Use threads for reading data. Default is True.
811
+ verbose (bool, optional): Verbose output. Default is None.
812
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
813
+ kwargs: Additional keyword arguments.
814
+
478
815
  Returns:
479
816
  Generator[pa.Table, None, None]: Generator of PyArrow Tables.
480
817
  """
481
- if self.batch_size is None and "batch_size" not in kwargs:
482
- self.batch_size = 1
483
-
484
- self._load(reload=reload, **kwargs)
818
+ batch_size = batch_size or self.batch_size or 1
819
+
820
+ self._load(
821
+ reload=reload,
822
+ batch_size=batch_size,
823
+ include_file_path=include_file_path,
824
+ concat=concat,
825
+ use_threads=use_threads,
826
+ verbose=verbose,
827
+ opt_dtypes=opt_dtypes,
828
+ **kwargs,
829
+ )
485
830
  if isinstance(self._data, list | Generator):
486
831
  for df in self._data:
487
832
  yield df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
@@ -497,6 +842,10 @@ class BaseFileReader(BaseFileIO, gc=False):
497
842
  conn: duckdb.DuckDBPyConnection | None = None,
498
843
  metadata: bool = False,
499
844
  reload: bool = False,
845
+ include_file_path: bool = False,
846
+ use_threads: bool | None = None,
847
+ verbose: bool | None = None,
848
+ opt_dtypes: bool | None = None,
500
849
  **kwargs,
501
850
  ) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
502
851
  """Convert data to DuckDB relation.
@@ -505,6 +854,11 @@ class BaseFileReader(BaseFileIO, gc=False):
505
854
  conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
506
855
  metadata (bool, optional): Include metadata in the output. Default is False.
507
856
  reload (bool, optional): Reload data if True. Default is False.
857
+ include_file_path (bool, optional): Include file path in the output. Default is False.
858
+ use_threads (bool, optional): Use threads for reading data. Default is True.
859
+ verbose (bool, optional): Verbose output. Default is None.
860
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
861
+ kwargs: Additional keyword arguments.
508
862
 
509
863
  Returns:
510
864
  duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
@@ -517,10 +871,27 @@ class BaseFileReader(BaseFileIO, gc=False):
517
871
 
518
872
  if metadata:
519
873
  return self._conn.from_arrow(
520
- self.to_pyarrow_table(concat=True, reload=reload, **kwargs),
874
+ self.to_pyarrow_table(
875
+ metadata=metadata,
876
+ reload=reload,
877
+ batch_size=None,
878
+ include_file_path=include_file_path,
879
+ se_threads=use_threads,
880
+ verbose=verbose,
881
+ opt_dtypes=opt_dtypes,
882
+ **kwargs,
883
+ ),
521
884
  ), self._metadata
522
885
  return self._conn.from_arrow(
523
- self.to_pyarrow_table(concat=True, reload=reload, **kwargs)
886
+ self.to_pyarrow_table(
887
+ reload=reload,
888
+ batch_size=None,
889
+ include_file_path=include_file_path,
890
+ use_threads=use_threads,
891
+ verbose=verbose,
892
+ opt_dtypes=opt_dtypes,
893
+ **kwargs,
894
+ )
524
895
  )
525
896
 
526
897
  def register_in_duckdb(
@@ -529,6 +900,10 @@ class BaseFileReader(BaseFileIO, gc=False):
529
900
  name: str | None = None,
530
901
  metadata: bool = False,
531
902
  reload: bool = False,
903
+ include_file_path: bool = False,
904
+ use_threads: bool | None = None,
905
+ verbose: bool | None = None,
906
+ opt_dtypes: bool | None = None,
532
907
  **kwargs,
533
908
  ) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
534
909
  """Register data in DuckDB.
@@ -538,6 +913,11 @@ class BaseFileReader(BaseFileIO, gc=False):
538
913
  name (str, optional): Name for the DuckDB table.
539
914
  metadata (bool, optional): Include metadata in the output. Default is False.
540
915
  reload (bool, optional): Reload data if True. Default is False.
916
+ include_file_path (bool, optional): Include file path in the output. Default is False.
917
+ use_threads (bool, optional): Use threads for reading data. Default is True.
918
+ verbose (bool, optional): Verbose output. Default is None.
919
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
920
+ kwargs: Additional keyword arguments.
541
921
 
542
922
  Returns:
543
923
  duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
@@ -552,7 +932,16 @@ class BaseFileReader(BaseFileIO, gc=False):
552
932
  self._conn = conn
553
933
 
554
934
  self._conn.register(
555
- name, self.to_pyarrow_table(concat=True, reload=reload, **kwargs)
935
+ name,
936
+ self.to_pyarrow_table(
937
+ metadata=metadata,
938
+ reload=reload,
939
+ include_file_path=include_file_path,
940
+ use_threads=use_threads,
941
+ verbose=verbose,
942
+ opt_dtypes=opt_dtypes,
943
+ **kwargs,
944
+ ),
556
945
  )
557
946
  if metadata:
558
947
  return self._conn, self._metadata
@@ -565,6 +954,10 @@ class BaseFileReader(BaseFileIO, gc=False):
565
954
  name: str | None = None,
566
955
  metadata: bool = False,
567
956
  reload: bool = False,
957
+ include_file_path: bool = False,
958
+ use_threads: bool | None = None,
959
+ verbose: bool | None = None,
960
+ opt_dtypes: bool | None = None,
568
961
  **kwargs,
569
962
  ) -> (
570
963
  duckdb.DuckDBPyRelation
@@ -580,6 +973,10 @@ class BaseFileReader(BaseFileIO, gc=False):
580
973
  name (str, optional): Name for the DuckDB table.
581
974
  metadata (bool, optional): Include metadata in the output. Default is False.
582
975
  reload (bool, optional): Reload data if True. Default is False.
976
+ include_file_path (bool, optional): Include file path in the output. Default is False.
977
+ use_threads (bool, optional): Use threads for reading data. Default is True.
978
+ verbose (bool, optional): Verbose output. Default is None.
979
+ opt_dtypes (bool, optional): Optimize data types. Default is True.
583
980
  **kwargs: Additional keyword arguments.
584
981
 
585
982
  Returns:
@@ -590,10 +987,25 @@ class BaseFileReader(BaseFileIO, gc=False):
590
987
  """
591
988
  if as_relation:
592
989
  return self.to_duckdb_relation(
593
- conn=conn, metadata=metadata, reload=reload, **kwargs
990
+ conn=conn,
991
+ metadata=metadata,
992
+ reload=reload,
993
+ include_file_path=include_file_path,
994
+ use_threads=use_threads,
995
+ verbose=verbose,
996
+ opt_dtypes=opt_dtypes,
997
+ **kwargs,
594
998
  )
595
999
  return self.register_in_duckdb(
596
- conn=conn, name=name, metadata=metadata, reload=reload, **kwargs
1000
+ conn=conn,
1001
+ name=name,
1002
+ metadata=metadata,
1003
+ reload=reload,
1004
+ include_file_path=include_file_path,
1005
+ use_threads=use_threads,
1006
+ verbose=verbose,
1007
+ opt_dtypes=opt_dtypes,
1008
+ **kwargs,
597
1009
  )
598
1010
 
599
1011
  def register_in_datafusion(
@@ -602,6 +1014,10 @@ class BaseFileReader(BaseFileIO, gc=False):
602
1014
  name: str | None = None,
603
1015
  metadata: bool = False,
604
1016
  reload: bool = False,
1017
+ include_file_path: bool = False,
1018
+ use_threads: bool | None = None,
1019
+ verbose: bool | None = None,
1020
+ opt_dtypes: bool | None = None,
605
1021
  **kwargs,
606
1022
  ) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
607
1023
  """Register data in DataFusion.
@@ -626,11 +1042,18 @@ class BaseFileReader(BaseFileIO, gc=False):
626
1042
 
627
1043
  self._ctx.register_record_batches(
628
1044
  name,
629
- [self.to_pyarrow_table(concat=True, reload=reload, **kwargs).to_batches()],
1045
+ [
1046
+ self.to_pyarrow_table(
1047
+ reload=reload,
1048
+ include_file_path=include_file_path,
1049
+ use_threads=use_threads,
1050
+ opt_dtypes=opt_dtypes**kwargs,
1051
+ ).to_batches()
1052
+ ],
630
1053
  )
631
1054
  if metadata:
632
1055
  return self._ctx, self._metadata
633
- return ctx
1056
+ return self._ctx
634
1057
 
635
1058
  def filter(
636
1059
  self, filter_expr: str | pl.Expr | pa.compute.Expression