FlowerPower 0.11.6__tar.gz → 0.11.6.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {flowerpower-0.11.6/src/FlowerPower.egg-info → flowerpower-0.11.6.1}/PKG-INFO +1 -1
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/pyproject.toml +1 -1
- {flowerpower-0.11.6 → flowerpower-0.11.6.1/src/FlowerPower.egg-info}/PKG-INFO +1 -1
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/fs/ext.py +58 -26
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/base.py +497 -80
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/LICENSE +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/README.md +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/setup.cfg +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/FlowerPower.egg-info/SOURCES.txt +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/FlowerPower.egg-info/dependency_links.txt +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/FlowerPower.egg-info/entry_points.txt +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/FlowerPower.egg-info/requires.txt +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/FlowerPower.egg-info/top_level.txt +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/base.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/pipeline/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/pipeline/adapter.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/pipeline/run.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/pipeline/schedule.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/project/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/project/adapter.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cfg/project/job_queue.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cli/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cli/cfg.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cli/job_queue.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cli/mqtt.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cli/pipeline.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/cli/utils.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/flowerpower.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/fs/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/fs/base.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/fs/storage_options.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/manager.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/setup.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/trigger.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/apscheduler/utils.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/base.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/_trigger.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/manager.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/setup.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/job_queue/rq/utils.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/mqtt.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/base.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/io.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/job_queue.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/manager.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/registry.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/runner.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/pipeline/visualizer.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/helpers/datetime.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/helpers/polars.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/helpers/pyarrow.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/helpers/sql.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/csv.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/deltatable.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/duckdb.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/json.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/mqtt.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/mssql.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/mysql.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/oracle.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/parquet.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/postgres.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/pydala.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/loader/sqlite.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/metadata.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/csv.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/deltatable.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/duckdb.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/json.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/mqtt.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/mssql.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/mysql.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/oracle.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/parquet.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/postgres.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/pydala.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/io/saver/sqlite.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/mqtt/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/mqtt/cfg.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/plugins/mqtt/manager.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/__init__.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/backend.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/executor.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/general.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/hamilton.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/job_queue.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/logging.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/settings/retry.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/callback.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/logging.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/misc.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/monkey.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/open_telemetry.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/scheduler.py +0 -0
- {flowerpower-0.11.6 → flowerpower-0.11.6.1}/src/flowerpower/utils/templates.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: FlowerPower
|
3
|
-
Version: 0.11.6
|
3
|
+
Version: 0.11.6.1
|
4
4
|
Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
5
5
|
Author-email: "Volker L." <ligno.blades@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/legout/flowerpower
|
@@ -4,7 +4,7 @@ description = "A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
|
4
4
|
authors = [{ name = "Volker L.", email = "ligno.blades@gmail.com" }]
|
5
5
|
readme = "README.md"
|
6
6
|
requires-python = ">= 3.11"
|
7
|
-
version = "0.11.6"
|
7
|
+
version = "0.11.6.1"
|
8
8
|
keywords = [
|
9
9
|
"hamilton",
|
10
10
|
"workflow",
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: FlowerPower
|
3
|
-
Version: 0.11.6
|
3
|
+
Version: 0.11.6.1
|
4
4
|
Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
|
5
5
|
Author-email: "Volker L." <ligno.blades@gmail.com>
|
6
6
|
Project-URL: Homepage, https://github.com/legout/flowerpower
|
@@ -193,6 +193,7 @@ def _read_json(
|
|
193
193
|
as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
|
194
194
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
195
195
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
196
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
196
197
|
**kwargs: Additional keyword arguments.
|
197
198
|
|
198
199
|
Returns:
|
@@ -247,8 +248,8 @@ def _read_json(
|
|
247
248
|
data = [opt_dtype_pl(df, strict=False) for df in data]
|
248
249
|
if concat:
|
249
250
|
result = pl.concat(data, how="diagonal_relaxed")
|
250
|
-
if opt_dtypes:
|
251
|
-
|
251
|
+
# if opt_dtypes:
|
252
|
+
# result = opt_dtype_pl(result, strict=False)
|
252
253
|
return result
|
253
254
|
return data
|
254
255
|
|
@@ -280,6 +281,7 @@ def _read_json_batches(
|
|
280
281
|
concat: Combine files within each batch
|
281
282
|
use_threads: Enable parallel file reading within batches
|
282
283
|
verbose: Print progress information
|
284
|
+
opt_dtypes: Optimize DataFrame dtypes
|
283
285
|
**kwargs: Additional arguments for DataFrame conversion
|
284
286
|
|
285
287
|
Yields:
|
@@ -354,10 +356,16 @@ def _read_json_batches(
|
|
354
356
|
][0]
|
355
357
|
for _data in batch_data
|
356
358
|
]
|
357
|
-
|
359
|
+
if opt_dtypes:
|
360
|
+
batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
358
361
|
if concat and len(batch_dfs) > 1:
|
359
|
-
|
362
|
+
batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
|
363
|
+
# if opt_dtypes:
|
364
|
+
# batch_df = opt_dtype_pl(batch_df, strict=False)
|
365
|
+
yield batch_df
|
360
366
|
else:
|
367
|
+
# if opt_dtypes:
|
368
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
361
369
|
yield batch_dfs
|
362
370
|
else:
|
363
371
|
yield batch_data
|
@@ -403,6 +411,7 @@ def read_json(
|
|
403
411
|
concat: Combine multiple files/batches into single result
|
404
412
|
use_threads: Enable parallel file reading
|
405
413
|
verbose: Print progress information
|
414
|
+
opt_dtypes: Optimize DataFrame dtypes for performance
|
406
415
|
**kwargs: Additional arguments passed to DataFrame conversion
|
407
416
|
|
408
417
|
Returns:
|
@@ -486,6 +495,7 @@ def _read_csv_file(
|
|
486
495
|
path: Path to CSV file
|
487
496
|
self: Filesystem instance to use for reading
|
488
497
|
include_file_path: Add source filepath as a column
|
498
|
+
opt_dtypes: Optimize DataFrame dtypes
|
489
499
|
**kwargs: Additional arguments passed to pl.read_csv()
|
490
500
|
|
491
501
|
Returns:
|
@@ -544,6 +554,7 @@ def _read_csv(
|
|
544
554
|
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
545
555
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
546
556
|
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
557
|
+
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
547
558
|
**kwargs: Additional keyword arguments.
|
548
559
|
|
549
560
|
Returns:
|
@@ -587,8 +598,8 @@ def _read_csv(
|
|
587
598
|
)
|
588
599
|
if concat:
|
589
600
|
result = pl.concat(dfs, how="diagonal_relaxed")
|
590
|
-
if opt_dtypes:
|
591
|
-
|
601
|
+
# if opt_dtypes:
|
602
|
+
# result = opt_dtype_pl(result, strict=False)
|
592
603
|
return result
|
593
604
|
return dfs
|
594
605
|
|
@@ -616,6 +627,7 @@ def _read_csv_batches(
|
|
616
627
|
concat: Combine files within each batch
|
617
628
|
use_threads: Enable parallel file reading within batches
|
618
629
|
verbose: Print progress information
|
630
|
+
opt_dtypes: Optimize DataFrame dtypes
|
619
631
|
**kwargs: Additional arguments passed to pl.read_csv()
|
620
632
|
|
621
633
|
Yields:
|
@@ -667,23 +679,28 @@ def _read_csv_batches(
|
|
667
679
|
n_jobs=-1,
|
668
680
|
backend="threading",
|
669
681
|
verbose=verbose,
|
682
|
+
opt_dtypes=opt_dtypes,
|
670
683
|
**kwargs,
|
671
684
|
)
|
672
685
|
else:
|
673
686
|
batch_dfs = [
|
674
687
|
_read_csv_file(
|
675
|
-
p,
|
688
|
+
p,
|
689
|
+
self=self,
|
690
|
+
include_file_path=include_file_path,
|
691
|
+
opt_dtypes=opt_dtypes,
|
692
|
+
**kwargs,
|
676
693
|
)
|
677
694
|
for p in batch_paths
|
678
695
|
]
|
679
696
|
|
680
|
-
if opt_dtypes:
|
681
|
-
|
697
|
+
# if opt_dtypes:
|
698
|
+
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
682
699
|
|
683
700
|
if concat and len(batch_dfs) > 1:
|
684
701
|
result = pl.concat(batch_dfs, how="diagonal_relaxed")
|
685
|
-
if opt_dtypes:
|
686
|
-
|
702
|
+
# if opt_dtypes:
|
703
|
+
# result = opt_dtype_pl(result, strict=False)
|
687
704
|
yield result
|
688
705
|
else:
|
689
706
|
yield batch_dfs
|
@@ -766,6 +783,7 @@ def read_csv(
|
|
766
783
|
concat=concat,
|
767
784
|
use_threads=use_threads,
|
768
785
|
verbose=verbose,
|
786
|
+
opt_dtypes=opt_dtypes,
|
769
787
|
**kwargs,
|
770
788
|
)
|
771
789
|
return _read_csv(
|
@@ -775,6 +793,7 @@ def read_csv(
|
|
775
793
|
concat=concat,
|
776
794
|
use_threads=use_threads,
|
777
795
|
verbose=verbose,
|
796
|
+
opt_dtypes=opt_dtypes,
|
778
797
|
**kwargs,
|
779
798
|
)
|
780
799
|
|
@@ -858,9 +877,7 @@ def _read_parquet(
|
|
858
877
|
if not include_file_path and concat:
|
859
878
|
if isinstance(path, str):
|
860
879
|
path = path.replace("**", "").replace("*.parquet", "")
|
861
|
-
table =
|
862
|
-
if opt_dtypes:
|
863
|
-
table = opt_dtype_pa(table, strict=False)
|
880
|
+
table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
|
864
881
|
return table
|
865
882
|
else:
|
866
883
|
if isinstance(path, str):
|
@@ -907,12 +924,12 @@ def _read_parquet(
|
|
907
924
|
unified_schema = unify_schemas_pa(schemas)
|
908
925
|
tables = [cast_schema(t, unified_schema) for t in tables]
|
909
926
|
result = pa.concat_tables(tables, promote_options="permissive")
|
910
|
-
if opt_dtypes:
|
911
|
-
|
927
|
+
# if opt_dtypes:
|
928
|
+
# result = opt_dtype_pa(result, strict=False)
|
912
929
|
return result
|
913
930
|
elif isinstance(tables, pa.Table):
|
914
|
-
if opt_dtypes:
|
915
|
-
|
931
|
+
# if opt_dtypes:
|
932
|
+
# tables = opt_dtype_pa(tables, strict=False)
|
916
933
|
return tables
|
917
934
|
else:
|
918
935
|
return pa.concat_tables(tables, promote_options="permissive")
|
@@ -981,9 +998,9 @@ def _read_parquet_batches(
|
|
981
998
|
if not include_file_path and concat and batch_size is None:
|
982
999
|
if isinstance(path, str):
|
983
1000
|
path = path.replace("**", "").replace("*.parquet", "")
|
984
|
-
table =
|
985
|
-
|
986
|
-
|
1001
|
+
table = _read_parquet_file(
|
1002
|
+
path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
|
1003
|
+
)
|
987
1004
|
yield table
|
988
1005
|
return
|
989
1006
|
|
@@ -994,7 +1011,11 @@ def _read_parquet_batches(
|
|
994
1011
|
|
995
1012
|
if not isinstance(path, list):
|
996
1013
|
yield _read_parquet_file(
|
997
|
-
path=path,
|
1014
|
+
path=path,
|
1015
|
+
self=self,
|
1016
|
+
include_file_path=include_file_path,
|
1017
|
+
opt_dtypes=opt_dtypes,
|
1018
|
+
**kwargs,
|
998
1019
|
)
|
999
1020
|
return
|
1000
1021
|
|
@@ -1032,12 +1053,12 @@ def _read_parquet_batches(
|
|
1032
1053
|
unified_schema = unify_schemas_pa(schemas)
|
1033
1054
|
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1034
1055
|
result = pa.concat_tables(batch_tables, promote_options="permissive")
|
1035
|
-
if opt_dtypes:
|
1036
|
-
|
1056
|
+
# if opt_dtypes:
|
1057
|
+
# result = opt_dtype_pa(result, strict=False)
|
1037
1058
|
yield result
|
1038
1059
|
else:
|
1039
|
-
if opt_dtypes and isinstance(batch_tables, list):
|
1040
|
-
|
1060
|
+
# if opt_dtypes and isinstance(batch_tables, list):
|
1061
|
+
# batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
|
1041
1062
|
yield batch_tables
|
1042
1063
|
|
1043
1064
|
|
@@ -1077,6 +1098,7 @@ def read_parquet(
|
|
1077
1098
|
concat: Combine multiple files/batches into single Table
|
1078
1099
|
use_threads: Enable parallel file reading
|
1079
1100
|
verbose: Print progress information
|
1101
|
+
opt_dtypes: Optimize Table dtypes for performance
|
1080
1102
|
**kwargs: Additional arguments passed to pq.read_table()
|
1081
1103
|
|
1082
1104
|
Returns:
|
@@ -1119,6 +1141,7 @@ def read_parquet(
|
|
1119
1141
|
concat=concat,
|
1120
1142
|
use_threads=use_threads,
|
1121
1143
|
verbose=verbose,
|
1144
|
+
opt_dtypes=opt_dtypes,
|
1122
1145
|
**kwargs,
|
1123
1146
|
)
|
1124
1147
|
return _read_parquet(
|
@@ -1128,6 +1151,7 @@ def read_parquet(
|
|
1128
1151
|
use_threads=use_threads,
|
1129
1152
|
concat=concat,
|
1130
1153
|
verbose=verbose,
|
1154
|
+
opt_dtypes=opt_dtypes,
|
1131
1155
|
**kwargs,
|
1132
1156
|
)
|
1133
1157
|
|
@@ -1142,6 +1166,7 @@ def read_files(
|
|
1142
1166
|
jsonlines: bool = False,
|
1143
1167
|
use_threads: bool = True,
|
1144
1168
|
verbose: bool = False,
|
1169
|
+
opt_dtypes: bool = False,
|
1145
1170
|
**kwargs: Any,
|
1146
1171
|
) -> (
|
1147
1172
|
pl.DataFrame
|
@@ -1175,6 +1200,7 @@ def read_files(
|
|
1175
1200
|
jsonlines: For JSON format, whether to read as JSON Lines
|
1176
1201
|
use_threads: Enable parallel file reading
|
1177
1202
|
verbose: Print progress information
|
1203
|
+
opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
|
1178
1204
|
**kwargs: Additional format-specific arguments
|
1179
1205
|
|
1180
1206
|
Returns:
|
@@ -1224,6 +1250,7 @@ def read_files(
|
|
1224
1250
|
concat=concat,
|
1225
1251
|
use_threads=use_threads,
|
1226
1252
|
verbose=verbose,
|
1253
|
+
opt_dtypes=opt_dtypes,
|
1227
1254
|
**kwargs,
|
1228
1255
|
)
|
1229
1256
|
return read_json(
|
@@ -1234,6 +1261,7 @@ def read_files(
|
|
1234
1261
|
concat=concat,
|
1235
1262
|
use_threads=use_threads,
|
1236
1263
|
verbose=verbose,
|
1264
|
+
opt_dtypes=opt_dtypes,
|
1237
1265
|
**kwargs,
|
1238
1266
|
)
|
1239
1267
|
elif format == "csv":
|
@@ -1246,6 +1274,7 @@ def read_files(
|
|
1246
1274
|
concat=concat,
|
1247
1275
|
use_threads=use_threads,
|
1248
1276
|
verbose=verbose,
|
1277
|
+
opt_dtypes=opt_dtypes,
|
1249
1278
|
**kwargs,
|
1250
1279
|
)
|
1251
1280
|
return read_csv(
|
@@ -1255,6 +1284,7 @@ def read_files(
|
|
1255
1284
|
use_threads=use_threads,
|
1256
1285
|
concat=concat,
|
1257
1286
|
verbose=verbose,
|
1287
|
+
opt_dtypes=opt_dtypes,
|
1258
1288
|
**kwargs,
|
1259
1289
|
)
|
1260
1290
|
elif format == "parquet":
|
@@ -1267,6 +1297,7 @@ def read_files(
|
|
1267
1297
|
concat=concat,
|
1268
1298
|
use_threads=use_threads,
|
1269
1299
|
verbose=verbose,
|
1300
|
+
opt_dtypes=opt_dtypes,
|
1270
1301
|
**kwargs,
|
1271
1302
|
)
|
1272
1303
|
return read_parquet(
|
@@ -1276,6 +1307,7 @@ def read_files(
|
|
1276
1307
|
use_threads=use_threads,
|
1277
1308
|
concat=concat,
|
1278
1309
|
verbose=verbose,
|
1310
|
+
opt_dtypes=opt_dtypes,
|
1279
1311
|
**kwargs,
|
1280
1312
|
)
|
1281
1313
|
|