FlowerPower 0.11.5.7__tar.gz → 0.11.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (107) hide show
  1. {flowerpower-0.11.5.7/src/FlowerPower.egg-info → flowerpower-0.11.6}/PKG-INFO +1 -1
  2. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/pyproject.toml +1 -1
  3. {flowerpower-0.11.5.7 → flowerpower-0.11.6/src/FlowerPower.egg-info}/PKG-INFO +1 -1
  4. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/FlowerPower.egg-info/SOURCES.txt +1 -0
  5. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/fs/ext.py +140 -30
  6. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/base.py +3 -1
  7. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/registry.py +9 -9
  8. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/base.py +14 -9
  9. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/helpers/polars.py +346 -124
  10. flowerpower-0.11.6/src/flowerpower/plugins/io/helpers/pyarrow.py +406 -0
  11. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/metadata.py +8 -11
  12. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/general.py +1 -1
  13. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/LICENSE +0 -0
  14. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/README.md +0 -0
  15. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/setup.cfg +0 -0
  16. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/FlowerPower.egg-info/dependency_links.txt +0 -0
  17. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/FlowerPower.egg-info/entry_points.txt +0 -0
  18. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/FlowerPower.egg-info/requires.txt +0 -0
  19. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/FlowerPower.egg-info/top_level.txt +0 -0
  20. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/__init__.py +0 -0
  21. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/__init__.py +0 -0
  22. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/base.py +0 -0
  23. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/pipeline/__init__.py +0 -0
  24. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/pipeline/adapter.py +0 -0
  25. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/pipeline/run.py +0 -0
  26. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/pipeline/schedule.py +0 -0
  27. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/project/__init__.py +0 -0
  28. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/project/adapter.py +0 -0
  29. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cfg/project/job_queue.py +0 -0
  30. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cli/__init__.py +0 -0
  31. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cli/cfg.py +0 -0
  32. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cli/job_queue.py +0 -0
  33. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cli/mqtt.py +0 -0
  34. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cli/pipeline.py +0 -0
  35. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/cli/utils.py +0 -0
  36. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/flowerpower.py +0 -0
  37. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/fs/__init__.py +0 -0
  38. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/fs/base.py +0 -0
  39. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/fs/storage_options.py +0 -0
  40. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/__init__.py +0 -0
  41. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/__init__.py +0 -0
  42. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -0
  43. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -0
  44. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/manager.py +0 -0
  45. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/setup.py +0 -0
  46. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/trigger.py +0 -0
  47. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/apscheduler/utils.py +0 -0
  48. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/base.py +0 -0
  49. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/__init__.py +0 -0
  50. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/_trigger.py +0 -0
  51. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +0 -0
  52. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -0
  53. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/manager.py +0 -0
  54. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/setup.py +0 -0
  55. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/job_queue/rq/utils.py +0 -0
  56. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/mqtt.py +0 -0
  57. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/__init__.py +0 -0
  58. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/io.py +0 -0
  59. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/job_queue.py +0 -0
  60. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/manager.py +0 -0
  61. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/runner.py +0 -0
  62. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/pipeline/visualizer.py +0 -0
  63. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/helpers/datetime.py +0 -0
  64. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/helpers/sql.py +0 -0
  65. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/__init__.py +0 -0
  66. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/csv.py +0 -0
  67. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/deltatable.py +0 -0
  68. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/duckdb.py +0 -0
  69. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/json.py +0 -0
  70. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/mqtt.py +0 -0
  71. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/mssql.py +0 -0
  72. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/mysql.py +0 -0
  73. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/oracle.py +0 -0
  74. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/parquet.py +0 -0
  75. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/postgres.py +0 -0
  76. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/pydala.py +0 -0
  77. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/loader/sqlite.py +0 -0
  78. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/__init__.py +0 -0
  79. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/csv.py +0 -0
  80. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/deltatable.py +0 -0
  81. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/duckdb.py +0 -0
  82. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/json.py +0 -0
  83. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/mqtt.py +0 -0
  84. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/mssql.py +0 -0
  85. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/mysql.py +0 -0
  86. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/oracle.py +0 -0
  87. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/parquet.py +0 -0
  88. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/postgres.py +0 -0
  89. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/pydala.py +0 -0
  90. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/io/saver/sqlite.py +0 -0
  91. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/mqtt/__init__.py +0 -0
  92. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/mqtt/cfg.py +0 -0
  93. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/plugins/mqtt/manager.py +0 -0
  94. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/__init__.py +0 -0
  95. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/backend.py +0 -0
  96. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/executor.py +0 -0
  97. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/hamilton.py +0 -0
  98. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/job_queue.py +0 -0
  99. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/logging.py +0 -0
  100. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/settings/retry.py +0 -0
  101. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/callback.py +0 -0
  102. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/logging.py +0 -0
  103. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/misc.py +0 -0
  104. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/monkey.py +0 -0
  105. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/open_telemetry.py +0 -0
  106. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/scheduler.py +0 -0
  107. {flowerpower-0.11.5.7 → flowerpower-0.11.6}/src/flowerpower/utils/templates.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowerPower
3
- Version: 0.11.5.7
3
+ Version: 0.11.6
4
4
  Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
5
5
  Author-email: "Volker L." <ligno.blades@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/legout/flowerpower
@@ -4,7 +4,7 @@ description = "A simple workflow framework. Hamilton + APScheduler = FlowerPower
4
4
  authors = [{ name = "Volker L.", email = "ligno.blades@gmail.com" }]
5
5
  readme = "README.md"
6
6
  requires-python = ">= 3.11"
7
- version = "0.11.5.7"
7
+ version = "0.11.6"
8
8
  keywords = [
9
9
  "hamilton",
10
10
  "workflow",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: FlowerPower
3
- Version: 0.11.5.7
3
+ Version: 0.11.6
4
4
  Summary: A simple workflow framework. Hamilton + APScheduler = FlowerPower
5
5
  Author-email: "Volker L." <ligno.blades@gmail.com>
6
6
  Project-URL: Homepage, https://github.com/legout/flowerpower
@@ -57,6 +57,7 @@ src/flowerpower/plugins/io/base.py
57
57
  src/flowerpower/plugins/io/metadata.py
58
58
  src/flowerpower/plugins/io/helpers/datetime.py
59
59
  src/flowerpower/plugins/io/helpers/polars.py
60
+ src/flowerpower/plugins/io/helpers/pyarrow.py
60
61
  src/flowerpower/plugins/io/helpers/sql.py
61
62
  src/flowerpower/plugins/io/loader/__init__.py
62
63
  src/flowerpower/plugins/io/loader/csv.py
@@ -10,13 +10,19 @@ else:
10
10
  raise ImportError("To use this module, please install `flowerpower[io]`.")
11
11
 
12
12
  import orjson
13
- import polars as pl
13
+ # import polars as pl
14
14
  import pyarrow as pa
15
15
  import pyarrow.dataset as pds
16
16
  import pyarrow.parquet as pq
17
17
  from fsspec import AbstractFileSystem
18
18
  from pydala.dataset import ParquetDataset
19
19
 
20
+ from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
21
+ from ..plugins.io.helpers.polars import pl
22
+ # from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
23
+ from ..plugins.io.helpers.pyarrow import cast_schema
24
+ from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
25
+ from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
20
26
  from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
21
27
  run_parallel, to_pyarrow_table)
22
28
 
@@ -172,6 +178,7 @@ def _read_json(
172
178
  as_dataframe: bool = True,
173
179
  concat: bool = True,
174
180
  verbose: bool = False,
181
+ opt_dtypes: bool = False,
175
182
  **kwargs,
176
183
  ) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
177
184
  """
@@ -236,8 +243,13 @@ def _read_json(
236
243
  ][0]
237
244
  for _data in data
238
245
  ]
246
+ if opt_dtypes:
247
+ data = [opt_dtype_pl(df, strict=False) for df in data]
239
248
  if concat:
240
- return pl.concat(data, how="diagonal_relaxed")
249
+ result = pl.concat(data, how="diagonal_relaxed")
250
+ if opt_dtypes:
251
+ result = opt_dtype_pl(result, strict=False)
252
+ return result
241
253
  return data
242
254
 
243
255
 
@@ -251,6 +263,7 @@ def _read_json_batches(
251
263
  concat: bool = True,
252
264
  use_threads: bool = True,
253
265
  verbose: bool = False,
266
+ opt_dtypes: bool = False,
254
267
  **kwargs: Any,
255
268
  ) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
256
269
  """Process JSON files in batches with optional parallel reading.
@@ -360,6 +373,7 @@ def read_json(
360
373
  concat: bool = True,
361
374
  use_threads: bool = True,
362
375
  verbose: bool = False,
376
+ opt_dtypes: bool = False,
363
377
  **kwargs: Any,
364
378
  ) -> (
365
379
  dict
@@ -439,6 +453,7 @@ def read_json(
439
453
  concat=concat,
440
454
  use_threads=use_threads,
441
455
  verbose=verbose,
456
+ opt_dtypes=opt_dtypes,
442
457
  **kwargs,
443
458
  )
444
459
  return _read_json(
@@ -450,12 +465,17 @@ def read_json(
450
465
  concat=concat,
451
466
  use_threads=use_threads,
452
467
  verbose=verbose,
468
+ opt_dtypes=opt_dtypes,
453
469
  **kwargs,
454
470
  )
455
471
 
456
472
 
457
473
  def _read_csv_file(
458
- path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
474
+ path: str,
475
+ self: AbstractFileSystem,
476
+ include_file_path: bool = False,
477
+ opt_dtypes: bool = False,
478
+ **kwargs: Any,
459
479
  ) -> pl.DataFrame:
460
480
  """Read a single CSV file from any filesystem.
461
481
 
@@ -486,15 +506,21 @@ def _read_csv_file(
486
506
  with self.open(path) as f:
487
507
  df = pl.read_csv(f, **kwargs)
488
508
  if include_file_path:
489
- return df.with_columns(pl.lit(path).alias("file_path"))
509
+ df = df.with_columns(pl.lit(path).alias("file_path"))
510
+ if opt_dtypes:
511
+ df = opt_dtype_pl(df, strict=False)
490
512
  return df
491
513
 
492
514
 
493
515
  def read_csv_file(
494
- self, path: str, include_file_path: bool = False, **kwargs
516
+ self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
495
517
  ) -> pl.DataFrame:
496
518
  return _read_csv_file(
497
- path=path, self=self, include_file_path=include_file_path, **kwargs
519
+ path=path,
520
+ self=self,
521
+ include_file_path=include_file_path,
522
+ opt_dtypes=opt_dtypes,
523
+ **kwargs,
498
524
  )
499
525
 
500
526
 
@@ -505,6 +531,7 @@ def _read_csv(
505
531
  use_threads: bool = True,
506
532
  concat: bool = True,
507
533
  verbose: bool = False,
534
+ opt_dtypes: bool = False,
508
535
  **kwargs,
509
536
  ) -> pl.DataFrame | list[pl.DataFrame]:
510
537
  """
@@ -533,21 +560,36 @@ def _read_csv(
533
560
  path,
534
561
  self=self,
535
562
  include_file_path=include_file_path,
563
+ opt_dtypes=opt_dtypes,
536
564
  n_jobs=-1,
537
565
  backend="threading",
538
566
  verbose=verbose,
539
567
  **kwargs,
540
568
  )
541
- dfs = [
542
- _read_csv_file(p, self=self, include_file_path=include_file_path, **kwargs)
543
- for p in path
544
- ]
569
+ else:
570
+ dfs = [
571
+ _read_csv_file(
572
+ p,
573
+ self=self,
574
+ include_file_path=include_file_path,
575
+ opt_dtypes=opt_dtypes,
576
+ **kwargs,
577
+ )
578
+ for p in path
579
+ ]
545
580
  else:
546
581
  dfs = _read_csv_file(
547
- path, self=self, include_file_path=include_file_path, **kwargs
582
+ path,
583
+ self=self,
584
+ include_file_path=include_file_path,
585
+ opt_dtypes=opt_dtypes,
586
+ **kwargs,
548
587
  )
549
588
  if concat:
550
- return pl.concat(dfs, how="diagonal_relaxed")
589
+ result = pl.concat(dfs, how="diagonal_relaxed")
590
+ if opt_dtypes:
591
+ result = opt_dtype_pl(result, strict=False)
592
+ return result
551
593
  return dfs
552
594
 
553
595
 
@@ -559,6 +601,7 @@ def _read_csv_batches(
559
601
  concat: bool = True,
560
602
  use_threads: bool = True,
561
603
  verbose: bool = False,
604
+ opt_dtypes: bool = False,
562
605
  **kwargs: Any,
563
606
  ) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
564
607
  """Process CSV files in batches with optional parallel reading.
@@ -634,8 +677,14 @@ def _read_csv_batches(
634
677
  for p in batch_paths
635
678
  ]
636
679
 
680
+ if opt_dtypes:
681
+ batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
682
+
637
683
  if concat and len(batch_dfs) > 1:
638
- yield pl.concat(batch_dfs, how="diagonal_relaxed")
684
+ result = pl.concat(batch_dfs, how="diagonal_relaxed")
685
+ if opt_dtypes:
686
+ result = opt_dtype_pl(result, strict=False)
687
+ yield result
639
688
  else:
640
689
  yield batch_dfs
641
690
 
@@ -648,6 +697,7 @@ def read_csv(
648
697
  concat: bool = True,
649
698
  use_threads: bool = True,
650
699
  verbose: bool = False,
700
+ opt_dtypes: bool = False,
651
701
  **kwargs: Any,
652
702
  ) -> (
653
703
  pl.DataFrame
@@ -730,7 +780,11 @@ def read_csv(
730
780
 
731
781
 
732
782
  def _read_parquet_file(
733
- path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
783
+ path: str,
784
+ self: AbstractFileSystem,
785
+ include_file_path: bool = False,
786
+ opt_dtypes: bool = False,
787
+ **kwargs: Any,
734
788
  ) -> pa.Table:
735
789
  """Read a single Parquet file from any filesystem.
736
790
 
@@ -759,15 +813,21 @@ def _read_parquet_file(
759
813
  """
760
814
  table = pq.read_table(path, filesystem=self, **kwargs)
761
815
  if include_file_path:
762
- return table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
816
+ table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
817
+ if opt_dtypes:
818
+ table = opt_dtype_pa(table, strict=False)
763
819
  return table
764
820
 
765
821
 
766
822
  def read_parquet_file(
767
- self, path: str, include_file_path: bool = False, **kwargs
823
+ self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
768
824
  ) -> pa.Table:
769
825
  return _read_parquet_file(
770
- path=path, self=self, include_file_path=include_file_path, **kwargs
826
+ path=path,
827
+ self=self,
828
+ include_file_path=include_file_path,
829
+ opt_dtypes=opt_dtypes,
830
+ **kwargs,
771
831
  )
772
832
 
773
833
 
@@ -778,6 +838,7 @@ def _read_parquet(
778
838
  use_threads: bool = True,
779
839
  concat: bool = True,
780
840
  verbose: bool = False,
841
+ opt_dtypes: bool = False,
781
842
  **kwargs,
782
843
  ) -> pa.Table | list[pa.Table]:
783
844
  """
@@ -797,7 +858,10 @@ def _read_parquet(
797
858
  if not include_file_path and concat:
798
859
  if isinstance(path, str):
799
860
  path = path.replace("**", "").replace("*.parquet", "")
800
- return pq.read_table(path, filesystem=self, **kwargs)
861
+ table = pq.read_table(path, filesystem=self, **kwargs)
862
+ if opt_dtypes:
863
+ table = opt_dtype_pa(table, strict=False)
864
+ return table
801
865
  else:
802
866
  if isinstance(path, str):
803
867
  path = path_to_glob(path, format="parquet")
@@ -805,30 +869,54 @@ def _read_parquet(
805
869
 
806
870
  if isinstance(path, list):
807
871
  if use_threads:
808
- table = run_parallel(
872
+ tables = run_parallel(
809
873
  _read_parquet_file,
810
874
  path,
811
875
  self=self,
812
876
  include_file_path=include_file_path,
877
+ opt_dtypes=opt_dtypes,
813
878
  n_jobs=-1,
814
879
  backend="threading",
815
880
  verbose=verbose,
816
881
  **kwargs,
817
882
  )
818
883
  else:
819
- table = [
884
+ tables = [
820
885
  _read_parquet_file(
821
- p, self=self, include_file_path=include_file_path, **kwargs
886
+ p,
887
+ self=self,
888
+ include_file_path=include_file_path,
889
+ opt_dtypes=opt_dtypes,
890
+ **kwargs,
822
891
  )
823
892
  for p in path
824
893
  ]
825
894
  else:
826
- table = _read_parquet_file(
827
- path=path, self=self, include_file_path=include_file_path, **kwargs
895
+ tables = _read_parquet_file(
896
+ path=path,
897
+ self=self,
898
+ include_file_path=include_file_path,
899
+ opt_dtypes=opt_dtypes,
900
+ **kwargs,
828
901
  )
829
902
  if concat:
830
- return pa.concat_tables(table, promote_options="permissive")
831
- return table
903
+ # Unify schemas before concatenation if opt_dtypes or multiple tables
904
+ if isinstance(tables, list):
905
+ if len(tables) > 1:
906
+ schemas = [t.schema for t in tables]
907
+ unified_schema = unify_schemas_pa(schemas)
908
+ tables = [cast_schema(t, unified_schema) for t in tables]
909
+ result = pa.concat_tables(tables, promote_options="permissive")
910
+ if opt_dtypes:
911
+ result = opt_dtype_pa(result, strict=False)
912
+ return result
913
+ elif isinstance(tables, pa.Table):
914
+ if opt_dtypes:
915
+ tables = opt_dtype_pa(tables, strict=False)
916
+ return tables
917
+ else:
918
+ return pa.concat_tables(tables, promote_options="permissive")
919
+ return tables
832
920
 
833
921
 
834
922
  def _read_parquet_batches(
@@ -839,6 +927,7 @@ def _read_parquet_batches(
839
927
  use_threads: bool = True,
840
928
  concat: bool = True,
841
929
  verbose: bool = False,
930
+ opt_dtypes: bool = False,
842
931
  **kwargs: Any,
843
932
  ) -> Generator[pa.Table | list[pa.Table], None, None]:
844
933
  """Process Parquet files in batches with performance optimizations.
@@ -892,7 +981,10 @@ def _read_parquet_batches(
892
981
  if not include_file_path and concat and batch_size is None:
893
982
  if isinstance(path, str):
894
983
  path = path.replace("**", "").replace("*.parquet", "")
895
- yield pq.read_table(path, filesystem=self, **kwargs)
984
+ table = pq.read_table(path, filesystem=self, **kwargs)
985
+ if opt_dtypes:
986
+ table = opt_dtype_pa(table, strict=False)
987
+ yield table
896
988
  return
897
989
 
898
990
  # Resolve path(s) to list
@@ -915,6 +1007,7 @@ def _read_parquet_batches(
915
1007
  batch_paths,
916
1008
  self=self,
917
1009
  include_file_path=include_file_path,
1010
+ opt_dtypes=opt_dtypes,
918
1011
  n_jobs=-1,
919
1012
  backend="threading",
920
1013
  verbose=verbose,
@@ -923,14 +1016,28 @@ def _read_parquet_batches(
923
1016
  else:
924
1017
  batch_tables = [
925
1018
  _read_parquet_file(
926
- p, self=self, include_file_path=include_file_path, **kwargs
1019
+ p,
1020
+ self=self,
1021
+ include_file_path=include_file_path,
1022
+ opt_dtypes=opt_dtypes,
1023
+ **kwargs,
927
1024
  )
928
1025
  for p in batch_paths
929
1026
  ]
930
1027
 
931
1028
  if concat and batch_tables:
932
- yield pa.concat_tables(batch_tables, promote_options="permissive")
1029
+ # Unify schemas before concatenation
1030
+ if len(batch_tables) > 1:
1031
+ schemas = [t.schema for t in batch_tables]
1032
+ unified_schema = unify_schemas_pa(schemas)
1033
+ batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
1034
+ result = pa.concat_tables(batch_tables, promote_options="permissive")
1035
+ if opt_dtypes:
1036
+ result = opt_dtype_pa(result, strict=False)
1037
+ yield result
933
1038
  else:
1039
+ if opt_dtypes and isinstance(batch_tables, list):
1040
+ batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
934
1041
  yield batch_tables
935
1042
 
936
1043
 
@@ -942,6 +1049,7 @@ def read_parquet(
942
1049
  concat: bool = True,
943
1050
  use_threads: bool = True,
944
1051
  verbose: bool = False,
1052
+ opt_dtypes: bool = False,
945
1053
  **kwargs: Any,
946
1054
  ) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
947
1055
  """Read Parquet data with advanced features and optimizations.
@@ -1415,7 +1523,7 @@ def write_parquet(
1415
1523
  data = to_pyarrow_table(data, concat=False, unique=False)
1416
1524
 
1417
1525
  if schema is not None:
1418
- data = data.cast(schema)
1526
+ data = cast_schema(data, schema)
1419
1527
  metadata = []
1420
1528
  pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
1421
1529
  metadata = metadata[0]
@@ -1469,7 +1577,9 @@ def write_json(
1469
1577
  data = data.collect()
1470
1578
  if isinstance(data, pl.DataFrame):
1471
1579
  data = data.to_arrow()
1472
- data = data.cast(convert_large_types_to_standard(data.schema)).to_pydict()
1580
+ data = cast_schema(
1581
+ data, convert_large_types_to_standard(data.schema)
1582
+ ).to_pydict()
1473
1583
  elif isinstance(data, pd.DataFrame):
1474
1584
  data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
1475
1585
  elif isinstance(data, pa.Table):
@@ -81,7 +81,9 @@ class BasePipeline:
81
81
  """
82
82
  if self._fs.is_cache_fs:
83
83
  self._fs.sync_cache()
84
- modules_path = posixpath.join(self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir)
84
+ modules_path = posixpath.join(
85
+ self._fs._mapper.directory, self._fs.cache_path, self._pipelines_dir
86
+ )
85
87
  else:
86
88
  modules_path = posixpath.join(self._fs.path, self._pipelines_dir)
87
89
  if modules_path not in sys.path:
@@ -190,7 +190,9 @@ class PipelineRegistry:
190
190
  )
191
191
 
192
192
  # Sync filesystem if needed (using _fs)
193
- if hasattr(self._fs, "sync_cache") and callable(getattr(self._fs, "sync_cache")):
193
+ if hasattr(self._fs, "sync_cache") and callable(
194
+ getattr(self._fs, "sync_cache")
195
+ ):
194
196
  self._fs.sync_cache()
195
197
 
196
198
  def _get_files(self) -> list[str]:
@@ -447,14 +449,12 @@ class PipelineRegistry:
447
449
  logger.warning(f"Could not get size for {path}: {e}")
448
450
  size = "Error"
449
451
 
450
- pipeline_info.append(
451
- {
452
- "name": name,
453
- "path": path,
454
- "mod_time": mod_time,
455
- "size": size,
456
- }
457
- )
452
+ pipeline_info.append({
453
+ "name": name,
454
+ "path": path,
455
+ "mod_time": mod_time,
456
+ "size": size,
457
+ })
458
458
 
459
459
  if show:
460
460
  table = Table(title="Available Pipelines")
@@ -22,11 +22,11 @@ from sqlalchemy import create_engine, text
22
22
  from ...fs import get_filesystem
23
23
  from ...fs.ext import _dict_to_dataframe, path_to_glob
24
24
  from ...fs.storage_options import (AwsStorageOptions, AzureStorageOptions,
25
- BaseStorageOptions, GcsStorageOptions,
26
- GitHubStorageOptions, GitLabStorageOptions,
27
- StorageOptions)
25
+ GcsStorageOptions, GitHubStorageOptions,
26
+ GitLabStorageOptions, StorageOptions)
28
27
  from ...utils.misc import convert_large_types_to_standard, to_pyarrow_table
29
28
  from .helpers.polars import pl
29
+ from .helpers.pyarrow import opt_dtype
30
30
  from .helpers.sql import sql2polars_filter, sql2pyarrow_filter
31
31
  from .metadata import get_dataframe_metadata, get_pyarrow_dataset_metadata
32
32
 
@@ -236,12 +236,18 @@ class BaseFileReader(BaseFileIO, gc=False):
236
236
  df=self._data,
237
237
  path=self.path,
238
238
  format=self.format,
239
- num_files=pl.from_arrow(self._data.select(["file_path"])).select(
240
- pl.n_unique("file_path")
241
- )[0, 0],
239
+ # num_files=pl.from_arrow(self._data.select(["file_path"])).select(
240
+ # pl.n_unique("file_path")
241
+ # )[0, 0],
242
242
  )
243
243
  if not self.include_file_path:
244
- self._data = self._data.drop("file_path")
244
+ if isinstance(self._data, pa.Table):
245
+ self._data = self._data.drop("file_path")
246
+ elif isinstance(self._data, list | tuple):
247
+ self._data = [
248
+ df.drop("file_path") if isinstance(df, pa.Table) else df
249
+ for df in self._data
250
+ ]
245
251
  else:
246
252
  self._metadata = {}
247
253
 
@@ -365,7 +371,7 @@ class BaseFileReader(BaseFileIO, gc=False):
365
371
  df = [df.lazy() for df in self._to_polars_dataframe()]
366
372
 
367
373
  else:
368
- df = self._to_polars_dataframe.lazy()
374
+ df = self._to_polars_dataframe().lazy()
369
375
  if metadata:
370
376
  metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
371
377
  return df, metadata
@@ -1391,7 +1397,6 @@ class BaseDatasetWriter(BaseFileWriter, gc=False):
1391
1397
  mode=mode or self.mode,
1392
1398
  basename=basename or self.basename,
1393
1399
  schema=schema or self.schema_,
1394
- partition_flavor=partitioning_flavor or self.partitioning_flavor,
1395
1400
  partition_by=partition_by or self.partition_by,
1396
1401
  compression=compression or self.compression,
1397
1402
  row_group_size=row_group_size or self.row_group_size,