FlowerPower 0.9.12.4__py3-none-any.whl → 1.0.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (81) hide show
  1. flowerpower/__init__.py +17 -2
  2. flowerpower/cfg/__init__.py +201 -149
  3. flowerpower/cfg/base.py +122 -24
  4. flowerpower/cfg/pipeline/__init__.py +254 -0
  5. flowerpower/cfg/pipeline/adapter.py +66 -0
  6. flowerpower/cfg/pipeline/run.py +40 -11
  7. flowerpower/cfg/pipeline/schedule.py +69 -79
  8. flowerpower/cfg/project/__init__.py +149 -0
  9. flowerpower/cfg/project/adapter.py +57 -0
  10. flowerpower/cfg/project/job_queue.py +165 -0
  11. flowerpower/cli/__init__.py +92 -35
  12. flowerpower/cli/job_queue.py +878 -0
  13. flowerpower/cli/mqtt.py +49 -4
  14. flowerpower/cli/pipeline.py +576 -381
  15. flowerpower/cli/utils.py +55 -0
  16. flowerpower/flowerpower.py +12 -7
  17. flowerpower/fs/__init__.py +20 -2
  18. flowerpower/fs/base.py +350 -26
  19. flowerpower/fs/ext.py +797 -216
  20. flowerpower/fs/storage_options.py +1097 -55
  21. flowerpower/io/base.py +13 -18
  22. flowerpower/io/loader/__init__.py +28 -0
  23. flowerpower/io/loader/deltatable.py +7 -10
  24. flowerpower/io/metadata.py +1 -0
  25. flowerpower/io/saver/__init__.py +28 -0
  26. flowerpower/io/saver/deltatable.py +4 -3
  27. flowerpower/job_queue/__init__.py +252 -0
  28. flowerpower/job_queue/apscheduler/__init__.py +11 -0
  29. flowerpower/job_queue/apscheduler/_setup/datastore.py +110 -0
  30. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +93 -0
  31. flowerpower/job_queue/apscheduler/manager.py +1063 -0
  32. flowerpower/job_queue/apscheduler/setup.py +524 -0
  33. flowerpower/job_queue/apscheduler/trigger.py +169 -0
  34. flowerpower/job_queue/apscheduler/utils.py +309 -0
  35. flowerpower/job_queue/base.py +382 -0
  36. flowerpower/job_queue/rq/__init__.py +10 -0
  37. flowerpower/job_queue/rq/_trigger.py +37 -0
  38. flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +226 -0
  39. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +231 -0
  40. flowerpower/job_queue/rq/manager.py +1449 -0
  41. flowerpower/job_queue/rq/setup.py +150 -0
  42. flowerpower/job_queue/rq/utils.py +69 -0
  43. flowerpower/pipeline/__init__.py +5 -0
  44. flowerpower/pipeline/base.py +118 -0
  45. flowerpower/pipeline/io.py +407 -0
  46. flowerpower/pipeline/job_queue.py +505 -0
  47. flowerpower/pipeline/manager.py +1586 -0
  48. flowerpower/pipeline/registry.py +560 -0
  49. flowerpower/pipeline/runner.py +560 -0
  50. flowerpower/pipeline/visualizer.py +142 -0
  51. flowerpower/plugins/mqtt/__init__.py +12 -0
  52. flowerpower/plugins/mqtt/cfg.py +16 -0
  53. flowerpower/plugins/mqtt/manager.py +789 -0
  54. flowerpower/settings.py +110 -0
  55. flowerpower/utils/logging.py +21 -0
  56. flowerpower/utils/misc.py +57 -9
  57. flowerpower/utils/sql.py +122 -24
  58. flowerpower/utils/templates.py +18 -142
  59. flowerpower/web/app.py +0 -0
  60. flowerpower-1.0.0b1.dist-info/METADATA +324 -0
  61. flowerpower-1.0.0b1.dist-info/RECORD +94 -0
  62. {flowerpower-0.9.12.4.dist-info → flowerpower-1.0.0b1.dist-info}/WHEEL +1 -1
  63. flowerpower/cfg/pipeline/tracker.py +0 -14
  64. flowerpower/cfg/project/open_telemetry.py +0 -8
  65. flowerpower/cfg/project/tracker.py +0 -11
  66. flowerpower/cfg/project/worker.py +0 -19
  67. flowerpower/cli/scheduler.py +0 -309
  68. flowerpower/event_handler.py +0 -23
  69. flowerpower/mqtt.py +0 -525
  70. flowerpower/pipeline.py +0 -2419
  71. flowerpower/scheduler.py +0 -680
  72. flowerpower/tui.py +0 -79
  73. flowerpower/utils/datastore.py +0 -186
  74. flowerpower/utils/eventbroker.py +0 -127
  75. flowerpower/utils/executor.py +0 -58
  76. flowerpower/utils/trigger.py +0 -140
  77. flowerpower-0.9.12.4.dist-info/METADATA +0 -575
  78. flowerpower-0.9.12.4.dist-info/RECORD +0 -70
  79. /flowerpower/{cfg/pipeline/params.py → cli/worker.py} +0 -0
  80. {flowerpower-0.9.12.4.dist-info → flowerpower-1.0.0b1.dist-info}/entry_points.txt +0 -0
  81. {flowerpower-0.9.12.4.dist-info → flowerpower-1.0.0b1.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py CHANGED
@@ -2,7 +2,7 @@ import datetime as dt
2
2
  import importlib
3
3
  import posixpath
4
4
  import uuid
5
- from typing import Generator
5
+ from typing import Any, Generator
6
6
 
7
7
  import orjson
8
8
  import pandas as pd
@@ -11,12 +11,8 @@ import pyarrow.dataset as pds
11
11
  import pyarrow.parquet as pq
12
12
  from fsspec import AbstractFileSystem
13
13
 
14
- from ..utils.misc import (
15
- _dict_to_dataframe,
16
- convert_large_types_to_standard,
17
- run_parallel,
18
- to_pyarrow_table,
19
- )
14
+ from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
15
+ run_parallel, to_pyarrow_table)
20
16
  from ..utils.polars import pl
21
17
 
22
18
  if importlib.util.find_spec("duckdb") is not None:
@@ -31,6 +27,34 @@ else:
31
27
 
32
28
 
33
29
  def path_to_glob(path: str, format: str | None = None) -> str:
30
+ """Convert a path to a glob pattern for file matching.
31
+
32
+ Intelligently converts paths to glob patterns that match files of the specified
33
+ format, handling various directory and wildcard patterns.
34
+
35
+ Args:
36
+ path: Base path to convert. Can include wildcards (* or **).
37
+ Examples: "data/", "data/*.json", "data/**"
38
+ format: File format to match (without dot). If None, inferred from path.
39
+ Examples: "json", "csv", "parquet"
40
+
41
+ Returns:
42
+ str: Glob pattern that matches files of specified format.
43
+ Examples: "data/**/*.json", "data/*.csv"
44
+
45
+ Example:
46
+ >>> # Basic directory
47
+ >>> path_to_glob("data", "json")
48
+ 'data/**/*.json'
49
+ >>>
50
+ >>> # With wildcards
51
+ >>> path_to_glob("data/**", "csv")
52
+ 'data/**/*.csv'
53
+ >>>
54
+ >>> # Format inference
55
+ >>> path_to_glob("data/file.parquet")
56
+ 'data/file.parquet'
57
+ """
34
58
  path = path.rstrip("/")
35
59
  if format is None:
36
60
  if ".json" in path:
@@ -53,8 +77,42 @@ def path_to_glob(path: str, format: str | None = None) -> str:
53
77
 
54
78
 
55
79
  def _read_json_file(
56
- path, self, include_file_path: bool = False, jsonlines: bool = False
80
+ path: str,
81
+ self: AbstractFileSystem,
82
+ include_file_path: bool = False,
83
+ jsonlines: bool = False,
57
84
  ) -> dict | list[dict]:
85
+ """Read a JSON file from any filesystem.
86
+
87
+ Internal function that handles both regular JSON and JSON Lines formats.
88
+
89
+ Args:
90
+ path: Path to JSON file
91
+ self: Filesystem instance to use for reading
92
+ include_file_path: Whether to return dict with filepath as key
93
+ jsonlines: Whether to read as JSON Lines format
94
+
95
+ Returns:
96
+ dict | list[dict]: Parsed JSON data. If include_file_path=True,
97
+ returns {filepath: data}
98
+
99
+ Example:
100
+ >>> fs = LocalFileSystem()
101
+ >>> # Regular JSON
102
+ >>> data = _read_json_file("data.json", fs)
103
+ >>> print(type(data))
104
+ <class 'dict'>
105
+ >>>
106
+ >>> # JSON Lines with filepath
107
+ >>> data = _read_json_file(
108
+ ... "data.jsonl",
109
+ ... fs,
110
+ ... include_file_path=True,
111
+ ... jsonlines=True
112
+ ... )
113
+ >>> print(list(data.keys())[0])
114
+ 'data.jsonl'
115
+ """
58
116
  with self.open(path) as f:
59
117
  if jsonlines:
60
118
  data = [orjson.loads(line) for line in f.readlines()]
@@ -66,10 +124,47 @@ def _read_json_file(
66
124
 
67
125
 
68
126
  def read_json_file(
69
- self, path: str, include_file_path: bool = False, jsonlines: bool = False
127
+ self: AbstractFileSystem,
128
+ path: str,
129
+ include_file_path: bool = False,
130
+ jsonlines: bool = False,
70
131
  ) -> dict | list[dict]:
132
+ """Read a single JSON file from any filesystem.
133
+
134
+ A public wrapper around _read_json_file providing a clean interface for
135
+ reading individual JSON files.
136
+
137
+ Args:
138
+ path: Path to JSON file to read
139
+ include_file_path: Whether to return dict with filepath as key
140
+ jsonlines: Whether to read as JSON Lines format
141
+
142
+ Returns:
143
+ dict | list[dict]: Parsed JSON data. For regular JSON, returns a dict.
144
+ For JSON Lines, returns a list of dicts. If include_file_path=True,
145
+ returns {filepath: data}.
146
+
147
+ Example:
148
+ >>> fs = LocalFileSystem()
149
+ >>> # Read regular JSON
150
+ >>> data = fs.read_json_file("config.json")
151
+ >>> print(data["setting"])
152
+ 'value'
153
+ >>>
154
+ >>> # Read JSON Lines with filepath
155
+ >>> data = fs.read_json_file(
156
+ ... "logs.jsonl",
157
+ ... include_file_path=True,
158
+ ... jsonlines=True
159
+ ... )
160
+ >>> print(list(data.keys())[0])
161
+ 'logs.jsonl'
162
+ """
71
163
  return _read_json_file(
72
- path=path, self=self, include_file_path=include_file_path, jsonlines=jsonlines
164
+ path=path,
165
+ self=self,
166
+ include_file_path=include_file_path,
167
+ jsonlines=jsonlines,
73
168
  )
74
169
 
75
170
 
@@ -152,7 +247,7 @@ def _read_json(
152
247
 
153
248
 
154
249
  def _read_json_batches(
155
- self,
250
+ self: AbstractFileSystem,
156
251
  path: str | list[str],
157
252
  batch_size: int | None = None,
158
253
  include_file_path: bool = False,
@@ -161,24 +256,49 @@ def _read_json_batches(
161
256
  concat: bool = True,
162
257
  use_threads: bool = True,
163
258
  verbose: bool = False,
164
- **kwargs,
259
+ **kwargs: Any,
165
260
  ) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
166
- """
167
- Read JSON files in batches with optional parallel processing within batches.
261
+ """Process JSON files in batches with optional parallel reading.
262
+
263
+ Internal generator function that handles batched reading of JSON files
264
+ with support for parallel processing within each batch.
168
265
 
169
266
  Args:
170
- path: (str | list[str]) Path to the JSON file(s).
171
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
172
- include_file_path: (bool) If True, return with file path as key.
173
- jsonlines: (bool) If True, read JSON lines. Defaults to False.
174
- as_dataframe: (bool) If True, return DataFrame. Defaults to True.
175
- concat: (bool) If True, concatenate batch DataFrames. Defaults to True.
176
- use_threads: (bool) If True, use parallel processing within batches.
177
- verbose: (bool) If True, print verbose output.
178
- **kwargs: Additional keyword arguments.
267
+ path: Path(s) to JSON file(s). Glob patterns supported.
268
+ batch_size: Number of files to process in each batch
269
+ include_file_path: Include source filepath in output
270
+ jsonlines: Whether to read as JSON Lines format
271
+ as_dataframe: Convert output to Polars DataFrame(s)
272
+ concat: Combine files within each batch
273
+ use_threads: Enable parallel file reading within batches
274
+ verbose: Print progress information
275
+ **kwargs: Additional arguments for DataFrame conversion
179
276
 
180
277
  Yields:
181
- Data from num_batches files as dict/DataFrame based on parameters.
278
+ Each batch of data in requested format:
279
+ - dict | list[dict]: Raw JSON data
280
+ - pl.DataFrame: Single DataFrame if concat=True
281
+ - list[pl.DataFrame]: List of DataFrames if concat=False
282
+
283
+ Example:
284
+ >>> fs = LocalFileSystem()
285
+ >>> # Process large dataset in batches
286
+ >>> for batch in fs._read_json_batches(
287
+ ... "data/*.json",
288
+ ... batch_size=100,
289
+ ... as_dataframe=True,
290
+ ... verbose=True
291
+ ... ):
292
+ ... print(f"Batch shape: {batch.shape}")
293
+ >>>
294
+ >>> # Parallel batch processing with filepath tracking
295
+ >>> for batch in fs._read_json_batches(
296
+ ... ["logs1.jsonl", "logs2.jsonl"],
297
+ ... batch_size=1,
298
+ ... include_file_path=True,
299
+ ... use_threads=True
300
+ ... ):
301
+ ... print(f"Processing {batch['file_path'][0]}")
182
302
  """
183
303
  # Handle path resolution
184
304
  if isinstance(path, str):
@@ -218,10 +338,13 @@ def _read_json_batches(
218
338
  batch_dfs = [pl.DataFrame(d) for d in batch_data]
219
339
  else:
220
340
  batch_dfs = [
221
- pl.DataFrame(list(d.values())[0]).with_columns(
222
- pl.lit(list(d.keys())[0]).alias("file_path")
223
- )
224
- for d in batch_data
341
+ [
342
+ pl.DataFrame(_data[k]).with_columns(
343
+ pl.lit(k).alias("file_path")
344
+ )
345
+ for k in _data
346
+ ][0]
347
+ for _data in batch_data
225
348
  ]
226
349
 
227
350
  if concat and len(batch_dfs) > 1:
@@ -233,7 +356,7 @@ def _read_json_batches(
233
356
 
234
357
 
235
358
  def read_json(
236
- self,
359
+ self: AbstractFileSystem,
237
360
  path: str | list[str],
238
361
  batch_size: int | None = None,
239
362
  include_file_path: bool = False,
@@ -242,7 +365,7 @@ def read_json(
242
365
  concat: bool = True,
243
366
  use_threads: bool = True,
244
367
  verbose: bool = False,
245
- **kwargs,
368
+ **kwargs: Any,
246
369
  ) -> (
247
370
  dict
248
371
  | list[dict]
@@ -250,27 +373,65 @@ def read_json(
250
373
  | list[pl.DataFrame]
251
374
  | Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]
252
375
  ):
253
- """
254
- Read a JSON file or a list of JSON files. Optionally read in batches,
255
- returning a generator that sequentially yields data for specified number of files.
376
+ """Read JSON data from one or more files with powerful options.
377
+
378
+ Provides a flexible interface for reading JSON data with support for:
379
+ - Single file or multiple files
380
+ - Regular JSON or JSON Lines format
381
+ - Batch processing for large datasets
382
+ - Parallel processing
383
+ - DataFrame conversion
384
+ - File path tracking
256
385
 
257
386
  Args:
258
- path: (str | list[str]) Path to the JSON file(s).
259
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
260
- include_file_path: (bool) If True, return with file path as key.
261
- jsonlines: (bool) If True, read JSON lines. Defaults to False.
262
- as_dataframe: (bool) If True, return DataFrame. Defaults to True.
263
- concat: (bool) If True, concatenate the DataFrames. Defaults to True.
264
- use_threads: (bool) If True, use parallel processing within batches. Defaults to True.
265
- verbose: (bool) If True, print verbose output. Defaults to False.
266
- **kwargs: Additional keyword arguments.
387
+ path: Path(s) to JSON file(s). Can be:
388
+ - Single path string (globs supported)
389
+ - List of path strings
390
+ batch_size: If set, enables batch reading with this many files per batch
391
+ include_file_path: Include source filepath in output
392
+ jsonlines: Whether to read as JSON Lines format
393
+ as_dataframe: Convert output to Polars DataFrame(s)
394
+ concat: Combine multiple files/batches into single result
395
+ use_threads: Enable parallel file reading
396
+ verbose: Print progress information
397
+ **kwargs: Additional arguments passed to DataFrame conversion
267
398
 
268
399
  Returns:
269
- (dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
270
- Dictionary, list of dictionaries, DataFrame, list of DataFrames
271
- Yields:
272
- (dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
273
- Dictionary, list of dictionaries, DataFrame, list of DataFrames
400
+ Various types depending on arguments:
401
+ - dict: Single JSON file as dictionary
402
+ - list[dict]: Multiple JSON files as list of dictionaries
403
+ - pl.DataFrame: Single or concatenated DataFrame
404
+ - list[pl.DataFrame]: List of DataFrames (if concat=False)
405
+ - Generator: If batch_size set, yields batches of above types
406
+
407
+ Example:
408
+ >>> fs = LocalFileSystem()
409
+ >>> # Read all JSON files in directory
410
+ >>> df = fs.read_json(
411
+ ... "data/*.json",
412
+ ... as_dataframe=True,
413
+ ... concat=True
414
+ ... )
415
+ >>> print(df.shape)
416
+ (1000, 5) # Combined data from all files
417
+ >>>
418
+ >>> # Batch process large dataset
419
+ >>> for batch_df in fs.read_json(
420
+ ... "logs/*.jsonl",
421
+ ... batch_size=100,
422
+ ... jsonlines=True,
423
+ ... include_file_path=True
424
+ ... ):
425
+ ... print(f"Processing {len(batch_df)} records")
426
+ >>>
427
+ >>> # Parallel read with custom options
428
+ >>> dfs = fs.read_json(
429
+ ... ["file1.json", "file2.json"],
430
+ ... use_threads=True,
431
+ ... concat=False,
432
+ ... verbose=True
433
+ ... )
434
+ >>> print(f"Read {len(dfs)} files")
274
435
  """
275
436
  if batch_size is not None:
276
437
  return _read_json_batches(
@@ -299,9 +460,34 @@ def read_json(
299
460
 
300
461
 
301
462
  def _read_csv_file(
302
- path, self, include_file_path: bool = False, **kwargs
463
+ path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
303
464
  ) -> pl.DataFrame:
304
- print(path)
465
+ """Read a single CSV file from any filesystem.
466
+
467
+ Internal function that handles reading individual CSV files and optionally
468
+ adds the source filepath as a column.
469
+
470
+ Args:
471
+ path: Path to CSV file
472
+ self: Filesystem instance to use for reading
473
+ include_file_path: Add source filepath as a column
474
+ **kwargs: Additional arguments passed to pl.read_csv()
475
+
476
+ Returns:
477
+ pl.DataFrame: DataFrame containing CSV data
478
+
479
+ Example:
480
+ >>> fs = LocalFileSystem()
481
+ >>> df = _read_csv_file(
482
+ ... "data.csv",
483
+ ... fs,
484
+ ... include_file_path=True,
485
+ ... delimiter="|"
486
+ ... )
487
+ >>> print("file_path" in df.columns)
488
+ True
489
+ """
490
+ print(path) # Debug info
305
491
  with self.open(path) as f:
306
492
  df = pl.read_csv(f, **kwargs)
307
493
  if include_file_path:
@@ -371,29 +557,54 @@ def _read_csv(
371
557
 
372
558
 
373
559
  def _read_csv_batches(
374
- self,
560
+ self: AbstractFileSystem,
375
561
  path: str | list[str],
376
562
  batch_size: int | None = None,
377
563
  include_file_path: bool = False,
378
564
  concat: bool = True,
379
565
  use_threads: bool = True,
380
566
  verbose: bool = False,
381
- **kwargs,
382
- ) -> Generator[pl.DataFrame, None, None]:
383
- """
384
- Read CSV files in batches with optional parallel processing within batches.
567
+ **kwargs: Any,
568
+ ) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
569
+ """Process CSV files in batches with optional parallel reading.
570
+
571
+ Internal generator function that handles batched reading of CSV files
572
+ with support for parallel processing within each batch.
385
573
 
386
574
  Args:
387
- path: (str | list[str]) Path to the CSV file(s).
388
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
389
- include_file_path: (bool) If True, include file_path column.
390
- concat: (bool) If True, concatenate batch DataFrames.
391
- use_threads: (bool) If True, use parallel processing within batches.
392
- verbose: (bool) If True, print verbose output.
393
- **kwargs: Additional keyword arguments.
575
+ path: Path(s) to CSV file(s). Glob patterns supported.
576
+ batch_size: Number of files to process in each batch
577
+ include_file_path: Add source filepath as a column
578
+ concat: Combine files within each batch
579
+ use_threads: Enable parallel file reading within batches
580
+ verbose: Print progress information
581
+ **kwargs: Additional arguments passed to pl.read_csv()
394
582
 
395
583
  Yields:
396
- pl.DataFrame: DataFrame containing data from num_batches files.
584
+ Each batch of data in requested format:
585
+ - pl.DataFrame: Single DataFrame if concat=True
586
+ - list[pl.DataFrame]: List of DataFrames if concat=False
587
+
588
+ Example:
589
+ >>> fs = LocalFileSystem()
590
+ >>> # Process large dataset in batches
591
+ >>> for batch in fs._read_csv_batches(
592
+ ... "data/*.csv",
593
+ ... batch_size=100,
594
+ ... include_file_path=True,
595
+ ... verbose=True
596
+ ... ):
597
+ ... print(f"Batch columns: {batch.columns}")
598
+ >>>
599
+ >>> # Parallel processing without concatenation
600
+ >>> for batch in fs._read_csv_batches(
601
+ ... ["file1.csv", "file2.csv"],
602
+ ... batch_size=1,
603
+ ... concat=False,
604
+ ... use_threads=True
605
+ ... ):
606
+ ... for df in batch:
607
+ ... print(f"DataFrame shape: {df.shape}")
397
608
  """
398
609
  # Handle path resolution
399
610
  if isinstance(path, str):
@@ -435,39 +646,71 @@ def _read_csv_batches(
435
646
 
436
647
 
437
648
  def read_csv(
438
- self,
649
+ self: AbstractFileSystem,
439
650
  path: str | list[str],
440
651
  batch_size: int | None = None,
441
652
  include_file_path: bool = False,
442
653
  concat: bool = True,
443
654
  use_threads: bool = True,
444
655
  verbose: bool = False,
445
- **kwargs,
656
+ **kwargs: Any,
446
657
  ) -> (
447
658
  pl.DataFrame
448
659
  | list[pl.DataFrame]
449
660
  | Generator[pl.DataFrame | list[pl.DataFrame], None, None]
450
661
  ):
451
- """
452
- Read a CSV file or a list of CSV files. Optionally read in batches,
453
- returning a generator that sequentially yields data for specified number of files.
662
+ """Read CSV data from one or more files with powerful options.
663
+
664
+ Provides a flexible interface for reading CSV files with support for:
665
+ - Single file or multiple files
666
+ - Batch processing for large datasets
667
+ - Parallel processing
668
+ - File path tracking
669
+ - Polars DataFrame output
454
670
 
455
671
  Args:
456
- path: (str | list[str]) Path to the CSV file(s).
457
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
458
- include_file_path: (bool, optional) If True, include 'file_path' column.
459
- concat: (bool, optional) If True, concatenate the batch DataFrames. Defaults to True.
460
- use_threads: (bool, optional) If True, use parallel processing within batches. Defaults to True.
461
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
462
- **kwargs: Additional keyword arguments.
672
+ path: Path(s) to CSV file(s). Can be:
673
+ - Single path string (globs supported)
674
+ - List of path strings
675
+ batch_size: If set, enables batch reading with this many files per batch
676
+ include_file_path: Add source filepath as a column
677
+ concat: Combine multiple files/batches into single DataFrame
678
+ use_threads: Enable parallel file reading
679
+ verbose: Print progress information
680
+ **kwargs: Additional arguments passed to pl.read_csv()
463
681
 
464
682
  Returns:
465
- pl.DataFrame | list[pl.DataFrame]:
466
- DataFrame or list or DataFrames containing data from num_batches files.
467
-
468
- Yields:
469
- pl.DataFrame | list[pl.DataFrame]:
470
- DataFrame or list of DataFrames containing data from num_batches files.
683
+ Various types depending on arguments:
684
+ - pl.DataFrame: Single or concatenated DataFrame
685
+ - list[pl.DataFrame]: List of DataFrames (if concat=False)
686
+ - Generator: If batch_size set, yields batches of above types
687
+
688
+ Example:
689
+ >>> fs = LocalFileSystem()
690
+ >>> # Read all CSVs in directory
691
+ >>> df = fs.read_csv(
692
+ ... "data/*.csv",
693
+ ... include_file_path=True
694
+ ... )
695
+ >>> print(df.columns)
696
+ ['file_path', 'col1', 'col2', ...]
697
+ >>>
698
+ >>> # Batch process large dataset
699
+ >>> for batch_df in fs.read_csv(
700
+ ... "logs/*.csv",
701
+ ... batch_size=100,
702
+ ... use_threads=True,
703
+ ... verbose=True
704
+ ... ):
705
+ ... print(f"Processing {len(batch_df)} rows")
706
+ >>>
707
+ >>> # Multiple files without concatenation
708
+ >>> dfs = fs.read_csv(
709
+ ... ["file1.csv", "file2.csv"],
710
+ ... concat=False,
711
+ ... use_threads=True
712
+ ... )
713
+ >>> print(f"Read {len(dfs)} files")
471
714
  """
472
715
  if batch_size is not None:
473
716
  return _read_csv_batches(
@@ -492,8 +735,33 @@ def read_csv(
492
735
 
493
736
 
494
737
  def _read_parquet_file(
495
- path, self, include_file_path: bool = False, **kwargs
738
+ path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
496
739
  ) -> pa.Table:
740
+ """Read a single Parquet file from any filesystem.
741
+
742
+ Internal function that handles reading individual Parquet files and
743
+ optionally adds the source filepath as a column.
744
+
745
+ Args:
746
+ path: Path to Parquet file
747
+ self: Filesystem instance to use for reading
748
+ include_file_path: Add source filepath as a column
749
+ **kwargs: Additional arguments passed to pq.read_table()
750
+
751
+ Returns:
752
+ pa.Table: PyArrow Table containing Parquet data
753
+
754
+ Example:
755
+ >>> fs = LocalFileSystem()
756
+ >>> table = _read_parquet_file(
757
+ ... "data.parquet",
758
+ ... fs,
759
+ ... include_file_path=True,
760
+ ... use_threads=True
761
+ ... )
762
+ >>> print("file_path" in table.column_names)
763
+ True
764
+ """
497
765
  table = pq.read_table(path, filesystem=self, **kwargs)
498
766
  if include_file_path:
499
767
  return table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
@@ -569,29 +837,61 @@ def _read_parquet(
569
837
 
570
838
 
571
839
  def _read_parquet_batches(
572
- self,
840
+ self: AbstractFileSystem,
573
841
  path: str | list[str],
574
842
  batch_size: int | None = None,
575
843
  include_file_path: bool = False,
576
844
  use_threads: bool = True,
577
845
  concat: bool = True,
578
846
  verbose: bool = False,
579
- **kwargs,
847
+ **kwargs: Any,
580
848
  ) -> Generator[pa.Table | list[pa.Table], None, None]:
581
- """
582
- Read Parquet files in batches, yielding PyArrow Tables.
849
+ """Process Parquet files in batches with performance optimizations.
850
+
851
+ Internal generator function that handles batched reading of Parquet files
852
+ with support for:
853
+ - Parallel processing within batches
854
+ - Metadata-based optimizations
855
+ - Memory-efficient processing
856
+ - Progress tracking
857
+
858
+ Uses fast path for simple cases:
859
+ - Single directory with _metadata
860
+ - No need for filepath column
861
+ - Concatenated output
583
862
 
584
863
  Args:
585
- path: (str | list[str]) Path to the Parquet file(s).
586
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
587
- include_file_path: (bool) If True, return Tables with 'file_path' column. Defaults to False.
588
- use_threads: (bool) If True, read files in parallel within batches. Defaults to True.
589
- concat: (bool) If True, concatenate Tables within each batch. Defaults to True.
590
- verbose: (bool) If True, print progress information. Defaults to False.
591
- **kwargs: Additional keyword arguments.
864
+ path: Path(s) to Parquet file(s). Glob patterns supported.
865
+ batch_size: Number of files to process in each batch
866
+ include_file_path: Add source filepath as a column
867
+ use_threads: Enable parallel file reading within batches
868
+ concat: Combine files within each batch
869
+ verbose: Print progress information
870
+ **kwargs: Additional arguments passed to pq.read_table()
592
871
 
593
872
  Yields:
594
- pa.Table | list[pa.Table]: Table or list of Tables per batch.
873
+ Each batch of data in requested format:
874
+ - pa.Table: Single Table if concat=True
875
+ - list[pa.Table]: List of Tables if concat=False
876
+
877
+ Example:
878
+ >>> fs = LocalFileSystem()
879
+ >>> # Fast path for simple case
880
+ >>> next(_read_parquet_batches(
881
+ ... fs,
882
+ ... "data/", # Contains _metadata
883
+ ... batch_size=1000
884
+ ... ))
885
+ >>>
886
+ >>> # Parallel batch processing
887
+ >>> for batch in fs._read_parquet_batches(
888
+ ... fs,
889
+ ... ["file1.parquet", "file2.parquet"],
890
+ ... batch_size=1,
891
+ ... include_file_path=True,
892
+ ... use_threads=True
893
+ ... ):
894
+ ... print(f"Batch schema: {batch.schema}")
595
895
  """
596
896
  # Fast path for simple cases
597
897
  if not include_file_path and concat and batch_size is None:
@@ -612,7 +912,6 @@ def _read_parquet_batches(
612
912
  return
613
913
 
614
914
  # Process in batches
615
-
616
915
  for i in range(0, len(path), batch_size):
617
916
  batch_paths = path[i : i + batch_size]
618
917
  if use_threads and len(batch_paths) > 1:
@@ -641,34 +940,72 @@ def _read_parquet_batches(
641
940
 
642
941
 
643
942
  def read_parquet(
644
- self,
943
+ self: AbstractFileSystem,
645
944
  path: str | list[str],
646
945
  batch_size: int | None = None,
647
946
  include_file_path: bool = False,
648
947
  concat: bool = True,
649
948
  use_threads: bool = True,
650
949
  verbose: bool = False,
651
- **kwargs,
950
+ **kwargs: Any,
652
951
  ) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
653
- """
654
- Read a Parquet file or a list of Parquet files. Optionally read in batches,
655
- returning a generator that sequentially yields data for specified number of files.
952
+ """Read Parquet data with advanced features and optimizations.
953
+
954
+ Provides a high-performance interface for reading Parquet files with support for:
955
+ - Single file or multiple files
956
+ - Batch processing for large datasets
957
+ - Parallel processing
958
+ - File path tracking
959
+ - Automatic concatenation
960
+ - PyArrow Table output
961
+
962
+ The function automatically uses optimal reading strategies:
963
+ - Direct dataset reading for simple cases
964
+ - Parallel processing for multiple files
965
+ - Batched reading for memory efficiency
656
966
 
657
967
  Args:
658
- path: (str | list[str]) Path to the Parquet file(s).
659
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
660
- include_file_path: (bool, optional) If True, include 'file_path' column.
661
- concat: (bool, optional) If True, concatenate the batch Tables. Defaults to True.
662
- use_threads: (bool, optional) If True, use parallel processing within batches. Defaults to True.
663
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
664
- **kwargs: Additional keyword arguments.
968
+ path: Path(s) to Parquet file(s). Can be:
969
+ - Single path string (globs supported)
970
+ - List of path strings
971
+ - Directory containing _metadata file
972
+ batch_size: If set, enables batch reading with this many files per batch
973
+ include_file_path: Add source filepath as a column
974
+ concat: Combine multiple files/batches into single Table
975
+ use_threads: Enable parallel file reading
976
+ verbose: Print progress information
977
+ **kwargs: Additional arguments passed to pq.read_table()
665
978
 
666
979
  Returns:
667
- pa.Table | list[pa.Table]:
668
- PyArrow Table or list of PyArrow Tables containing data from num_batches files.
669
-
670
- Yields:
671
- pa.Table | list[pa.Table]: PyArrow Table or list of PyArrow Tables containing data from num_batches files.
980
+ Various types depending on arguments:
981
+ - pa.Table: Single or concatenated Table
982
+ - list[pa.Table]: List of Tables (if concat=False)
983
+ - Generator: If batch_size set, yields batches of above types
984
+
985
+ Example:
986
+ >>> fs = LocalFileSystem()
987
+ >>> # Read all Parquet files in directory
988
+ >>> table = fs.read_parquet(
989
+ ... "data/*.parquet",
990
+ ... include_file_path=True
991
+ ... )
992
+ >>> print(table.column_names)
993
+ ['file_path', 'col1', 'col2', ...]
994
+ >>>
995
+ >>> # Batch process large dataset
996
+ >>> for batch in fs.read_parquet(
997
+ ... "data/*.parquet",
998
+ ... batch_size=100,
999
+ ... use_threads=True
1000
+ ... ):
1001
+ ... print(f"Processing {batch.num_rows} rows")
1002
+ >>>
1003
+ >>> # Read from directory with metadata
1004
+ >>> table = fs.read_parquet(
1005
+ ... "data/", # Contains _metadata
1006
+ ... use_threads=True
1007
+ ... )
1008
+ >>> print(f"Total rows: {table.num_rows}")
672
1009
  """
673
1010
  if batch_size is not None:
674
1011
  return _read_parquet_batches(
@@ -693,7 +1030,7 @@ def read_parquet(
693
1030
 
694
1031
 
695
1032
  def read_files(
696
- self,
1033
+ self: AbstractFileSystem,
697
1034
  path: str | list[str],
698
1035
  format: str,
699
1036
  batch_size: int | None = None,
@@ -702,38 +1039,76 @@ def read_files(
702
1039
  jsonlines: bool = False,
703
1040
  use_threads: bool = True,
704
1041
  verbose: bool = False,
705
- **kwargs,
1042
+ **kwargs: Any,
706
1043
  ) -> (
707
1044
  pl.DataFrame
708
1045
  | pa.Table
709
1046
  | list[pl.DataFrame]
710
1047
  | list[pa.Table]
711
1048
  | Generator[
712
- pl.DataFrame | pa.Table | list[pa.Table] | list[pl.DataFrame], None, None
1049
+ pl.DataFrame | pa.Table | list[pl.DataFrame] | list[pa.Table], None, None
713
1050
  ]
714
1051
  ):
715
- """
716
- Read a file or a list of files of the given format.
1052
+ """Universal interface for reading data files of any supported format.
1053
+
1054
+ A unified API that automatically delegates to the appropriate reading function
1055
+ based on file format, while preserving all advanced features like:
1056
+ - Batch processing
1057
+ - Parallel reading
1058
+ - File path tracking
1059
+ - Format-specific optimizations
717
1060
 
718
1061
  Args:
719
- path: (str | list[str]) Path to the file(s).
720
- format: (str) Format of the file.
721
- batch_size: (int | None) Number of files to process in each batch. Defaults to None.
722
- include_file_path: (bool, optional) If True, return a DataFrame with a 'file_path' column.
723
- Defaults to False.
724
- concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
725
- jsonlines: (bool, optional) If True, read JSON lines. Defaults to False.
726
- use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
727
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
728
- **kwargs: Additional keyword arguments.
1062
+ path: Path(s) to data file(s). Can be:
1063
+ - Single path string (globs supported)
1064
+ - List of path strings
1065
+ format: File format to read. Supported values:
1066
+ - "json": Regular JSON or JSON Lines
1067
+ - "csv": CSV files
1068
+ - "parquet": Parquet files
1069
+ batch_size: If set, enables batch reading with this many files per batch
1070
+ include_file_path: Add source filepath as column/field
1071
+ concat: Combine multiple files/batches into single result
1072
+ jsonlines: For JSON format, whether to read as JSON Lines
1073
+ use_threads: Enable parallel file reading
1074
+ verbose: Print progress information
1075
+ **kwargs: Additional format-specific arguments
729
1076
 
730
1077
  Returns:
731
- (pl.DataFrame | pa.Table | list[pl.DataFrame] | list[pa.Table]):
732
- Polars DataFrame, Pyarrow Table or list of DataFrames, LazyFrames or Tables.
733
-
734
- Yields:
735
- (pl.DataFrame | pa.Table):
736
- Polars DataFrame, Pyarrow Table or list of DataFrames, LazyFrames or Tables.
1078
+ Various types depending on format and arguments:
1079
+ - pl.DataFrame: For CSV and optionally JSON
1080
+ - pa.Table: For Parquet
1081
+ - list[pl.DataFrame | pa.Table]: Without concatenation
1082
+ - Generator: If batch_size set, yields batches
1083
+
1084
+ Example:
1085
+ >>> fs = LocalFileSystem()
1086
+ >>> # Read CSV files
1087
+ >>> df = fs.read_files(
1088
+ ... "data/*.csv",
1089
+ ... format="csv",
1090
+ ... include_file_path=True
1091
+ ... )
1092
+ >>> print(type(df))
1093
+ <class 'polars.DataFrame'>
1094
+ >>>
1095
+ >>> # Batch process Parquet files
1096
+ >>> for batch in fs.read_files(
1097
+ ... "data/*.parquet",
1098
+ ... format="parquet",
1099
+ ... batch_size=100,
1100
+ ... use_threads=True
1101
+ ... ):
1102
+ ... print(f"Batch type: {type(batch)}")
1103
+ >>>
1104
+ >>> # Read JSON Lines
1105
+ >>> df = fs.read_files(
1106
+ ... "logs/*.jsonl",
1107
+ ... format="json",
1108
+ ... jsonlines=True,
1109
+ ... concat=True
1110
+ ... )
1111
+ >>> print(df.columns)
737
1112
  """
738
1113
  if format == "json":
739
1114
  if batch_size is not None:
@@ -749,8 +1124,8 @@ def read_files(
749
1124
  **kwargs,
750
1125
  )
751
1126
  return read_json(
752
- self,
753
- path,
1127
+ self=self,
1128
+ path=path,
754
1129
  include_file_path=include_file_path,
755
1130
  jsonlines=jsonlines,
756
1131
  concat=concat,
@@ -771,8 +1146,8 @@ def read_files(
771
1146
  **kwargs,
772
1147
  )
773
1148
  return read_csv(
774
- self,
775
- path,
1149
+ self=self,
1150
+ path=path,
776
1151
  include_file_path=include_file_path,
777
1152
  use_threads=use_threads,
778
1153
  concat=concat,
@@ -792,8 +1167,8 @@ def read_files(
792
1167
  **kwargs,
793
1168
  )
794
1169
  return read_parquet(
795
- self,
796
- path,
1170
+ self=self,
1171
+ path=path,
797
1172
  include_file_path=include_file_path,
798
1173
  use_threads=use_threads,
799
1174
  concat=concat,
@@ -803,26 +1178,64 @@ def read_files(
803
1178
 
804
1179
 
805
1180
  def pyarrow_dataset(
806
- self,
1181
+ self: AbstractFileSystem,
807
1182
  path: str,
808
- format="parquet",
1183
+ format: str = "parquet",
809
1184
  schema: pa.Schema | None = None,
810
1185
  partitioning: str | list[str] | pds.Partitioning = None,
811
- **kwargs,
1186
+ **kwargs: Any,
812
1187
  ) -> pds.Dataset:
813
- """
814
- Create a pyarrow dataset.
1188
+ """Create a PyArrow dataset from files in any supported format.
1189
+
1190
+ Creates a dataset that provides optimized reading and querying capabilities
1191
+ including:
1192
+ - Schema inference and enforcement
1193
+ - Partition discovery and pruning
1194
+ - Predicate pushdown
1195
+ - Column projection
815
1196
 
816
1197
  Args:
817
- path: (str) Path to the dataset.
818
- format: (str, optional) Format of the dataset. Defaults to 'parquet'.
819
- schema: (pa.Schema, optional) Schema of the dataset. Defaults to None.
820
- partitioning: (str | list[str] | pds.Partitioning, optional) Partitioning of the dataset.
821
- Defaults to None.
822
- **kwargs: Additional keyword arguments.
1198
+ path: Base path to dataset files
1199
+ format: File format. Currently supports:
1200
+ - "parquet" (default)
1201
+ - "csv"
1202
+ - "json" (experimental)
1203
+ schema: Optional schema to enforce. If None, inferred from data.
1204
+ partitioning: How the dataset is partitioned. Can be:
1205
+ - str: Single partition field
1206
+ - list[str]: Multiple partition fields
1207
+ - pds.Partitioning: Custom partitioning scheme
1208
+ **kwargs: Additional arguments for dataset creation
823
1209
 
824
1210
  Returns:
825
- (pds.Dataset): Pyarrow dataset.
1211
+ pds.Dataset: PyArrow dataset instance
1212
+
1213
+ Example:
1214
+ >>> fs = LocalFileSystem()
1215
+ >>> # Simple Parquet dataset
1216
+ >>> ds = fs.pyarrow_dataset("data/")
1217
+ >>> print(ds.schema)
1218
+ >>>
1219
+ >>> # Partitioned dataset
1220
+ >>> ds = fs.pyarrow_dataset(
1221
+ ... "events/",
1222
+ ... partitioning=["year", "month"]
1223
+ ... )
1224
+ >>> # Query with partition pruning
1225
+ >>> table = ds.to_table(
1226
+ ... filter=(ds.field("year") == 2024)
1227
+ ... )
1228
+ >>>
1229
+ >>> # CSV with schema
1230
+ >>> ds = fs.pyarrow_dataset(
1231
+ ... "logs/",
1232
+ ... format="csv",
1233
+ ... schema=pa.schema([
1234
+ ... ("timestamp", pa.timestamp("s")),
1235
+ ... ("level", pa.string()),
1236
+ ... ("message", pa.string())
1237
+ ... ])
1238
+ ... )
826
1239
  """
827
1240
  return pds.dataset(
828
1241
  path,
@@ -835,24 +1248,52 @@ def pyarrow_dataset(
835
1248
 
836
1249
 
837
1250
  def pyarrow_parquet_dataset(
838
- self,
1251
+ self: AbstractFileSystem,
839
1252
  path: str,
840
1253
  schema: pa.Schema | None = None,
841
1254
  partitioning: str | list[str] | pds.Partitioning = None,
842
- **kwargs,
1255
+ **kwargs: Any,
843
1256
  ) -> pds.Dataset:
844
- """
845
- Create a pyarrow dataset from a parquet_metadata file.
1257
+ """Create a PyArrow dataset optimized for Parquet files.
1258
+
1259
+ Creates a dataset specifically for Parquet data, automatically handling
1260
+ _metadata files for optimized reading.
1261
+
1262
+ This function is particularly useful for:
1263
+ - Datasets with existing _metadata files
1264
+ - Multi-file datasets that should be treated as one
1265
+ - Partitioned Parquet datasets
846
1266
 
847
1267
  Args:
848
- path: (str) Path to the dataset.
849
- schema: (pa.Schema, optional) Schema of the dataset. Defaults to None.
850
- partitioning: (str | list[str] | pds.Partitioning, optional) Partitioning of the dataset.
851
- Defaults to None.
852
- **kwargs: Additional keyword arguments.
1268
+ path: Path to dataset directory or _metadata file
1269
+ schema: Optional schema to enforce. If None, inferred from data.
1270
+ partitioning: How the dataset is partitioned. Can be:
1271
+ - str: Single partition field
1272
+ - list[str]: Multiple partition fields
1273
+ - pds.Partitioning: Custom partitioning scheme
1274
+ **kwargs: Additional dataset arguments
853
1275
 
854
1276
  Returns:
855
- (pds.Dataset): Pyarrow dataset.
1277
+ pds.Dataset: PyArrow dataset instance
1278
+
1279
+ Example:
1280
+ >>> fs = LocalFileSystem()
1281
+ >>> # Dataset with _metadata
1282
+ >>> ds = fs.pyarrow_parquet_dataset("data/_metadata")
1283
+ >>> print(ds.files) # Shows all data files
1284
+ >>>
1285
+ >>> # Partitioned dataset directory
1286
+ >>> ds = fs.pyarrow_parquet_dataset(
1287
+ ... "sales/",
1288
+ ... partitioning=["year", "region"]
1289
+ ... )
1290
+ >>> # Query with partition pruning
1291
+ >>> table = ds.to_table(
1292
+ ... filter=(
1293
+ ... (ds.field("year") == 2024) &
1294
+ ... (ds.field("region") == "EMEA")
1295
+ ... )
1296
+ ... )
856
1297
  """
857
1298
  if not self.is_file(path):
858
1299
  path = posixpath.join(path, "_metadata")
@@ -866,22 +1307,49 @@ def pyarrow_parquet_dataset(
866
1307
 
867
1308
 
868
1309
  def pydala_dataset(
869
- self,
1310
+ self: AbstractFileSystem,
870
1311
  path: str,
871
1312
  partitioning: str | list[str] | pds.Partitioning = None,
872
- **kwargs,
1313
+ **kwargs: Any,
873
1314
  ) -> ParquetDataset: # type: ignore
874
- """
875
- Create a pydala dataset.
1315
+ """Create a Pydala dataset for advanced Parquet operations.
1316
+
1317
+ Creates a dataset with additional features beyond PyArrow including:
1318
+ - Delta table support
1319
+ - Schema evolution
1320
+ - Advanced partitioning
1321
+ - Metadata management
1322
+ - Sort key optimization
876
1323
 
877
1324
  Args:
878
- path: (str) Path to the dataset.
879
- partitioning: (str | list[str] | pds.Partitioning, optional) Partitioning of the dataset.
880
- Defaults to None.
881
- **kwargs: Additional keyword arguments.
1325
+ path: Path to dataset directory
1326
+ partitioning: How the dataset is partitioned. Can be:
1327
+ - str: Single partition field
1328
+ - list[str]: Multiple partition fields
1329
+ - pds.Partitioning: Custom partitioning scheme
1330
+ **kwargs: Additional dataset configuration
882
1331
 
883
1332
  Returns:
884
- (ParquetDataset): Pydala dataset.
1333
+ ParquetDataset: Pydala dataset instance
1334
+
1335
+ Example:
1336
+ >>> fs = LocalFileSystem()
1337
+ >>> # Create dataset
1338
+ >>> ds = fs.pydala_dataset(
1339
+ ... "data/",
1340
+ ... partitioning=["date"]
1341
+ ... )
1342
+ >>>
1343
+ >>> # Write with delta support
1344
+ >>> ds.write_to_dataset(
1345
+ ... new_data,
1346
+ ... mode="delta",
1347
+ ... delta_subset=["id"]
1348
+ ... )
1349
+ >>>
1350
+ >>> # Read with metadata
1351
+ >>> df = ds.to_polars()
1352
+ >>> print(df.columns)
885
1353
  """
886
1354
  return ParquetDataset(
887
1355
  path,
@@ -892,23 +1360,62 @@ def pydala_dataset(
892
1360
 
893
1361
 
894
1362
  def write_parquet(
895
- self,
1363
+ self: AbstractFileSystem,
896
1364
  data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
897
1365
  path: str,
898
1366
  schema: pa.Schema | None = None,
899
- **kwargs,
1367
+ **kwargs: Any,
900
1368
  ) -> pq.FileMetaData:
901
- """
902
- Write a DataFrame to a Parquet file.
1369
+ """Write data to a Parquet file with automatic format conversion.
1370
+
1371
+ Handles writing data from multiple input formats to Parquet with:
1372
+ - Automatic conversion to PyArrow
1373
+ - Schema validation/coercion
1374
+ - Metadata collection
1375
+ - Compression and encoding options
903
1376
 
904
1377
  Args:
905
- data: (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame) Data to write.
906
- path: (str) Path to write the data.
907
- schema: (pa.Schema, optional) Schema of the data. Defaults to None.
908
- **kwargs: Additional keyword arguments for `pq.write_table`.
1378
+ data: Input data in various formats:
1379
+ - Polars DataFrame/LazyFrame
1380
+ - PyArrow Table
1381
+ - Pandas DataFrame
1382
+ - Dict or list of dicts
1383
+ path: Output Parquet file path
1384
+ schema: Optional schema to enforce on write
1385
+ **kwargs: Additional arguments for pq.write_table()
909
1386
 
910
1387
  Returns:
911
- (pq.FileMetaData): Parquet file metadata.
1388
+ pq.FileMetaData: Metadata of written Parquet file
1389
+
1390
+ Raises:
1391
+ SchemaError: If data doesn't match schema
1392
+ ValueError: If data cannot be converted
1393
+
1394
+ Example:
1395
+ >>> fs = LocalFileSystem()
1396
+ >>> # Write Polars DataFrame
1397
+ >>> df = pl.DataFrame({
1398
+ ... "id": range(1000),
1399
+ ... "value": pl.Series(np.random.randn(1000))
1400
+ ... })
1401
+ >>> metadata = fs.write_parquet(
1402
+ ... df,
1403
+ ... "data.parquet",
1404
+ ... compression="zstd",
1405
+ ... compression_level=3
1406
+ ... )
1407
+ >>> print(f"Rows: {metadata.num_rows}")
1408
+ >>>
1409
+ >>> # Write with schema
1410
+ >>> schema = pa.schema([
1411
+ ... ("id", pa.int64()),
1412
+ ... ("value", pa.float64())
1413
+ ... ])
1414
+ >>> metadata = fs.write_parquet(
1415
+ ... {"id": [1, 2], "value": [0.1, 0.2]},
1416
+ ... "data.parquet",
1417
+ ... schema=schema
1418
+ ... )
912
1419
  """
913
1420
  data = to_pyarrow_table(data, concat=False, unique=False)
914
1421
 
@@ -922,23 +1429,46 @@ def write_parquet(
922
1429
 
923
1430
 
924
1431
  def write_json(
925
- self,
926
- data: (
927
- dict | pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict]
928
- ),
1432
+ self: AbstractFileSystem,
1433
+ data: dict
1434
+ | pl.DataFrame
1435
+ | pl.LazyFrame
1436
+ | pa.Table
1437
+ | pd.DataFrame
1438
+ | dict
1439
+ | list[dict],
929
1440
  path: str,
930
1441
  append: bool = False,
931
1442
  ) -> None:
932
- """
933
- Write a dictionary, DataFrame or Table to a JSON file.
1443
+ """Write data to a JSON file with flexible input support.
934
1444
 
935
- Args:
936
- data: (dict | pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame) Data to write.
937
- path: (str) Path to write the data.
938
- append: (bool, optional) If True, append to the file. Defaults to False.
1445
+ Handles writing data in various formats to JSON or JSON Lines,
1446
+ with optional appending for streaming writes.
939
1447
 
940
- Returns:
941
- None
1448
+ Args:
1449
+ data: Input data in various formats:
1450
+ - Dict or list of dicts
1451
+ - Polars DataFrame/LazyFrame
1452
+ - PyArrow Table
1453
+ - Pandas DataFrame
1454
+ path: Output JSON file path
1455
+ append: Whether to append to existing file (JSON Lines mode)
1456
+
1457
+ Example:
1458
+ >>> fs = LocalFileSystem()
1459
+ >>> # Write dictionary
1460
+ >>> data = {"name": "test", "values": [1, 2, 3]}
1461
+ >>> fs.write_json(data, "config.json")
1462
+ >>>
1463
+ >>> # Stream records
1464
+ >>> df1 = pl.DataFrame({"id": [1], "value": ["first"]})
1465
+ >>> df2 = pl.DataFrame({"id": [2], "value": ["second"]})
1466
+ >>> fs.write_json(df1, "stream.jsonl", append=False)
1467
+ >>> fs.write_json(df2, "stream.jsonl", append=True)
1468
+ >>>
1469
+ >>> # Convert PyArrow
1470
+ >>> table = pa.table({"a": [1, 2], "b": ["x", "y"]})
1471
+ >>> fs.write_json(table, "data.json")
942
1472
  """
943
1473
  if isinstance(data, pl.LazyFrame):
944
1474
  data = data.collect()
@@ -951,46 +1481,97 @@ def write_json(
951
1481
  data = data.to_pydict()
952
1482
  if append:
953
1483
  with self.open(path, "ab") as f:
954
- f.write(orjson.dumps(data))
955
- f.write(b"\n")
1484
+ if isinstance(data, dict):
1485
+ f.write(orjson.dumps(data) + b"\n")
1486
+ else:
1487
+ for record in data:
1488
+ f.write(orjson.dumps(record) + b"\n")
956
1489
  else:
957
1490
  with self.open(path, "wb") as f:
958
1491
  f.write(orjson.dumps(data))
959
1492
 
960
1493
 
961
1494
  def write_csv(
962
- self,
963
- data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict,
1495
+ self: AbstractFileSystem,
1496
+ data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
964
1497
  path: str,
965
- **kwargs,
1498
+ append: bool = False,
1499
+ **kwargs: Any,
966
1500
  ) -> None:
967
- """
968
- Write a DataFrame to a CSV file.
1501
+ """Write data to a CSV file with flexible input support.
969
1502
 
970
- Args:
971
- data: (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame) Data to write.
972
- path: (str) Path to write the data.
973
- **kwargs: Additional keyword arguments for `pl.DataFrame.write_csv`.
1503
+ Handles writing data from multiple formats to CSV with options for:
1504
+ - Appending to existing files
1505
+ - Custom delimiters and formatting
1506
+ - Automatic type conversion
1507
+ - Header handling
974
1508
 
975
- Returns:
976
- None
1509
+ Args:
1510
+ data: Input data in various formats:
1511
+ - Polars DataFrame/LazyFrame
1512
+ - PyArrow Table
1513
+ - Pandas DataFrame
1514
+ - Dict or list of dicts
1515
+ path: Output CSV file path
1516
+ append: Whether to append to existing file
1517
+ **kwargs: Additional arguments for CSV writing:
1518
+ - delimiter: Field separator (default ",")
1519
+ - header: Whether to write header row
1520
+ - quote_char: Character for quoting fields
1521
+ - date_format: Format for date/time fields
1522
+ - float_precision: Decimal places for floats
1523
+
1524
+ Example:
1525
+ >>> fs = LocalFileSystem()
1526
+ >>> # Write Polars DataFrame
1527
+ >>> df = pl.DataFrame({
1528
+ ... "id": range(100),
1529
+ ... "name": ["item_" + str(i) for i in range(100)]
1530
+ ... })
1531
+ >>> fs.write_csv(df, "items.csv")
1532
+ >>>
1533
+ >>> # Append records
1534
+ >>> new_items = pl.DataFrame({
1535
+ ... "id": range(100, 200),
1536
+ ... "name": ["item_" + str(i) for i in range(100, 200)]
1537
+ ... })
1538
+ >>> fs.write_csv(
1539
+ ... new_items,
1540
+ ... "items.csv",
1541
+ ... append=True,
1542
+ ... header=False
1543
+ ... )
1544
+ >>>
1545
+ >>> # Custom formatting
1546
+ >>> data = pa.table({
1547
+ ... "date": [datetime.now()],
1548
+ ... "value": [123.456]
1549
+ ... })
1550
+ >>> fs.write_csv(
1551
+ ... data,
1552
+ ... "formatted.csv",
1553
+ ... date_format="%Y-%m-%d",
1554
+ ... float_precision=2
1555
+ ... )
977
1556
  """
978
- if isinstance(data, dict | list):
979
- data = _dict_to_dataframe(data)
980
- elif isinstance(data, pl.LazyFrame):
1557
+ if isinstance(data, pl.LazyFrame):
981
1558
  data = data.collect()
982
- elif isinstance(data, pa.Table):
983
- data = pl.from_arrow(data)
984
- elif isinstance(data, pd.DataFrame):
985
- data = pl.from_pandas(data)
986
-
987
- with self.open(path, "w") as f:
988
- data.write_csv(f, **kwargs)
1559
+ if isinstance(data, pl.DataFrame):
1560
+ if append:
1561
+ with self.open(path, "ab") as f:
1562
+ data.write_csv(f, has_header=not append, **kwargs)
1563
+ else:
1564
+ with self.open(path, "wb") as f:
1565
+ data.write_csv(f, **kwargs)
1566
+ elif isinstance(data, (pa.Table, pd.DataFrame)):
1567
+ pl.from_arrow(pa.table(data)).write_csv(path, **kwargs)
1568
+ else:
1569
+ pl.DataFrame(data).write_csv(path, **kwargs)
989
1570
 
990
1571
 
991
1572
  def write_file(
992
1573
  self,
993
- data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
1574
+ data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict,
994
1575
  path: str,
995
1576
  format: str,
996
1577
  **kwargs,
@@ -1054,7 +1635,7 @@ def write_files(
1054
1635
  basename: (str, optional) Basename of the files. Defaults to None.
1055
1636
  format: (str, optional) Format of the data. Defaults to None.
1056
1637
  concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
1057
- unique: (bool | list[str] | str, optional) If True, remove duplicates. Defaults to False.
1638
+ unique: (bool, optional) If True, remove duplicates. Defaults to False.
1058
1639
  mode: (str, optional) Write mode. Defaults to 'append'. Options: 'append', 'overwrite', 'delete_matching',
1059
1640
  'error_if_exists'.
1060
1641
  use_threads: (bool, optional) If True, use parallel processing. Defaults to True.
@@ -1202,7 +1783,7 @@ def write_pyarrow_dataset(
1202
1783
  max_rows_per_file: (int, optional) Maximum number of rows per file. Defaults to 2_500_000.
1203
1784
  row_group_size: (int, optional) Row group size. Defaults to 250_000.
1204
1785
  concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
1205
- unique: (bool | list[str] | str, optional) If True, remove duplicates. Defaults to False.
1786
+ unique: (bool | str | list[str], optional) If True, remove duplicates. Defaults to False.
1206
1787
  **kwargs: Additional keyword arguments for `pds.write_dataset`.
1207
1788
 
1208
1789
  Returns: