FlowerPower 0.9.13.1__py3-none-any.whl → 1.0.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/__init__.py +17 -2
- flowerpower/cfg/__init__.py +201 -149
- flowerpower/cfg/base.py +122 -24
- flowerpower/cfg/pipeline/__init__.py +254 -0
- flowerpower/cfg/pipeline/adapter.py +66 -0
- flowerpower/cfg/pipeline/run.py +40 -11
- flowerpower/cfg/pipeline/schedule.py +69 -79
- flowerpower/cfg/project/__init__.py +149 -0
- flowerpower/cfg/project/adapter.py +57 -0
- flowerpower/cfg/project/job_queue.py +165 -0
- flowerpower/cli/__init__.py +92 -37
- flowerpower/cli/job_queue.py +878 -0
- flowerpower/cli/mqtt.py +32 -1
- flowerpower/cli/pipeline.py +559 -406
- flowerpower/cli/utils.py +29 -18
- flowerpower/flowerpower.py +12 -8
- flowerpower/fs/__init__.py +20 -2
- flowerpower/fs/base.py +350 -26
- flowerpower/fs/ext.py +797 -216
- flowerpower/fs/storage_options.py +1097 -55
- flowerpower/io/base.py +13 -18
- flowerpower/io/loader/__init__.py +28 -0
- flowerpower/io/loader/deltatable.py +7 -10
- flowerpower/io/metadata.py +1 -0
- flowerpower/io/saver/__init__.py +28 -0
- flowerpower/io/saver/deltatable.py +4 -3
- flowerpower/job_queue/__init__.py +252 -0
- flowerpower/job_queue/apscheduler/__init__.py +11 -0
- flowerpower/job_queue/apscheduler/_setup/datastore.py +110 -0
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +93 -0
- flowerpower/job_queue/apscheduler/manager.py +1063 -0
- flowerpower/job_queue/apscheduler/setup.py +524 -0
- flowerpower/job_queue/apscheduler/trigger.py +169 -0
- flowerpower/job_queue/apscheduler/utils.py +309 -0
- flowerpower/job_queue/base.py +382 -0
- flowerpower/job_queue/rq/__init__.py +10 -0
- flowerpower/job_queue/rq/_trigger.py +37 -0
- flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +226 -0
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +231 -0
- flowerpower/job_queue/rq/manager.py +1449 -0
- flowerpower/job_queue/rq/setup.py +150 -0
- flowerpower/job_queue/rq/utils.py +69 -0
- flowerpower/pipeline/__init__.py +5 -0
- flowerpower/pipeline/base.py +118 -0
- flowerpower/pipeline/io.py +407 -0
- flowerpower/pipeline/job_queue.py +505 -0
- flowerpower/pipeline/manager.py +1586 -0
- flowerpower/pipeline/registry.py +560 -0
- flowerpower/pipeline/runner.py +560 -0
- flowerpower/pipeline/visualizer.py +142 -0
- flowerpower/plugins/mqtt/__init__.py +12 -0
- flowerpower/plugins/mqtt/cfg.py +16 -0
- flowerpower/plugins/mqtt/manager.py +789 -0
- flowerpower/settings.py +110 -0
- flowerpower/utils/logging.py +21 -0
- flowerpower/utils/misc.py +57 -9
- flowerpower/utils/sql.py +122 -24
- flowerpower/utils/templates.py +2 -142
- flowerpower-1.0.0b1.dist-info/METADATA +324 -0
- flowerpower-1.0.0b1.dist-info/RECORD +94 -0
- flowerpower/_web/__init__.py +0 -61
- flowerpower/_web/routes/config.py +0 -103
- flowerpower/_web/routes/pipelines.py +0 -173
- flowerpower/_web/routes/scheduler.py +0 -136
- flowerpower/cfg/pipeline/tracker.py +0 -14
- flowerpower/cfg/project/open_telemetry.py +0 -8
- flowerpower/cfg/project/tracker.py +0 -11
- flowerpower/cfg/project/worker.py +0 -19
- flowerpower/cli/scheduler.py +0 -309
- flowerpower/cli/web.py +0 -44
- flowerpower/event_handler.py +0 -23
- flowerpower/mqtt.py +0 -609
- flowerpower/pipeline.py +0 -2499
- flowerpower/scheduler.py +0 -680
- flowerpower/tui.py +0 -79
- flowerpower/utils/datastore.py +0 -186
- flowerpower/utils/eventbroker.py +0 -127
- flowerpower/utils/executor.py +0 -58
- flowerpower/utils/trigger.py +0 -140
- flowerpower-0.9.13.1.dist-info/METADATA +0 -586
- flowerpower-0.9.13.1.dist-info/RECORD +0 -76
- /flowerpower/{cfg/pipeline/params.py → cli/worker.py} +0 -0
- {flowerpower-0.9.13.1.dist-info → flowerpower-1.0.0b1.dist-info}/WHEEL +0 -0
- {flowerpower-0.9.13.1.dist-info → flowerpower-1.0.0b1.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.9.13.1.dist-info → flowerpower-1.0.0b1.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
CHANGED
@@ -2,7 +2,7 @@ import datetime as dt
|
|
2
2
|
import importlib
|
3
3
|
import posixpath
|
4
4
|
import uuid
|
5
|
-
from typing import Generator
|
5
|
+
from typing import Any, Generator
|
6
6
|
|
7
7
|
import orjson
|
8
8
|
import pandas as pd
|
@@ -11,12 +11,8 @@ import pyarrow.dataset as pds
|
|
11
11
|
import pyarrow.parquet as pq
|
12
12
|
from fsspec import AbstractFileSystem
|
13
13
|
|
14
|
-
from ..utils.misc import (
|
15
|
-
|
16
|
-
convert_large_types_to_standard,
|
17
|
-
run_parallel,
|
18
|
-
to_pyarrow_table,
|
19
|
-
)
|
14
|
+
from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
|
15
|
+
run_parallel, to_pyarrow_table)
|
20
16
|
from ..utils.polars import pl
|
21
17
|
|
22
18
|
if importlib.util.find_spec("duckdb") is not None:
|
@@ -31,6 +27,34 @@ else:
|
|
31
27
|
|
32
28
|
|
33
29
|
def path_to_glob(path: str, format: str | None = None) -> str:
|
30
|
+
"""Convert a path to a glob pattern for file matching.
|
31
|
+
|
32
|
+
Intelligently converts paths to glob patterns that match files of the specified
|
33
|
+
format, handling various directory and wildcard patterns.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
path: Base path to convert. Can include wildcards (* or **).
|
37
|
+
Examples: "data/", "data/*.json", "data/**"
|
38
|
+
format: File format to match (without dot). If None, inferred from path.
|
39
|
+
Examples: "json", "csv", "parquet"
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
str: Glob pattern that matches files of specified format.
|
43
|
+
Examples: "data/**/*.json", "data/*.csv"
|
44
|
+
|
45
|
+
Example:
|
46
|
+
>>> # Basic directory
|
47
|
+
>>> path_to_glob("data", "json")
|
48
|
+
'data/**/*.json'
|
49
|
+
>>>
|
50
|
+
>>> # With wildcards
|
51
|
+
>>> path_to_glob("data/**", "csv")
|
52
|
+
'data/**/*.csv'
|
53
|
+
>>>
|
54
|
+
>>> # Format inference
|
55
|
+
>>> path_to_glob("data/file.parquet")
|
56
|
+
'data/file.parquet'
|
57
|
+
"""
|
34
58
|
path = path.rstrip("/")
|
35
59
|
if format is None:
|
36
60
|
if ".json" in path:
|
@@ -53,8 +77,42 @@ def path_to_glob(path: str, format: str | None = None) -> str:
|
|
53
77
|
|
54
78
|
|
55
79
|
def _read_json_file(
|
56
|
-
path
|
80
|
+
path: str,
|
81
|
+
self: AbstractFileSystem,
|
82
|
+
include_file_path: bool = False,
|
83
|
+
jsonlines: bool = False,
|
57
84
|
) -> dict | list[dict]:
|
85
|
+
"""Read a JSON file from any filesystem.
|
86
|
+
|
87
|
+
Internal function that handles both regular JSON and JSON Lines formats.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
path: Path to JSON file
|
91
|
+
self: Filesystem instance to use for reading
|
92
|
+
include_file_path: Whether to return dict with filepath as key
|
93
|
+
jsonlines: Whether to read as JSON Lines format
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
dict | list[dict]: Parsed JSON data. If include_file_path=True,
|
97
|
+
returns {filepath: data}
|
98
|
+
|
99
|
+
Example:
|
100
|
+
>>> fs = LocalFileSystem()
|
101
|
+
>>> # Regular JSON
|
102
|
+
>>> data = _read_json_file("data.json", fs)
|
103
|
+
>>> print(type(data))
|
104
|
+
<class 'dict'>
|
105
|
+
>>>
|
106
|
+
>>> # JSON Lines with filepath
|
107
|
+
>>> data = _read_json_file(
|
108
|
+
... "data.jsonl",
|
109
|
+
... fs,
|
110
|
+
... include_file_path=True,
|
111
|
+
... jsonlines=True
|
112
|
+
... )
|
113
|
+
>>> print(list(data.keys())[0])
|
114
|
+
'data.jsonl'
|
115
|
+
"""
|
58
116
|
with self.open(path) as f:
|
59
117
|
if jsonlines:
|
60
118
|
data = [orjson.loads(line) for line in f.readlines()]
|
@@ -66,10 +124,47 @@ def _read_json_file(
|
|
66
124
|
|
67
125
|
|
68
126
|
def read_json_file(
|
69
|
-
self
|
127
|
+
self: AbstractFileSystem,
|
128
|
+
path: str,
|
129
|
+
include_file_path: bool = False,
|
130
|
+
jsonlines: bool = False,
|
70
131
|
) -> dict | list[dict]:
|
132
|
+
"""Read a single JSON file from any filesystem.
|
133
|
+
|
134
|
+
A public wrapper around _read_json_file providing a clean interface for
|
135
|
+
reading individual JSON files.
|
136
|
+
|
137
|
+
Args:
|
138
|
+
path: Path to JSON file to read
|
139
|
+
include_file_path: Whether to return dict with filepath as key
|
140
|
+
jsonlines: Whether to read as JSON Lines format
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
dict | list[dict]: Parsed JSON data. For regular JSON, returns a dict.
|
144
|
+
For JSON Lines, returns a list of dicts. If include_file_path=True,
|
145
|
+
returns {filepath: data}.
|
146
|
+
|
147
|
+
Example:
|
148
|
+
>>> fs = LocalFileSystem()
|
149
|
+
>>> # Read regular JSON
|
150
|
+
>>> data = fs.read_json_file("config.json")
|
151
|
+
>>> print(data["setting"])
|
152
|
+
'value'
|
153
|
+
>>>
|
154
|
+
>>> # Read JSON Lines with filepath
|
155
|
+
>>> data = fs.read_json_file(
|
156
|
+
... "logs.jsonl",
|
157
|
+
... include_file_path=True,
|
158
|
+
... jsonlines=True
|
159
|
+
... )
|
160
|
+
>>> print(list(data.keys())[0])
|
161
|
+
'logs.jsonl'
|
162
|
+
"""
|
71
163
|
return _read_json_file(
|
72
|
-
path=path,
|
164
|
+
path=path,
|
165
|
+
self=self,
|
166
|
+
include_file_path=include_file_path,
|
167
|
+
jsonlines=jsonlines,
|
73
168
|
)
|
74
169
|
|
75
170
|
|
@@ -152,7 +247,7 @@ def _read_json(
|
|
152
247
|
|
153
248
|
|
154
249
|
def _read_json_batches(
|
155
|
-
self,
|
250
|
+
self: AbstractFileSystem,
|
156
251
|
path: str | list[str],
|
157
252
|
batch_size: int | None = None,
|
158
253
|
include_file_path: bool = False,
|
@@ -161,24 +256,49 @@ def _read_json_batches(
|
|
161
256
|
concat: bool = True,
|
162
257
|
use_threads: bool = True,
|
163
258
|
verbose: bool = False,
|
164
|
-
**kwargs,
|
259
|
+
**kwargs: Any,
|
165
260
|
) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
|
166
|
-
"""
|
167
|
-
|
261
|
+
"""Process JSON files in batches with optional parallel reading.
|
262
|
+
|
263
|
+
Internal generator function that handles batched reading of JSON files
|
264
|
+
with support for parallel processing within each batch.
|
168
265
|
|
169
266
|
Args:
|
170
|
-
path: (
|
171
|
-
batch_size:
|
172
|
-
include_file_path:
|
173
|
-
jsonlines:
|
174
|
-
as_dataframe:
|
175
|
-
concat:
|
176
|
-
use_threads:
|
177
|
-
verbose:
|
178
|
-
**kwargs: Additional
|
267
|
+
path: Path(s) to JSON file(s). Glob patterns supported.
|
268
|
+
batch_size: Number of files to process in each batch
|
269
|
+
include_file_path: Include source filepath in output
|
270
|
+
jsonlines: Whether to read as JSON Lines format
|
271
|
+
as_dataframe: Convert output to Polars DataFrame(s)
|
272
|
+
concat: Combine files within each batch
|
273
|
+
use_threads: Enable parallel file reading within batches
|
274
|
+
verbose: Print progress information
|
275
|
+
**kwargs: Additional arguments for DataFrame conversion
|
179
276
|
|
180
277
|
Yields:
|
181
|
-
|
278
|
+
Each batch of data in requested format:
|
279
|
+
- dict | list[dict]: Raw JSON data
|
280
|
+
- pl.DataFrame: Single DataFrame if concat=True
|
281
|
+
- list[pl.DataFrame]: List of DataFrames if concat=False
|
282
|
+
|
283
|
+
Example:
|
284
|
+
>>> fs = LocalFileSystem()
|
285
|
+
>>> # Process large dataset in batches
|
286
|
+
>>> for batch in fs._read_json_batches(
|
287
|
+
... "data/*.json",
|
288
|
+
... batch_size=100,
|
289
|
+
... as_dataframe=True,
|
290
|
+
... verbose=True
|
291
|
+
... ):
|
292
|
+
... print(f"Batch shape: {batch.shape}")
|
293
|
+
>>>
|
294
|
+
>>> # Parallel batch processing with filepath tracking
|
295
|
+
>>> for batch in fs._read_json_batches(
|
296
|
+
... ["logs1.jsonl", "logs2.jsonl"],
|
297
|
+
... batch_size=1,
|
298
|
+
... include_file_path=True,
|
299
|
+
... use_threads=True
|
300
|
+
... ):
|
301
|
+
... print(f"Processing {batch['file_path'][0]}")
|
182
302
|
"""
|
183
303
|
# Handle path resolution
|
184
304
|
if isinstance(path, str):
|
@@ -218,10 +338,13 @@ def _read_json_batches(
|
|
218
338
|
batch_dfs = [pl.DataFrame(d) for d in batch_data]
|
219
339
|
else:
|
220
340
|
batch_dfs = [
|
221
|
-
|
222
|
-
pl.
|
223
|
-
|
224
|
-
|
341
|
+
[
|
342
|
+
pl.DataFrame(_data[k]).with_columns(
|
343
|
+
pl.lit(k).alias("file_path")
|
344
|
+
)
|
345
|
+
for k in _data
|
346
|
+
][0]
|
347
|
+
for _data in batch_data
|
225
348
|
]
|
226
349
|
|
227
350
|
if concat and len(batch_dfs) > 1:
|
@@ -233,7 +356,7 @@ def _read_json_batches(
|
|
233
356
|
|
234
357
|
|
235
358
|
def read_json(
|
236
|
-
self,
|
359
|
+
self: AbstractFileSystem,
|
237
360
|
path: str | list[str],
|
238
361
|
batch_size: int | None = None,
|
239
362
|
include_file_path: bool = False,
|
@@ -242,7 +365,7 @@ def read_json(
|
|
242
365
|
concat: bool = True,
|
243
366
|
use_threads: bool = True,
|
244
367
|
verbose: bool = False,
|
245
|
-
**kwargs,
|
368
|
+
**kwargs: Any,
|
246
369
|
) -> (
|
247
370
|
dict
|
248
371
|
| list[dict]
|
@@ -250,27 +373,65 @@ def read_json(
|
|
250
373
|
| list[pl.DataFrame]
|
251
374
|
| Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]
|
252
375
|
):
|
253
|
-
"""
|
254
|
-
|
255
|
-
|
376
|
+
"""Read JSON data from one or more files with powerful options.
|
377
|
+
|
378
|
+
Provides a flexible interface for reading JSON data with support for:
|
379
|
+
- Single file or multiple files
|
380
|
+
- Regular JSON or JSON Lines format
|
381
|
+
- Batch processing for large datasets
|
382
|
+
- Parallel processing
|
383
|
+
- DataFrame conversion
|
384
|
+
- File path tracking
|
256
385
|
|
257
386
|
Args:
|
258
|
-
path: (
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
-
|
266
|
-
|
387
|
+
path: Path(s) to JSON file(s). Can be:
|
388
|
+
- Single path string (globs supported)
|
389
|
+
- List of path strings
|
390
|
+
batch_size: If set, enables batch reading with this many files per batch
|
391
|
+
include_file_path: Include source filepath in output
|
392
|
+
jsonlines: Whether to read as JSON Lines format
|
393
|
+
as_dataframe: Convert output to Polars DataFrame(s)
|
394
|
+
concat: Combine multiple files/batches into single result
|
395
|
+
use_threads: Enable parallel file reading
|
396
|
+
verbose: Print progress information
|
397
|
+
**kwargs: Additional arguments passed to DataFrame conversion
|
267
398
|
|
268
399
|
Returns:
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
400
|
+
Various types depending on arguments:
|
401
|
+
- dict: Single JSON file as dictionary
|
402
|
+
- list[dict]: Multiple JSON files as list of dictionaries
|
403
|
+
- pl.DataFrame: Single or concatenated DataFrame
|
404
|
+
- list[pl.DataFrame]: List of DataFrames (if concat=False)
|
405
|
+
- Generator: If batch_size set, yields batches of above types
|
406
|
+
|
407
|
+
Example:
|
408
|
+
>>> fs = LocalFileSystem()
|
409
|
+
>>> # Read all JSON files in directory
|
410
|
+
>>> df = fs.read_json(
|
411
|
+
... "data/*.json",
|
412
|
+
... as_dataframe=True,
|
413
|
+
... concat=True
|
414
|
+
... )
|
415
|
+
>>> print(df.shape)
|
416
|
+
(1000, 5) # Combined data from all files
|
417
|
+
>>>
|
418
|
+
>>> # Batch process large dataset
|
419
|
+
>>> for batch_df in fs.read_json(
|
420
|
+
... "logs/*.jsonl",
|
421
|
+
... batch_size=100,
|
422
|
+
... jsonlines=True,
|
423
|
+
... include_file_path=True
|
424
|
+
... ):
|
425
|
+
... print(f"Processing {len(batch_df)} records")
|
426
|
+
>>>
|
427
|
+
>>> # Parallel read with custom options
|
428
|
+
>>> dfs = fs.read_json(
|
429
|
+
... ["file1.json", "file2.json"],
|
430
|
+
... use_threads=True,
|
431
|
+
... concat=False,
|
432
|
+
... verbose=True
|
433
|
+
... )
|
434
|
+
>>> print(f"Read {len(dfs)} files")
|
274
435
|
"""
|
275
436
|
if batch_size is not None:
|
276
437
|
return _read_json_batches(
|
@@ -299,9 +460,34 @@ def read_json(
|
|
299
460
|
|
300
461
|
|
301
462
|
def _read_csv_file(
|
302
|
-
path, self, include_file_path: bool = False, **kwargs
|
463
|
+
path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
|
303
464
|
) -> pl.DataFrame:
|
304
|
-
|
465
|
+
"""Read a single CSV file from any filesystem.
|
466
|
+
|
467
|
+
Internal function that handles reading individual CSV files and optionally
|
468
|
+
adds the source filepath as a column.
|
469
|
+
|
470
|
+
Args:
|
471
|
+
path: Path to CSV file
|
472
|
+
self: Filesystem instance to use for reading
|
473
|
+
include_file_path: Add source filepath as a column
|
474
|
+
**kwargs: Additional arguments passed to pl.read_csv()
|
475
|
+
|
476
|
+
Returns:
|
477
|
+
pl.DataFrame: DataFrame containing CSV data
|
478
|
+
|
479
|
+
Example:
|
480
|
+
>>> fs = LocalFileSystem()
|
481
|
+
>>> df = _read_csv_file(
|
482
|
+
... "data.csv",
|
483
|
+
... fs,
|
484
|
+
... include_file_path=True,
|
485
|
+
... delimiter="|"
|
486
|
+
... )
|
487
|
+
>>> print("file_path" in df.columns)
|
488
|
+
True
|
489
|
+
"""
|
490
|
+
print(path) # Debug info
|
305
491
|
with self.open(path) as f:
|
306
492
|
df = pl.read_csv(f, **kwargs)
|
307
493
|
if include_file_path:
|
@@ -371,29 +557,54 @@ def _read_csv(
|
|
371
557
|
|
372
558
|
|
373
559
|
def _read_csv_batches(
|
374
|
-
self,
|
560
|
+
self: AbstractFileSystem,
|
375
561
|
path: str | list[str],
|
376
562
|
batch_size: int | None = None,
|
377
563
|
include_file_path: bool = False,
|
378
564
|
concat: bool = True,
|
379
565
|
use_threads: bool = True,
|
380
566
|
verbose: bool = False,
|
381
|
-
**kwargs,
|
382
|
-
) -> Generator[pl.DataFrame, None, None]:
|
383
|
-
"""
|
384
|
-
|
567
|
+
**kwargs: Any,
|
568
|
+
) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
|
569
|
+
"""Process CSV files in batches with optional parallel reading.
|
570
|
+
|
571
|
+
Internal generator function that handles batched reading of CSV files
|
572
|
+
with support for parallel processing within each batch.
|
385
573
|
|
386
574
|
Args:
|
387
|
-
path: (
|
388
|
-
batch_size:
|
389
|
-
include_file_path:
|
390
|
-
concat:
|
391
|
-
use_threads:
|
392
|
-
verbose:
|
393
|
-
**kwargs: Additional
|
575
|
+
path: Path(s) to CSV file(s). Glob patterns supported.
|
576
|
+
batch_size: Number of files to process in each batch
|
577
|
+
include_file_path: Add source filepath as a column
|
578
|
+
concat: Combine files within each batch
|
579
|
+
use_threads: Enable parallel file reading within batches
|
580
|
+
verbose: Print progress information
|
581
|
+
**kwargs: Additional arguments passed to pl.read_csv()
|
394
582
|
|
395
583
|
Yields:
|
396
|
-
|
584
|
+
Each batch of data in requested format:
|
585
|
+
- pl.DataFrame: Single DataFrame if concat=True
|
586
|
+
- list[pl.DataFrame]: List of DataFrames if concat=False
|
587
|
+
|
588
|
+
Example:
|
589
|
+
>>> fs = LocalFileSystem()
|
590
|
+
>>> # Process large dataset in batches
|
591
|
+
>>> for batch in fs._read_csv_batches(
|
592
|
+
... "data/*.csv",
|
593
|
+
... batch_size=100,
|
594
|
+
... include_file_path=True,
|
595
|
+
... verbose=True
|
596
|
+
... ):
|
597
|
+
... print(f"Batch columns: {batch.columns}")
|
598
|
+
>>>
|
599
|
+
>>> # Parallel processing without concatenation
|
600
|
+
>>> for batch in fs._read_csv_batches(
|
601
|
+
... ["file1.csv", "file2.csv"],
|
602
|
+
... batch_size=1,
|
603
|
+
... concat=False,
|
604
|
+
... use_threads=True
|
605
|
+
... ):
|
606
|
+
... for df in batch:
|
607
|
+
... print(f"DataFrame shape: {df.shape}")
|
397
608
|
"""
|
398
609
|
# Handle path resolution
|
399
610
|
if isinstance(path, str):
|
@@ -435,39 +646,71 @@ def _read_csv_batches(
|
|
435
646
|
|
436
647
|
|
437
648
|
def read_csv(
|
438
|
-
self,
|
649
|
+
self: AbstractFileSystem,
|
439
650
|
path: str | list[str],
|
440
651
|
batch_size: int | None = None,
|
441
652
|
include_file_path: bool = False,
|
442
653
|
concat: bool = True,
|
443
654
|
use_threads: bool = True,
|
444
655
|
verbose: bool = False,
|
445
|
-
**kwargs,
|
656
|
+
**kwargs: Any,
|
446
657
|
) -> (
|
447
658
|
pl.DataFrame
|
448
659
|
| list[pl.DataFrame]
|
449
660
|
| Generator[pl.DataFrame | list[pl.DataFrame], None, None]
|
450
661
|
):
|
451
|
-
"""
|
452
|
-
|
453
|
-
|
662
|
+
"""Read CSV data from one or more files with powerful options.
|
663
|
+
|
664
|
+
Provides a flexible interface for reading CSV files with support for:
|
665
|
+
- Single file or multiple files
|
666
|
+
- Batch processing for large datasets
|
667
|
+
- Parallel processing
|
668
|
+
- File path tracking
|
669
|
+
- Polars DataFrame output
|
454
670
|
|
455
671
|
Args:
|
456
|
-
path: (
|
457
|
-
|
458
|
-
|
459
|
-
|
460
|
-
|
461
|
-
|
462
|
-
|
672
|
+
path: Path(s) to CSV file(s). Can be:
|
673
|
+
- Single path string (globs supported)
|
674
|
+
- List of path strings
|
675
|
+
batch_size: If set, enables batch reading with this many files per batch
|
676
|
+
include_file_path: Add source filepath as a column
|
677
|
+
concat: Combine multiple files/batches into single DataFrame
|
678
|
+
use_threads: Enable parallel file reading
|
679
|
+
verbose: Print progress information
|
680
|
+
**kwargs: Additional arguments passed to pl.read_csv()
|
463
681
|
|
464
682
|
Returns:
|
465
|
-
|
466
|
-
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
683
|
+
Various types depending on arguments:
|
684
|
+
- pl.DataFrame: Single or concatenated DataFrame
|
685
|
+
- list[pl.DataFrame]: List of DataFrames (if concat=False)
|
686
|
+
- Generator: If batch_size set, yields batches of above types
|
687
|
+
|
688
|
+
Example:
|
689
|
+
>>> fs = LocalFileSystem()
|
690
|
+
>>> # Read all CSVs in directory
|
691
|
+
>>> df = fs.read_csv(
|
692
|
+
... "data/*.csv",
|
693
|
+
... include_file_path=True
|
694
|
+
... )
|
695
|
+
>>> print(df.columns)
|
696
|
+
['file_path', 'col1', 'col2', ...]
|
697
|
+
>>>
|
698
|
+
>>> # Batch process large dataset
|
699
|
+
>>> for batch_df in fs.read_csv(
|
700
|
+
... "logs/*.csv",
|
701
|
+
... batch_size=100,
|
702
|
+
... use_threads=True,
|
703
|
+
... verbose=True
|
704
|
+
... ):
|
705
|
+
... print(f"Processing {len(batch_df)} rows")
|
706
|
+
>>>
|
707
|
+
>>> # Multiple files without concatenation
|
708
|
+
>>> dfs = fs.read_csv(
|
709
|
+
... ["file1.csv", "file2.csv"],
|
710
|
+
... concat=False,
|
711
|
+
... use_threads=True
|
712
|
+
... )
|
713
|
+
>>> print(f"Read {len(dfs)} files")
|
471
714
|
"""
|
472
715
|
if batch_size is not None:
|
473
716
|
return _read_csv_batches(
|
@@ -492,8 +735,33 @@ def read_csv(
|
|
492
735
|
|
493
736
|
|
494
737
|
def _read_parquet_file(
|
495
|
-
path, self, include_file_path: bool = False, **kwargs
|
738
|
+
path: str, self: AbstractFileSystem, include_file_path: bool = False, **kwargs: Any
|
496
739
|
) -> pa.Table:
|
740
|
+
"""Read a single Parquet file from any filesystem.
|
741
|
+
|
742
|
+
Internal function that handles reading individual Parquet files and
|
743
|
+
optionally adds the source filepath as a column.
|
744
|
+
|
745
|
+
Args:
|
746
|
+
path: Path to Parquet file
|
747
|
+
self: Filesystem instance to use for reading
|
748
|
+
include_file_path: Add source filepath as a column
|
749
|
+
**kwargs: Additional arguments passed to pq.read_table()
|
750
|
+
|
751
|
+
Returns:
|
752
|
+
pa.Table: PyArrow Table containing Parquet data
|
753
|
+
|
754
|
+
Example:
|
755
|
+
>>> fs = LocalFileSystem()
|
756
|
+
>>> table = _read_parquet_file(
|
757
|
+
... "data.parquet",
|
758
|
+
... fs,
|
759
|
+
... include_file_path=True,
|
760
|
+
... use_threads=True
|
761
|
+
... )
|
762
|
+
>>> print("file_path" in table.column_names)
|
763
|
+
True
|
764
|
+
"""
|
497
765
|
table = pq.read_table(path, filesystem=self, **kwargs)
|
498
766
|
if include_file_path:
|
499
767
|
return table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
|
@@ -569,29 +837,61 @@ def _read_parquet(
|
|
569
837
|
|
570
838
|
|
571
839
|
def _read_parquet_batches(
|
572
|
-
self,
|
840
|
+
self: AbstractFileSystem,
|
573
841
|
path: str | list[str],
|
574
842
|
batch_size: int | None = None,
|
575
843
|
include_file_path: bool = False,
|
576
844
|
use_threads: bool = True,
|
577
845
|
concat: bool = True,
|
578
846
|
verbose: bool = False,
|
579
|
-
**kwargs,
|
847
|
+
**kwargs: Any,
|
580
848
|
) -> Generator[pa.Table | list[pa.Table], None, None]:
|
581
|
-
"""
|
582
|
-
|
849
|
+
"""Process Parquet files in batches with performance optimizations.
|
850
|
+
|
851
|
+
Internal generator function that handles batched reading of Parquet files
|
852
|
+
with support for:
|
853
|
+
- Parallel processing within batches
|
854
|
+
- Metadata-based optimizations
|
855
|
+
- Memory-efficient processing
|
856
|
+
- Progress tracking
|
857
|
+
|
858
|
+
Uses fast path for simple cases:
|
859
|
+
- Single directory with _metadata
|
860
|
+
- No need for filepath column
|
861
|
+
- Concatenated output
|
583
862
|
|
584
863
|
Args:
|
585
|
-
path: (
|
586
|
-
batch_size:
|
587
|
-
include_file_path:
|
588
|
-
use_threads:
|
589
|
-
concat:
|
590
|
-
verbose:
|
591
|
-
**kwargs: Additional
|
864
|
+
path: Path(s) to Parquet file(s). Glob patterns supported.
|
865
|
+
batch_size: Number of files to process in each batch
|
866
|
+
include_file_path: Add source filepath as a column
|
867
|
+
use_threads: Enable parallel file reading within batches
|
868
|
+
concat: Combine files within each batch
|
869
|
+
verbose: Print progress information
|
870
|
+
**kwargs: Additional arguments passed to pq.read_table()
|
592
871
|
|
593
872
|
Yields:
|
594
|
-
|
873
|
+
Each batch of data in requested format:
|
874
|
+
- pa.Table: Single Table if concat=True
|
875
|
+
- list[pa.Table]: List of Tables if concat=False
|
876
|
+
|
877
|
+
Example:
|
878
|
+
>>> fs = LocalFileSystem()
|
879
|
+
>>> # Fast path for simple case
|
880
|
+
>>> next(_read_parquet_batches(
|
881
|
+
... fs,
|
882
|
+
... "data/", # Contains _metadata
|
883
|
+
... batch_size=1000
|
884
|
+
... ))
|
885
|
+
>>>
|
886
|
+
>>> # Parallel batch processing
|
887
|
+
>>> for batch in fs._read_parquet_batches(
|
888
|
+
... fs,
|
889
|
+
... ["file1.parquet", "file2.parquet"],
|
890
|
+
... batch_size=1,
|
891
|
+
... include_file_path=True,
|
892
|
+
... use_threads=True
|
893
|
+
... ):
|
894
|
+
... print(f"Batch schema: {batch.schema}")
|
595
895
|
"""
|
596
896
|
# Fast path for simple cases
|
597
897
|
if not include_file_path and concat and batch_size is None:
|
@@ -612,7 +912,6 @@ def _read_parquet_batches(
|
|
612
912
|
return
|
613
913
|
|
614
914
|
# Process in batches
|
615
|
-
|
616
915
|
for i in range(0, len(path), batch_size):
|
617
916
|
batch_paths = path[i : i + batch_size]
|
618
917
|
if use_threads and len(batch_paths) > 1:
|
@@ -641,34 +940,72 @@ def _read_parquet_batches(
|
|
641
940
|
|
642
941
|
|
643
942
|
def read_parquet(
|
644
|
-
self,
|
943
|
+
self: AbstractFileSystem,
|
645
944
|
path: str | list[str],
|
646
945
|
batch_size: int | None = None,
|
647
946
|
include_file_path: bool = False,
|
648
947
|
concat: bool = True,
|
649
948
|
use_threads: bool = True,
|
650
949
|
verbose: bool = False,
|
651
|
-
**kwargs,
|
950
|
+
**kwargs: Any,
|
652
951
|
) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
|
653
|
-
"""
|
654
|
-
|
655
|
-
|
952
|
+
"""Read Parquet data with advanced features and optimizations.
|
953
|
+
|
954
|
+
Provides a high-performance interface for reading Parquet files with support for:
|
955
|
+
- Single file or multiple files
|
956
|
+
- Batch processing for large datasets
|
957
|
+
- Parallel processing
|
958
|
+
- File path tracking
|
959
|
+
- Automatic concatenation
|
960
|
+
- PyArrow Table output
|
961
|
+
|
962
|
+
The function automatically uses optimal reading strategies:
|
963
|
+
- Direct dataset reading for simple cases
|
964
|
+
- Parallel processing for multiple files
|
965
|
+
- Batched reading for memory efficiency
|
656
966
|
|
657
967
|
Args:
|
658
|
-
path: (
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
663
|
-
|
664
|
-
|
968
|
+
path: Path(s) to Parquet file(s). Can be:
|
969
|
+
- Single path string (globs supported)
|
970
|
+
- List of path strings
|
971
|
+
- Directory containing _metadata file
|
972
|
+
batch_size: If set, enables batch reading with this many files per batch
|
973
|
+
include_file_path: Add source filepath as a column
|
974
|
+
concat: Combine multiple files/batches into single Table
|
975
|
+
use_threads: Enable parallel file reading
|
976
|
+
verbose: Print progress information
|
977
|
+
**kwargs: Additional arguments passed to pq.read_table()
|
665
978
|
|
666
979
|
Returns:
|
667
|
-
|
668
|
-
|
669
|
-
|
670
|
-
|
671
|
-
|
980
|
+
Various types depending on arguments:
|
981
|
+
- pa.Table: Single or concatenated Table
|
982
|
+
- list[pa.Table]: List of Tables (if concat=False)
|
983
|
+
- Generator: If batch_size set, yields batches of above types
|
984
|
+
|
985
|
+
Example:
|
986
|
+
>>> fs = LocalFileSystem()
|
987
|
+
>>> # Read all Parquet files in directory
|
988
|
+
>>> table = fs.read_parquet(
|
989
|
+
... "data/*.parquet",
|
990
|
+
... include_file_path=True
|
991
|
+
... )
|
992
|
+
>>> print(table.column_names)
|
993
|
+
['file_path', 'col1', 'col2', ...]
|
994
|
+
>>>
|
995
|
+
>>> # Batch process large dataset
|
996
|
+
>>> for batch in fs.read_parquet(
|
997
|
+
... "data/*.parquet",
|
998
|
+
... batch_size=100,
|
999
|
+
... use_threads=True
|
1000
|
+
... ):
|
1001
|
+
... print(f"Processing {batch.num_rows} rows")
|
1002
|
+
>>>
|
1003
|
+
>>> # Read from directory with metadata
|
1004
|
+
>>> table = fs.read_parquet(
|
1005
|
+
... "data/", # Contains _metadata
|
1006
|
+
... use_threads=True
|
1007
|
+
... )
|
1008
|
+
>>> print(f"Total rows: {table.num_rows}")
|
672
1009
|
"""
|
673
1010
|
if batch_size is not None:
|
674
1011
|
return _read_parquet_batches(
|
@@ -693,7 +1030,7 @@ def read_parquet(
|
|
693
1030
|
|
694
1031
|
|
695
1032
|
def read_files(
|
696
|
-
self,
|
1033
|
+
self: AbstractFileSystem,
|
697
1034
|
path: str | list[str],
|
698
1035
|
format: str,
|
699
1036
|
batch_size: int | None = None,
|
@@ -702,38 +1039,76 @@ def read_files(
|
|
702
1039
|
jsonlines: bool = False,
|
703
1040
|
use_threads: bool = True,
|
704
1041
|
verbose: bool = False,
|
705
|
-
**kwargs,
|
1042
|
+
**kwargs: Any,
|
706
1043
|
) -> (
|
707
1044
|
pl.DataFrame
|
708
1045
|
| pa.Table
|
709
1046
|
| list[pl.DataFrame]
|
710
1047
|
| list[pa.Table]
|
711
1048
|
| Generator[
|
712
|
-
pl.DataFrame | pa.Table | list[
|
1049
|
+
pl.DataFrame | pa.Table | list[pl.DataFrame] | list[pa.Table], None, None
|
713
1050
|
]
|
714
1051
|
):
|
715
|
-
"""
|
716
|
-
|
1052
|
+
"""Universal interface for reading data files of any supported format.
|
1053
|
+
|
1054
|
+
A unified API that automatically delegates to the appropriate reading function
|
1055
|
+
based on file format, while preserving all advanced features like:
|
1056
|
+
- Batch processing
|
1057
|
+
- Parallel reading
|
1058
|
+
- File path tracking
|
1059
|
+
- Format-specific optimizations
|
717
1060
|
|
718
1061
|
Args:
|
719
|
-
path: (
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
1062
|
+
path: Path(s) to data file(s). Can be:
|
1063
|
+
- Single path string (globs supported)
|
1064
|
+
- List of path strings
|
1065
|
+
format: File format to read. Supported values:
|
1066
|
+
- "json": Regular JSON or JSON Lines
|
1067
|
+
- "csv": CSV files
|
1068
|
+
- "parquet": Parquet files
|
1069
|
+
batch_size: If set, enables batch reading with this many files per batch
|
1070
|
+
include_file_path: Add source filepath as column/field
|
1071
|
+
concat: Combine multiple files/batches into single result
|
1072
|
+
jsonlines: For JSON format, whether to read as JSON Lines
|
1073
|
+
use_threads: Enable parallel file reading
|
1074
|
+
verbose: Print progress information
|
1075
|
+
**kwargs: Additional format-specific arguments
|
729
1076
|
|
730
1077
|
Returns:
|
731
|
-
|
732
|
-
|
733
|
-
|
734
|
-
|
735
|
-
|
736
|
-
|
1078
|
+
Various types depending on format and arguments:
|
1079
|
+
- pl.DataFrame: For CSV and optionally JSON
|
1080
|
+
- pa.Table: For Parquet
|
1081
|
+
- list[pl.DataFrame | pa.Table]: Without concatenation
|
1082
|
+
- Generator: If batch_size set, yields batches
|
1083
|
+
|
1084
|
+
Example:
|
1085
|
+
>>> fs = LocalFileSystem()
|
1086
|
+
>>> # Read CSV files
|
1087
|
+
>>> df = fs.read_files(
|
1088
|
+
... "data/*.csv",
|
1089
|
+
... format="csv",
|
1090
|
+
... include_file_path=True
|
1091
|
+
... )
|
1092
|
+
>>> print(type(df))
|
1093
|
+
<class 'polars.DataFrame'>
|
1094
|
+
>>>
|
1095
|
+
>>> # Batch process Parquet files
|
1096
|
+
>>> for batch in fs.read_files(
|
1097
|
+
... "data/*.parquet",
|
1098
|
+
... format="parquet",
|
1099
|
+
... batch_size=100,
|
1100
|
+
... use_threads=True
|
1101
|
+
... ):
|
1102
|
+
... print(f"Batch type: {type(batch)}")
|
1103
|
+
>>>
|
1104
|
+
>>> # Read JSON Lines
|
1105
|
+
>>> df = fs.read_files(
|
1106
|
+
... "logs/*.jsonl",
|
1107
|
+
... format="json",
|
1108
|
+
... jsonlines=True,
|
1109
|
+
... concat=True
|
1110
|
+
... )
|
1111
|
+
>>> print(df.columns)
|
737
1112
|
"""
|
738
1113
|
if format == "json":
|
739
1114
|
if batch_size is not None:
|
@@ -749,8 +1124,8 @@ def read_files(
|
|
749
1124
|
**kwargs,
|
750
1125
|
)
|
751
1126
|
return read_json(
|
752
|
-
self,
|
753
|
-
path,
|
1127
|
+
self=self,
|
1128
|
+
path=path,
|
754
1129
|
include_file_path=include_file_path,
|
755
1130
|
jsonlines=jsonlines,
|
756
1131
|
concat=concat,
|
@@ -771,8 +1146,8 @@ def read_files(
|
|
771
1146
|
**kwargs,
|
772
1147
|
)
|
773
1148
|
return read_csv(
|
774
|
-
self,
|
775
|
-
path,
|
1149
|
+
self=self,
|
1150
|
+
path=path,
|
776
1151
|
include_file_path=include_file_path,
|
777
1152
|
use_threads=use_threads,
|
778
1153
|
concat=concat,
|
@@ -792,8 +1167,8 @@ def read_files(
|
|
792
1167
|
**kwargs,
|
793
1168
|
)
|
794
1169
|
return read_parquet(
|
795
|
-
self,
|
796
|
-
path,
|
1170
|
+
self=self,
|
1171
|
+
path=path,
|
797
1172
|
include_file_path=include_file_path,
|
798
1173
|
use_threads=use_threads,
|
799
1174
|
concat=concat,
|
@@ -803,26 +1178,64 @@ def read_files(
|
|
803
1178
|
|
804
1179
|
|
805
1180
|
def pyarrow_dataset(
|
806
|
-
self,
|
1181
|
+
self: AbstractFileSystem,
|
807
1182
|
path: str,
|
808
|
-
format="parquet",
|
1183
|
+
format: str = "parquet",
|
809
1184
|
schema: pa.Schema | None = None,
|
810
1185
|
partitioning: str | list[str] | pds.Partitioning = None,
|
811
|
-
**kwargs,
|
1186
|
+
**kwargs: Any,
|
812
1187
|
) -> pds.Dataset:
|
813
|
-
"""
|
814
|
-
|
1188
|
+
"""Create a PyArrow dataset from files in any supported format.
|
1189
|
+
|
1190
|
+
Creates a dataset that provides optimized reading and querying capabilities
|
1191
|
+
including:
|
1192
|
+
- Schema inference and enforcement
|
1193
|
+
- Partition discovery and pruning
|
1194
|
+
- Predicate pushdown
|
1195
|
+
- Column projection
|
815
1196
|
|
816
1197
|
Args:
|
817
|
-
path:
|
818
|
-
format:
|
819
|
-
|
820
|
-
|
821
|
-
|
822
|
-
|
1198
|
+
path: Base path to dataset files
|
1199
|
+
format: File format. Currently supports:
|
1200
|
+
- "parquet" (default)
|
1201
|
+
- "csv"
|
1202
|
+
- "json" (experimental)
|
1203
|
+
schema: Optional schema to enforce. If None, inferred from data.
|
1204
|
+
partitioning: How the dataset is partitioned. Can be:
|
1205
|
+
- str: Single partition field
|
1206
|
+
- list[str]: Multiple partition fields
|
1207
|
+
- pds.Partitioning: Custom partitioning scheme
|
1208
|
+
**kwargs: Additional arguments for dataset creation
|
823
1209
|
|
824
1210
|
Returns:
|
825
|
-
|
1211
|
+
pds.Dataset: PyArrow dataset instance
|
1212
|
+
|
1213
|
+
Example:
|
1214
|
+
>>> fs = LocalFileSystem()
|
1215
|
+
>>> # Simple Parquet dataset
|
1216
|
+
>>> ds = fs.pyarrow_dataset("data/")
|
1217
|
+
>>> print(ds.schema)
|
1218
|
+
>>>
|
1219
|
+
>>> # Partitioned dataset
|
1220
|
+
>>> ds = fs.pyarrow_dataset(
|
1221
|
+
... "events/",
|
1222
|
+
... partitioning=["year", "month"]
|
1223
|
+
... )
|
1224
|
+
>>> # Query with partition pruning
|
1225
|
+
>>> table = ds.to_table(
|
1226
|
+
... filter=(ds.field("year") == 2024)
|
1227
|
+
... )
|
1228
|
+
>>>
|
1229
|
+
>>> # CSV with schema
|
1230
|
+
>>> ds = fs.pyarrow_dataset(
|
1231
|
+
... "logs/",
|
1232
|
+
... format="csv",
|
1233
|
+
... schema=pa.schema([
|
1234
|
+
... ("timestamp", pa.timestamp("s")),
|
1235
|
+
... ("level", pa.string()),
|
1236
|
+
... ("message", pa.string())
|
1237
|
+
... ])
|
1238
|
+
... )
|
826
1239
|
"""
|
827
1240
|
return pds.dataset(
|
828
1241
|
path,
|
@@ -835,24 +1248,52 @@ def pyarrow_dataset(
|
|
835
1248
|
|
836
1249
|
|
837
1250
|
def pyarrow_parquet_dataset(
|
838
|
-
self,
|
1251
|
+
self: AbstractFileSystem,
|
839
1252
|
path: str,
|
840
1253
|
schema: pa.Schema | None = None,
|
841
1254
|
partitioning: str | list[str] | pds.Partitioning = None,
|
842
|
-
**kwargs,
|
1255
|
+
**kwargs: Any,
|
843
1256
|
) -> pds.Dataset:
|
844
|
-
"""
|
845
|
-
|
1257
|
+
"""Create a PyArrow dataset optimized for Parquet files.
|
1258
|
+
|
1259
|
+
Creates a dataset specifically for Parquet data, automatically handling
|
1260
|
+
_metadata files for optimized reading.
|
1261
|
+
|
1262
|
+
This function is particularly useful for:
|
1263
|
+
- Datasets with existing _metadata files
|
1264
|
+
- Multi-file datasets that should be treated as one
|
1265
|
+
- Partitioned Parquet datasets
|
846
1266
|
|
847
1267
|
Args:
|
848
|
-
path:
|
849
|
-
schema:
|
850
|
-
partitioning:
|
851
|
-
|
852
|
-
|
1268
|
+
path: Path to dataset directory or _metadata file
|
1269
|
+
schema: Optional schema to enforce. If None, inferred from data.
|
1270
|
+
partitioning: How the dataset is partitioned. Can be:
|
1271
|
+
- str: Single partition field
|
1272
|
+
- list[str]: Multiple partition fields
|
1273
|
+
- pds.Partitioning: Custom partitioning scheme
|
1274
|
+
**kwargs: Additional dataset arguments
|
853
1275
|
|
854
1276
|
Returns:
|
855
|
-
|
1277
|
+
pds.Dataset: PyArrow dataset instance
|
1278
|
+
|
1279
|
+
Example:
|
1280
|
+
>>> fs = LocalFileSystem()
|
1281
|
+
>>> # Dataset with _metadata
|
1282
|
+
>>> ds = fs.pyarrow_parquet_dataset("data/_metadata")
|
1283
|
+
>>> print(ds.files) # Shows all data files
|
1284
|
+
>>>
|
1285
|
+
>>> # Partitioned dataset directory
|
1286
|
+
>>> ds = fs.pyarrow_parquet_dataset(
|
1287
|
+
... "sales/",
|
1288
|
+
... partitioning=["year", "region"]
|
1289
|
+
... )
|
1290
|
+
>>> # Query with partition pruning
|
1291
|
+
>>> table = ds.to_table(
|
1292
|
+
... filter=(
|
1293
|
+
... (ds.field("year") == 2024) &
|
1294
|
+
... (ds.field("region") == "EMEA")
|
1295
|
+
... )
|
1296
|
+
... )
|
856
1297
|
"""
|
857
1298
|
if not self.is_file(path):
|
858
1299
|
path = posixpath.join(path, "_metadata")
|
@@ -866,22 +1307,49 @@ def pyarrow_parquet_dataset(
|
|
866
1307
|
|
867
1308
|
|
868
1309
|
def pydala_dataset(
|
869
|
-
self,
|
1310
|
+
self: AbstractFileSystem,
|
870
1311
|
path: str,
|
871
1312
|
partitioning: str | list[str] | pds.Partitioning = None,
|
872
|
-
**kwargs,
|
1313
|
+
**kwargs: Any,
|
873
1314
|
) -> ParquetDataset: # type: ignore
|
874
|
-
"""
|
875
|
-
|
1315
|
+
"""Create a Pydala dataset for advanced Parquet operations.
|
1316
|
+
|
1317
|
+
Creates a dataset with additional features beyond PyArrow including:
|
1318
|
+
- Delta table support
|
1319
|
+
- Schema evolution
|
1320
|
+
- Advanced partitioning
|
1321
|
+
- Metadata management
|
1322
|
+
- Sort key optimization
|
876
1323
|
|
877
1324
|
Args:
|
878
|
-
path:
|
879
|
-
partitioning:
|
880
|
-
|
881
|
-
|
1325
|
+
path: Path to dataset directory
|
1326
|
+
partitioning: How the dataset is partitioned. Can be:
|
1327
|
+
- str: Single partition field
|
1328
|
+
- list[str]: Multiple partition fields
|
1329
|
+
- pds.Partitioning: Custom partitioning scheme
|
1330
|
+
**kwargs: Additional dataset configuration
|
882
1331
|
|
883
1332
|
Returns:
|
884
|
-
|
1333
|
+
ParquetDataset: Pydala dataset instance
|
1334
|
+
|
1335
|
+
Example:
|
1336
|
+
>>> fs = LocalFileSystem()
|
1337
|
+
>>> # Create dataset
|
1338
|
+
>>> ds = fs.pydala_dataset(
|
1339
|
+
... "data/",
|
1340
|
+
... partitioning=["date"]
|
1341
|
+
... )
|
1342
|
+
>>>
|
1343
|
+
>>> # Write with delta support
|
1344
|
+
>>> ds.write_to_dataset(
|
1345
|
+
... new_data,
|
1346
|
+
... mode="delta",
|
1347
|
+
... delta_subset=["id"]
|
1348
|
+
... )
|
1349
|
+
>>>
|
1350
|
+
>>> # Read with metadata
|
1351
|
+
>>> df = ds.to_polars()
|
1352
|
+
>>> print(df.columns)
|
885
1353
|
"""
|
886
1354
|
return ParquetDataset(
|
887
1355
|
path,
|
@@ -892,23 +1360,62 @@ def pydala_dataset(
|
|
892
1360
|
|
893
1361
|
|
894
1362
|
def write_parquet(
|
895
|
-
self,
|
1363
|
+
self: AbstractFileSystem,
|
896
1364
|
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
|
897
1365
|
path: str,
|
898
1366
|
schema: pa.Schema | None = None,
|
899
|
-
**kwargs,
|
1367
|
+
**kwargs: Any,
|
900
1368
|
) -> pq.FileMetaData:
|
901
|
-
"""
|
902
|
-
|
1369
|
+
"""Write data to a Parquet file with automatic format conversion.
|
1370
|
+
|
1371
|
+
Handles writing data from multiple input formats to Parquet with:
|
1372
|
+
- Automatic conversion to PyArrow
|
1373
|
+
- Schema validation/coercion
|
1374
|
+
- Metadata collection
|
1375
|
+
- Compression and encoding options
|
903
1376
|
|
904
1377
|
Args:
|
905
|
-
data:
|
906
|
-
|
907
|
-
|
908
|
-
|
1378
|
+
data: Input data in various formats:
|
1379
|
+
- Polars DataFrame/LazyFrame
|
1380
|
+
- PyArrow Table
|
1381
|
+
- Pandas DataFrame
|
1382
|
+
- Dict or list of dicts
|
1383
|
+
path: Output Parquet file path
|
1384
|
+
schema: Optional schema to enforce on write
|
1385
|
+
**kwargs: Additional arguments for pq.write_table()
|
909
1386
|
|
910
1387
|
Returns:
|
911
|
-
|
1388
|
+
pq.FileMetaData: Metadata of written Parquet file
|
1389
|
+
|
1390
|
+
Raises:
|
1391
|
+
SchemaError: If data doesn't match schema
|
1392
|
+
ValueError: If data cannot be converted
|
1393
|
+
|
1394
|
+
Example:
|
1395
|
+
>>> fs = LocalFileSystem()
|
1396
|
+
>>> # Write Polars DataFrame
|
1397
|
+
>>> df = pl.DataFrame({
|
1398
|
+
... "id": range(1000),
|
1399
|
+
... "value": pl.Series(np.random.randn(1000))
|
1400
|
+
... })
|
1401
|
+
>>> metadata = fs.write_parquet(
|
1402
|
+
... df,
|
1403
|
+
... "data.parquet",
|
1404
|
+
... compression="zstd",
|
1405
|
+
... compression_level=3
|
1406
|
+
... )
|
1407
|
+
>>> print(f"Rows: {metadata.num_rows}")
|
1408
|
+
>>>
|
1409
|
+
>>> # Write with schema
|
1410
|
+
>>> schema = pa.schema([
|
1411
|
+
... ("id", pa.int64()),
|
1412
|
+
... ("value", pa.float64())
|
1413
|
+
... ])
|
1414
|
+
>>> metadata = fs.write_parquet(
|
1415
|
+
... {"id": [1, 2], "value": [0.1, 0.2]},
|
1416
|
+
... "data.parquet",
|
1417
|
+
... schema=schema
|
1418
|
+
... )
|
912
1419
|
"""
|
913
1420
|
data = to_pyarrow_table(data, concat=False, unique=False)
|
914
1421
|
|
@@ -922,23 +1429,46 @@ def write_parquet(
|
|
922
1429
|
|
923
1430
|
|
924
1431
|
def write_json(
|
925
|
-
self,
|
926
|
-
data:
|
927
|
-
|
928
|
-
|
1432
|
+
self: AbstractFileSystem,
|
1433
|
+
data: dict
|
1434
|
+
| pl.DataFrame
|
1435
|
+
| pl.LazyFrame
|
1436
|
+
| pa.Table
|
1437
|
+
| pd.DataFrame
|
1438
|
+
| dict
|
1439
|
+
| list[dict],
|
929
1440
|
path: str,
|
930
1441
|
append: bool = False,
|
931
1442
|
) -> None:
|
932
|
-
"""
|
933
|
-
Write a dictionary, DataFrame or Table to a JSON file.
|
1443
|
+
"""Write data to a JSON file with flexible input support.
|
934
1444
|
|
935
|
-
|
936
|
-
|
937
|
-
path: (str) Path to write the data.
|
938
|
-
append: (bool, optional) If True, append to the file. Defaults to False.
|
1445
|
+
Handles writing data in various formats to JSON or JSON Lines,
|
1446
|
+
with optional appending for streaming writes.
|
939
1447
|
|
940
|
-
|
941
|
-
|
1448
|
+
Args:
|
1449
|
+
data: Input data in various formats:
|
1450
|
+
- Dict or list of dicts
|
1451
|
+
- Polars DataFrame/LazyFrame
|
1452
|
+
- PyArrow Table
|
1453
|
+
- Pandas DataFrame
|
1454
|
+
path: Output JSON file path
|
1455
|
+
append: Whether to append to existing file (JSON Lines mode)
|
1456
|
+
|
1457
|
+
Example:
|
1458
|
+
>>> fs = LocalFileSystem()
|
1459
|
+
>>> # Write dictionary
|
1460
|
+
>>> data = {"name": "test", "values": [1, 2, 3]}
|
1461
|
+
>>> fs.write_json(data, "config.json")
|
1462
|
+
>>>
|
1463
|
+
>>> # Stream records
|
1464
|
+
>>> df1 = pl.DataFrame({"id": [1], "value": ["first"]})
|
1465
|
+
>>> df2 = pl.DataFrame({"id": [2], "value": ["second"]})
|
1466
|
+
>>> fs.write_json(df1, "stream.jsonl", append=False)
|
1467
|
+
>>> fs.write_json(df2, "stream.jsonl", append=True)
|
1468
|
+
>>>
|
1469
|
+
>>> # Convert PyArrow
|
1470
|
+
>>> table = pa.table({"a": [1, 2], "b": ["x", "y"]})
|
1471
|
+
>>> fs.write_json(table, "data.json")
|
942
1472
|
"""
|
943
1473
|
if isinstance(data, pl.LazyFrame):
|
944
1474
|
data = data.collect()
|
@@ -951,46 +1481,97 @@ def write_json(
|
|
951
1481
|
data = data.to_pydict()
|
952
1482
|
if append:
|
953
1483
|
with self.open(path, "ab") as f:
|
954
|
-
|
955
|
-
|
1484
|
+
if isinstance(data, dict):
|
1485
|
+
f.write(orjson.dumps(data) + b"\n")
|
1486
|
+
else:
|
1487
|
+
for record in data:
|
1488
|
+
f.write(orjson.dumps(record) + b"\n")
|
956
1489
|
else:
|
957
1490
|
with self.open(path, "wb") as f:
|
958
1491
|
f.write(orjson.dumps(data))
|
959
1492
|
|
960
1493
|
|
961
1494
|
def write_csv(
|
962
|
-
self,
|
963
|
-
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict,
|
1495
|
+
self: AbstractFileSystem,
|
1496
|
+
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
|
964
1497
|
path: str,
|
965
|
-
|
1498
|
+
append: bool = False,
|
1499
|
+
**kwargs: Any,
|
966
1500
|
) -> None:
|
967
|
-
"""
|
968
|
-
Write a DataFrame to a CSV file.
|
1501
|
+
"""Write data to a CSV file with flexible input support.
|
969
1502
|
|
970
|
-
|
971
|
-
|
972
|
-
|
973
|
-
|
1503
|
+
Handles writing data from multiple formats to CSV with options for:
|
1504
|
+
- Appending to existing files
|
1505
|
+
- Custom delimiters and formatting
|
1506
|
+
- Automatic type conversion
|
1507
|
+
- Header handling
|
974
1508
|
|
975
|
-
|
976
|
-
|
1509
|
+
Args:
|
1510
|
+
data: Input data in various formats:
|
1511
|
+
- Polars DataFrame/LazyFrame
|
1512
|
+
- PyArrow Table
|
1513
|
+
- Pandas DataFrame
|
1514
|
+
- Dict or list of dicts
|
1515
|
+
path: Output CSV file path
|
1516
|
+
append: Whether to append to existing file
|
1517
|
+
**kwargs: Additional arguments for CSV writing:
|
1518
|
+
- delimiter: Field separator (default ",")
|
1519
|
+
- header: Whether to write header row
|
1520
|
+
- quote_char: Character for quoting fields
|
1521
|
+
- date_format: Format for date/time fields
|
1522
|
+
- float_precision: Decimal places for floats
|
1523
|
+
|
1524
|
+
Example:
|
1525
|
+
>>> fs = LocalFileSystem()
|
1526
|
+
>>> # Write Polars DataFrame
|
1527
|
+
>>> df = pl.DataFrame({
|
1528
|
+
... "id": range(100),
|
1529
|
+
... "name": ["item_" + str(i) for i in range(100)]
|
1530
|
+
... })
|
1531
|
+
>>> fs.write_csv(df, "items.csv")
|
1532
|
+
>>>
|
1533
|
+
>>> # Append records
|
1534
|
+
>>> new_items = pl.DataFrame({
|
1535
|
+
... "id": range(100, 200),
|
1536
|
+
... "name": ["item_" + str(i) for i in range(100, 200)]
|
1537
|
+
... })
|
1538
|
+
>>> fs.write_csv(
|
1539
|
+
... new_items,
|
1540
|
+
... "items.csv",
|
1541
|
+
... append=True,
|
1542
|
+
... header=False
|
1543
|
+
... )
|
1544
|
+
>>>
|
1545
|
+
>>> # Custom formatting
|
1546
|
+
>>> data = pa.table({
|
1547
|
+
... "date": [datetime.now()],
|
1548
|
+
... "value": [123.456]
|
1549
|
+
... })
|
1550
|
+
>>> fs.write_csv(
|
1551
|
+
... data,
|
1552
|
+
... "formatted.csv",
|
1553
|
+
... date_format="%Y-%m-%d",
|
1554
|
+
... float_precision=2
|
1555
|
+
... )
|
977
1556
|
"""
|
978
|
-
if isinstance(data,
|
979
|
-
data = _dict_to_dataframe(data)
|
980
|
-
elif isinstance(data, pl.LazyFrame):
|
1557
|
+
if isinstance(data, pl.LazyFrame):
|
981
1558
|
data = data.collect()
|
982
|
-
|
983
|
-
|
984
|
-
|
985
|
-
|
986
|
-
|
987
|
-
|
988
|
-
|
1559
|
+
if isinstance(data, pl.DataFrame):
|
1560
|
+
if append:
|
1561
|
+
with self.open(path, "ab") as f:
|
1562
|
+
data.write_csv(f, has_header=not append, **kwargs)
|
1563
|
+
else:
|
1564
|
+
with self.open(path, "wb") as f:
|
1565
|
+
data.write_csv(f, **kwargs)
|
1566
|
+
elif isinstance(data, (pa.Table, pd.DataFrame)):
|
1567
|
+
pl.from_arrow(pa.table(data)).write_csv(path, **kwargs)
|
1568
|
+
else:
|
1569
|
+
pl.DataFrame(data).write_csv(path, **kwargs)
|
989
1570
|
|
990
1571
|
|
991
1572
|
def write_file(
|
992
1573
|
self,
|
993
|
-
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict
|
1574
|
+
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict,
|
994
1575
|
path: str,
|
995
1576
|
format: str,
|
996
1577
|
**kwargs,
|
@@ -1054,7 +1635,7 @@ def write_files(
|
|
1054
1635
|
basename: (str, optional) Basename of the files. Defaults to None.
|
1055
1636
|
format: (str, optional) Format of the data. Defaults to None.
|
1056
1637
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
1057
|
-
unique: (bool
|
1638
|
+
unique: (bool, optional) If True, remove duplicates. Defaults to False.
|
1058
1639
|
mode: (str, optional) Write mode. Defaults to 'append'. Options: 'append', 'overwrite', 'delete_matching',
|
1059
1640
|
'error_if_exists'.
|
1060
1641
|
use_threads: (bool, optional) If True, use parallel processing. Defaults to True.
|
@@ -1202,7 +1783,7 @@ def write_pyarrow_dataset(
|
|
1202
1783
|
max_rows_per_file: (int, optional) Maximum number of rows per file. Defaults to 2_500_000.
|
1203
1784
|
row_group_size: (int, optional) Row group size. Defaults to 250_000.
|
1204
1785
|
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
1205
|
-
unique: (bool |
|
1786
|
+
unique: (bool | str | list[str], optional) If True, remove duplicates. Defaults to False.
|
1206
1787
|
**kwargs: Additional keyword arguments for `pds.write_dataset`.
|
1207
1788
|
|
1208
1789
|
Returns:
|