FlowerPower 0.11.6.20__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/cfg/__init__.py +3 -3
- flowerpower/cfg/pipeline/__init__.py +5 -3
- flowerpower/cfg/project/__init__.py +3 -3
- flowerpower/cfg/project/job_queue.py +1 -128
- flowerpower/cli/__init__.py +5 -5
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/job_queue.py +400 -132
- flowerpower/cli/pipeline.py +14 -413
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +537 -28
- flowerpower/job_queue/__init__.py +5 -94
- flowerpower/job_queue/base.py +201 -3
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
- flowerpower/job_queue/rq/manager.py +388 -77
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +2 -2
- flowerpower/pipeline/io.py +14 -16
- flowerpower/pipeline/manager.py +21 -642
- flowerpower/pipeline/pipeline.py +571 -0
- flowerpower/pipeline/registry.py +242 -10
- flowerpower/pipeline/visualizer.py +1 -2
- flowerpower/plugins/_io/__init__.py +8 -0
- flowerpower/plugins/mqtt/manager.py +6 -6
- flowerpower/settings/backend.py +0 -2
- flowerpower/settings/job_queue.py +1 -57
- flowerpower/utils/misc.py +0 -256
- flowerpower/utils/monkey.py +1 -83
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
- flowerpower-0.20.0.dist-info/RECORD +58 -0
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.20.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py
DELETED
@@ -1,2143 +0,0 @@
|
|
1
|
-
import datetime as dt
|
2
|
-
import importlib
|
3
|
-
import posixpath
|
4
|
-
import uuid
|
5
|
-
from typing import Any, Generator
|
6
|
-
|
7
|
-
if importlib.util.find_spec("pandas") is not None:
|
8
|
-
import pandas as pd
|
9
|
-
else:
|
10
|
-
raise ImportError("To use this module, please install `flowerpower[io]`.")
|
11
|
-
|
12
|
-
import orjson
|
13
|
-
# import polars as pl
|
14
|
-
import pyarrow as pa
|
15
|
-
import pyarrow.dataset as pds
|
16
|
-
import pyarrow.parquet as pq
|
17
|
-
from fsspec import AbstractFileSystem
|
18
|
-
from pydala.dataset import ParquetDataset
|
19
|
-
|
20
|
-
from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
|
21
|
-
from ..plugins.io.helpers.polars import pl
|
22
|
-
# from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
|
23
|
-
from ..plugins.io.helpers.pyarrow import cast_schema
|
24
|
-
from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
|
25
|
-
from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
|
26
|
-
from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
|
27
|
-
run_parallel, to_pyarrow_table)
|
28
|
-
|
29
|
-
|
30
|
-
def path_to_glob(path: str, format: str | None = None) -> str:
|
31
|
-
"""Convert a path to a glob pattern for file matching.
|
32
|
-
|
33
|
-
Intelligently converts paths to glob patterns that match files of the specified
|
34
|
-
format, handling various directory and wildcard patterns.
|
35
|
-
|
36
|
-
Args:
|
37
|
-
path: Base path to convert. Can include wildcards (* or **).
|
38
|
-
Examples: "data/", "data/*.json", "data/**"
|
39
|
-
format: File format to match (without dot). If None, inferred from path.
|
40
|
-
Examples: "json", "csv", "parquet"
|
41
|
-
|
42
|
-
Returns:
|
43
|
-
str: Glob pattern that matches files of specified format.
|
44
|
-
Examples: "data/**/*.json", "data/*.csv"
|
45
|
-
|
46
|
-
Example:
|
47
|
-
>>> # Basic directory
|
48
|
-
>>> path_to_glob("data", "json")
|
49
|
-
'data/**/*.json'
|
50
|
-
>>>
|
51
|
-
>>> # With wildcards
|
52
|
-
>>> path_to_glob("data/**", "csv")
|
53
|
-
'data/**/*.csv'
|
54
|
-
>>>
|
55
|
-
>>> # Format inference
|
56
|
-
>>> path_to_glob("data/file.parquet")
|
57
|
-
'data/file.parquet'
|
58
|
-
"""
|
59
|
-
path = path.rstrip("/")
|
60
|
-
if format is None:
|
61
|
-
if ".json" in path:
|
62
|
-
format = "json"
|
63
|
-
elif ".csv" in path:
|
64
|
-
format = "csv"
|
65
|
-
elif ".parquet" in path:
|
66
|
-
format = "parquet"
|
67
|
-
|
68
|
-
if format in path:
|
69
|
-
return path
|
70
|
-
else:
|
71
|
-
if path.endswith("**"):
|
72
|
-
return posixpath.join(path, f"*.{format}")
|
73
|
-
elif path.endswith("*"):
|
74
|
-
if path.endswith("*/*"):
|
75
|
-
return path + f".{format}"
|
76
|
-
return posixpath.join(path.rstrip("/*"), f"*.{format}")
|
77
|
-
return posixpath.join(path, f"**/*.{format}")
|
78
|
-
|
79
|
-
|
80
|
-
def _read_json_file(
|
81
|
-
path: str,
|
82
|
-
self: AbstractFileSystem,
|
83
|
-
include_file_path: bool = False,
|
84
|
-
jsonlines: bool = False,
|
85
|
-
) -> dict | list[dict]:
|
86
|
-
"""Read a JSON file from any filesystem.
|
87
|
-
|
88
|
-
Internal function that handles both regular JSON and JSON Lines formats.
|
89
|
-
|
90
|
-
Args:
|
91
|
-
path: Path to JSON file
|
92
|
-
self: Filesystem instance to use for reading
|
93
|
-
include_file_path: Whether to return dict with filepath as key
|
94
|
-
jsonlines: Whether to read as JSON Lines format
|
95
|
-
|
96
|
-
Returns:
|
97
|
-
dict | list[dict]: Parsed JSON data. If include_file_path=True,
|
98
|
-
returns {filepath: data}
|
99
|
-
|
100
|
-
Example:
|
101
|
-
>>> fs = LocalFileSystem()
|
102
|
-
>>> # Regular JSON
|
103
|
-
>>> data = _read_json_file("data.json", fs)
|
104
|
-
>>> print(type(data))
|
105
|
-
<class 'dict'>
|
106
|
-
>>>
|
107
|
-
>>> # JSON Lines with filepath
|
108
|
-
>>> data = _read_json_file(
|
109
|
-
... "data.jsonl",
|
110
|
-
... fs,
|
111
|
-
... include_file_path=True,
|
112
|
-
... jsonlines=True
|
113
|
-
... )
|
114
|
-
>>> print(list(data.keys())[0])
|
115
|
-
'data.jsonl'
|
116
|
-
"""
|
117
|
-
with self.open(path) as f:
|
118
|
-
if jsonlines:
|
119
|
-
data = [orjson.loads(line) for line in f.readlines()]
|
120
|
-
else:
|
121
|
-
data = orjson.loads(f.read())
|
122
|
-
if include_file_path:
|
123
|
-
return {path: data}
|
124
|
-
return data
|
125
|
-
|
126
|
-
|
127
|
-
def read_json_file(
|
128
|
-
self: AbstractFileSystem,
|
129
|
-
path: str,
|
130
|
-
include_file_path: bool = False,
|
131
|
-
jsonlines: bool = False,
|
132
|
-
) -> dict | list[dict]:
|
133
|
-
"""Read a single JSON file from any filesystem.
|
134
|
-
|
135
|
-
A public wrapper around _read_json_file providing a clean interface for
|
136
|
-
reading individual JSON files.
|
137
|
-
|
138
|
-
Args:
|
139
|
-
path: Path to JSON file to read
|
140
|
-
include_file_path: Whether to return dict with filepath as key
|
141
|
-
jsonlines: Whether to read as JSON Lines format
|
142
|
-
|
143
|
-
Returns:
|
144
|
-
dict | list[dict]: Parsed JSON data. For regular JSON, returns a dict.
|
145
|
-
For JSON Lines, returns a list of dicts. If include_file_path=True,
|
146
|
-
returns {filepath: data}.
|
147
|
-
|
148
|
-
Example:
|
149
|
-
>>> fs = LocalFileSystem()
|
150
|
-
>>> # Read regular JSON
|
151
|
-
>>> data = fs.read_json_file("config.json")
|
152
|
-
>>> print(data["setting"])
|
153
|
-
'value'
|
154
|
-
>>>
|
155
|
-
>>> # Read JSON Lines with filepath
|
156
|
-
>>> data = fs.read_json_file(
|
157
|
-
... "logs.jsonl",
|
158
|
-
... include_file_path=True,
|
159
|
-
... jsonlines=True
|
160
|
-
... )
|
161
|
-
>>> print(list(data.keys())[0])
|
162
|
-
'logs.jsonl'
|
163
|
-
"""
|
164
|
-
return _read_json_file(
|
165
|
-
path=path,
|
166
|
-
self=self,
|
167
|
-
include_file_path=include_file_path,
|
168
|
-
jsonlines=jsonlines,
|
169
|
-
)
|
170
|
-
|
171
|
-
|
172
|
-
def _read_json(
|
173
|
-
self,
|
174
|
-
path: str | list[str],
|
175
|
-
include_file_path: bool = False,
|
176
|
-
use_threads: bool = True,
|
177
|
-
jsonlines: bool = False,
|
178
|
-
as_dataframe: bool = True,
|
179
|
-
concat: bool = True,
|
180
|
-
verbose: bool = False,
|
181
|
-
opt_dtypes: bool = False,
|
182
|
-
**kwargs,
|
183
|
-
) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
|
184
|
-
"""
|
185
|
-
Read a JSON file or a list of JSON files.
|
186
|
-
|
187
|
-
Args:
|
188
|
-
path: (str | list[str]) Path to the JSON file(s).
|
189
|
-
include_file_path: (bool, optional) If True, return a dictionary with the file path as key.
|
190
|
-
Defaults to False.
|
191
|
-
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
192
|
-
jsonlines: (bool, optional) If True, read JSON lines. Defaults to False.
|
193
|
-
as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
|
194
|
-
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
195
|
-
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
196
|
-
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
197
|
-
**kwargs: Additional keyword arguments.
|
198
|
-
|
199
|
-
Returns:
|
200
|
-
(dict | list[dict] | pl.DataFrame | list[pl.DataFrame]):
|
201
|
-
Dictionary, list of dictionaries, DataFrame or list of DataFrames.
|
202
|
-
"""
|
203
|
-
if isinstance(path, str):
|
204
|
-
path = path_to_glob(path, format="json")
|
205
|
-
path = self.glob(path)
|
206
|
-
|
207
|
-
if isinstance(path, list):
|
208
|
-
if use_threads:
|
209
|
-
data = run_parallel(
|
210
|
-
_read_json_file,
|
211
|
-
path,
|
212
|
-
self=self,
|
213
|
-
include_file_path=include_file_path,
|
214
|
-
jsonlines=jsonlines,
|
215
|
-
n_jobs=-1,
|
216
|
-
backend="threading",
|
217
|
-
verbose=verbose,
|
218
|
-
**kwargs,
|
219
|
-
)
|
220
|
-
data = [
|
221
|
-
_read_json_file(
|
222
|
-
path=p,
|
223
|
-
self=self,
|
224
|
-
include_file_path=include_file_path,
|
225
|
-
jsonlines=jsonlines,
|
226
|
-
)
|
227
|
-
for p in path
|
228
|
-
]
|
229
|
-
else:
|
230
|
-
data = _read_json_file(
|
231
|
-
path=path,
|
232
|
-
self=self,
|
233
|
-
include_file_path=include_file_path,
|
234
|
-
jsonlines=jsonlines,
|
235
|
-
)
|
236
|
-
if as_dataframe:
|
237
|
-
if not include_file_path:
|
238
|
-
data = [pl.DataFrame(d) for d in data]
|
239
|
-
else:
|
240
|
-
data = [
|
241
|
-
[
|
242
|
-
pl.DataFrame(_data[k]).with_columns(pl.lit(k).alias("file_path"))
|
243
|
-
for k in _data
|
244
|
-
][0]
|
245
|
-
for _data in data
|
246
|
-
]
|
247
|
-
if opt_dtypes:
|
248
|
-
data = [opt_dtype_pl(df, strict=False) for df in data]
|
249
|
-
if concat:
|
250
|
-
result = pl.concat(data, how="diagonal_relaxed")
|
251
|
-
# if opt_dtypes:
|
252
|
-
# result = opt_dtype_pl(result, strict=False)
|
253
|
-
return result
|
254
|
-
return data
|
255
|
-
|
256
|
-
|
257
|
-
def _read_json_batches(
|
258
|
-
self: AbstractFileSystem,
|
259
|
-
path: str | list[str],
|
260
|
-
batch_size: int | None = None,
|
261
|
-
include_file_path: bool = False,
|
262
|
-
jsonlines: bool = False,
|
263
|
-
as_dataframe: bool = True,
|
264
|
-
concat: bool = True,
|
265
|
-
use_threads: bool = True,
|
266
|
-
verbose: bool = False,
|
267
|
-
opt_dtypes: bool = False,
|
268
|
-
**kwargs: Any,
|
269
|
-
) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
|
270
|
-
"""Process JSON files in batches with optional parallel reading.
|
271
|
-
|
272
|
-
Internal generator function that handles batched reading of JSON files
|
273
|
-
with support for parallel processing within each batch.
|
274
|
-
|
275
|
-
Args:
|
276
|
-
path: Path(s) to JSON file(s). Glob patterns supported.
|
277
|
-
batch_size: Number of files to process in each batch
|
278
|
-
include_file_path: Include source filepath in output
|
279
|
-
jsonlines: Whether to read as JSON Lines format
|
280
|
-
as_dataframe: Convert output to Polars DataFrame(s)
|
281
|
-
concat: Combine files within each batch
|
282
|
-
use_threads: Enable parallel file reading within batches
|
283
|
-
verbose: Print progress information
|
284
|
-
opt_dtypes: Optimize DataFrame dtypes
|
285
|
-
**kwargs: Additional arguments for DataFrame conversion
|
286
|
-
|
287
|
-
Yields:
|
288
|
-
Each batch of data in requested format:
|
289
|
-
- dict | list[dict]: Raw JSON data
|
290
|
-
- pl.DataFrame: Single DataFrame if concat=True
|
291
|
-
- list[pl.DataFrame]: List of DataFrames if concat=False
|
292
|
-
|
293
|
-
Example:
|
294
|
-
>>> fs = LocalFileSystem()
|
295
|
-
>>> # Process large dataset in batches
|
296
|
-
>>> for batch in fs._read_json_batches(
|
297
|
-
... "data/*.json",
|
298
|
-
... batch_size=100,
|
299
|
-
... as_dataframe=True,
|
300
|
-
... verbose=True
|
301
|
-
... ):
|
302
|
-
... print(f"Batch shape: {batch.shape}")
|
303
|
-
>>>
|
304
|
-
>>> # Parallel batch processing with filepath tracking
|
305
|
-
>>> for batch in fs._read_json_batches(
|
306
|
-
... ["logs1.jsonl", "logs2.jsonl"],
|
307
|
-
... batch_size=1,
|
308
|
-
... include_file_path=True,
|
309
|
-
... use_threads=True
|
310
|
-
... ):
|
311
|
-
... print(f"Processing {batch['file_path'][0]}")
|
312
|
-
"""
|
313
|
-
# Handle path resolution
|
314
|
-
if isinstance(path, str):
|
315
|
-
path = path_to_glob(path, format="json")
|
316
|
-
path = self.glob(path)
|
317
|
-
|
318
|
-
# Process files in batches
|
319
|
-
for i in range(0, len(path), batch_size):
|
320
|
-
batch_paths = path[i : i + batch_size]
|
321
|
-
|
322
|
-
# Read batch with optional parallelization
|
323
|
-
if use_threads and len(batch_paths) > 1:
|
324
|
-
batch_data = run_parallel(
|
325
|
-
_read_json_file,
|
326
|
-
batch_paths,
|
327
|
-
self=self,
|
328
|
-
include_file_path=include_file_path,
|
329
|
-
jsonlines=jsonlines,
|
330
|
-
n_jobs=-1,
|
331
|
-
backend="threading",
|
332
|
-
verbose=verbose,
|
333
|
-
**kwargs,
|
334
|
-
)
|
335
|
-
else:
|
336
|
-
batch_data = [
|
337
|
-
_read_json_file(
|
338
|
-
path=p,
|
339
|
-
self=self,
|
340
|
-
include_file_path=include_file_path,
|
341
|
-
jsonlines=jsonlines,
|
342
|
-
)
|
343
|
-
for p in batch_paths
|
344
|
-
]
|
345
|
-
|
346
|
-
if as_dataframe:
|
347
|
-
if not include_file_path:
|
348
|
-
batch_dfs = [pl.DataFrame(d) for d in batch_data]
|
349
|
-
else:
|
350
|
-
batch_dfs = [
|
351
|
-
[
|
352
|
-
pl.DataFrame(_data[k]).with_columns(
|
353
|
-
pl.lit(k).alias("file_path")
|
354
|
-
)
|
355
|
-
for k in _data
|
356
|
-
][0]
|
357
|
-
for _data in batch_data
|
358
|
-
]
|
359
|
-
if opt_dtypes:
|
360
|
-
batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
361
|
-
if concat and len(batch_dfs) > 1:
|
362
|
-
batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
|
363
|
-
# if opt_dtypes:
|
364
|
-
# batch_df = opt_dtype_pl(batch_df, strict=False)
|
365
|
-
yield batch_df
|
366
|
-
else:
|
367
|
-
# if opt_dtypes:
|
368
|
-
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
369
|
-
yield batch_dfs
|
370
|
-
else:
|
371
|
-
yield batch_data
|
372
|
-
|
373
|
-
|
374
|
-
def read_json(
|
375
|
-
self: AbstractFileSystem,
|
376
|
-
path: str | list[str],
|
377
|
-
batch_size: int | None = None,
|
378
|
-
include_file_path: bool = False,
|
379
|
-
jsonlines: bool = False,
|
380
|
-
as_dataframe: bool = True,
|
381
|
-
concat: bool = True,
|
382
|
-
use_threads: bool = True,
|
383
|
-
verbose: bool = False,
|
384
|
-
opt_dtypes: bool = False,
|
385
|
-
**kwargs: Any,
|
386
|
-
) -> (
|
387
|
-
dict
|
388
|
-
| list[dict]
|
389
|
-
| pl.DataFrame
|
390
|
-
| list[pl.DataFrame]
|
391
|
-
| Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]
|
392
|
-
):
|
393
|
-
"""Read JSON data from one or more files with powerful options.
|
394
|
-
|
395
|
-
Provides a flexible interface for reading JSON data with support for:
|
396
|
-
- Single file or multiple files
|
397
|
-
- Regular JSON or JSON Lines format
|
398
|
-
- Batch processing for large datasets
|
399
|
-
- Parallel processing
|
400
|
-
- DataFrame conversion
|
401
|
-
- File path tracking
|
402
|
-
|
403
|
-
Args:
|
404
|
-
path: Path(s) to JSON file(s). Can be:
|
405
|
-
- Single path string (globs supported)
|
406
|
-
- List of path strings
|
407
|
-
batch_size: If set, enables batch reading with this many files per batch
|
408
|
-
include_file_path: Include source filepath in output
|
409
|
-
jsonlines: Whether to read as JSON Lines format
|
410
|
-
as_dataframe: Convert output to Polars DataFrame(s)
|
411
|
-
concat: Combine multiple files/batches into single result
|
412
|
-
use_threads: Enable parallel file reading
|
413
|
-
verbose: Print progress information
|
414
|
-
opt_dtypes: Optimize DataFrame dtypes for performance
|
415
|
-
**kwargs: Additional arguments passed to DataFrame conversion
|
416
|
-
|
417
|
-
Returns:
|
418
|
-
Various types depending on arguments:
|
419
|
-
- dict: Single JSON file as dictionary
|
420
|
-
- list[dict]: Multiple JSON files as list of dictionaries
|
421
|
-
- pl.DataFrame: Single or concatenated DataFrame
|
422
|
-
- list[pl.DataFrame]: List of DataFrames (if concat=False)
|
423
|
-
- Generator: If batch_size set, yields batches of above types
|
424
|
-
|
425
|
-
Example:
|
426
|
-
>>> fs = LocalFileSystem()
|
427
|
-
>>> # Read all JSON files in directory
|
428
|
-
>>> df = fs.read_json(
|
429
|
-
... "data/*.json",
|
430
|
-
... as_dataframe=True,
|
431
|
-
... concat=True
|
432
|
-
... )
|
433
|
-
>>> print(df.shape)
|
434
|
-
(1000, 5) # Combined data from all files
|
435
|
-
>>>
|
436
|
-
>>> # Batch process large dataset
|
437
|
-
>>> for batch_df in fs.read_json(
|
438
|
-
... "logs/*.jsonl",
|
439
|
-
... batch_size=100,
|
440
|
-
... jsonlines=True,
|
441
|
-
... include_file_path=True
|
442
|
-
... ):
|
443
|
-
... print(f"Processing {len(batch_df)} records")
|
444
|
-
>>>
|
445
|
-
>>> # Parallel read with custom options
|
446
|
-
>>> dfs = fs.read_json(
|
447
|
-
... ["file1.json", "file2.json"],
|
448
|
-
... use_threads=True,
|
449
|
-
... concat=False,
|
450
|
-
... verbose=True
|
451
|
-
... )
|
452
|
-
>>> print(f"Read {len(dfs)} files")
|
453
|
-
"""
|
454
|
-
if batch_size is not None:
|
455
|
-
return _read_json_batches(
|
456
|
-
self=self,
|
457
|
-
path=path,
|
458
|
-
batch_size=batch_size,
|
459
|
-
include_file_path=include_file_path,
|
460
|
-
jsonlines=jsonlines,
|
461
|
-
as_dataframe=as_dataframe,
|
462
|
-
concat=concat,
|
463
|
-
use_threads=use_threads,
|
464
|
-
verbose=verbose,
|
465
|
-
opt_dtypes=opt_dtypes,
|
466
|
-
**kwargs,
|
467
|
-
)
|
468
|
-
return _read_json(
|
469
|
-
self=self,
|
470
|
-
path=path,
|
471
|
-
include_file_path=include_file_path,
|
472
|
-
jsonlines=jsonlines,
|
473
|
-
as_dataframe=as_dataframe,
|
474
|
-
concat=concat,
|
475
|
-
use_threads=use_threads,
|
476
|
-
verbose=verbose,
|
477
|
-
opt_dtypes=opt_dtypes,
|
478
|
-
**kwargs,
|
479
|
-
)
|
480
|
-
|
481
|
-
|
482
|
-
def _read_csv_file(
|
483
|
-
path: str,
|
484
|
-
self: AbstractFileSystem,
|
485
|
-
include_file_path: bool = False,
|
486
|
-
opt_dtypes: bool = False,
|
487
|
-
**kwargs: Any,
|
488
|
-
) -> pl.DataFrame:
|
489
|
-
"""Read a single CSV file from any filesystem.
|
490
|
-
|
491
|
-
Internal function that handles reading individual CSV files and optionally
|
492
|
-
adds the source filepath as a column.
|
493
|
-
|
494
|
-
Args:
|
495
|
-
path: Path to CSV file
|
496
|
-
self: Filesystem instance to use for reading
|
497
|
-
include_file_path: Add source filepath as a column
|
498
|
-
opt_dtypes: Optimize DataFrame dtypes
|
499
|
-
**kwargs: Additional arguments passed to pl.read_csv()
|
500
|
-
|
501
|
-
Returns:
|
502
|
-
pl.DataFrame: DataFrame containing CSV data
|
503
|
-
|
504
|
-
Example:
|
505
|
-
>>> fs = LocalFileSystem()
|
506
|
-
>>> df = _read_csv_file(
|
507
|
-
... "data.csv",
|
508
|
-
... fs,
|
509
|
-
... include_file_path=True,
|
510
|
-
... delimiter="|"
|
511
|
-
... )
|
512
|
-
>>> print("file_path" in df.columns)
|
513
|
-
True
|
514
|
-
"""
|
515
|
-
print(path) # Debug info
|
516
|
-
with self.open(path) as f:
|
517
|
-
df = pl.read_csv(f, **kwargs)
|
518
|
-
if include_file_path:
|
519
|
-
df = df.with_columns(pl.lit(path).alias("file_path"))
|
520
|
-
if opt_dtypes:
|
521
|
-
df = opt_dtype_pl(df, strict=False)
|
522
|
-
return df
|
523
|
-
|
524
|
-
|
525
|
-
def read_csv_file(
|
526
|
-
self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
|
527
|
-
) -> pl.DataFrame:
|
528
|
-
return _read_csv_file(
|
529
|
-
path=path,
|
530
|
-
self=self,
|
531
|
-
include_file_path=include_file_path,
|
532
|
-
opt_dtypes=opt_dtypes,
|
533
|
-
**kwargs,
|
534
|
-
)
|
535
|
-
|
536
|
-
|
537
|
-
def _read_csv(
|
538
|
-
self,
|
539
|
-
path: str | list[str],
|
540
|
-
include_file_path: bool = False,
|
541
|
-
use_threads: bool = True,
|
542
|
-
concat: bool = True,
|
543
|
-
verbose: bool = False,
|
544
|
-
opt_dtypes: bool = False,
|
545
|
-
**kwargs,
|
546
|
-
) -> pl.DataFrame | list[pl.DataFrame]:
|
547
|
-
"""
|
548
|
-
Read a CSV file or a list of CSV files into a polars DataFrame.
|
549
|
-
|
550
|
-
Args:
|
551
|
-
path: (str | list[str]) Path to the CSV file(s).
|
552
|
-
include_file_path: (bool, optional) If True, return a DataFrame with a 'file_path' column.
|
553
|
-
Defaults to False.
|
554
|
-
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
555
|
-
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
556
|
-
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
557
|
-
opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
|
558
|
-
**kwargs: Additional keyword arguments.
|
559
|
-
|
560
|
-
Returns:
|
561
|
-
(pl.DataFrame | list[pl.DataFrame]): Polars DataFrame or list of DataFrames.
|
562
|
-
"""
|
563
|
-
if isinstance(path, str):
|
564
|
-
path = path_to_glob(path, format="csv")
|
565
|
-
path = self.glob(path)
|
566
|
-
|
567
|
-
if isinstance(path, list):
|
568
|
-
if use_threads:
|
569
|
-
dfs = run_parallel(
|
570
|
-
_read_csv_file,
|
571
|
-
path,
|
572
|
-
self=self,
|
573
|
-
include_file_path=include_file_path,
|
574
|
-
opt_dtypes=opt_dtypes,
|
575
|
-
n_jobs=-1,
|
576
|
-
backend="threading",
|
577
|
-
verbose=verbose,
|
578
|
-
**kwargs,
|
579
|
-
)
|
580
|
-
else:
|
581
|
-
dfs = [
|
582
|
-
_read_csv_file(
|
583
|
-
p,
|
584
|
-
self=self,
|
585
|
-
include_file_path=include_file_path,
|
586
|
-
opt_dtypes=opt_dtypes,
|
587
|
-
**kwargs,
|
588
|
-
)
|
589
|
-
for p in path
|
590
|
-
]
|
591
|
-
else:
|
592
|
-
dfs = _read_csv_file(
|
593
|
-
path,
|
594
|
-
self=self,
|
595
|
-
include_file_path=include_file_path,
|
596
|
-
opt_dtypes=opt_dtypes,
|
597
|
-
**kwargs,
|
598
|
-
)
|
599
|
-
if concat:
|
600
|
-
result = pl.concat(dfs, how="diagonal_relaxed")
|
601
|
-
# if opt_dtypes:
|
602
|
-
# result = opt_dtype_pl(result, strict=False)
|
603
|
-
return result
|
604
|
-
return dfs
|
605
|
-
|
606
|
-
|
607
|
-
def _read_csv_batches(
|
608
|
-
self: AbstractFileSystem,
|
609
|
-
path: str | list[str],
|
610
|
-
batch_size: int | None = None,
|
611
|
-
include_file_path: bool = False,
|
612
|
-
concat: bool = True,
|
613
|
-
use_threads: bool = True,
|
614
|
-
verbose: bool = False,
|
615
|
-
opt_dtypes: bool = False,
|
616
|
-
**kwargs: Any,
|
617
|
-
) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
|
618
|
-
"""Process CSV files in batches with optional parallel reading.
|
619
|
-
|
620
|
-
Internal generator function that handles batched reading of CSV files
|
621
|
-
with support for parallel processing within each batch.
|
622
|
-
|
623
|
-
Args:
|
624
|
-
path: Path(s) to CSV file(s). Glob patterns supported.
|
625
|
-
batch_size: Number of files to process in each batch
|
626
|
-
include_file_path: Add source filepath as a column
|
627
|
-
concat: Combine files within each batch
|
628
|
-
use_threads: Enable parallel file reading within batches
|
629
|
-
verbose: Print progress information
|
630
|
-
opt_dtypes: Optimize DataFrame dtypes
|
631
|
-
**kwargs: Additional arguments passed to pl.read_csv()
|
632
|
-
|
633
|
-
Yields:
|
634
|
-
Each batch of data in requested format:
|
635
|
-
- pl.DataFrame: Single DataFrame if concat=True
|
636
|
-
- list[pl.DataFrame]: List of DataFrames if concat=False
|
637
|
-
|
638
|
-
Example:
|
639
|
-
>>> fs = LocalFileSystem()
|
640
|
-
>>> # Process large dataset in batches
|
641
|
-
>>> for batch in fs._read_csv_batches(
|
642
|
-
... "data/*.csv",
|
643
|
-
... batch_size=100,
|
644
|
-
... include_file_path=True,
|
645
|
-
... verbose=True
|
646
|
-
... ):
|
647
|
-
... print(f"Batch columns: {batch.columns}")
|
648
|
-
>>>
|
649
|
-
>>> # Parallel processing without concatenation
|
650
|
-
>>> for batch in fs._read_csv_batches(
|
651
|
-
... ["file1.csv", "file2.csv"],
|
652
|
-
... batch_size=1,
|
653
|
-
... concat=False,
|
654
|
-
... use_threads=True
|
655
|
-
... ):
|
656
|
-
... for df in batch:
|
657
|
-
... print(f"DataFrame shape: {df.shape}")
|
658
|
-
"""
|
659
|
-
# Handle path resolution
|
660
|
-
if isinstance(path, str):
|
661
|
-
path = path_to_glob(path, format="csv")
|
662
|
-
path = self.glob(path)
|
663
|
-
|
664
|
-
# Ensure path is a list
|
665
|
-
if isinstance(path, str):
|
666
|
-
path = [path]
|
667
|
-
|
668
|
-
# Process files in batches
|
669
|
-
for i in range(0, len(path), batch_size):
|
670
|
-
batch_paths = path[i : i + batch_size]
|
671
|
-
|
672
|
-
# Read batch with optional parallelization
|
673
|
-
if use_threads and len(batch_paths) > 1:
|
674
|
-
batch_dfs = run_parallel(
|
675
|
-
_read_csv_file,
|
676
|
-
batch_paths,
|
677
|
-
self=self,
|
678
|
-
include_file_path=include_file_path,
|
679
|
-
n_jobs=-1,
|
680
|
-
backend="threading",
|
681
|
-
verbose=verbose,
|
682
|
-
opt_dtypes=opt_dtypes,
|
683
|
-
**kwargs,
|
684
|
-
)
|
685
|
-
else:
|
686
|
-
batch_dfs = [
|
687
|
-
_read_csv_file(
|
688
|
-
p,
|
689
|
-
self=self,
|
690
|
-
include_file_path=include_file_path,
|
691
|
-
opt_dtypes=opt_dtypes,
|
692
|
-
**kwargs,
|
693
|
-
)
|
694
|
-
for p in batch_paths
|
695
|
-
]
|
696
|
-
|
697
|
-
# if opt_dtypes:
|
698
|
-
# batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
|
699
|
-
|
700
|
-
if concat and len(batch_dfs) > 1:
|
701
|
-
result = pl.concat(batch_dfs, how="diagonal_relaxed")
|
702
|
-
# if opt_dtypes:
|
703
|
-
# result = opt_dtype_pl(result, strict=False)
|
704
|
-
yield result
|
705
|
-
else:
|
706
|
-
yield batch_dfs
|
707
|
-
|
708
|
-
|
709
|
-
def read_csv(
|
710
|
-
self: AbstractFileSystem,
|
711
|
-
path: str | list[str],
|
712
|
-
batch_size: int | None = None,
|
713
|
-
include_file_path: bool = False,
|
714
|
-
concat: bool = True,
|
715
|
-
use_threads: bool = True,
|
716
|
-
verbose: bool = False,
|
717
|
-
opt_dtypes: bool = False,
|
718
|
-
**kwargs: Any,
|
719
|
-
) -> (
|
720
|
-
pl.DataFrame
|
721
|
-
| list[pl.DataFrame]
|
722
|
-
| Generator[pl.DataFrame | list[pl.DataFrame], None, None]
|
723
|
-
):
|
724
|
-
"""Read CSV data from one or more files with powerful options.
|
725
|
-
|
726
|
-
Provides a flexible interface for reading CSV files with support for:
|
727
|
-
- Single file or multiple files
|
728
|
-
- Batch processing for large datasets
|
729
|
-
- Parallel processing
|
730
|
-
- File path tracking
|
731
|
-
- Polars DataFrame output
|
732
|
-
|
733
|
-
Args:
|
734
|
-
path: Path(s) to CSV file(s). Can be:
|
735
|
-
- Single path string (globs supported)
|
736
|
-
- List of path strings
|
737
|
-
batch_size: If set, enables batch reading with this many files per batch
|
738
|
-
include_file_path: Add source filepath as a column
|
739
|
-
concat: Combine multiple files/batches into single DataFrame
|
740
|
-
use_threads: Enable parallel file reading
|
741
|
-
verbose: Print progress information
|
742
|
-
**kwargs: Additional arguments passed to pl.read_csv()
|
743
|
-
|
744
|
-
Returns:
|
745
|
-
Various types depending on arguments:
|
746
|
-
- pl.DataFrame: Single or concatenated DataFrame
|
747
|
-
- list[pl.DataFrame]: List of DataFrames (if concat=False)
|
748
|
-
- Generator: If batch_size set, yields batches of above types
|
749
|
-
|
750
|
-
Example:
|
751
|
-
>>> fs = LocalFileSystem()
|
752
|
-
>>> # Read all CSVs in directory
|
753
|
-
>>> df = fs.read_csv(
|
754
|
-
... "data/*.csv",
|
755
|
-
... include_file_path=True
|
756
|
-
... )
|
757
|
-
>>> print(df.columns)
|
758
|
-
['file_path', 'col1', 'col2', ...]
|
759
|
-
>>>
|
760
|
-
>>> # Batch process large dataset
|
761
|
-
>>> for batch_df in fs.read_csv(
|
762
|
-
... "logs/*.csv",
|
763
|
-
... batch_size=100,
|
764
|
-
... use_threads=True,
|
765
|
-
... verbose=True
|
766
|
-
... ):
|
767
|
-
... print(f"Processing {len(batch_df)} rows")
|
768
|
-
>>>
|
769
|
-
>>> # Multiple files without concatenation
|
770
|
-
>>> dfs = fs.read_csv(
|
771
|
-
... ["file1.csv", "file2.csv"],
|
772
|
-
... concat=False,
|
773
|
-
... use_threads=True
|
774
|
-
... )
|
775
|
-
>>> print(f"Read {len(dfs)} files")
|
776
|
-
"""
|
777
|
-
if batch_size is not None:
|
778
|
-
return _read_csv_batches(
|
779
|
-
self=self,
|
780
|
-
path=path,
|
781
|
-
batch_size=batch_size,
|
782
|
-
include_file_path=include_file_path,
|
783
|
-
concat=concat,
|
784
|
-
use_threads=use_threads,
|
785
|
-
verbose=verbose,
|
786
|
-
opt_dtypes=opt_dtypes,
|
787
|
-
**kwargs,
|
788
|
-
)
|
789
|
-
return _read_csv(
|
790
|
-
self=self,
|
791
|
-
path=path,
|
792
|
-
include_file_path=include_file_path,
|
793
|
-
concat=concat,
|
794
|
-
use_threads=use_threads,
|
795
|
-
verbose=verbose,
|
796
|
-
opt_dtypes=opt_dtypes,
|
797
|
-
**kwargs,
|
798
|
-
)
|
799
|
-
|
800
|
-
|
801
|
-
def _read_parquet_file(
|
802
|
-
path: str,
|
803
|
-
self: AbstractFileSystem,
|
804
|
-
include_file_path: bool = False,
|
805
|
-
opt_dtypes: bool = False,
|
806
|
-
**kwargs: Any,
|
807
|
-
) -> pa.Table:
|
808
|
-
"""Read a single Parquet file from any filesystem.
|
809
|
-
|
810
|
-
Internal function that handles reading individual Parquet files and
|
811
|
-
optionally adds the source filepath as a column.
|
812
|
-
|
813
|
-
Args:
|
814
|
-
path: Path to Parquet file
|
815
|
-
self: Filesystem instance to use for reading
|
816
|
-
include_file_path: Add source filepath as a column
|
817
|
-
opt_dtypes: Optimize DataFrame dtypes
|
818
|
-
**kwargs: Additional arguments passed to pq.read_table()
|
819
|
-
|
820
|
-
Returns:
|
821
|
-
pa.Table: PyArrow Table containing Parquet data
|
822
|
-
|
823
|
-
Example:
|
824
|
-
>>> fs = LocalFileSystem()
|
825
|
-
>>> table = _read_parquet_file(
|
826
|
-
... "data.parquet",
|
827
|
-
... fs,
|
828
|
-
... include_file_path=True,
|
829
|
-
... use_threads=True
|
830
|
-
... )
|
831
|
-
>>> print("file_path" in table.column_names)
|
832
|
-
True
|
833
|
-
"""
|
834
|
-
if not path.endswith(".parquet"):
|
835
|
-
raise ValueError(
|
836
|
-
f"Path '{path}' does not point to a Parquet file. "
|
837
|
-
"Ensure the path ends with '.parquet'."
|
838
|
-
)
|
839
|
-
table = pq.read_table(path, filesystem=self, **kwargs)
|
840
|
-
if include_file_path:
|
841
|
-
table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
|
842
|
-
if opt_dtypes:
|
843
|
-
table = opt_dtype_pa(table, strict=False)
|
844
|
-
return table
|
845
|
-
|
846
|
-
|
847
|
-
def read_parquet_file(
|
848
|
-
self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
|
849
|
-
) -> pa.Table:
|
850
|
-
"""Read a single Parquet file from any filesystem.
|
851
|
-
|
852
|
-
Internal function that handles reading individual Parquet files and
|
853
|
-
optionally adds the source filepath as a column.
|
854
|
-
|
855
|
-
Args:
|
856
|
-
path: Path to Parquet file
|
857
|
-
include_file_path: Add source filepath as a column
|
858
|
-
opt_dtypes: Optimize DataFrame dtypes
|
859
|
-
**kwargs: Additional arguments passed to pq.read_table()
|
860
|
-
|
861
|
-
Returns:
|
862
|
-
pa.Table: PyArrow Table containing Parquet data
|
863
|
-
|
864
|
-
Example:
|
865
|
-
>>> fs = LocalFileSystem()
|
866
|
-
>>> table = fs.read_parquet_file(
|
867
|
-
... "data.parquet",
|
868
|
-
... include_file_path=True,
|
869
|
-
... use_threads=True
|
870
|
-
... )
|
871
|
-
>>> print("file_path" in table.column_names)
|
872
|
-
True
|
873
|
-
"""
|
874
|
-
return _read_parquet_file(
|
875
|
-
path=path,
|
876
|
-
self=self,
|
877
|
-
include_file_path=include_file_path,
|
878
|
-
opt_dtypes=opt_dtypes,
|
879
|
-
**kwargs,
|
880
|
-
)
|
881
|
-
|
882
|
-
|
883
|
-
def _read_parquet(
|
884
|
-
self,
|
885
|
-
path: str | list[str],
|
886
|
-
include_file_path: bool = False,
|
887
|
-
use_threads: bool = True,
|
888
|
-
concat: bool = True,
|
889
|
-
verbose: bool = False,
|
890
|
-
opt_dtypes: bool = False,
|
891
|
-
**kwargs,
|
892
|
-
) -> pa.Table | list[pa.Table]:
|
893
|
-
"""
|
894
|
-
Read a Parquet file or a list of Parquet files into a pyarrow Table.
|
895
|
-
|
896
|
-
Args:
|
897
|
-
path: (str | list[str]) Path to the Parquet file(s).
|
898
|
-
include_file_path: (bool, optional) If True, return a Table with a 'file_path' column.
|
899
|
-
Defaults to False.
|
900
|
-
use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
|
901
|
-
concat: (bool, optional) If True, concatenate the Tables. Defaults to True.
|
902
|
-
**kwargs: Additional keyword arguments.
|
903
|
-
|
904
|
-
Returns:
|
905
|
-
(pa.Table | list[pa.Table]): Pyarrow Table or list of Pyarrow Tables.
|
906
|
-
"""
|
907
|
-
# if not include_file_path and concat:
|
908
|
-
# if isinstance(path, str):
|
909
|
-
# path = path.replace("**", "").replace("*.parquet", "")
|
910
|
-
# table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
|
911
|
-
# return table
|
912
|
-
# else:
|
913
|
-
if isinstance(path, str):
|
914
|
-
path = path_to_glob(path, format="parquet")
|
915
|
-
path = self.glob(path)
|
916
|
-
|
917
|
-
if isinstance(path, list):
|
918
|
-
if use_threads:
|
919
|
-
tables = run_parallel(
|
920
|
-
_read_parquet_file,
|
921
|
-
path,
|
922
|
-
self=self,
|
923
|
-
include_file_path=include_file_path,
|
924
|
-
opt_dtypes=opt_dtypes,
|
925
|
-
n_jobs=-1,
|
926
|
-
backend="threading",
|
927
|
-
verbose=verbose,
|
928
|
-
**kwargs,
|
929
|
-
)
|
930
|
-
else:
|
931
|
-
tables = [
|
932
|
-
_read_parquet_file(
|
933
|
-
p,
|
934
|
-
self=self,
|
935
|
-
include_file_path=include_file_path,
|
936
|
-
opt_dtypes=opt_dtypes,
|
937
|
-
**kwargs,
|
938
|
-
)
|
939
|
-
for p in path
|
940
|
-
]
|
941
|
-
else:
|
942
|
-
tables = _read_parquet_file(
|
943
|
-
path=path,
|
944
|
-
self=self,
|
945
|
-
include_file_path=include_file_path,
|
946
|
-
opt_dtypes=opt_dtypes,
|
947
|
-
**kwargs,
|
948
|
-
)
|
949
|
-
if concat:
|
950
|
-
# Unify schemas before concatenation if opt_dtypes or multiple tables
|
951
|
-
if isinstance(tables, list):
|
952
|
-
if len(tables) > 0:
|
953
|
-
schemas = [t.schema for t in tables]
|
954
|
-
unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
|
955
|
-
tables = [cast_schema(t, unified_schema) for t in tables]
|
956
|
-
|
957
|
-
tables = [table for table in tables if table.num_rows > 0]
|
958
|
-
if not tables:
|
959
|
-
return unified_schema.empty_table()
|
960
|
-
|
961
|
-
result = pa.concat_tables(
|
962
|
-
tables,
|
963
|
-
promote_options="permissive",
|
964
|
-
)
|
965
|
-
# if opt_dtypes:
|
966
|
-
# result = opt_dtype_pa(result, strict=False)
|
967
|
-
return result
|
968
|
-
elif isinstance(tables, pa.Table):
|
969
|
-
# if opt_dtypes:
|
970
|
-
# tables = opt_dtype_pa(tables, strict=False)
|
971
|
-
return tables
|
972
|
-
else:
|
973
|
-
tables = [table for table in tables if table.num_rows > 0]
|
974
|
-
if not tables:
|
975
|
-
return unified_schema.empty_table()
|
976
|
-
|
977
|
-
result = pa.concat_tables(
|
978
|
-
tables,
|
979
|
-
promote_options="permissive",
|
980
|
-
)
|
981
|
-
return tables
|
982
|
-
|
983
|
-
|
984
|
-
def _read_parquet_batches(
|
985
|
-
self: AbstractFileSystem,
|
986
|
-
path: str | list[str],
|
987
|
-
batch_size: int | None = None,
|
988
|
-
include_file_path: bool = False,
|
989
|
-
use_threads: bool = True,
|
990
|
-
concat: bool = True,
|
991
|
-
verbose: bool = False,
|
992
|
-
opt_dtypes: bool = False,
|
993
|
-
**kwargs: Any,
|
994
|
-
) -> Generator[pa.Table | list[pa.Table], None, None]:
|
995
|
-
"""Process Parquet files in batches with performance optimizations.
|
996
|
-
|
997
|
-
Internal generator function that handles batched reading of Parquet files
|
998
|
-
with support for:
|
999
|
-
- Parallel processing within batches
|
1000
|
-
- Metadata-based optimizations
|
1001
|
-
- Memory-efficient processing
|
1002
|
-
- Progress tracking
|
1003
|
-
|
1004
|
-
Uses fast path for simple cases:
|
1005
|
-
- Single directory with _metadata
|
1006
|
-
- No need for filepath column
|
1007
|
-
- Concatenated output
|
1008
|
-
|
1009
|
-
Args:
|
1010
|
-
path: Path(s) to Parquet file(s). Glob patterns supported.
|
1011
|
-
batch_size: Number of files to process in each batch
|
1012
|
-
include_file_path: Add source filepath as a column
|
1013
|
-
use_threads: Enable parallel file reading within batches
|
1014
|
-
concat: Combine files within each batch
|
1015
|
-
verbose: Print progress information
|
1016
|
-
**kwargs: Additional arguments passed to pq.read_table()
|
1017
|
-
|
1018
|
-
Yields:
|
1019
|
-
Each batch of data in requested format:
|
1020
|
-
- pa.Table: Single Table if concat=True
|
1021
|
-
- list[pa.Table]: List of Tables if concat=False
|
1022
|
-
|
1023
|
-
Example:
|
1024
|
-
>>> fs = LocalFileSystem()
|
1025
|
-
>>> # Fast path for simple case
|
1026
|
-
>>> next(_read_parquet_batches(
|
1027
|
-
... fs,
|
1028
|
-
... "data/", # Contains _metadata
|
1029
|
-
... batch_size=1000
|
1030
|
-
... ))
|
1031
|
-
>>>
|
1032
|
-
>>> # Parallel batch processing
|
1033
|
-
>>> for batch in fs._read_parquet_batches(
|
1034
|
-
... fs,
|
1035
|
-
... ["file1.parquet", "file2.parquet"],
|
1036
|
-
... batch_size=1,
|
1037
|
-
... include_file_path=True,
|
1038
|
-
... use_threads=True
|
1039
|
-
... ):
|
1040
|
-
... print(f"Batch schema: {batch.schema}")
|
1041
|
-
"""
|
1042
|
-
# Fast path for simple cases
|
1043
|
-
# if not include_file_path and concat and batch_size is None:
|
1044
|
-
# if isinstance(path, str):
|
1045
|
-
# path = path.replace("**", "").replace("*.parquet", "")
|
1046
|
-
# table = _read_parquet_file(
|
1047
|
-
# path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
|
1048
|
-
# )
|
1049
|
-
# yield table
|
1050
|
-
# return
|
1051
|
-
|
1052
|
-
# Resolve path(s) to list
|
1053
|
-
if isinstance(path, str):
|
1054
|
-
path = path_to_glob(path, format="parquet")
|
1055
|
-
path = self.glob(path)
|
1056
|
-
|
1057
|
-
if not isinstance(path, list):
|
1058
|
-
yield _read_parquet_file(
|
1059
|
-
path=path,
|
1060
|
-
self=self,
|
1061
|
-
include_file_path=include_file_path,
|
1062
|
-
opt_dtypes=opt_dtypes,
|
1063
|
-
**kwargs,
|
1064
|
-
)
|
1065
|
-
return
|
1066
|
-
|
1067
|
-
# Process in batches
|
1068
|
-
for i in range(0, len(path), batch_size):
|
1069
|
-
batch_paths = path[i : i + batch_size]
|
1070
|
-
if use_threads and len(batch_paths) > 1:
|
1071
|
-
batch_tables = run_parallel(
|
1072
|
-
_read_parquet_file,
|
1073
|
-
batch_paths,
|
1074
|
-
self=self,
|
1075
|
-
include_file_path=include_file_path,
|
1076
|
-
opt_dtypes=opt_dtypes,
|
1077
|
-
n_jobs=-1,
|
1078
|
-
backend="threading",
|
1079
|
-
verbose=verbose,
|
1080
|
-
**kwargs,
|
1081
|
-
)
|
1082
|
-
else:
|
1083
|
-
batch_tables = [
|
1084
|
-
_read_parquet_file(
|
1085
|
-
p,
|
1086
|
-
self=self,
|
1087
|
-
include_file_path=include_file_path,
|
1088
|
-
opt_dtypes=opt_dtypes,
|
1089
|
-
**kwargs,
|
1090
|
-
)
|
1091
|
-
for p in batch_paths
|
1092
|
-
]
|
1093
|
-
|
1094
|
-
if concat and batch_tables:
|
1095
|
-
# Unify schemas before concatenation
|
1096
|
-
if len(batch_tables) > 1:
|
1097
|
-
schemas = [t.schema for t in batch_tables]
|
1098
|
-
unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
|
1099
|
-
batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
|
1100
|
-
batch_tables = [table for table in batch_tables if table.num_rows > 0]
|
1101
|
-
if not batch_tables:
|
1102
|
-
yield unified_schema.empty_table()
|
1103
|
-
batch_table = pa.concat_tables(
|
1104
|
-
batch_tables,
|
1105
|
-
promote_options="permissive",
|
1106
|
-
)
|
1107
|
-
# if opt_dtypes:
|
1108
|
-
# result = opt_dtype_pa(result, strict=False)
|
1109
|
-
yield batch_table
|
1110
|
-
else:
|
1111
|
-
# if opt_dtypes and isinstance(batch_tables, list):
|
1112
|
-
# batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
|
1113
|
-
yield batch_tables
|
1114
|
-
|
1115
|
-
|
1116
|
-
def read_parquet(
|
1117
|
-
self: AbstractFileSystem,
|
1118
|
-
path: str | list[str],
|
1119
|
-
batch_size: int | None = None,
|
1120
|
-
include_file_path: bool = False,
|
1121
|
-
concat: bool = True,
|
1122
|
-
use_threads: bool = True,
|
1123
|
-
verbose: bool = False,
|
1124
|
-
opt_dtypes: bool = False,
|
1125
|
-
**kwargs: Any,
|
1126
|
-
) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
|
1127
|
-
"""Read Parquet data with advanced features and optimizations.
|
1128
|
-
|
1129
|
-
Provides a high-performance interface for reading Parquet files with support for:
|
1130
|
-
- Single file or multiple files
|
1131
|
-
- Batch processing for large datasets
|
1132
|
-
- Parallel processing
|
1133
|
-
- File path tracking
|
1134
|
-
- Automatic concatenation
|
1135
|
-
- PyArrow Table output
|
1136
|
-
|
1137
|
-
The function automatically uses optimal reading strategies:
|
1138
|
-
- Direct dataset reading for simple cases
|
1139
|
-
- Parallel processing for multiple files
|
1140
|
-
- Batched reading for memory efficiency
|
1141
|
-
|
1142
|
-
Args:
|
1143
|
-
path: Path(s) to Parquet file(s). Can be:
|
1144
|
-
- Single path string (globs supported)
|
1145
|
-
- List of path strings
|
1146
|
-
- Directory containing _metadata file
|
1147
|
-
batch_size: If set, enables batch reading with this many files per batch
|
1148
|
-
include_file_path: Add source filepath as a column
|
1149
|
-
concat: Combine multiple files/batches into single Table
|
1150
|
-
use_threads: Enable parallel file reading
|
1151
|
-
verbose: Print progress information
|
1152
|
-
opt_dtypes: Optimize Table dtypes for performance
|
1153
|
-
**kwargs: Additional arguments passed to pq.read_table()
|
1154
|
-
|
1155
|
-
Returns:
|
1156
|
-
Various types depending on arguments:
|
1157
|
-
- pa.Table: Single or concatenated Table
|
1158
|
-
- list[pa.Table]: List of Tables (if concat=False)
|
1159
|
-
- Generator: If batch_size set, yields batches of above types
|
1160
|
-
|
1161
|
-
Example:
|
1162
|
-
>>> fs = LocalFileSystem()
|
1163
|
-
>>> # Read all Parquet files in directory
|
1164
|
-
>>> table = fs.read_parquet(
|
1165
|
-
... "data/*.parquet",
|
1166
|
-
... include_file_path=True
|
1167
|
-
... )
|
1168
|
-
>>> print(table.column_names)
|
1169
|
-
['file_path', 'col1', 'col2', ...]
|
1170
|
-
>>>
|
1171
|
-
>>> # Batch process large dataset
|
1172
|
-
>>> for batch in fs.read_parquet(
|
1173
|
-
... "data/*.parquet",
|
1174
|
-
... batch_size=100,
|
1175
|
-
... use_threads=True
|
1176
|
-
... ):
|
1177
|
-
... print(f"Processing {batch.num_rows} rows")
|
1178
|
-
>>>
|
1179
|
-
>>> # Read from directory with metadata
|
1180
|
-
>>> table = fs.read_parquet(
|
1181
|
-
... "data/", # Contains _metadata
|
1182
|
-
... use_threads=True
|
1183
|
-
... )
|
1184
|
-
>>> print(f"Total rows: {table.num_rows}")
|
1185
|
-
"""
|
1186
|
-
if batch_size is not None:
|
1187
|
-
return _read_parquet_batches(
|
1188
|
-
self=self,
|
1189
|
-
path=path,
|
1190
|
-
batch_size=batch_size,
|
1191
|
-
include_file_path=include_file_path,
|
1192
|
-
concat=concat,
|
1193
|
-
use_threads=use_threads,
|
1194
|
-
verbose=verbose,
|
1195
|
-
opt_dtypes=opt_dtypes,
|
1196
|
-
**kwargs,
|
1197
|
-
)
|
1198
|
-
return _read_parquet(
|
1199
|
-
self=self,
|
1200
|
-
path=path,
|
1201
|
-
include_file_path=include_file_path,
|
1202
|
-
use_threads=use_threads,
|
1203
|
-
concat=concat,
|
1204
|
-
verbose=verbose,
|
1205
|
-
opt_dtypes=opt_dtypes,
|
1206
|
-
**kwargs,
|
1207
|
-
)
|
1208
|
-
|
1209
|
-
|
1210
|
-
def read_files(
|
1211
|
-
self: AbstractFileSystem,
|
1212
|
-
path: str | list[str],
|
1213
|
-
format: str,
|
1214
|
-
batch_size: int | None = None,
|
1215
|
-
include_file_path: bool = False,
|
1216
|
-
concat: bool = True,
|
1217
|
-
jsonlines: bool = False,
|
1218
|
-
use_threads: bool = True,
|
1219
|
-
verbose: bool = False,
|
1220
|
-
opt_dtypes: bool = False,
|
1221
|
-
**kwargs: Any,
|
1222
|
-
) -> (
|
1223
|
-
pl.DataFrame
|
1224
|
-
| pa.Table
|
1225
|
-
| list[pl.DataFrame]
|
1226
|
-
| list[pa.Table]
|
1227
|
-
| Generator[
|
1228
|
-
pl.DataFrame | pa.Table | list[pl.DataFrame] | list[pa.Table], None, None
|
1229
|
-
]
|
1230
|
-
):
|
1231
|
-
"""Universal interface for reading data files of any supported format.
|
1232
|
-
|
1233
|
-
A unified API that automatically delegates to the appropriate reading function
|
1234
|
-
based on file format, while preserving all advanced features like:
|
1235
|
-
- Batch processing
|
1236
|
-
- Parallel reading
|
1237
|
-
- File path tracking
|
1238
|
-
- Format-specific optimizations
|
1239
|
-
|
1240
|
-
Args:
|
1241
|
-
path: Path(s) to data file(s). Can be:
|
1242
|
-
- Single path string (globs supported)
|
1243
|
-
- List of path strings
|
1244
|
-
format: File format to read. Supported values:
|
1245
|
-
- "json": Regular JSON or JSON Lines
|
1246
|
-
- "csv": CSV files
|
1247
|
-
- "parquet": Parquet files
|
1248
|
-
batch_size: If set, enables batch reading with this many files per batch
|
1249
|
-
include_file_path: Add source filepath as column/field
|
1250
|
-
concat: Combine multiple files/batches into single result
|
1251
|
-
jsonlines: For JSON format, whether to read as JSON Lines
|
1252
|
-
use_threads: Enable parallel file reading
|
1253
|
-
verbose: Print progress information
|
1254
|
-
opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
|
1255
|
-
**kwargs: Additional format-specific arguments
|
1256
|
-
|
1257
|
-
Returns:
|
1258
|
-
Various types depending on format and arguments:
|
1259
|
-
- pl.DataFrame: For CSV and optionally JSON
|
1260
|
-
- pa.Table: For Parquet
|
1261
|
-
- list[pl.DataFrame | pa.Table]: Without concatenation
|
1262
|
-
- Generator: If batch_size set, yields batches
|
1263
|
-
|
1264
|
-
Example:
|
1265
|
-
>>> fs = LocalFileSystem()
|
1266
|
-
>>> # Read CSV files
|
1267
|
-
>>> df = fs.read_files(
|
1268
|
-
... "data/*.csv",
|
1269
|
-
... format="csv",
|
1270
|
-
... include_file_path=True
|
1271
|
-
... )
|
1272
|
-
>>> print(type(df))
|
1273
|
-
<class 'polars.DataFrame'>
|
1274
|
-
>>>
|
1275
|
-
>>> # Batch process Parquet files
|
1276
|
-
>>> for batch in fs.read_files(
|
1277
|
-
... "data/*.parquet",
|
1278
|
-
... format="parquet",
|
1279
|
-
... batch_size=100,
|
1280
|
-
... use_threads=True
|
1281
|
-
... ):
|
1282
|
-
... print(f"Batch type: {type(batch)}")
|
1283
|
-
>>>
|
1284
|
-
>>> # Read JSON Lines
|
1285
|
-
>>> df = fs.read_files(
|
1286
|
-
... "logs/*.jsonl",
|
1287
|
-
... format="json",
|
1288
|
-
... jsonlines=True,
|
1289
|
-
... concat=True
|
1290
|
-
... )
|
1291
|
-
>>> print(df.columns)
|
1292
|
-
"""
|
1293
|
-
if format == "json":
|
1294
|
-
if batch_size is not None:
|
1295
|
-
return read_json(
|
1296
|
-
self=self,
|
1297
|
-
path=path,
|
1298
|
-
batch_size=batch_size,
|
1299
|
-
include_file_path=include_file_path,
|
1300
|
-
jsonlines=jsonlines,
|
1301
|
-
concat=concat,
|
1302
|
-
use_threads=use_threads,
|
1303
|
-
verbose=verbose,
|
1304
|
-
opt_dtypes=opt_dtypes,
|
1305
|
-
**kwargs,
|
1306
|
-
)
|
1307
|
-
return read_json(
|
1308
|
-
self=self,
|
1309
|
-
path=path,
|
1310
|
-
include_file_path=include_file_path,
|
1311
|
-
jsonlines=jsonlines,
|
1312
|
-
concat=concat,
|
1313
|
-
use_threads=use_threads,
|
1314
|
-
verbose=verbose,
|
1315
|
-
opt_dtypes=opt_dtypes,
|
1316
|
-
**kwargs,
|
1317
|
-
)
|
1318
|
-
elif format == "csv":
|
1319
|
-
if batch_size is not None:
|
1320
|
-
return read_csv(
|
1321
|
-
self=self,
|
1322
|
-
path=path,
|
1323
|
-
batch_size=batch_size,
|
1324
|
-
include_file_path=include_file_path,
|
1325
|
-
concat=concat,
|
1326
|
-
use_threads=use_threads,
|
1327
|
-
verbose=verbose,
|
1328
|
-
opt_dtypes=opt_dtypes,
|
1329
|
-
**kwargs,
|
1330
|
-
)
|
1331
|
-
return read_csv(
|
1332
|
-
self=self,
|
1333
|
-
path=path,
|
1334
|
-
include_file_path=include_file_path,
|
1335
|
-
use_threads=use_threads,
|
1336
|
-
concat=concat,
|
1337
|
-
verbose=verbose,
|
1338
|
-
opt_dtypes=opt_dtypes,
|
1339
|
-
**kwargs,
|
1340
|
-
)
|
1341
|
-
elif format == "parquet":
|
1342
|
-
if batch_size is not None:
|
1343
|
-
return read_parquet(
|
1344
|
-
self=self,
|
1345
|
-
path=path,
|
1346
|
-
batch_size=batch_size,
|
1347
|
-
include_file_path=include_file_path,
|
1348
|
-
concat=concat,
|
1349
|
-
use_threads=use_threads,
|
1350
|
-
verbose=verbose,
|
1351
|
-
opt_dtypes=opt_dtypes,
|
1352
|
-
**kwargs,
|
1353
|
-
)
|
1354
|
-
return read_parquet(
|
1355
|
-
self=self,
|
1356
|
-
path=path,
|
1357
|
-
include_file_path=include_file_path,
|
1358
|
-
use_threads=use_threads,
|
1359
|
-
concat=concat,
|
1360
|
-
verbose=verbose,
|
1361
|
-
opt_dtypes=opt_dtypes,
|
1362
|
-
**kwargs,
|
1363
|
-
)
|
1364
|
-
|
1365
|
-
|
1366
|
-
def pyarrow_dataset(
|
1367
|
-
self: AbstractFileSystem,
|
1368
|
-
path: str,
|
1369
|
-
format: str = "parquet",
|
1370
|
-
schema: pa.Schema | None = None,
|
1371
|
-
partitioning: str | list[str] | pds.Partitioning = None,
|
1372
|
-
**kwargs: Any,
|
1373
|
-
) -> pds.Dataset:
|
1374
|
-
"""Create a PyArrow dataset from files in any supported format.
|
1375
|
-
|
1376
|
-
Creates a dataset that provides optimized reading and querying capabilities
|
1377
|
-
including:
|
1378
|
-
- Schema inference and enforcement
|
1379
|
-
- Partition discovery and pruning
|
1380
|
-
- Predicate pushdown
|
1381
|
-
- Column projection
|
1382
|
-
|
1383
|
-
Args:
|
1384
|
-
path: Base path to dataset files
|
1385
|
-
format: File format. Currently supports:
|
1386
|
-
- "parquet" (default)
|
1387
|
-
- "csv"
|
1388
|
-
- "json" (experimental)
|
1389
|
-
schema: Optional schema to enforce. If None, inferred from data.
|
1390
|
-
partitioning: How the dataset is partitioned. Can be:
|
1391
|
-
- str: Single partition field
|
1392
|
-
- list[str]: Multiple partition fields
|
1393
|
-
- pds.Partitioning: Custom partitioning scheme
|
1394
|
-
**kwargs: Additional arguments for dataset creation
|
1395
|
-
|
1396
|
-
Returns:
|
1397
|
-
pds.Dataset: PyArrow dataset instance
|
1398
|
-
|
1399
|
-
Example:
|
1400
|
-
>>> fs = LocalFileSystem()
|
1401
|
-
>>> # Simple Parquet dataset
|
1402
|
-
>>> ds = fs.pyarrow_dataset("data/")
|
1403
|
-
>>> print(ds.schema)
|
1404
|
-
>>>
|
1405
|
-
>>> # Partitioned dataset
|
1406
|
-
>>> ds = fs.pyarrow_dataset(
|
1407
|
-
... "events/",
|
1408
|
-
... partitioning=["year", "month"]
|
1409
|
-
... )
|
1410
|
-
>>> # Query with partition pruning
|
1411
|
-
>>> table = ds.to_table(
|
1412
|
-
... filter=(ds.field("year") == 2024)
|
1413
|
-
... )
|
1414
|
-
>>>
|
1415
|
-
>>> # CSV with schema
|
1416
|
-
>>> ds = fs.pyarrow_dataset(
|
1417
|
-
... "logs/",
|
1418
|
-
... format="csv",
|
1419
|
-
... schema=pa.schema([
|
1420
|
-
... ("timestamp", pa.timestamp("s")),
|
1421
|
-
... ("level", pa.string()),
|
1422
|
-
... ("message", pa.string())
|
1423
|
-
... ])
|
1424
|
-
... )
|
1425
|
-
"""
|
1426
|
-
return pds.dataset(
|
1427
|
-
path,
|
1428
|
-
filesystem=self,
|
1429
|
-
partitioning=partitioning,
|
1430
|
-
schema=schema,
|
1431
|
-
format=format,
|
1432
|
-
**kwargs,
|
1433
|
-
)
|
1434
|
-
|
1435
|
-
|
1436
|
-
def pyarrow_parquet_dataset(
|
1437
|
-
self: AbstractFileSystem,
|
1438
|
-
path: str,
|
1439
|
-
schema: pa.Schema | None = None,
|
1440
|
-
partitioning: str | list[str] | pds.Partitioning = None,
|
1441
|
-
**kwargs: Any,
|
1442
|
-
) -> pds.Dataset:
|
1443
|
-
"""Create a PyArrow dataset optimized for Parquet files.
|
1444
|
-
|
1445
|
-
Creates a dataset specifically for Parquet data, automatically handling
|
1446
|
-
_metadata files for optimized reading.
|
1447
|
-
|
1448
|
-
This function is particularly useful for:
|
1449
|
-
- Datasets with existing _metadata files
|
1450
|
-
- Multi-file datasets that should be treated as one
|
1451
|
-
- Partitioned Parquet datasets
|
1452
|
-
|
1453
|
-
Args:
|
1454
|
-
path: Path to dataset directory or _metadata file
|
1455
|
-
schema: Optional schema to enforce. If None, inferred from data.
|
1456
|
-
partitioning: How the dataset is partitioned. Can be:
|
1457
|
-
- str: Single partition field
|
1458
|
-
- list[str]: Multiple partition fields
|
1459
|
-
- pds.Partitioning: Custom partitioning scheme
|
1460
|
-
**kwargs: Additional dataset arguments
|
1461
|
-
|
1462
|
-
Returns:
|
1463
|
-
pds.Dataset: PyArrow dataset instance
|
1464
|
-
|
1465
|
-
Example:
|
1466
|
-
>>> fs = LocalFileSystem()
|
1467
|
-
>>> # Dataset with _metadata
|
1468
|
-
>>> ds = fs.pyarrow_parquet_dataset("data/_metadata")
|
1469
|
-
>>> print(ds.files) # Shows all data files
|
1470
|
-
>>>
|
1471
|
-
>>> # Partitioned dataset directory
|
1472
|
-
>>> ds = fs.pyarrow_parquet_dataset(
|
1473
|
-
... "sales/",
|
1474
|
-
... partitioning=["year", "region"]
|
1475
|
-
... )
|
1476
|
-
>>> # Query with partition pruning
|
1477
|
-
>>> table = ds.to_table(
|
1478
|
-
... filter=(
|
1479
|
-
... (ds.field("year") == 2024) &
|
1480
|
-
... (ds.field("region") == "EMEA")
|
1481
|
-
... )
|
1482
|
-
... )
|
1483
|
-
"""
|
1484
|
-
if not self.is_file(path):
|
1485
|
-
path = posixpath.join(path, "_metadata")
|
1486
|
-
return pds.dataset(
|
1487
|
-
path,
|
1488
|
-
filesystem=self,
|
1489
|
-
partitioning=partitioning,
|
1490
|
-
schema=schema,
|
1491
|
-
**kwargs,
|
1492
|
-
)
|
1493
|
-
|
1494
|
-
|
1495
|
-
def pydala_dataset(
|
1496
|
-
self: AbstractFileSystem,
|
1497
|
-
path: str,
|
1498
|
-
partitioning: str | list[str] | pds.Partitioning = None,
|
1499
|
-
**kwargs: Any,
|
1500
|
-
) -> ParquetDataset: # type: ignore
|
1501
|
-
"""Create a Pydala dataset for advanced Parquet operations.
|
1502
|
-
|
1503
|
-
Creates a dataset with additional features beyond PyArrow including:
|
1504
|
-
- Delta table support
|
1505
|
-
- Schema evolution
|
1506
|
-
- Advanced partitioning
|
1507
|
-
- Metadata management
|
1508
|
-
- Sort key optimization
|
1509
|
-
|
1510
|
-
Args:
|
1511
|
-
path: Path to dataset directory
|
1512
|
-
partitioning: How the dataset is partitioned. Can be:
|
1513
|
-
- str: Single partition field
|
1514
|
-
- list[str]: Multiple partition fields
|
1515
|
-
- pds.Partitioning: Custom partitioning scheme
|
1516
|
-
**kwargs: Additional dataset configuration
|
1517
|
-
|
1518
|
-
Returns:
|
1519
|
-
ParquetDataset: Pydala dataset instance
|
1520
|
-
|
1521
|
-
Example:
|
1522
|
-
>>> fs = LocalFileSystem()
|
1523
|
-
>>> # Create dataset
|
1524
|
-
>>> ds = fs.pydala_dataset(
|
1525
|
-
... "data/",
|
1526
|
-
... partitioning=["date"]
|
1527
|
-
... )
|
1528
|
-
>>>
|
1529
|
-
>>> # Write with delta support
|
1530
|
-
>>> ds.write_to_dataset(
|
1531
|
-
... new_data,
|
1532
|
-
... mode="delta",
|
1533
|
-
... delta_subset=["id"]
|
1534
|
-
... )
|
1535
|
-
>>>
|
1536
|
-
>>> # Read with metadata
|
1537
|
-
>>> df = ds.to_polars()
|
1538
|
-
>>> print(df.columns)
|
1539
|
-
"""
|
1540
|
-
return ParquetDataset(
|
1541
|
-
path,
|
1542
|
-
filesystem=self,
|
1543
|
-
partitioning=partitioning,
|
1544
|
-
**kwargs,
|
1545
|
-
)
|
1546
|
-
|
1547
|
-
|
1548
|
-
def write_parquet(
|
1549
|
-
self: AbstractFileSystem,
|
1550
|
-
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
|
1551
|
-
path: str,
|
1552
|
-
schema: pa.Schema | None = None,
|
1553
|
-
**kwargs: Any,
|
1554
|
-
) -> pq.FileMetaData:
|
1555
|
-
"""Write data to a Parquet file with automatic format conversion.
|
1556
|
-
|
1557
|
-
Handles writing data from multiple input formats to Parquet with:
|
1558
|
-
- Automatic conversion to PyArrow
|
1559
|
-
- Schema validation/coercion
|
1560
|
-
- Metadata collection
|
1561
|
-
- Compression and encoding options
|
1562
|
-
|
1563
|
-
Args:
|
1564
|
-
data: Input data in various formats:
|
1565
|
-
- Polars DataFrame/LazyFrame
|
1566
|
-
- PyArrow Table
|
1567
|
-
- Pandas DataFrame
|
1568
|
-
- Dict or list of dicts
|
1569
|
-
path: Output Parquet file path
|
1570
|
-
schema: Optional schema to enforce on write
|
1571
|
-
**kwargs: Additional arguments for pq.write_table()
|
1572
|
-
|
1573
|
-
Returns:
|
1574
|
-
pq.FileMetaData: Metadata of written Parquet file
|
1575
|
-
|
1576
|
-
Raises:
|
1577
|
-
SchemaError: If data doesn't match schema
|
1578
|
-
ValueError: If data cannot be converted
|
1579
|
-
|
1580
|
-
Example:
|
1581
|
-
>>> fs = LocalFileSystem()
|
1582
|
-
>>> # Write Polars DataFrame
|
1583
|
-
>>> df = pl.DataFrame({
|
1584
|
-
... "id": range(1000),
|
1585
|
-
... "value": pl.Series(np.random.randn(1000))
|
1586
|
-
... })
|
1587
|
-
>>> metadata = fs.write_parquet(
|
1588
|
-
... df,
|
1589
|
-
... "data.parquet",
|
1590
|
-
... compression="zstd",
|
1591
|
-
... compression_level=3
|
1592
|
-
... )
|
1593
|
-
>>> print(f"Rows: {metadata.num_rows}")
|
1594
|
-
>>>
|
1595
|
-
>>> # Write with schema
|
1596
|
-
>>> schema = pa.schema([
|
1597
|
-
... ("id", pa.int64()),
|
1598
|
-
... ("value", pa.float64())
|
1599
|
-
... ])
|
1600
|
-
>>> metadata = fs.write_parquet(
|
1601
|
-
... {"id": [1, 2], "value": [0.1, 0.2]},
|
1602
|
-
... "data.parquet",
|
1603
|
-
... schema=schema
|
1604
|
-
... )
|
1605
|
-
"""
|
1606
|
-
data = to_pyarrow_table(data, concat=False, unique=False)
|
1607
|
-
|
1608
|
-
if schema is not None:
|
1609
|
-
data = cast_schema(data, schema)
|
1610
|
-
metadata = []
|
1611
|
-
pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
|
1612
|
-
metadata = metadata[0]
|
1613
|
-
metadata.set_file_path(path)
|
1614
|
-
return metadata
|
1615
|
-
|
1616
|
-
|
1617
|
-
def write_json(
|
1618
|
-
self: AbstractFileSystem,
|
1619
|
-
data: dict
|
1620
|
-
| pl.DataFrame
|
1621
|
-
| pl.LazyFrame
|
1622
|
-
| pa.Table
|
1623
|
-
| pd.DataFrame
|
1624
|
-
| dict
|
1625
|
-
| list[dict],
|
1626
|
-
path: str,
|
1627
|
-
append: bool = False,
|
1628
|
-
) -> None:
|
1629
|
-
"""Write data to a JSON file with flexible input support.
|
1630
|
-
|
1631
|
-
Handles writing data in various formats to JSON or JSON Lines,
|
1632
|
-
with optional appending for streaming writes.
|
1633
|
-
|
1634
|
-
Args:
|
1635
|
-
data: Input data in various formats:
|
1636
|
-
- Dict or list of dicts
|
1637
|
-
- Polars DataFrame/LazyFrame
|
1638
|
-
- PyArrow Table
|
1639
|
-
- Pandas DataFrame
|
1640
|
-
path: Output JSON file path
|
1641
|
-
append: Whether to append to existing file (JSON Lines mode)
|
1642
|
-
|
1643
|
-
Example:
|
1644
|
-
>>> fs = LocalFileSystem()
|
1645
|
-
>>> # Write dictionary
|
1646
|
-
>>> data = {"name": "test", "values": [1, 2, 3]}
|
1647
|
-
>>> fs.write_json(data, "config.json")
|
1648
|
-
>>>
|
1649
|
-
>>> # Stream records
|
1650
|
-
>>> df1 = pl.DataFrame({"id": [1], "value": ["first"]})
|
1651
|
-
>>> df2 = pl.DataFrame({"id": [2], "value": ["second"]})
|
1652
|
-
>>> fs.write_json(df1, "stream.jsonl", append=False)
|
1653
|
-
>>> fs.write_json(df2, "stream.jsonl", append=True)
|
1654
|
-
>>>
|
1655
|
-
>>> # Convert PyArrow
|
1656
|
-
>>> table = pa.table({"a": [1, 2], "b": ["x", "y"]})
|
1657
|
-
>>> fs.write_json(table, "data.json")
|
1658
|
-
"""
|
1659
|
-
if isinstance(data, pl.LazyFrame):
|
1660
|
-
data = data.collect()
|
1661
|
-
if isinstance(data, pl.DataFrame):
|
1662
|
-
data = data.to_arrow()
|
1663
|
-
data = cast_schema(
|
1664
|
-
data, convert_large_types_to_standard(data.schema)
|
1665
|
-
).to_pydict()
|
1666
|
-
elif isinstance(data, pd.DataFrame):
|
1667
|
-
data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
|
1668
|
-
elif isinstance(data, pa.Table):
|
1669
|
-
data = data.to_pydict()
|
1670
|
-
if append:
|
1671
|
-
with self.open(path, "ab") as f:
|
1672
|
-
if isinstance(data, dict):
|
1673
|
-
f.write(orjson.dumps(data) + b"\n")
|
1674
|
-
else:
|
1675
|
-
for record in data:
|
1676
|
-
f.write(orjson.dumps(record) + b"\n")
|
1677
|
-
else:
|
1678
|
-
with self.open(path, "wb") as f:
|
1679
|
-
f.write(orjson.dumps(data))
|
1680
|
-
|
1681
|
-
|
1682
|
-
def write_csv(
|
1683
|
-
self: AbstractFileSystem,
|
1684
|
-
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
|
1685
|
-
path: str,
|
1686
|
-
append: bool = False,
|
1687
|
-
**kwargs: Any,
|
1688
|
-
) -> None:
|
1689
|
-
"""Write data to a CSV file with flexible input support.
|
1690
|
-
|
1691
|
-
Handles writing data from multiple formats to CSV with options for:
|
1692
|
-
- Appending to existing files
|
1693
|
-
- Custom delimiters and formatting
|
1694
|
-
- Automatic type conversion
|
1695
|
-
- Header handling
|
1696
|
-
|
1697
|
-
Args:
|
1698
|
-
data: Input data in various formats:
|
1699
|
-
- Polars DataFrame/LazyFrame
|
1700
|
-
- PyArrow Table
|
1701
|
-
- Pandas DataFrame
|
1702
|
-
- Dict or list of dicts
|
1703
|
-
path: Output CSV file path
|
1704
|
-
append: Whether to append to existing file
|
1705
|
-
**kwargs: Additional arguments for CSV writing:
|
1706
|
-
- delimiter: Field separator (default ",")
|
1707
|
-
- header: Whether to write header row
|
1708
|
-
- quote_char: Character for quoting fields
|
1709
|
-
- date_format: Format for date/time fields
|
1710
|
-
- float_precision: Decimal places for floats
|
1711
|
-
|
1712
|
-
Example:
|
1713
|
-
>>> fs = LocalFileSystem()
|
1714
|
-
>>> # Write Polars DataFrame
|
1715
|
-
>>> df = pl.DataFrame({
|
1716
|
-
... "id": range(100),
|
1717
|
-
... "name": ["item_" + str(i) for i in range(100)]
|
1718
|
-
... })
|
1719
|
-
>>> fs.write_csv(df, "items.csv")
|
1720
|
-
>>>
|
1721
|
-
>>> # Append records
|
1722
|
-
>>> new_items = pl.DataFrame({
|
1723
|
-
... "id": range(100, 200),
|
1724
|
-
... "name": ["item_" + str(i) for i in range(100, 200)]
|
1725
|
-
... })
|
1726
|
-
>>> fs.write_csv(
|
1727
|
-
... new_items,
|
1728
|
-
... "items.csv",
|
1729
|
-
... append=True,
|
1730
|
-
... header=False
|
1731
|
-
... )
|
1732
|
-
>>>
|
1733
|
-
>>> # Custom formatting
|
1734
|
-
>>> data = pa.table({
|
1735
|
-
... "date": [datetime.now()],
|
1736
|
-
... "value": [123.456]
|
1737
|
-
... })
|
1738
|
-
>>> fs.write_csv(
|
1739
|
-
... data,
|
1740
|
-
... "formatted.csv",
|
1741
|
-
... date_format="%Y-%m-%d",
|
1742
|
-
... float_precision=2
|
1743
|
-
... )
|
1744
|
-
"""
|
1745
|
-
if isinstance(data, pl.LazyFrame):
|
1746
|
-
data = data.collect()
|
1747
|
-
if isinstance(data, pl.DataFrame):
|
1748
|
-
if append:
|
1749
|
-
with self.open(path, "ab") as f:
|
1750
|
-
data.write_csv(f, has_header=not append, **kwargs)
|
1751
|
-
else:
|
1752
|
-
with self.open(path, "wb") as f:
|
1753
|
-
data.write_csv(f, **kwargs)
|
1754
|
-
elif isinstance(data, (pa.Table, pd.DataFrame)):
|
1755
|
-
pl.from_arrow(pa.table(data)).write_csv(path, **kwargs)
|
1756
|
-
else:
|
1757
|
-
pl.DataFrame(data).write_csv(path, **kwargs)
|
1758
|
-
|
1759
|
-
|
1760
|
-
def write_file(
|
1761
|
-
self,
|
1762
|
-
data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict,
|
1763
|
-
path: str,
|
1764
|
-
format: str,
|
1765
|
-
**kwargs,
|
1766
|
-
) -> None:
|
1767
|
-
"""
|
1768
|
-
Write a DataFrame to a file in the given format.
|
1769
|
-
|
1770
|
-
Args:
|
1771
|
-
data: (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame) Data to write.
|
1772
|
-
path (str): Path to write the data.
|
1773
|
-
format (str): Format of the file.
|
1774
|
-
**kwargs: Additional keyword arguments.
|
1775
|
-
|
1776
|
-
Returns:
|
1777
|
-
None
|
1778
|
-
"""
|
1779
|
-
if format == "json":
|
1780
|
-
write_json(self, data, path, **kwargs)
|
1781
|
-
elif format == "csv":
|
1782
|
-
write_csv(self, data, path, **kwargs)
|
1783
|
-
elif format == "parquet":
|
1784
|
-
write_parquet(self, data, path, **kwargs)
|
1785
|
-
|
1786
|
-
|
1787
|
-
def write_files(
|
1788
|
-
self,
|
1789
|
-
data: (
|
1790
|
-
pl.DataFrame
|
1791
|
-
| pl.LazyFrame
|
1792
|
-
| pa.Table
|
1793
|
-
| pa.RecordBatch
|
1794
|
-
| pa.RecordBatchReader
|
1795
|
-
| pd.DataFrame
|
1796
|
-
| dict
|
1797
|
-
| list[
|
1798
|
-
pl.DataFrame
|
1799
|
-
| pl.LazyFrame
|
1800
|
-
| pa.Table
|
1801
|
-
| pa.RecordBatch
|
1802
|
-
| pa.RecordBatchReader
|
1803
|
-
| pd.DataFrame
|
1804
|
-
| dict
|
1805
|
-
]
|
1806
|
-
),
|
1807
|
-
path: str | list[str],
|
1808
|
-
basename: str = None,
|
1809
|
-
format: str = None,
|
1810
|
-
concat: bool = True,
|
1811
|
-
unique: bool | list[str] | str = False,
|
1812
|
-
mode: str = "append", # append, overwrite, delete_matching, error_if_exists
|
1813
|
-
use_threads: bool = True,
|
1814
|
-
verbose: bool = False,
|
1815
|
-
**kwargs,
|
1816
|
-
) -> None:
|
1817
|
-
"""Write a DataFrame or a list of DataFrames to a file or a list of files.
|
1818
|
-
|
1819
|
-
Args:
|
1820
|
-
data: (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[pl.DataFrame | pl.LazyFrame |
|
1821
|
-
pa.Table | pd.DataFrame | dict]) Data to write.
|
1822
|
-
path: (str | list[str]) Path to write the data.
|
1823
|
-
basename: (str, optional) Basename of the files. Defaults to None.
|
1824
|
-
format: (str, optional) Format of the data. Defaults to None.
|
1825
|
-
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
1826
|
-
unique: (bool, optional) If True, remove duplicates. Defaults to False.
|
1827
|
-
mode: (str, optional) Write mode. Defaults to 'append'. Options: 'append', 'overwrite', 'delete_matching',
|
1828
|
-
'error_if_exists'.
|
1829
|
-
use_threads: (bool, optional) If True, use parallel processing. Defaults to True.
|
1830
|
-
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
1831
|
-
**kwargs: Additional keyword arguments.
|
1832
|
-
|
1833
|
-
Returns:
|
1834
|
-
None
|
1835
|
-
|
1836
|
-
Raises:
|
1837
|
-
FileExistsError: If file already exists and mode is 'error_if_exists'.
|
1838
|
-
"""
|
1839
|
-
if not isinstance(data, list):
|
1840
|
-
data = [data]
|
1841
|
-
|
1842
|
-
if concat:
|
1843
|
-
if isinstance(data[0], dict):
|
1844
|
-
data = _dict_to_dataframe(data)
|
1845
|
-
if isinstance(data[0], pl.LazyFrame):
|
1846
|
-
data = pl.concat([d.collect() for d in data], how="diagonal_relaxed")
|
1847
|
-
|
1848
|
-
if isinstance(
|
1849
|
-
data[0], pa.Table | pa.RecordBatch | pa.RecordBatchReader | Generator
|
1850
|
-
):
|
1851
|
-
data = pl.concat([pl.from_arrow(d) for d in data], how="diagonal_relaxed")
|
1852
|
-
elif isinstance(data[0], pd.DataFrame):
|
1853
|
-
data = pl.concat([pl.from_pandas(d) for d in data], how="diagonal_relaxed")
|
1854
|
-
|
1855
|
-
if unique:
|
1856
|
-
data = data.unique(
|
1857
|
-
subset=None if not isinstance(unique, str | list) else unique,
|
1858
|
-
maintain_order=True,
|
1859
|
-
)
|
1860
|
-
|
1861
|
-
data = [data]
|
1862
|
-
|
1863
|
-
if format is None:
|
1864
|
-
format = (
|
1865
|
-
path[0].split(".")[-1]
|
1866
|
-
if isinstance(path, list) and "." in path[0]
|
1867
|
-
else path.split(".")[-1]
|
1868
|
-
if "." in path
|
1869
|
-
else "parquet"
|
1870
|
-
)
|
1871
|
-
|
1872
|
-
def _write(d, p, basename, i):
|
1873
|
-
if f".{format}" not in p:
|
1874
|
-
if not basename:
|
1875
|
-
basename = f"data-{dt.datetime.now().strftime('%Y%m%d_%H%M%S%f')[:-3]}-{uuid.uuid4().hex[:16]}"
|
1876
|
-
p = f"{p}/{basename}-{i}.{format}"
|
1877
|
-
|
1878
|
-
if mode == "delete_matching":
|
1879
|
-
write_file(self, d, p, format, **kwargs)
|
1880
|
-
elif mode == "overwrite":
|
1881
|
-
if self.exists(p):
|
1882
|
-
self.fs.rm(p, recursive=True)
|
1883
|
-
write_file(self, d, p, format, **kwargs)
|
1884
|
-
elif mode == "append":
|
1885
|
-
if not self.exists(p):
|
1886
|
-
write_file(self, d, p, format, **kwargs)
|
1887
|
-
else:
|
1888
|
-
p = p.replace(f".{format}", f"-{i}.{format}")
|
1889
|
-
write_file(self, d, p, format, **kwargs)
|
1890
|
-
elif mode == "error_if_exists":
|
1891
|
-
if self.exists(p):
|
1892
|
-
raise FileExistsError(f"File already exists: {p}")
|
1893
|
-
else:
|
1894
|
-
write_file(self, d, p, format, **kwargs)
|
1895
|
-
|
1896
|
-
if mode == "overwrite":
|
1897
|
-
if isinstance(path, list):
|
1898
|
-
for p in path:
|
1899
|
-
# Remove existing files
|
1900
|
-
if self.exists(p):
|
1901
|
-
self.rm(p, recursive=True)
|
1902
|
-
else:
|
1903
|
-
# Remove existing files
|
1904
|
-
if self.exists(path):
|
1905
|
-
self.rm(path, recursive=True)
|
1906
|
-
|
1907
|
-
if use_threads:
|
1908
|
-
run_parallel(
|
1909
|
-
_write,
|
1910
|
-
d=data,
|
1911
|
-
p=path,
|
1912
|
-
basename=basename,
|
1913
|
-
i=list(range(len(data))),
|
1914
|
-
verbose=verbose,
|
1915
|
-
)
|
1916
|
-
else:
|
1917
|
-
for i, p in enumerate(path):
|
1918
|
-
_write(i, data, p, basename)
|
1919
|
-
|
1920
|
-
|
1921
|
-
def write_pyarrow_dataset(
|
1922
|
-
self,
|
1923
|
-
data: (
|
1924
|
-
pl.DataFrame
|
1925
|
-
| pl.LazyFrame
|
1926
|
-
| pa.Table
|
1927
|
-
| pa.RecordBatch
|
1928
|
-
| pa.RecordBatchReader
|
1929
|
-
| pd.DataFrame
|
1930
|
-
| dict
|
1931
|
-
| list[
|
1932
|
-
pl.DataFrame
|
1933
|
-
| pl.LazyFrame
|
1934
|
-
| pa.Table
|
1935
|
-
| pa.RecordBatch
|
1936
|
-
| pa.RecordBatchReader
|
1937
|
-
| pd.DataFrame
|
1938
|
-
| dict
|
1939
|
-
]
|
1940
|
-
),
|
1941
|
-
path: str,
|
1942
|
-
basename: str | None = None,
|
1943
|
-
schema: pa.Schema | None = None,
|
1944
|
-
partition_by: str | list[str] | pds.Partitioning | None = None,
|
1945
|
-
partitioning_flavor: str = "hive",
|
1946
|
-
mode: str = "append",
|
1947
|
-
format: str | None = "parquet",
|
1948
|
-
compression: str = "zstd",
|
1949
|
-
max_rows_per_file: int | None = 2_500_000,
|
1950
|
-
row_group_size: int | None = 250_000,
|
1951
|
-
concat: bool = True,
|
1952
|
-
unique: bool | list[str] | str = False,
|
1953
|
-
**kwargs,
|
1954
|
-
) -> list[pq.FileMetaData] | None:
|
1955
|
-
"""
|
1956
|
-
Write a tabluar data to a PyArrow dataset.
|
1957
|
-
|
1958
|
-
Args:
|
1959
|
-
data: (pl.DataFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
|
1960
|
-
pd.DataFrame | list[pl.DataFrame] | list[pa.Table] | list[pa.RecordBatch] |
|
1961
|
-
list[pa.RecordBatchReader] | list[pd.DataFrame]) Data to write.
|
1962
|
-
path: (str) Path to write the data.
|
1963
|
-
basename: (str, optional) Basename of the files. Defaults to None.
|
1964
|
-
schema: (pa.Schema, optional) Schema of the data. Defaults to None.
|
1965
|
-
partition_by: (str | list[str] | pds.Partitioning, optional) Partitioning of the data.
|
1966
|
-
Defaults to None.
|
1967
|
-
partitioning_flavor: (str, optional) Partitioning flavor. Defaults to 'hive'.
|
1968
|
-
mode: (str, optional) Write mode. Defaults to 'append'.
|
1969
|
-
format: (str, optional) Format of the data. Defaults to 'parquet'.
|
1970
|
-
compression: (str, optional) Compression algorithm. Defaults to 'zstd'.
|
1971
|
-
max_rows_per_file: (int, optional) Maximum number of rows per file. Defaults to 2_500_000.
|
1972
|
-
row_group_size: (int, optional) Row group size. Defaults to 250_000.
|
1973
|
-
concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
|
1974
|
-
unique: (bool | str | list[str], optional) If True, remove duplicates. Defaults to False.
|
1975
|
-
**kwargs: Additional keyword arguments for `pds.write_dataset`.
|
1976
|
-
|
1977
|
-
Returns:
|
1978
|
-
(list[pq.FileMetaData] | None): List of Parquet file metadata or None.
|
1979
|
-
"""
|
1980
|
-
data = to_pyarrow_table(data, concat=concat, unique=unique)
|
1981
|
-
|
1982
|
-
if mode == "delete_matching":
|
1983
|
-
existing_data_behavior = "delete_matching"
|
1984
|
-
elif mode == "append":
|
1985
|
-
existing_data_behavior = "overwrite_or_ignore"
|
1986
|
-
elif mode == "overwrite":
|
1987
|
-
self.rm(path, recursive=True)
|
1988
|
-
existing_data_behavior = "overwrite_or_ignore"
|
1989
|
-
else:
|
1990
|
-
existing_data_behavior = mode
|
1991
|
-
|
1992
|
-
if basename is None:
|
1993
|
-
basename_template = (
|
1994
|
-
"data-"
|
1995
|
-
f"{dt.datetime.now().strftime('%Y%m%d_%H%M%S%f')[:-3]}-{uuid.uuid4().hex[:16]}-{{i}}.parquet"
|
1996
|
-
)
|
1997
|
-
else:
|
1998
|
-
basename_template = f"{basename}-{{i}}.parquet"
|
1999
|
-
|
2000
|
-
file_options = pds.ParquetFileFormat().make_write_options(compression=compression)
|
2001
|
-
|
2002
|
-
create_dir: bool = (False,)
|
2003
|
-
|
2004
|
-
if hasattr(self, "fs"):
|
2005
|
-
if "local" in self.fs.protocol:
|
2006
|
-
create_dir = True
|
2007
|
-
else:
|
2008
|
-
if "local" in self.protocol:
|
2009
|
-
create_dir = True
|
2010
|
-
|
2011
|
-
if format == "parquet":
|
2012
|
-
metadata = []
|
2013
|
-
|
2014
|
-
def file_visitor(written_file):
|
2015
|
-
file_metadata = written_file.metadata
|
2016
|
-
file_metadata.set_file_path(written_file.path)
|
2017
|
-
metadata.append(file_metadata)
|
2018
|
-
|
2019
|
-
pds.write_dataset(
|
2020
|
-
data=data,
|
2021
|
-
base_dir=path,
|
2022
|
-
basename_template=basename_template,
|
2023
|
-
partitioning=partition_by,
|
2024
|
-
partitioning_flavor=partitioning_flavor,
|
2025
|
-
filesystem=self,
|
2026
|
-
existing_data_behavior=existing_data_behavior,
|
2027
|
-
min_rows_per_group=row_group_size,
|
2028
|
-
max_rows_per_group=row_group_size,
|
2029
|
-
max_rows_per_file=max_rows_per_file,
|
2030
|
-
schema=schema,
|
2031
|
-
format=format,
|
2032
|
-
create_dir=create_dir,
|
2033
|
-
file_options=file_options,
|
2034
|
-
file_visitor=file_visitor if format == "parquet" else None,
|
2035
|
-
**kwargs,
|
2036
|
-
)
|
2037
|
-
if format == "parquet":
|
2038
|
-
return metadata
|
2039
|
-
|
2040
|
-
|
2041
|
-
def write_pydala_dataset(
|
2042
|
-
self,
|
2043
|
-
data: (
|
2044
|
-
pl.DataFrame
|
2045
|
-
| pl.LazyFrame
|
2046
|
-
| pa.Table
|
2047
|
-
| pa.RecordBatch
|
2048
|
-
| pa.RecordBatchReader
|
2049
|
-
| pd.DataFrame
|
2050
|
-
| dict
|
2051
|
-
| list[
|
2052
|
-
pl.DataFrame
|
2053
|
-
| pl.LazyFrame
|
2054
|
-
| pa.Table
|
2055
|
-
| pa.RecordBatch
|
2056
|
-
| pa.RecordBatchReader
|
2057
|
-
| pd.DataFrame
|
2058
|
-
| dict
|
2059
|
-
]
|
2060
|
-
),
|
2061
|
-
path: str,
|
2062
|
-
mode: str = "append", # "delta", "overwrite"
|
2063
|
-
basename: str | None = None,
|
2064
|
-
partition_by: str | list[str] | None = None,
|
2065
|
-
partitioning_flavor: str = "hive",
|
2066
|
-
max_rows_per_file: int | None = 2_500_000,
|
2067
|
-
row_group_size: int | None = 250_000,
|
2068
|
-
compression: str = "zstd",
|
2069
|
-
concat: bool = True,
|
2070
|
-
sort_by: str | list[str] | list[tuple[str, str]] | None = None,
|
2071
|
-
unique: bool | str | list[str] = False,
|
2072
|
-
delta_subset: str | list[str] | None = None,
|
2073
|
-
update_metadata: bool = True,
|
2074
|
-
alter_schema: bool = False,
|
2075
|
-
timestamp_column: str | None = None,
|
2076
|
-
verbose: bool = False,
|
2077
|
-
**kwargs,
|
2078
|
-
) -> None:
|
2079
|
-
"""Write a tabular data to a Pydala dataset.
|
2080
|
-
|
2081
|
-
Args:
|
2082
|
-
data: (pl.DataFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
|
2083
|
-
pd.DataFrame | list[pl.DataFrame] | list[pa.Table] | list[pa.RecordBatch] |
|
2084
|
-
list[pa.RecordBatchReader] | list[pd.DataFrame]) Data to write.
|
2085
|
-
path: (str) Path to write the data.
|
2086
|
-
mode: (str, optional) Write mode. Defaults to 'append'. Options: 'delta', 'overwrite'.
|
2087
|
-
basename: (str, optional) Basename of the files. Defaults to None.
|
2088
|
-
partition_by: (str | list[str], optional) Partitioning of the data. Defaults to None.
|
2089
|
-
partitioning_flavor: (str, optional) Partitioning flavor. Defaults to 'hive'.
|
2090
|
-
max_rows_per_file: (int, optional) Maximum number of rows per file. Defaults to 2_500_000.
|
2091
|
-
row_group_size: (int, optional) Row group size. Defaults to 250_000.
|
2092
|
-
compression: (str, optional) Compression algorithm. Defaults to 'zstd'.
|
2093
|
-
sort_by: (str | list[str] | list[tuple[str, str]], optional) Columns to sort by. Defaults to None.
|
2094
|
-
unique: (bool | str | list[str], optional) If True, ensure unique values. Defaults to False.
|
2095
|
-
delta_subset: (str | list[str], optional) Subset of columns to include in delta table. Defaults to None.
|
2096
|
-
update_metadata: (bool, optional) If True, update metadata. Defaults to True.
|
2097
|
-
alter_schema: (bool, optional) If True, alter schema. Defaults to False.
|
2098
|
-
timestamp_column: (str, optional) Timestamp column. Defaults to None.
|
2099
|
-
verbose: (bool, optional) If True, print verbose output. Defaults to False.
|
2100
|
-
**kwargs: Additional keyword arguments for `ParquetDataset.write_to_dataset`.
|
2101
|
-
|
2102
|
-
Returns:
|
2103
|
-
None
|
2104
|
-
"""
|
2105
|
-
data = to_pyarrow_table(data, concat=concat, unique=unique)
|
2106
|
-
|
2107
|
-
ds = pydala_dataset(self=self, path=path, partitioning=partitioning_flavor)
|
2108
|
-
ds.write_to_dataset(
|
2109
|
-
data=data,
|
2110
|
-
mode=mode,
|
2111
|
-
basename=basename,
|
2112
|
-
partition_by=partition_by,
|
2113
|
-
max_rows_per_file=max_rows_per_file,
|
2114
|
-
row_group_size=row_group_size,
|
2115
|
-
compression=compression,
|
2116
|
-
sort_by=sort_by,
|
2117
|
-
unique=unique,
|
2118
|
-
delta_subset=delta_subset,
|
2119
|
-
update_metadata=update_metadata,
|
2120
|
-
alter_schema=alter_schema,
|
2121
|
-
timestamp_column=timestamp_column,
|
2122
|
-
verbose=verbose,
|
2123
|
-
**kwargs,
|
2124
|
-
)
|
2125
|
-
|
2126
|
-
|
2127
|
-
AbstractFileSystem.read_json_file = read_json_file
|
2128
|
-
AbstractFileSystem.read_json = read_json
|
2129
|
-
AbstractFileSystem.read_csv_file = read_csv_file
|
2130
|
-
AbstractFileSystem.read_csv = read_csv
|
2131
|
-
AbstractFileSystem.read_parquet_file = read_parquet_file
|
2132
|
-
AbstractFileSystem.read_parquet = read_parquet
|
2133
|
-
AbstractFileSystem.read_files = read_files
|
2134
|
-
AbstractFileSystem.pyarrow_dataset = pyarrow_dataset
|
2135
|
-
AbstractFileSystem.pydala_dataset = pydala_dataset
|
2136
|
-
AbstractFileSystem.pyarrow_parquet_dataset = pyarrow_parquet_dataset
|
2137
|
-
AbstractFileSystem.write_parquet = write_parquet
|
2138
|
-
AbstractFileSystem.write_json = write_json
|
2139
|
-
AbstractFileSystem.write_csv = write_csv
|
2140
|
-
AbstractFileSystem.write_file = write_file
|
2141
|
-
AbstractFileSystem.write_files = write_files
|
2142
|
-
AbstractFileSystem.write_pyarrow_dataset = write_pyarrow_dataset
|
2143
|
-
AbstractFileSystem.write_pydala_dataset = write_pydala_dataset
|