FlowerPower 0.11.6.20__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +400 -132
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.20.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
flowerpower/fs/ext.py DELETED
@@ -1,2143 +0,0 @@
1
- import datetime as dt
2
- import importlib
3
- import posixpath
4
- import uuid
5
- from typing import Any, Generator
6
-
7
- if importlib.util.find_spec("pandas") is not None:
8
- import pandas as pd
9
- else:
10
- raise ImportError("To use this module, please install `flowerpower[io]`.")
11
-
12
- import orjson
13
- # import polars as pl
14
- import pyarrow as pa
15
- import pyarrow.dataset as pds
16
- import pyarrow.parquet as pq
17
- from fsspec import AbstractFileSystem
18
- from pydala.dataset import ParquetDataset
19
-
20
- from ..plugins.io.helpers.polars import opt_dtype as opt_dtype_pl
21
- from ..plugins.io.helpers.polars import pl
22
- # from ..plugins.io.helpers.polars import unify_schemas as unfify_schemas_pl
23
- from ..plugins.io.helpers.pyarrow import cast_schema
24
- from ..plugins.io.helpers.pyarrow import opt_dtype as opt_dtype_pa
25
- from ..plugins.io.helpers.pyarrow import unify_schemas as unify_schemas_pa
26
- from ..utils.misc import (_dict_to_dataframe, convert_large_types_to_standard,
27
- run_parallel, to_pyarrow_table)
28
-
29
-
30
- def path_to_glob(path: str, format: str | None = None) -> str:
31
- """Convert a path to a glob pattern for file matching.
32
-
33
- Intelligently converts paths to glob patterns that match files of the specified
34
- format, handling various directory and wildcard patterns.
35
-
36
- Args:
37
- path: Base path to convert. Can include wildcards (* or **).
38
- Examples: "data/", "data/*.json", "data/**"
39
- format: File format to match (without dot). If None, inferred from path.
40
- Examples: "json", "csv", "parquet"
41
-
42
- Returns:
43
- str: Glob pattern that matches files of specified format.
44
- Examples: "data/**/*.json", "data/*.csv"
45
-
46
- Example:
47
- >>> # Basic directory
48
- >>> path_to_glob("data", "json")
49
- 'data/**/*.json'
50
- >>>
51
- >>> # With wildcards
52
- >>> path_to_glob("data/**", "csv")
53
- 'data/**/*.csv'
54
- >>>
55
- >>> # Format inference
56
- >>> path_to_glob("data/file.parquet")
57
- 'data/file.parquet'
58
- """
59
- path = path.rstrip("/")
60
- if format is None:
61
- if ".json" in path:
62
- format = "json"
63
- elif ".csv" in path:
64
- format = "csv"
65
- elif ".parquet" in path:
66
- format = "parquet"
67
-
68
- if format in path:
69
- return path
70
- else:
71
- if path.endswith("**"):
72
- return posixpath.join(path, f"*.{format}")
73
- elif path.endswith("*"):
74
- if path.endswith("*/*"):
75
- return path + f".{format}"
76
- return posixpath.join(path.rstrip("/*"), f"*.{format}")
77
- return posixpath.join(path, f"**/*.{format}")
78
-
79
-
80
- def _read_json_file(
81
- path: str,
82
- self: AbstractFileSystem,
83
- include_file_path: bool = False,
84
- jsonlines: bool = False,
85
- ) -> dict | list[dict]:
86
- """Read a JSON file from any filesystem.
87
-
88
- Internal function that handles both regular JSON and JSON Lines formats.
89
-
90
- Args:
91
- path: Path to JSON file
92
- self: Filesystem instance to use for reading
93
- include_file_path: Whether to return dict with filepath as key
94
- jsonlines: Whether to read as JSON Lines format
95
-
96
- Returns:
97
- dict | list[dict]: Parsed JSON data. If include_file_path=True,
98
- returns {filepath: data}
99
-
100
- Example:
101
- >>> fs = LocalFileSystem()
102
- >>> # Regular JSON
103
- >>> data = _read_json_file("data.json", fs)
104
- >>> print(type(data))
105
- <class 'dict'>
106
- >>>
107
- >>> # JSON Lines with filepath
108
- >>> data = _read_json_file(
109
- ... "data.jsonl",
110
- ... fs,
111
- ... include_file_path=True,
112
- ... jsonlines=True
113
- ... )
114
- >>> print(list(data.keys())[0])
115
- 'data.jsonl'
116
- """
117
- with self.open(path) as f:
118
- if jsonlines:
119
- data = [orjson.loads(line) for line in f.readlines()]
120
- else:
121
- data = orjson.loads(f.read())
122
- if include_file_path:
123
- return {path: data}
124
- return data
125
-
126
-
127
- def read_json_file(
128
- self: AbstractFileSystem,
129
- path: str,
130
- include_file_path: bool = False,
131
- jsonlines: bool = False,
132
- ) -> dict | list[dict]:
133
- """Read a single JSON file from any filesystem.
134
-
135
- A public wrapper around _read_json_file providing a clean interface for
136
- reading individual JSON files.
137
-
138
- Args:
139
- path: Path to JSON file to read
140
- include_file_path: Whether to return dict with filepath as key
141
- jsonlines: Whether to read as JSON Lines format
142
-
143
- Returns:
144
- dict | list[dict]: Parsed JSON data. For regular JSON, returns a dict.
145
- For JSON Lines, returns a list of dicts. If include_file_path=True,
146
- returns {filepath: data}.
147
-
148
- Example:
149
- >>> fs = LocalFileSystem()
150
- >>> # Read regular JSON
151
- >>> data = fs.read_json_file("config.json")
152
- >>> print(data["setting"])
153
- 'value'
154
- >>>
155
- >>> # Read JSON Lines with filepath
156
- >>> data = fs.read_json_file(
157
- ... "logs.jsonl",
158
- ... include_file_path=True,
159
- ... jsonlines=True
160
- ... )
161
- >>> print(list(data.keys())[0])
162
- 'logs.jsonl'
163
- """
164
- return _read_json_file(
165
- path=path,
166
- self=self,
167
- include_file_path=include_file_path,
168
- jsonlines=jsonlines,
169
- )
170
-
171
-
172
- def _read_json(
173
- self,
174
- path: str | list[str],
175
- include_file_path: bool = False,
176
- use_threads: bool = True,
177
- jsonlines: bool = False,
178
- as_dataframe: bool = True,
179
- concat: bool = True,
180
- verbose: bool = False,
181
- opt_dtypes: bool = False,
182
- **kwargs,
183
- ) -> dict | list[dict] | pl.DataFrame | list[pl.DataFrame]:
184
- """
185
- Read a JSON file or a list of JSON files.
186
-
187
- Args:
188
- path: (str | list[str]) Path to the JSON file(s).
189
- include_file_path: (bool, optional) If True, return a dictionary with the file path as key.
190
- Defaults to False.
191
- use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
192
- jsonlines: (bool, optional) If True, read JSON lines. Defaults to False.
193
- as_dataframe: (bool, optional) If True, return a DataFrame. Defaults to True.
194
- concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
195
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
196
- opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
197
- **kwargs: Additional keyword arguments.
198
-
199
- Returns:
200
- (dict | list[dict] | pl.DataFrame | list[pl.DataFrame]):
201
- Dictionary, list of dictionaries, DataFrame or list of DataFrames.
202
- """
203
- if isinstance(path, str):
204
- path = path_to_glob(path, format="json")
205
- path = self.glob(path)
206
-
207
- if isinstance(path, list):
208
- if use_threads:
209
- data = run_parallel(
210
- _read_json_file,
211
- path,
212
- self=self,
213
- include_file_path=include_file_path,
214
- jsonlines=jsonlines,
215
- n_jobs=-1,
216
- backend="threading",
217
- verbose=verbose,
218
- **kwargs,
219
- )
220
- data = [
221
- _read_json_file(
222
- path=p,
223
- self=self,
224
- include_file_path=include_file_path,
225
- jsonlines=jsonlines,
226
- )
227
- for p in path
228
- ]
229
- else:
230
- data = _read_json_file(
231
- path=path,
232
- self=self,
233
- include_file_path=include_file_path,
234
- jsonlines=jsonlines,
235
- )
236
- if as_dataframe:
237
- if not include_file_path:
238
- data = [pl.DataFrame(d) for d in data]
239
- else:
240
- data = [
241
- [
242
- pl.DataFrame(_data[k]).with_columns(pl.lit(k).alias("file_path"))
243
- for k in _data
244
- ][0]
245
- for _data in data
246
- ]
247
- if opt_dtypes:
248
- data = [opt_dtype_pl(df, strict=False) for df in data]
249
- if concat:
250
- result = pl.concat(data, how="diagonal_relaxed")
251
- # if opt_dtypes:
252
- # result = opt_dtype_pl(result, strict=False)
253
- return result
254
- return data
255
-
256
-
257
- def _read_json_batches(
258
- self: AbstractFileSystem,
259
- path: str | list[str],
260
- batch_size: int | None = None,
261
- include_file_path: bool = False,
262
- jsonlines: bool = False,
263
- as_dataframe: bool = True,
264
- concat: bool = True,
265
- use_threads: bool = True,
266
- verbose: bool = False,
267
- opt_dtypes: bool = False,
268
- **kwargs: Any,
269
- ) -> Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]:
270
- """Process JSON files in batches with optional parallel reading.
271
-
272
- Internal generator function that handles batched reading of JSON files
273
- with support for parallel processing within each batch.
274
-
275
- Args:
276
- path: Path(s) to JSON file(s). Glob patterns supported.
277
- batch_size: Number of files to process in each batch
278
- include_file_path: Include source filepath in output
279
- jsonlines: Whether to read as JSON Lines format
280
- as_dataframe: Convert output to Polars DataFrame(s)
281
- concat: Combine files within each batch
282
- use_threads: Enable parallel file reading within batches
283
- verbose: Print progress information
284
- opt_dtypes: Optimize DataFrame dtypes
285
- **kwargs: Additional arguments for DataFrame conversion
286
-
287
- Yields:
288
- Each batch of data in requested format:
289
- - dict | list[dict]: Raw JSON data
290
- - pl.DataFrame: Single DataFrame if concat=True
291
- - list[pl.DataFrame]: List of DataFrames if concat=False
292
-
293
- Example:
294
- >>> fs = LocalFileSystem()
295
- >>> # Process large dataset in batches
296
- >>> for batch in fs._read_json_batches(
297
- ... "data/*.json",
298
- ... batch_size=100,
299
- ... as_dataframe=True,
300
- ... verbose=True
301
- ... ):
302
- ... print(f"Batch shape: {batch.shape}")
303
- >>>
304
- >>> # Parallel batch processing with filepath tracking
305
- >>> for batch in fs._read_json_batches(
306
- ... ["logs1.jsonl", "logs2.jsonl"],
307
- ... batch_size=1,
308
- ... include_file_path=True,
309
- ... use_threads=True
310
- ... ):
311
- ... print(f"Processing {batch['file_path'][0]}")
312
- """
313
- # Handle path resolution
314
- if isinstance(path, str):
315
- path = path_to_glob(path, format="json")
316
- path = self.glob(path)
317
-
318
- # Process files in batches
319
- for i in range(0, len(path), batch_size):
320
- batch_paths = path[i : i + batch_size]
321
-
322
- # Read batch with optional parallelization
323
- if use_threads and len(batch_paths) > 1:
324
- batch_data = run_parallel(
325
- _read_json_file,
326
- batch_paths,
327
- self=self,
328
- include_file_path=include_file_path,
329
- jsonlines=jsonlines,
330
- n_jobs=-1,
331
- backend="threading",
332
- verbose=verbose,
333
- **kwargs,
334
- )
335
- else:
336
- batch_data = [
337
- _read_json_file(
338
- path=p,
339
- self=self,
340
- include_file_path=include_file_path,
341
- jsonlines=jsonlines,
342
- )
343
- for p in batch_paths
344
- ]
345
-
346
- if as_dataframe:
347
- if not include_file_path:
348
- batch_dfs = [pl.DataFrame(d) for d in batch_data]
349
- else:
350
- batch_dfs = [
351
- [
352
- pl.DataFrame(_data[k]).with_columns(
353
- pl.lit(k).alias("file_path")
354
- )
355
- for k in _data
356
- ][0]
357
- for _data in batch_data
358
- ]
359
- if opt_dtypes:
360
- batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
361
- if concat and len(batch_dfs) > 1:
362
- batch_df = pl.concat(batch_dfs, how="diagonal_relaxed")
363
- # if opt_dtypes:
364
- # batch_df = opt_dtype_pl(batch_df, strict=False)
365
- yield batch_df
366
- else:
367
- # if opt_dtypes:
368
- # batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
369
- yield batch_dfs
370
- else:
371
- yield batch_data
372
-
373
-
374
- def read_json(
375
- self: AbstractFileSystem,
376
- path: str | list[str],
377
- batch_size: int | None = None,
378
- include_file_path: bool = False,
379
- jsonlines: bool = False,
380
- as_dataframe: bool = True,
381
- concat: bool = True,
382
- use_threads: bool = True,
383
- verbose: bool = False,
384
- opt_dtypes: bool = False,
385
- **kwargs: Any,
386
- ) -> (
387
- dict
388
- | list[dict]
389
- | pl.DataFrame
390
- | list[pl.DataFrame]
391
- | Generator[dict | list[dict] | pl.DataFrame | list[pl.DataFrame], None, None]
392
- ):
393
- """Read JSON data from one or more files with powerful options.
394
-
395
- Provides a flexible interface for reading JSON data with support for:
396
- - Single file or multiple files
397
- - Regular JSON or JSON Lines format
398
- - Batch processing for large datasets
399
- - Parallel processing
400
- - DataFrame conversion
401
- - File path tracking
402
-
403
- Args:
404
- path: Path(s) to JSON file(s). Can be:
405
- - Single path string (globs supported)
406
- - List of path strings
407
- batch_size: If set, enables batch reading with this many files per batch
408
- include_file_path: Include source filepath in output
409
- jsonlines: Whether to read as JSON Lines format
410
- as_dataframe: Convert output to Polars DataFrame(s)
411
- concat: Combine multiple files/batches into single result
412
- use_threads: Enable parallel file reading
413
- verbose: Print progress information
414
- opt_dtypes: Optimize DataFrame dtypes for performance
415
- **kwargs: Additional arguments passed to DataFrame conversion
416
-
417
- Returns:
418
- Various types depending on arguments:
419
- - dict: Single JSON file as dictionary
420
- - list[dict]: Multiple JSON files as list of dictionaries
421
- - pl.DataFrame: Single or concatenated DataFrame
422
- - list[pl.DataFrame]: List of DataFrames (if concat=False)
423
- - Generator: If batch_size set, yields batches of above types
424
-
425
- Example:
426
- >>> fs = LocalFileSystem()
427
- >>> # Read all JSON files in directory
428
- >>> df = fs.read_json(
429
- ... "data/*.json",
430
- ... as_dataframe=True,
431
- ... concat=True
432
- ... )
433
- >>> print(df.shape)
434
- (1000, 5) # Combined data from all files
435
- >>>
436
- >>> # Batch process large dataset
437
- >>> for batch_df in fs.read_json(
438
- ... "logs/*.jsonl",
439
- ... batch_size=100,
440
- ... jsonlines=True,
441
- ... include_file_path=True
442
- ... ):
443
- ... print(f"Processing {len(batch_df)} records")
444
- >>>
445
- >>> # Parallel read with custom options
446
- >>> dfs = fs.read_json(
447
- ... ["file1.json", "file2.json"],
448
- ... use_threads=True,
449
- ... concat=False,
450
- ... verbose=True
451
- ... )
452
- >>> print(f"Read {len(dfs)} files")
453
- """
454
- if batch_size is not None:
455
- return _read_json_batches(
456
- self=self,
457
- path=path,
458
- batch_size=batch_size,
459
- include_file_path=include_file_path,
460
- jsonlines=jsonlines,
461
- as_dataframe=as_dataframe,
462
- concat=concat,
463
- use_threads=use_threads,
464
- verbose=verbose,
465
- opt_dtypes=opt_dtypes,
466
- **kwargs,
467
- )
468
- return _read_json(
469
- self=self,
470
- path=path,
471
- include_file_path=include_file_path,
472
- jsonlines=jsonlines,
473
- as_dataframe=as_dataframe,
474
- concat=concat,
475
- use_threads=use_threads,
476
- verbose=verbose,
477
- opt_dtypes=opt_dtypes,
478
- **kwargs,
479
- )
480
-
481
-
482
- def _read_csv_file(
483
- path: str,
484
- self: AbstractFileSystem,
485
- include_file_path: bool = False,
486
- opt_dtypes: bool = False,
487
- **kwargs: Any,
488
- ) -> pl.DataFrame:
489
- """Read a single CSV file from any filesystem.
490
-
491
- Internal function that handles reading individual CSV files and optionally
492
- adds the source filepath as a column.
493
-
494
- Args:
495
- path: Path to CSV file
496
- self: Filesystem instance to use for reading
497
- include_file_path: Add source filepath as a column
498
- opt_dtypes: Optimize DataFrame dtypes
499
- **kwargs: Additional arguments passed to pl.read_csv()
500
-
501
- Returns:
502
- pl.DataFrame: DataFrame containing CSV data
503
-
504
- Example:
505
- >>> fs = LocalFileSystem()
506
- >>> df = _read_csv_file(
507
- ... "data.csv",
508
- ... fs,
509
- ... include_file_path=True,
510
- ... delimiter="|"
511
- ... )
512
- >>> print("file_path" in df.columns)
513
- True
514
- """
515
- print(path) # Debug info
516
- with self.open(path) as f:
517
- df = pl.read_csv(f, **kwargs)
518
- if include_file_path:
519
- df = df.with_columns(pl.lit(path).alias("file_path"))
520
- if opt_dtypes:
521
- df = opt_dtype_pl(df, strict=False)
522
- return df
523
-
524
-
525
- def read_csv_file(
526
- self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
527
- ) -> pl.DataFrame:
528
- return _read_csv_file(
529
- path=path,
530
- self=self,
531
- include_file_path=include_file_path,
532
- opt_dtypes=opt_dtypes,
533
- **kwargs,
534
- )
535
-
536
-
537
- def _read_csv(
538
- self,
539
- path: str | list[str],
540
- include_file_path: bool = False,
541
- use_threads: bool = True,
542
- concat: bool = True,
543
- verbose: bool = False,
544
- opt_dtypes: bool = False,
545
- **kwargs,
546
- ) -> pl.DataFrame | list[pl.DataFrame]:
547
- """
548
- Read a CSV file or a list of CSV files into a polars DataFrame.
549
-
550
- Args:
551
- path: (str | list[str]) Path to the CSV file(s).
552
- include_file_path: (bool, optional) If True, return a DataFrame with a 'file_path' column.
553
- Defaults to False.
554
- use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
555
- concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
556
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
557
- opt_dtypes: (bool, optional) If True, optimize DataFrame dtypes. Defaults to False.
558
- **kwargs: Additional keyword arguments.
559
-
560
- Returns:
561
- (pl.DataFrame | list[pl.DataFrame]): Polars DataFrame or list of DataFrames.
562
- """
563
- if isinstance(path, str):
564
- path = path_to_glob(path, format="csv")
565
- path = self.glob(path)
566
-
567
- if isinstance(path, list):
568
- if use_threads:
569
- dfs = run_parallel(
570
- _read_csv_file,
571
- path,
572
- self=self,
573
- include_file_path=include_file_path,
574
- opt_dtypes=opt_dtypes,
575
- n_jobs=-1,
576
- backend="threading",
577
- verbose=verbose,
578
- **kwargs,
579
- )
580
- else:
581
- dfs = [
582
- _read_csv_file(
583
- p,
584
- self=self,
585
- include_file_path=include_file_path,
586
- opt_dtypes=opt_dtypes,
587
- **kwargs,
588
- )
589
- for p in path
590
- ]
591
- else:
592
- dfs = _read_csv_file(
593
- path,
594
- self=self,
595
- include_file_path=include_file_path,
596
- opt_dtypes=opt_dtypes,
597
- **kwargs,
598
- )
599
- if concat:
600
- result = pl.concat(dfs, how="diagonal_relaxed")
601
- # if opt_dtypes:
602
- # result = opt_dtype_pl(result, strict=False)
603
- return result
604
- return dfs
605
-
606
-
607
- def _read_csv_batches(
608
- self: AbstractFileSystem,
609
- path: str | list[str],
610
- batch_size: int | None = None,
611
- include_file_path: bool = False,
612
- concat: bool = True,
613
- use_threads: bool = True,
614
- verbose: bool = False,
615
- opt_dtypes: bool = False,
616
- **kwargs: Any,
617
- ) -> Generator[pl.DataFrame | list[pl.DataFrame], None, None]:
618
- """Process CSV files in batches with optional parallel reading.
619
-
620
- Internal generator function that handles batched reading of CSV files
621
- with support for parallel processing within each batch.
622
-
623
- Args:
624
- path: Path(s) to CSV file(s). Glob patterns supported.
625
- batch_size: Number of files to process in each batch
626
- include_file_path: Add source filepath as a column
627
- concat: Combine files within each batch
628
- use_threads: Enable parallel file reading within batches
629
- verbose: Print progress information
630
- opt_dtypes: Optimize DataFrame dtypes
631
- **kwargs: Additional arguments passed to pl.read_csv()
632
-
633
- Yields:
634
- Each batch of data in requested format:
635
- - pl.DataFrame: Single DataFrame if concat=True
636
- - list[pl.DataFrame]: List of DataFrames if concat=False
637
-
638
- Example:
639
- >>> fs = LocalFileSystem()
640
- >>> # Process large dataset in batches
641
- >>> for batch in fs._read_csv_batches(
642
- ... "data/*.csv",
643
- ... batch_size=100,
644
- ... include_file_path=True,
645
- ... verbose=True
646
- ... ):
647
- ... print(f"Batch columns: {batch.columns}")
648
- >>>
649
- >>> # Parallel processing without concatenation
650
- >>> for batch in fs._read_csv_batches(
651
- ... ["file1.csv", "file2.csv"],
652
- ... batch_size=1,
653
- ... concat=False,
654
- ... use_threads=True
655
- ... ):
656
- ... for df in batch:
657
- ... print(f"DataFrame shape: {df.shape}")
658
- """
659
- # Handle path resolution
660
- if isinstance(path, str):
661
- path = path_to_glob(path, format="csv")
662
- path = self.glob(path)
663
-
664
- # Ensure path is a list
665
- if isinstance(path, str):
666
- path = [path]
667
-
668
- # Process files in batches
669
- for i in range(0, len(path), batch_size):
670
- batch_paths = path[i : i + batch_size]
671
-
672
- # Read batch with optional parallelization
673
- if use_threads and len(batch_paths) > 1:
674
- batch_dfs = run_parallel(
675
- _read_csv_file,
676
- batch_paths,
677
- self=self,
678
- include_file_path=include_file_path,
679
- n_jobs=-1,
680
- backend="threading",
681
- verbose=verbose,
682
- opt_dtypes=opt_dtypes,
683
- **kwargs,
684
- )
685
- else:
686
- batch_dfs = [
687
- _read_csv_file(
688
- p,
689
- self=self,
690
- include_file_path=include_file_path,
691
- opt_dtypes=opt_dtypes,
692
- **kwargs,
693
- )
694
- for p in batch_paths
695
- ]
696
-
697
- # if opt_dtypes:
698
- # batch_dfs = [opt_dtype_pl(df, strict=False) for df in batch_dfs]
699
-
700
- if concat and len(batch_dfs) > 1:
701
- result = pl.concat(batch_dfs, how="diagonal_relaxed")
702
- # if opt_dtypes:
703
- # result = opt_dtype_pl(result, strict=False)
704
- yield result
705
- else:
706
- yield batch_dfs
707
-
708
-
709
- def read_csv(
710
- self: AbstractFileSystem,
711
- path: str | list[str],
712
- batch_size: int | None = None,
713
- include_file_path: bool = False,
714
- concat: bool = True,
715
- use_threads: bool = True,
716
- verbose: bool = False,
717
- opt_dtypes: bool = False,
718
- **kwargs: Any,
719
- ) -> (
720
- pl.DataFrame
721
- | list[pl.DataFrame]
722
- | Generator[pl.DataFrame | list[pl.DataFrame], None, None]
723
- ):
724
- """Read CSV data from one or more files with powerful options.
725
-
726
- Provides a flexible interface for reading CSV files with support for:
727
- - Single file or multiple files
728
- - Batch processing for large datasets
729
- - Parallel processing
730
- - File path tracking
731
- - Polars DataFrame output
732
-
733
- Args:
734
- path: Path(s) to CSV file(s). Can be:
735
- - Single path string (globs supported)
736
- - List of path strings
737
- batch_size: If set, enables batch reading with this many files per batch
738
- include_file_path: Add source filepath as a column
739
- concat: Combine multiple files/batches into single DataFrame
740
- use_threads: Enable parallel file reading
741
- verbose: Print progress information
742
- **kwargs: Additional arguments passed to pl.read_csv()
743
-
744
- Returns:
745
- Various types depending on arguments:
746
- - pl.DataFrame: Single or concatenated DataFrame
747
- - list[pl.DataFrame]: List of DataFrames (if concat=False)
748
- - Generator: If batch_size set, yields batches of above types
749
-
750
- Example:
751
- >>> fs = LocalFileSystem()
752
- >>> # Read all CSVs in directory
753
- >>> df = fs.read_csv(
754
- ... "data/*.csv",
755
- ... include_file_path=True
756
- ... )
757
- >>> print(df.columns)
758
- ['file_path', 'col1', 'col2', ...]
759
- >>>
760
- >>> # Batch process large dataset
761
- >>> for batch_df in fs.read_csv(
762
- ... "logs/*.csv",
763
- ... batch_size=100,
764
- ... use_threads=True,
765
- ... verbose=True
766
- ... ):
767
- ... print(f"Processing {len(batch_df)} rows")
768
- >>>
769
- >>> # Multiple files without concatenation
770
- >>> dfs = fs.read_csv(
771
- ... ["file1.csv", "file2.csv"],
772
- ... concat=False,
773
- ... use_threads=True
774
- ... )
775
- >>> print(f"Read {len(dfs)} files")
776
- """
777
- if batch_size is not None:
778
- return _read_csv_batches(
779
- self=self,
780
- path=path,
781
- batch_size=batch_size,
782
- include_file_path=include_file_path,
783
- concat=concat,
784
- use_threads=use_threads,
785
- verbose=verbose,
786
- opt_dtypes=opt_dtypes,
787
- **kwargs,
788
- )
789
- return _read_csv(
790
- self=self,
791
- path=path,
792
- include_file_path=include_file_path,
793
- concat=concat,
794
- use_threads=use_threads,
795
- verbose=verbose,
796
- opt_dtypes=opt_dtypes,
797
- **kwargs,
798
- )
799
-
800
-
801
- def _read_parquet_file(
802
- path: str,
803
- self: AbstractFileSystem,
804
- include_file_path: bool = False,
805
- opt_dtypes: bool = False,
806
- **kwargs: Any,
807
- ) -> pa.Table:
808
- """Read a single Parquet file from any filesystem.
809
-
810
- Internal function that handles reading individual Parquet files and
811
- optionally adds the source filepath as a column.
812
-
813
- Args:
814
- path: Path to Parquet file
815
- self: Filesystem instance to use for reading
816
- include_file_path: Add source filepath as a column
817
- opt_dtypes: Optimize DataFrame dtypes
818
- **kwargs: Additional arguments passed to pq.read_table()
819
-
820
- Returns:
821
- pa.Table: PyArrow Table containing Parquet data
822
-
823
- Example:
824
- >>> fs = LocalFileSystem()
825
- >>> table = _read_parquet_file(
826
- ... "data.parquet",
827
- ... fs,
828
- ... include_file_path=True,
829
- ... use_threads=True
830
- ... )
831
- >>> print("file_path" in table.column_names)
832
- True
833
- """
834
- if not path.endswith(".parquet"):
835
- raise ValueError(
836
- f"Path '{path}' does not point to a Parquet file. "
837
- "Ensure the path ends with '.parquet'."
838
- )
839
- table = pq.read_table(path, filesystem=self, **kwargs)
840
- if include_file_path:
841
- table = table.add_column(0, "file_path", pl.Series([path] * table.num_rows))
842
- if opt_dtypes:
843
- table = opt_dtype_pa(table, strict=False)
844
- return table
845
-
846
-
847
- def read_parquet_file(
848
- self, path: str, include_file_path: bool = False, opt_dtypes: bool = False, **kwargs
849
- ) -> pa.Table:
850
- """Read a single Parquet file from any filesystem.
851
-
852
- Internal function that handles reading individual Parquet files and
853
- optionally adds the source filepath as a column.
854
-
855
- Args:
856
- path: Path to Parquet file
857
- include_file_path: Add source filepath as a column
858
- opt_dtypes: Optimize DataFrame dtypes
859
- **kwargs: Additional arguments passed to pq.read_table()
860
-
861
- Returns:
862
- pa.Table: PyArrow Table containing Parquet data
863
-
864
- Example:
865
- >>> fs = LocalFileSystem()
866
- >>> table = fs.read_parquet_file(
867
- ... "data.parquet",
868
- ... include_file_path=True,
869
- ... use_threads=True
870
- ... )
871
- >>> print("file_path" in table.column_names)
872
- True
873
- """
874
- return _read_parquet_file(
875
- path=path,
876
- self=self,
877
- include_file_path=include_file_path,
878
- opt_dtypes=opt_dtypes,
879
- **kwargs,
880
- )
881
-
882
-
883
- def _read_parquet(
884
- self,
885
- path: str | list[str],
886
- include_file_path: bool = False,
887
- use_threads: bool = True,
888
- concat: bool = True,
889
- verbose: bool = False,
890
- opt_dtypes: bool = False,
891
- **kwargs,
892
- ) -> pa.Table | list[pa.Table]:
893
- """
894
- Read a Parquet file or a list of Parquet files into a pyarrow Table.
895
-
896
- Args:
897
- path: (str | list[str]) Path to the Parquet file(s).
898
- include_file_path: (bool, optional) If True, return a Table with a 'file_path' column.
899
- Defaults to False.
900
- use_threads: (bool, optional) If True, read files in parallel. Defaults to True.
901
- concat: (bool, optional) If True, concatenate the Tables. Defaults to True.
902
- **kwargs: Additional keyword arguments.
903
-
904
- Returns:
905
- (pa.Table | list[pa.Table]): Pyarrow Table or list of Pyarrow Tables.
906
- """
907
- # if not include_file_path and concat:
908
- # if isinstance(path, str):
909
- # path = path.replace("**", "").replace("*.parquet", "")
910
- # table = _read_parquet_file(path, self=self, opt_dtypes=opt_dtypes, **kwargs)
911
- # return table
912
- # else:
913
- if isinstance(path, str):
914
- path = path_to_glob(path, format="parquet")
915
- path = self.glob(path)
916
-
917
- if isinstance(path, list):
918
- if use_threads:
919
- tables = run_parallel(
920
- _read_parquet_file,
921
- path,
922
- self=self,
923
- include_file_path=include_file_path,
924
- opt_dtypes=opt_dtypes,
925
- n_jobs=-1,
926
- backend="threading",
927
- verbose=verbose,
928
- **kwargs,
929
- )
930
- else:
931
- tables = [
932
- _read_parquet_file(
933
- p,
934
- self=self,
935
- include_file_path=include_file_path,
936
- opt_dtypes=opt_dtypes,
937
- **kwargs,
938
- )
939
- for p in path
940
- ]
941
- else:
942
- tables = _read_parquet_file(
943
- path=path,
944
- self=self,
945
- include_file_path=include_file_path,
946
- opt_dtypes=opt_dtypes,
947
- **kwargs,
948
- )
949
- if concat:
950
- # Unify schemas before concatenation if opt_dtypes or multiple tables
951
- if isinstance(tables, list):
952
- if len(tables) > 0:
953
- schemas = [t.schema for t in tables]
954
- unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
955
- tables = [cast_schema(t, unified_schema) for t in tables]
956
-
957
- tables = [table for table in tables if table.num_rows > 0]
958
- if not tables:
959
- return unified_schema.empty_table()
960
-
961
- result = pa.concat_tables(
962
- tables,
963
- promote_options="permissive",
964
- )
965
- # if opt_dtypes:
966
- # result = opt_dtype_pa(result, strict=False)
967
- return result
968
- elif isinstance(tables, pa.Table):
969
- # if opt_dtypes:
970
- # tables = opt_dtype_pa(tables, strict=False)
971
- return tables
972
- else:
973
- tables = [table for table in tables if table.num_rows > 0]
974
- if not tables:
975
- return unified_schema.empty_table()
976
-
977
- result = pa.concat_tables(
978
- tables,
979
- promote_options="permissive",
980
- )
981
- return tables
982
-
983
-
984
- def _read_parquet_batches(
985
- self: AbstractFileSystem,
986
- path: str | list[str],
987
- batch_size: int | None = None,
988
- include_file_path: bool = False,
989
- use_threads: bool = True,
990
- concat: bool = True,
991
- verbose: bool = False,
992
- opt_dtypes: bool = False,
993
- **kwargs: Any,
994
- ) -> Generator[pa.Table | list[pa.Table], None, None]:
995
- """Process Parquet files in batches with performance optimizations.
996
-
997
- Internal generator function that handles batched reading of Parquet files
998
- with support for:
999
- - Parallel processing within batches
1000
- - Metadata-based optimizations
1001
- - Memory-efficient processing
1002
- - Progress tracking
1003
-
1004
- Uses fast path for simple cases:
1005
- - Single directory with _metadata
1006
- - No need for filepath column
1007
- - Concatenated output
1008
-
1009
- Args:
1010
- path: Path(s) to Parquet file(s). Glob patterns supported.
1011
- batch_size: Number of files to process in each batch
1012
- include_file_path: Add source filepath as a column
1013
- use_threads: Enable parallel file reading within batches
1014
- concat: Combine files within each batch
1015
- verbose: Print progress information
1016
- **kwargs: Additional arguments passed to pq.read_table()
1017
-
1018
- Yields:
1019
- Each batch of data in requested format:
1020
- - pa.Table: Single Table if concat=True
1021
- - list[pa.Table]: List of Tables if concat=False
1022
-
1023
- Example:
1024
- >>> fs = LocalFileSystem()
1025
- >>> # Fast path for simple case
1026
- >>> next(_read_parquet_batches(
1027
- ... fs,
1028
- ... "data/", # Contains _metadata
1029
- ... batch_size=1000
1030
- ... ))
1031
- >>>
1032
- >>> # Parallel batch processing
1033
- >>> for batch in fs._read_parquet_batches(
1034
- ... fs,
1035
- ... ["file1.parquet", "file2.parquet"],
1036
- ... batch_size=1,
1037
- ... include_file_path=True,
1038
- ... use_threads=True
1039
- ... ):
1040
- ... print(f"Batch schema: {batch.schema}")
1041
- """
1042
- # Fast path for simple cases
1043
- # if not include_file_path and concat and batch_size is None:
1044
- # if isinstance(path, str):
1045
- # path = path.replace("**", "").replace("*.parquet", "")
1046
- # table = _read_parquet_file(
1047
- # path=path, self=self, opt_dtypes=opt_dtypes, **kwargs
1048
- # )
1049
- # yield table
1050
- # return
1051
-
1052
- # Resolve path(s) to list
1053
- if isinstance(path, str):
1054
- path = path_to_glob(path, format="parquet")
1055
- path = self.glob(path)
1056
-
1057
- if not isinstance(path, list):
1058
- yield _read_parquet_file(
1059
- path=path,
1060
- self=self,
1061
- include_file_path=include_file_path,
1062
- opt_dtypes=opt_dtypes,
1063
- **kwargs,
1064
- )
1065
- return
1066
-
1067
- # Process in batches
1068
- for i in range(0, len(path), batch_size):
1069
- batch_paths = path[i : i + batch_size]
1070
- if use_threads and len(batch_paths) > 1:
1071
- batch_tables = run_parallel(
1072
- _read_parquet_file,
1073
- batch_paths,
1074
- self=self,
1075
- include_file_path=include_file_path,
1076
- opt_dtypes=opt_dtypes,
1077
- n_jobs=-1,
1078
- backend="threading",
1079
- verbose=verbose,
1080
- **kwargs,
1081
- )
1082
- else:
1083
- batch_tables = [
1084
- _read_parquet_file(
1085
- p,
1086
- self=self,
1087
- include_file_path=include_file_path,
1088
- opt_dtypes=opt_dtypes,
1089
- **kwargs,
1090
- )
1091
- for p in batch_paths
1092
- ]
1093
-
1094
- if concat and batch_tables:
1095
- # Unify schemas before concatenation
1096
- if len(batch_tables) > 1:
1097
- schemas = [t.schema for t in batch_tables]
1098
- unified_schema = unify_schemas_pa(schemas, standardize_timezones=True)
1099
- batch_tables = [cast_schema(t, unified_schema) for t in batch_tables]
1100
- batch_tables = [table for table in batch_tables if table.num_rows > 0]
1101
- if not batch_tables:
1102
- yield unified_schema.empty_table()
1103
- batch_table = pa.concat_tables(
1104
- batch_tables,
1105
- promote_options="permissive",
1106
- )
1107
- # if opt_dtypes:
1108
- # result = opt_dtype_pa(result, strict=False)
1109
- yield batch_table
1110
- else:
1111
- # if opt_dtypes and isinstance(batch_tables, list):
1112
- # batch_tables = [opt_dtype_pa(t, strict=False) for t in batch_tables]
1113
- yield batch_tables
1114
-
1115
-
1116
- def read_parquet(
1117
- self: AbstractFileSystem,
1118
- path: str | list[str],
1119
- batch_size: int | None = None,
1120
- include_file_path: bool = False,
1121
- concat: bool = True,
1122
- use_threads: bool = True,
1123
- verbose: bool = False,
1124
- opt_dtypes: bool = False,
1125
- **kwargs: Any,
1126
- ) -> pa.Table | list[pa.Table] | Generator[pa.Table | list[pa.Table], None, None]:
1127
- """Read Parquet data with advanced features and optimizations.
1128
-
1129
- Provides a high-performance interface for reading Parquet files with support for:
1130
- - Single file or multiple files
1131
- - Batch processing for large datasets
1132
- - Parallel processing
1133
- - File path tracking
1134
- - Automatic concatenation
1135
- - PyArrow Table output
1136
-
1137
- The function automatically uses optimal reading strategies:
1138
- - Direct dataset reading for simple cases
1139
- - Parallel processing for multiple files
1140
- - Batched reading for memory efficiency
1141
-
1142
- Args:
1143
- path: Path(s) to Parquet file(s). Can be:
1144
- - Single path string (globs supported)
1145
- - List of path strings
1146
- - Directory containing _metadata file
1147
- batch_size: If set, enables batch reading with this many files per batch
1148
- include_file_path: Add source filepath as a column
1149
- concat: Combine multiple files/batches into single Table
1150
- use_threads: Enable parallel file reading
1151
- verbose: Print progress information
1152
- opt_dtypes: Optimize Table dtypes for performance
1153
- **kwargs: Additional arguments passed to pq.read_table()
1154
-
1155
- Returns:
1156
- Various types depending on arguments:
1157
- - pa.Table: Single or concatenated Table
1158
- - list[pa.Table]: List of Tables (if concat=False)
1159
- - Generator: If batch_size set, yields batches of above types
1160
-
1161
- Example:
1162
- >>> fs = LocalFileSystem()
1163
- >>> # Read all Parquet files in directory
1164
- >>> table = fs.read_parquet(
1165
- ... "data/*.parquet",
1166
- ... include_file_path=True
1167
- ... )
1168
- >>> print(table.column_names)
1169
- ['file_path', 'col1', 'col2', ...]
1170
- >>>
1171
- >>> # Batch process large dataset
1172
- >>> for batch in fs.read_parquet(
1173
- ... "data/*.parquet",
1174
- ... batch_size=100,
1175
- ... use_threads=True
1176
- ... ):
1177
- ... print(f"Processing {batch.num_rows} rows")
1178
- >>>
1179
- >>> # Read from directory with metadata
1180
- >>> table = fs.read_parquet(
1181
- ... "data/", # Contains _metadata
1182
- ... use_threads=True
1183
- ... )
1184
- >>> print(f"Total rows: {table.num_rows}")
1185
- """
1186
- if batch_size is not None:
1187
- return _read_parquet_batches(
1188
- self=self,
1189
- path=path,
1190
- batch_size=batch_size,
1191
- include_file_path=include_file_path,
1192
- concat=concat,
1193
- use_threads=use_threads,
1194
- verbose=verbose,
1195
- opt_dtypes=opt_dtypes,
1196
- **kwargs,
1197
- )
1198
- return _read_parquet(
1199
- self=self,
1200
- path=path,
1201
- include_file_path=include_file_path,
1202
- use_threads=use_threads,
1203
- concat=concat,
1204
- verbose=verbose,
1205
- opt_dtypes=opt_dtypes,
1206
- **kwargs,
1207
- )
1208
-
1209
-
1210
- def read_files(
1211
- self: AbstractFileSystem,
1212
- path: str | list[str],
1213
- format: str,
1214
- batch_size: int | None = None,
1215
- include_file_path: bool = False,
1216
- concat: bool = True,
1217
- jsonlines: bool = False,
1218
- use_threads: bool = True,
1219
- verbose: bool = False,
1220
- opt_dtypes: bool = False,
1221
- **kwargs: Any,
1222
- ) -> (
1223
- pl.DataFrame
1224
- | pa.Table
1225
- | list[pl.DataFrame]
1226
- | list[pa.Table]
1227
- | Generator[
1228
- pl.DataFrame | pa.Table | list[pl.DataFrame] | list[pa.Table], None, None
1229
- ]
1230
- ):
1231
- """Universal interface for reading data files of any supported format.
1232
-
1233
- A unified API that automatically delegates to the appropriate reading function
1234
- based on file format, while preserving all advanced features like:
1235
- - Batch processing
1236
- - Parallel reading
1237
- - File path tracking
1238
- - Format-specific optimizations
1239
-
1240
- Args:
1241
- path: Path(s) to data file(s). Can be:
1242
- - Single path string (globs supported)
1243
- - List of path strings
1244
- format: File format to read. Supported values:
1245
- - "json": Regular JSON or JSON Lines
1246
- - "csv": CSV files
1247
- - "parquet": Parquet files
1248
- batch_size: If set, enables batch reading with this many files per batch
1249
- include_file_path: Add source filepath as column/field
1250
- concat: Combine multiple files/batches into single result
1251
- jsonlines: For JSON format, whether to read as JSON Lines
1252
- use_threads: Enable parallel file reading
1253
- verbose: Print progress information
1254
- opt_dtypes: Optimize DataFrame/Arrow Table dtypes for performance
1255
- **kwargs: Additional format-specific arguments
1256
-
1257
- Returns:
1258
- Various types depending on format and arguments:
1259
- - pl.DataFrame: For CSV and optionally JSON
1260
- - pa.Table: For Parquet
1261
- - list[pl.DataFrame | pa.Table]: Without concatenation
1262
- - Generator: If batch_size set, yields batches
1263
-
1264
- Example:
1265
- >>> fs = LocalFileSystem()
1266
- >>> # Read CSV files
1267
- >>> df = fs.read_files(
1268
- ... "data/*.csv",
1269
- ... format="csv",
1270
- ... include_file_path=True
1271
- ... )
1272
- >>> print(type(df))
1273
- <class 'polars.DataFrame'>
1274
- >>>
1275
- >>> # Batch process Parquet files
1276
- >>> for batch in fs.read_files(
1277
- ... "data/*.parquet",
1278
- ... format="parquet",
1279
- ... batch_size=100,
1280
- ... use_threads=True
1281
- ... ):
1282
- ... print(f"Batch type: {type(batch)}")
1283
- >>>
1284
- >>> # Read JSON Lines
1285
- >>> df = fs.read_files(
1286
- ... "logs/*.jsonl",
1287
- ... format="json",
1288
- ... jsonlines=True,
1289
- ... concat=True
1290
- ... )
1291
- >>> print(df.columns)
1292
- """
1293
- if format == "json":
1294
- if batch_size is not None:
1295
- return read_json(
1296
- self=self,
1297
- path=path,
1298
- batch_size=batch_size,
1299
- include_file_path=include_file_path,
1300
- jsonlines=jsonlines,
1301
- concat=concat,
1302
- use_threads=use_threads,
1303
- verbose=verbose,
1304
- opt_dtypes=opt_dtypes,
1305
- **kwargs,
1306
- )
1307
- return read_json(
1308
- self=self,
1309
- path=path,
1310
- include_file_path=include_file_path,
1311
- jsonlines=jsonlines,
1312
- concat=concat,
1313
- use_threads=use_threads,
1314
- verbose=verbose,
1315
- opt_dtypes=opt_dtypes,
1316
- **kwargs,
1317
- )
1318
- elif format == "csv":
1319
- if batch_size is not None:
1320
- return read_csv(
1321
- self=self,
1322
- path=path,
1323
- batch_size=batch_size,
1324
- include_file_path=include_file_path,
1325
- concat=concat,
1326
- use_threads=use_threads,
1327
- verbose=verbose,
1328
- opt_dtypes=opt_dtypes,
1329
- **kwargs,
1330
- )
1331
- return read_csv(
1332
- self=self,
1333
- path=path,
1334
- include_file_path=include_file_path,
1335
- use_threads=use_threads,
1336
- concat=concat,
1337
- verbose=verbose,
1338
- opt_dtypes=opt_dtypes,
1339
- **kwargs,
1340
- )
1341
- elif format == "parquet":
1342
- if batch_size is not None:
1343
- return read_parquet(
1344
- self=self,
1345
- path=path,
1346
- batch_size=batch_size,
1347
- include_file_path=include_file_path,
1348
- concat=concat,
1349
- use_threads=use_threads,
1350
- verbose=verbose,
1351
- opt_dtypes=opt_dtypes,
1352
- **kwargs,
1353
- )
1354
- return read_parquet(
1355
- self=self,
1356
- path=path,
1357
- include_file_path=include_file_path,
1358
- use_threads=use_threads,
1359
- concat=concat,
1360
- verbose=verbose,
1361
- opt_dtypes=opt_dtypes,
1362
- **kwargs,
1363
- )
1364
-
1365
-
1366
- def pyarrow_dataset(
1367
- self: AbstractFileSystem,
1368
- path: str,
1369
- format: str = "parquet",
1370
- schema: pa.Schema | None = None,
1371
- partitioning: str | list[str] | pds.Partitioning = None,
1372
- **kwargs: Any,
1373
- ) -> pds.Dataset:
1374
- """Create a PyArrow dataset from files in any supported format.
1375
-
1376
- Creates a dataset that provides optimized reading and querying capabilities
1377
- including:
1378
- - Schema inference and enforcement
1379
- - Partition discovery and pruning
1380
- - Predicate pushdown
1381
- - Column projection
1382
-
1383
- Args:
1384
- path: Base path to dataset files
1385
- format: File format. Currently supports:
1386
- - "parquet" (default)
1387
- - "csv"
1388
- - "json" (experimental)
1389
- schema: Optional schema to enforce. If None, inferred from data.
1390
- partitioning: How the dataset is partitioned. Can be:
1391
- - str: Single partition field
1392
- - list[str]: Multiple partition fields
1393
- - pds.Partitioning: Custom partitioning scheme
1394
- **kwargs: Additional arguments for dataset creation
1395
-
1396
- Returns:
1397
- pds.Dataset: PyArrow dataset instance
1398
-
1399
- Example:
1400
- >>> fs = LocalFileSystem()
1401
- >>> # Simple Parquet dataset
1402
- >>> ds = fs.pyarrow_dataset("data/")
1403
- >>> print(ds.schema)
1404
- >>>
1405
- >>> # Partitioned dataset
1406
- >>> ds = fs.pyarrow_dataset(
1407
- ... "events/",
1408
- ... partitioning=["year", "month"]
1409
- ... )
1410
- >>> # Query with partition pruning
1411
- >>> table = ds.to_table(
1412
- ... filter=(ds.field("year") == 2024)
1413
- ... )
1414
- >>>
1415
- >>> # CSV with schema
1416
- >>> ds = fs.pyarrow_dataset(
1417
- ... "logs/",
1418
- ... format="csv",
1419
- ... schema=pa.schema([
1420
- ... ("timestamp", pa.timestamp("s")),
1421
- ... ("level", pa.string()),
1422
- ... ("message", pa.string())
1423
- ... ])
1424
- ... )
1425
- """
1426
- return pds.dataset(
1427
- path,
1428
- filesystem=self,
1429
- partitioning=partitioning,
1430
- schema=schema,
1431
- format=format,
1432
- **kwargs,
1433
- )
1434
-
1435
-
1436
- def pyarrow_parquet_dataset(
1437
- self: AbstractFileSystem,
1438
- path: str,
1439
- schema: pa.Schema | None = None,
1440
- partitioning: str | list[str] | pds.Partitioning = None,
1441
- **kwargs: Any,
1442
- ) -> pds.Dataset:
1443
- """Create a PyArrow dataset optimized for Parquet files.
1444
-
1445
- Creates a dataset specifically for Parquet data, automatically handling
1446
- _metadata files for optimized reading.
1447
-
1448
- This function is particularly useful for:
1449
- - Datasets with existing _metadata files
1450
- - Multi-file datasets that should be treated as one
1451
- - Partitioned Parquet datasets
1452
-
1453
- Args:
1454
- path: Path to dataset directory or _metadata file
1455
- schema: Optional schema to enforce. If None, inferred from data.
1456
- partitioning: How the dataset is partitioned. Can be:
1457
- - str: Single partition field
1458
- - list[str]: Multiple partition fields
1459
- - pds.Partitioning: Custom partitioning scheme
1460
- **kwargs: Additional dataset arguments
1461
-
1462
- Returns:
1463
- pds.Dataset: PyArrow dataset instance
1464
-
1465
- Example:
1466
- >>> fs = LocalFileSystem()
1467
- >>> # Dataset with _metadata
1468
- >>> ds = fs.pyarrow_parquet_dataset("data/_metadata")
1469
- >>> print(ds.files) # Shows all data files
1470
- >>>
1471
- >>> # Partitioned dataset directory
1472
- >>> ds = fs.pyarrow_parquet_dataset(
1473
- ... "sales/",
1474
- ... partitioning=["year", "region"]
1475
- ... )
1476
- >>> # Query with partition pruning
1477
- >>> table = ds.to_table(
1478
- ... filter=(
1479
- ... (ds.field("year") == 2024) &
1480
- ... (ds.field("region") == "EMEA")
1481
- ... )
1482
- ... )
1483
- """
1484
- if not self.is_file(path):
1485
- path = posixpath.join(path, "_metadata")
1486
- return pds.dataset(
1487
- path,
1488
- filesystem=self,
1489
- partitioning=partitioning,
1490
- schema=schema,
1491
- **kwargs,
1492
- )
1493
-
1494
-
1495
- def pydala_dataset(
1496
- self: AbstractFileSystem,
1497
- path: str,
1498
- partitioning: str | list[str] | pds.Partitioning = None,
1499
- **kwargs: Any,
1500
- ) -> ParquetDataset: # type: ignore
1501
- """Create a Pydala dataset for advanced Parquet operations.
1502
-
1503
- Creates a dataset with additional features beyond PyArrow including:
1504
- - Delta table support
1505
- - Schema evolution
1506
- - Advanced partitioning
1507
- - Metadata management
1508
- - Sort key optimization
1509
-
1510
- Args:
1511
- path: Path to dataset directory
1512
- partitioning: How the dataset is partitioned. Can be:
1513
- - str: Single partition field
1514
- - list[str]: Multiple partition fields
1515
- - pds.Partitioning: Custom partitioning scheme
1516
- **kwargs: Additional dataset configuration
1517
-
1518
- Returns:
1519
- ParquetDataset: Pydala dataset instance
1520
-
1521
- Example:
1522
- >>> fs = LocalFileSystem()
1523
- >>> # Create dataset
1524
- >>> ds = fs.pydala_dataset(
1525
- ... "data/",
1526
- ... partitioning=["date"]
1527
- ... )
1528
- >>>
1529
- >>> # Write with delta support
1530
- >>> ds.write_to_dataset(
1531
- ... new_data,
1532
- ... mode="delta",
1533
- ... delta_subset=["id"]
1534
- ... )
1535
- >>>
1536
- >>> # Read with metadata
1537
- >>> df = ds.to_polars()
1538
- >>> print(df.columns)
1539
- """
1540
- return ParquetDataset(
1541
- path,
1542
- filesystem=self,
1543
- partitioning=partitioning,
1544
- **kwargs,
1545
- )
1546
-
1547
-
1548
- def write_parquet(
1549
- self: AbstractFileSystem,
1550
- data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
1551
- path: str,
1552
- schema: pa.Schema | None = None,
1553
- **kwargs: Any,
1554
- ) -> pq.FileMetaData:
1555
- """Write data to a Parquet file with automatic format conversion.
1556
-
1557
- Handles writing data from multiple input formats to Parquet with:
1558
- - Automatic conversion to PyArrow
1559
- - Schema validation/coercion
1560
- - Metadata collection
1561
- - Compression and encoding options
1562
-
1563
- Args:
1564
- data: Input data in various formats:
1565
- - Polars DataFrame/LazyFrame
1566
- - PyArrow Table
1567
- - Pandas DataFrame
1568
- - Dict or list of dicts
1569
- path: Output Parquet file path
1570
- schema: Optional schema to enforce on write
1571
- **kwargs: Additional arguments for pq.write_table()
1572
-
1573
- Returns:
1574
- pq.FileMetaData: Metadata of written Parquet file
1575
-
1576
- Raises:
1577
- SchemaError: If data doesn't match schema
1578
- ValueError: If data cannot be converted
1579
-
1580
- Example:
1581
- >>> fs = LocalFileSystem()
1582
- >>> # Write Polars DataFrame
1583
- >>> df = pl.DataFrame({
1584
- ... "id": range(1000),
1585
- ... "value": pl.Series(np.random.randn(1000))
1586
- ... })
1587
- >>> metadata = fs.write_parquet(
1588
- ... df,
1589
- ... "data.parquet",
1590
- ... compression="zstd",
1591
- ... compression_level=3
1592
- ... )
1593
- >>> print(f"Rows: {metadata.num_rows}")
1594
- >>>
1595
- >>> # Write with schema
1596
- >>> schema = pa.schema([
1597
- ... ("id", pa.int64()),
1598
- ... ("value", pa.float64())
1599
- ... ])
1600
- >>> metadata = fs.write_parquet(
1601
- ... {"id": [1, 2], "value": [0.1, 0.2]},
1602
- ... "data.parquet",
1603
- ... schema=schema
1604
- ... )
1605
- """
1606
- data = to_pyarrow_table(data, concat=False, unique=False)
1607
-
1608
- if schema is not None:
1609
- data = cast_schema(data, schema)
1610
- metadata = []
1611
- pq.write_table(data, path, filesystem=self, metadata_collector=metadata, **kwargs)
1612
- metadata = metadata[0]
1613
- metadata.set_file_path(path)
1614
- return metadata
1615
-
1616
-
1617
- def write_json(
1618
- self: AbstractFileSystem,
1619
- data: dict
1620
- | pl.DataFrame
1621
- | pl.LazyFrame
1622
- | pa.Table
1623
- | pd.DataFrame
1624
- | dict
1625
- | list[dict],
1626
- path: str,
1627
- append: bool = False,
1628
- ) -> None:
1629
- """Write data to a JSON file with flexible input support.
1630
-
1631
- Handles writing data in various formats to JSON or JSON Lines,
1632
- with optional appending for streaming writes.
1633
-
1634
- Args:
1635
- data: Input data in various formats:
1636
- - Dict or list of dicts
1637
- - Polars DataFrame/LazyFrame
1638
- - PyArrow Table
1639
- - Pandas DataFrame
1640
- path: Output JSON file path
1641
- append: Whether to append to existing file (JSON Lines mode)
1642
-
1643
- Example:
1644
- >>> fs = LocalFileSystem()
1645
- >>> # Write dictionary
1646
- >>> data = {"name": "test", "values": [1, 2, 3]}
1647
- >>> fs.write_json(data, "config.json")
1648
- >>>
1649
- >>> # Stream records
1650
- >>> df1 = pl.DataFrame({"id": [1], "value": ["first"]})
1651
- >>> df2 = pl.DataFrame({"id": [2], "value": ["second"]})
1652
- >>> fs.write_json(df1, "stream.jsonl", append=False)
1653
- >>> fs.write_json(df2, "stream.jsonl", append=True)
1654
- >>>
1655
- >>> # Convert PyArrow
1656
- >>> table = pa.table({"a": [1, 2], "b": ["x", "y"]})
1657
- >>> fs.write_json(table, "data.json")
1658
- """
1659
- if isinstance(data, pl.LazyFrame):
1660
- data = data.collect()
1661
- if isinstance(data, pl.DataFrame):
1662
- data = data.to_arrow()
1663
- data = cast_schema(
1664
- data, convert_large_types_to_standard(data.schema)
1665
- ).to_pydict()
1666
- elif isinstance(data, pd.DataFrame):
1667
- data = pa.Table.from_pandas(data, preserve_index=False).to_pydict()
1668
- elif isinstance(data, pa.Table):
1669
- data = data.to_pydict()
1670
- if append:
1671
- with self.open(path, "ab") as f:
1672
- if isinstance(data, dict):
1673
- f.write(orjson.dumps(data) + b"\n")
1674
- else:
1675
- for record in data:
1676
- f.write(orjson.dumps(record) + b"\n")
1677
- else:
1678
- with self.open(path, "wb") as f:
1679
- f.write(orjson.dumps(data))
1680
-
1681
-
1682
- def write_csv(
1683
- self: AbstractFileSystem,
1684
- data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[dict],
1685
- path: str,
1686
- append: bool = False,
1687
- **kwargs: Any,
1688
- ) -> None:
1689
- """Write data to a CSV file with flexible input support.
1690
-
1691
- Handles writing data from multiple formats to CSV with options for:
1692
- - Appending to existing files
1693
- - Custom delimiters and formatting
1694
- - Automatic type conversion
1695
- - Header handling
1696
-
1697
- Args:
1698
- data: Input data in various formats:
1699
- - Polars DataFrame/LazyFrame
1700
- - PyArrow Table
1701
- - Pandas DataFrame
1702
- - Dict or list of dicts
1703
- path: Output CSV file path
1704
- append: Whether to append to existing file
1705
- **kwargs: Additional arguments for CSV writing:
1706
- - delimiter: Field separator (default ",")
1707
- - header: Whether to write header row
1708
- - quote_char: Character for quoting fields
1709
- - date_format: Format for date/time fields
1710
- - float_precision: Decimal places for floats
1711
-
1712
- Example:
1713
- >>> fs = LocalFileSystem()
1714
- >>> # Write Polars DataFrame
1715
- >>> df = pl.DataFrame({
1716
- ... "id": range(100),
1717
- ... "name": ["item_" + str(i) for i in range(100)]
1718
- ... })
1719
- >>> fs.write_csv(df, "items.csv")
1720
- >>>
1721
- >>> # Append records
1722
- >>> new_items = pl.DataFrame({
1723
- ... "id": range(100, 200),
1724
- ... "name": ["item_" + str(i) for i in range(100, 200)]
1725
- ... })
1726
- >>> fs.write_csv(
1727
- ... new_items,
1728
- ... "items.csv",
1729
- ... append=True,
1730
- ... header=False
1731
- ... )
1732
- >>>
1733
- >>> # Custom formatting
1734
- >>> data = pa.table({
1735
- ... "date": [datetime.now()],
1736
- ... "value": [123.456]
1737
- ... })
1738
- >>> fs.write_csv(
1739
- ... data,
1740
- ... "formatted.csv",
1741
- ... date_format="%Y-%m-%d",
1742
- ... float_precision=2
1743
- ... )
1744
- """
1745
- if isinstance(data, pl.LazyFrame):
1746
- data = data.collect()
1747
- if isinstance(data, pl.DataFrame):
1748
- if append:
1749
- with self.open(path, "ab") as f:
1750
- data.write_csv(f, has_header=not append, **kwargs)
1751
- else:
1752
- with self.open(path, "wb") as f:
1753
- data.write_csv(f, **kwargs)
1754
- elif isinstance(data, (pa.Table, pd.DataFrame)):
1755
- pl.from_arrow(pa.table(data)).write_csv(path, **kwargs)
1756
- else:
1757
- pl.DataFrame(data).write_csv(path, **kwargs)
1758
-
1759
-
1760
- def write_file(
1761
- self,
1762
- data: pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict,
1763
- path: str,
1764
- format: str,
1765
- **kwargs,
1766
- ) -> None:
1767
- """
1768
- Write a DataFrame to a file in the given format.
1769
-
1770
- Args:
1771
- data: (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame) Data to write.
1772
- path (str): Path to write the data.
1773
- format (str): Format of the file.
1774
- **kwargs: Additional keyword arguments.
1775
-
1776
- Returns:
1777
- None
1778
- """
1779
- if format == "json":
1780
- write_json(self, data, path, **kwargs)
1781
- elif format == "csv":
1782
- write_csv(self, data, path, **kwargs)
1783
- elif format == "parquet":
1784
- write_parquet(self, data, path, **kwargs)
1785
-
1786
-
1787
- def write_files(
1788
- self,
1789
- data: (
1790
- pl.DataFrame
1791
- | pl.LazyFrame
1792
- | pa.Table
1793
- | pa.RecordBatch
1794
- | pa.RecordBatchReader
1795
- | pd.DataFrame
1796
- | dict
1797
- | list[
1798
- pl.DataFrame
1799
- | pl.LazyFrame
1800
- | pa.Table
1801
- | pa.RecordBatch
1802
- | pa.RecordBatchReader
1803
- | pd.DataFrame
1804
- | dict
1805
- ]
1806
- ),
1807
- path: str | list[str],
1808
- basename: str = None,
1809
- format: str = None,
1810
- concat: bool = True,
1811
- unique: bool | list[str] | str = False,
1812
- mode: str = "append", # append, overwrite, delete_matching, error_if_exists
1813
- use_threads: bool = True,
1814
- verbose: bool = False,
1815
- **kwargs,
1816
- ) -> None:
1817
- """Write a DataFrame or a list of DataFrames to a file or a list of files.
1818
-
1819
- Args:
1820
- data: (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict | list[pl.DataFrame | pl.LazyFrame |
1821
- pa.Table | pd.DataFrame | dict]) Data to write.
1822
- path: (str | list[str]) Path to write the data.
1823
- basename: (str, optional) Basename of the files. Defaults to None.
1824
- format: (str, optional) Format of the data. Defaults to None.
1825
- concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
1826
- unique: (bool, optional) If True, remove duplicates. Defaults to False.
1827
- mode: (str, optional) Write mode. Defaults to 'append'. Options: 'append', 'overwrite', 'delete_matching',
1828
- 'error_if_exists'.
1829
- use_threads: (bool, optional) If True, use parallel processing. Defaults to True.
1830
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
1831
- **kwargs: Additional keyword arguments.
1832
-
1833
- Returns:
1834
- None
1835
-
1836
- Raises:
1837
- FileExistsError: If file already exists and mode is 'error_if_exists'.
1838
- """
1839
- if not isinstance(data, list):
1840
- data = [data]
1841
-
1842
- if concat:
1843
- if isinstance(data[0], dict):
1844
- data = _dict_to_dataframe(data)
1845
- if isinstance(data[0], pl.LazyFrame):
1846
- data = pl.concat([d.collect() for d in data], how="diagonal_relaxed")
1847
-
1848
- if isinstance(
1849
- data[0], pa.Table | pa.RecordBatch | pa.RecordBatchReader | Generator
1850
- ):
1851
- data = pl.concat([pl.from_arrow(d) for d in data], how="diagonal_relaxed")
1852
- elif isinstance(data[0], pd.DataFrame):
1853
- data = pl.concat([pl.from_pandas(d) for d in data], how="diagonal_relaxed")
1854
-
1855
- if unique:
1856
- data = data.unique(
1857
- subset=None if not isinstance(unique, str | list) else unique,
1858
- maintain_order=True,
1859
- )
1860
-
1861
- data = [data]
1862
-
1863
- if format is None:
1864
- format = (
1865
- path[0].split(".")[-1]
1866
- if isinstance(path, list) and "." in path[0]
1867
- else path.split(".")[-1]
1868
- if "." in path
1869
- else "parquet"
1870
- )
1871
-
1872
- def _write(d, p, basename, i):
1873
- if f".{format}" not in p:
1874
- if not basename:
1875
- basename = f"data-{dt.datetime.now().strftime('%Y%m%d_%H%M%S%f')[:-3]}-{uuid.uuid4().hex[:16]}"
1876
- p = f"{p}/{basename}-{i}.{format}"
1877
-
1878
- if mode == "delete_matching":
1879
- write_file(self, d, p, format, **kwargs)
1880
- elif mode == "overwrite":
1881
- if self.exists(p):
1882
- self.fs.rm(p, recursive=True)
1883
- write_file(self, d, p, format, **kwargs)
1884
- elif mode == "append":
1885
- if not self.exists(p):
1886
- write_file(self, d, p, format, **kwargs)
1887
- else:
1888
- p = p.replace(f".{format}", f"-{i}.{format}")
1889
- write_file(self, d, p, format, **kwargs)
1890
- elif mode == "error_if_exists":
1891
- if self.exists(p):
1892
- raise FileExistsError(f"File already exists: {p}")
1893
- else:
1894
- write_file(self, d, p, format, **kwargs)
1895
-
1896
- if mode == "overwrite":
1897
- if isinstance(path, list):
1898
- for p in path:
1899
- # Remove existing files
1900
- if self.exists(p):
1901
- self.rm(p, recursive=True)
1902
- else:
1903
- # Remove existing files
1904
- if self.exists(path):
1905
- self.rm(path, recursive=True)
1906
-
1907
- if use_threads:
1908
- run_parallel(
1909
- _write,
1910
- d=data,
1911
- p=path,
1912
- basename=basename,
1913
- i=list(range(len(data))),
1914
- verbose=verbose,
1915
- )
1916
- else:
1917
- for i, p in enumerate(path):
1918
- _write(i, data, p, basename)
1919
-
1920
-
1921
- def write_pyarrow_dataset(
1922
- self,
1923
- data: (
1924
- pl.DataFrame
1925
- | pl.LazyFrame
1926
- | pa.Table
1927
- | pa.RecordBatch
1928
- | pa.RecordBatchReader
1929
- | pd.DataFrame
1930
- | dict
1931
- | list[
1932
- pl.DataFrame
1933
- | pl.LazyFrame
1934
- | pa.Table
1935
- | pa.RecordBatch
1936
- | pa.RecordBatchReader
1937
- | pd.DataFrame
1938
- | dict
1939
- ]
1940
- ),
1941
- path: str,
1942
- basename: str | None = None,
1943
- schema: pa.Schema | None = None,
1944
- partition_by: str | list[str] | pds.Partitioning | None = None,
1945
- partitioning_flavor: str = "hive",
1946
- mode: str = "append",
1947
- format: str | None = "parquet",
1948
- compression: str = "zstd",
1949
- max_rows_per_file: int | None = 2_500_000,
1950
- row_group_size: int | None = 250_000,
1951
- concat: bool = True,
1952
- unique: bool | list[str] | str = False,
1953
- **kwargs,
1954
- ) -> list[pq.FileMetaData] | None:
1955
- """
1956
- Write a tabluar data to a PyArrow dataset.
1957
-
1958
- Args:
1959
- data: (pl.DataFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
1960
- pd.DataFrame | list[pl.DataFrame] | list[pa.Table] | list[pa.RecordBatch] |
1961
- list[pa.RecordBatchReader] | list[pd.DataFrame]) Data to write.
1962
- path: (str) Path to write the data.
1963
- basename: (str, optional) Basename of the files. Defaults to None.
1964
- schema: (pa.Schema, optional) Schema of the data. Defaults to None.
1965
- partition_by: (str | list[str] | pds.Partitioning, optional) Partitioning of the data.
1966
- Defaults to None.
1967
- partitioning_flavor: (str, optional) Partitioning flavor. Defaults to 'hive'.
1968
- mode: (str, optional) Write mode. Defaults to 'append'.
1969
- format: (str, optional) Format of the data. Defaults to 'parquet'.
1970
- compression: (str, optional) Compression algorithm. Defaults to 'zstd'.
1971
- max_rows_per_file: (int, optional) Maximum number of rows per file. Defaults to 2_500_000.
1972
- row_group_size: (int, optional) Row group size. Defaults to 250_000.
1973
- concat: (bool, optional) If True, concatenate the DataFrames. Defaults to True.
1974
- unique: (bool | str | list[str], optional) If True, remove duplicates. Defaults to False.
1975
- **kwargs: Additional keyword arguments for `pds.write_dataset`.
1976
-
1977
- Returns:
1978
- (list[pq.FileMetaData] | None): List of Parquet file metadata or None.
1979
- """
1980
- data = to_pyarrow_table(data, concat=concat, unique=unique)
1981
-
1982
- if mode == "delete_matching":
1983
- existing_data_behavior = "delete_matching"
1984
- elif mode == "append":
1985
- existing_data_behavior = "overwrite_or_ignore"
1986
- elif mode == "overwrite":
1987
- self.rm(path, recursive=True)
1988
- existing_data_behavior = "overwrite_or_ignore"
1989
- else:
1990
- existing_data_behavior = mode
1991
-
1992
- if basename is None:
1993
- basename_template = (
1994
- "data-"
1995
- f"{dt.datetime.now().strftime('%Y%m%d_%H%M%S%f')[:-3]}-{uuid.uuid4().hex[:16]}-{{i}}.parquet"
1996
- )
1997
- else:
1998
- basename_template = f"{basename}-{{i}}.parquet"
1999
-
2000
- file_options = pds.ParquetFileFormat().make_write_options(compression=compression)
2001
-
2002
- create_dir: bool = (False,)
2003
-
2004
- if hasattr(self, "fs"):
2005
- if "local" in self.fs.protocol:
2006
- create_dir = True
2007
- else:
2008
- if "local" in self.protocol:
2009
- create_dir = True
2010
-
2011
- if format == "parquet":
2012
- metadata = []
2013
-
2014
- def file_visitor(written_file):
2015
- file_metadata = written_file.metadata
2016
- file_metadata.set_file_path(written_file.path)
2017
- metadata.append(file_metadata)
2018
-
2019
- pds.write_dataset(
2020
- data=data,
2021
- base_dir=path,
2022
- basename_template=basename_template,
2023
- partitioning=partition_by,
2024
- partitioning_flavor=partitioning_flavor,
2025
- filesystem=self,
2026
- existing_data_behavior=existing_data_behavior,
2027
- min_rows_per_group=row_group_size,
2028
- max_rows_per_group=row_group_size,
2029
- max_rows_per_file=max_rows_per_file,
2030
- schema=schema,
2031
- format=format,
2032
- create_dir=create_dir,
2033
- file_options=file_options,
2034
- file_visitor=file_visitor if format == "parquet" else None,
2035
- **kwargs,
2036
- )
2037
- if format == "parquet":
2038
- return metadata
2039
-
2040
-
2041
- def write_pydala_dataset(
2042
- self,
2043
- data: (
2044
- pl.DataFrame
2045
- | pl.LazyFrame
2046
- | pa.Table
2047
- | pa.RecordBatch
2048
- | pa.RecordBatchReader
2049
- | pd.DataFrame
2050
- | dict
2051
- | list[
2052
- pl.DataFrame
2053
- | pl.LazyFrame
2054
- | pa.Table
2055
- | pa.RecordBatch
2056
- | pa.RecordBatchReader
2057
- | pd.DataFrame
2058
- | dict
2059
- ]
2060
- ),
2061
- path: str,
2062
- mode: str = "append", # "delta", "overwrite"
2063
- basename: str | None = None,
2064
- partition_by: str | list[str] | None = None,
2065
- partitioning_flavor: str = "hive",
2066
- max_rows_per_file: int | None = 2_500_000,
2067
- row_group_size: int | None = 250_000,
2068
- compression: str = "zstd",
2069
- concat: bool = True,
2070
- sort_by: str | list[str] | list[tuple[str, str]] | None = None,
2071
- unique: bool | str | list[str] = False,
2072
- delta_subset: str | list[str] | None = None,
2073
- update_metadata: bool = True,
2074
- alter_schema: bool = False,
2075
- timestamp_column: str | None = None,
2076
- verbose: bool = False,
2077
- **kwargs,
2078
- ) -> None:
2079
- """Write a tabular data to a Pydala dataset.
2080
-
2081
- Args:
2082
- data: (pl.DataFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
2083
- pd.DataFrame | list[pl.DataFrame] | list[pa.Table] | list[pa.RecordBatch] |
2084
- list[pa.RecordBatchReader] | list[pd.DataFrame]) Data to write.
2085
- path: (str) Path to write the data.
2086
- mode: (str, optional) Write mode. Defaults to 'append'. Options: 'delta', 'overwrite'.
2087
- basename: (str, optional) Basename of the files. Defaults to None.
2088
- partition_by: (str | list[str], optional) Partitioning of the data. Defaults to None.
2089
- partitioning_flavor: (str, optional) Partitioning flavor. Defaults to 'hive'.
2090
- max_rows_per_file: (int, optional) Maximum number of rows per file. Defaults to 2_500_000.
2091
- row_group_size: (int, optional) Row group size. Defaults to 250_000.
2092
- compression: (str, optional) Compression algorithm. Defaults to 'zstd'.
2093
- sort_by: (str | list[str] | list[tuple[str, str]], optional) Columns to sort by. Defaults to None.
2094
- unique: (bool | str | list[str], optional) If True, ensure unique values. Defaults to False.
2095
- delta_subset: (str | list[str], optional) Subset of columns to include in delta table. Defaults to None.
2096
- update_metadata: (bool, optional) If True, update metadata. Defaults to True.
2097
- alter_schema: (bool, optional) If True, alter schema. Defaults to False.
2098
- timestamp_column: (str, optional) Timestamp column. Defaults to None.
2099
- verbose: (bool, optional) If True, print verbose output. Defaults to False.
2100
- **kwargs: Additional keyword arguments for `ParquetDataset.write_to_dataset`.
2101
-
2102
- Returns:
2103
- None
2104
- """
2105
- data = to_pyarrow_table(data, concat=concat, unique=unique)
2106
-
2107
- ds = pydala_dataset(self=self, path=path, partitioning=partitioning_flavor)
2108
- ds.write_to_dataset(
2109
- data=data,
2110
- mode=mode,
2111
- basename=basename,
2112
- partition_by=partition_by,
2113
- max_rows_per_file=max_rows_per_file,
2114
- row_group_size=row_group_size,
2115
- compression=compression,
2116
- sort_by=sort_by,
2117
- unique=unique,
2118
- delta_subset=delta_subset,
2119
- update_metadata=update_metadata,
2120
- alter_schema=alter_schema,
2121
- timestamp_column=timestamp_column,
2122
- verbose=verbose,
2123
- **kwargs,
2124
- )
2125
-
2126
-
2127
- AbstractFileSystem.read_json_file = read_json_file
2128
- AbstractFileSystem.read_json = read_json
2129
- AbstractFileSystem.read_csv_file = read_csv_file
2130
- AbstractFileSystem.read_csv = read_csv
2131
- AbstractFileSystem.read_parquet_file = read_parquet_file
2132
- AbstractFileSystem.read_parquet = read_parquet
2133
- AbstractFileSystem.read_files = read_files
2134
- AbstractFileSystem.pyarrow_dataset = pyarrow_dataset
2135
- AbstractFileSystem.pydala_dataset = pydala_dataset
2136
- AbstractFileSystem.pyarrow_parquet_dataset = pyarrow_parquet_dataset
2137
- AbstractFileSystem.write_parquet = write_parquet
2138
- AbstractFileSystem.write_json = write_json
2139
- AbstractFileSystem.write_csv = write_csv
2140
- AbstractFileSystem.write_file = write_file
2141
- AbstractFileSystem.write_files = write_files
2142
- AbstractFileSystem.write_pyarrow_dataset = write_pyarrow_dataset
2143
- AbstractFileSystem.write_pydala_dataset = write_pydala_dataset