FlowerPower 0.11.6.19__py3-none-any.whl → 0.20.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. flowerpower/cfg/__init__.py +3 -3
  2. flowerpower/cfg/pipeline/__init__.py +5 -3
  3. flowerpower/cfg/project/__init__.py +3 -3
  4. flowerpower/cfg/project/job_queue.py +1 -128
  5. flowerpower/cli/__init__.py +5 -5
  6. flowerpower/cli/cfg.py +0 -3
  7. flowerpower/cli/job_queue.py +401 -133
  8. flowerpower/cli/pipeline.py +14 -413
  9. flowerpower/cli/utils.py +0 -1
  10. flowerpower/flowerpower.py +537 -28
  11. flowerpower/job_queue/__init__.py +5 -94
  12. flowerpower/job_queue/base.py +201 -3
  13. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
  14. flowerpower/job_queue/rq/manager.py +388 -77
  15. flowerpower/pipeline/__init__.py +2 -0
  16. flowerpower/pipeline/base.py +2 -2
  17. flowerpower/pipeline/io.py +14 -16
  18. flowerpower/pipeline/manager.py +21 -642
  19. flowerpower/pipeline/pipeline.py +571 -0
  20. flowerpower/pipeline/registry.py +242 -10
  21. flowerpower/pipeline/visualizer.py +1 -2
  22. flowerpower/plugins/_io/__init__.py +8 -0
  23. flowerpower/plugins/mqtt/manager.py +6 -6
  24. flowerpower/settings/backend.py +0 -2
  25. flowerpower/settings/job_queue.py +1 -57
  26. flowerpower/utils/misc.py +0 -256
  27. flowerpower/utils/monkey.py +1 -83
  28. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
  29. flowerpower-0.20.0.dist-info/RECORD +58 -0
  30. flowerpower/fs/__init__.py +0 -29
  31. flowerpower/fs/base.py +0 -662
  32. flowerpower/fs/ext.py +0 -2143
  33. flowerpower/fs/storage_options.py +0 -1420
  34. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  35. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  36. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  37. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  38. flowerpower/job_queue/apscheduler/setup.py +0 -554
  39. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  40. flowerpower/job_queue/apscheduler/utils.py +0 -311
  41. flowerpower/pipeline/job_queue.py +0 -583
  42. flowerpower/pipeline/runner.py +0 -603
  43. flowerpower/plugins/io/base.py +0 -2520
  44. flowerpower/plugins/io/helpers/datetime.py +0 -298
  45. flowerpower/plugins/io/helpers/polars.py +0 -875
  46. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  47. flowerpower/plugins/io/helpers/sql.py +0 -202
  48. flowerpower/plugins/io/loader/__init__.py +0 -28
  49. flowerpower/plugins/io/loader/csv.py +0 -37
  50. flowerpower/plugins/io/loader/deltatable.py +0 -190
  51. flowerpower/plugins/io/loader/duckdb.py +0 -19
  52. flowerpower/plugins/io/loader/json.py +0 -37
  53. flowerpower/plugins/io/loader/mqtt.py +0 -159
  54. flowerpower/plugins/io/loader/mssql.py +0 -26
  55. flowerpower/plugins/io/loader/mysql.py +0 -26
  56. flowerpower/plugins/io/loader/oracle.py +0 -26
  57. flowerpower/plugins/io/loader/parquet.py +0 -35
  58. flowerpower/plugins/io/loader/postgres.py +0 -26
  59. flowerpower/plugins/io/loader/pydala.py +0 -19
  60. flowerpower/plugins/io/loader/sqlite.py +0 -23
  61. flowerpower/plugins/io/metadata.py +0 -244
  62. flowerpower/plugins/io/saver/__init__.py +0 -28
  63. flowerpower/plugins/io/saver/csv.py +0 -36
  64. flowerpower/plugins/io/saver/deltatable.py +0 -186
  65. flowerpower/plugins/io/saver/duckdb.py +0 -19
  66. flowerpower/plugins/io/saver/json.py +0 -36
  67. flowerpower/plugins/io/saver/mqtt.py +0 -28
  68. flowerpower/plugins/io/saver/mssql.py +0 -26
  69. flowerpower/plugins/io/saver/mysql.py +0 -26
  70. flowerpower/plugins/io/saver/oracle.py +0 -26
  71. flowerpower/plugins/io/saver/parquet.py +0 -36
  72. flowerpower/plugins/io/saver/postgres.py +0 -26
  73. flowerpower/plugins/io/saver/pydala.py +0 -20
  74. flowerpower/plugins/io/saver/sqlite.py +0 -24
  75. flowerpower/utils/scheduler.py +0 -311
  76. flowerpower-0.11.6.19.dist-info/RECORD +0 -102
  77. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
  78. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
  79. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
  80. {flowerpower-0.11.6.19.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,2520 +0,0 @@
1
- import importlib
2
- import os
3
- import posixpath
4
- from typing import Any, Generator
5
-
6
- if importlib.util.find_spec("datafusion"):
7
- import datafusion
8
- else:
9
- raise ImportError("To use this module, please install `flowerpower[io]`.")
10
- import sqlite3
11
-
12
- import duckdb
13
- import msgspec
14
- import pandas as pd
15
- import pyarrow as pa
16
- import pyarrow.dataset as pds
17
- from fsspec import AbstractFileSystem
18
- from msgspec import field
19
- from pydala.dataset import ParquetDataset
20
- from sqlalchemy import create_engine, text
21
-
22
- from ...fs import get_filesystem
23
- from ...fs.ext import _dict_to_dataframe, path_to_glob
24
- from ...fs.storage_options import (AwsStorageOptions, AzureStorageOptions,
25
- GcsStorageOptions, GitHubStorageOptions,
26
- GitLabStorageOptions, StorageOptions)
27
- from ...utils.misc import convert_large_types_to_standard, to_pyarrow_table
28
- from .helpers.polars import pl
29
- from .helpers.pyarrow import opt_dtype
30
- from .helpers.sql import sql2polars_filter, sql2pyarrow_filter
31
- from .metadata import get_dataframe_metadata, get_pyarrow_dataset_metadata
32
-
33
-
34
- # @attrs.define # Removed
35
- class BaseFileIO(msgspec.Struct, gc=False):
36
- """
37
- Base class for file I/O operations supporting various storage backends.
38
- This class provides a foundation for file operations across different storage systems
39
- including AWS S3, Google Cloud Storage, Azure Blob Storage, GitHub, and GitLab.
40
-
41
- Args:
42
- path (str | list[str]): Path or list of paths to file(s).
43
- storage_options (AwsStorageOptions | GcsStorageOptions | AzureStorageOptions |
44
- GitHubStorageOptions | GitLabStorageOptions | dict[str, Any] | None, optional):
45
- Storage-specific options for accessing remote filesystems.
46
- fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
47
- format (str, optional): File format extension (without dot).
48
-
49
- Notes:
50
- ```python
51
- file_io = BaseFileIO(
52
- path="s3://bucket/path/to/files",
53
- storage_options=AwsStorageOptions(
54
- key="access_key",
55
- secret="secret_key"
56
- files = file_io.list_files()
57
- ```
58
- Notes:
59
- - Supports multiple cloud storage backends through different storage options
60
- - Automatically handles filesystem initialization based on path protocol
61
- - Supports both single path and multiple path inputs
62
- - Can read credentials from environment variables when using from_env() methods
63
-
64
- """
65
-
66
- path: str | list[str]
67
- storage_options: (
68
- StorageOptions
69
- | AwsStorageOptions
70
- | AzureStorageOptions
71
- | GcsStorageOptions
72
- | GitLabStorageOptions
73
- | GitHubStorageOptions
74
- | dict[str, Any]
75
- | None
76
- ) = field(default=None)
77
- fs: AbstractFileSystem | None = field(default=None)
78
- format: str | None = None
79
- # _base_path: str | list[str] | None = field(default=None)
80
- # _full_path: str | list[str] | None = field(default=None)
81
- # _rel_path: str | list[str] | None = field(default=None)
82
- # _glob_path
83
- _metadata: dict[str, Any] | None = field(default=None)
84
-
85
- def __post_init__(self):
86
- # self._base_path = self.path if isinstance(self.path, str) else os.path.commonpath(self.path)
87
-
88
- # if self.fs is None:
89
- self.fs = get_filesystem(
90
- path=self._base_path,
91
- storage_options=self.storage_options,
92
- fs=self.fs,
93
- dirfs=True,
94
- )
95
-
96
- self.storage_options = (
97
- self.storage_options or self.fs.storage_options
98
- if self.protocol != "dir"
99
- else self.fs.fs.storage_options
100
- )
101
-
102
- @property
103
- def protocol(self):
104
- """Get the protocol of the filesystem."""
105
- protocol = (
106
- self.fs.protocol if self.fs.protocol != "dir" else self.fs.fs.protocol
107
- )
108
- if isinstance(protocol, list | tuple):
109
- protocol = protocol[0]
110
- return protocol
111
-
112
- @property
113
- def _base_path(self) -> str:
114
- """Get the base path for the filesystem."""
115
- if isinstance(self.path, list):
116
- base_path = posixpath.commonpath(self.path).rstrip("/*")
117
- else:
118
- base_path = self.path
119
-
120
- if self.format in base_path:
121
- base_path = posixpath.dirname(base_path).rstrip("/")
122
-
123
- return base_path
124
-
125
- @property
126
- def _path(self) -> str | list[str]:
127
- if self.fs.protocol == "dir":
128
- if isinstance(self.path, list):
129
- return [
130
- p.replace(self._base_path.lstrip("/"), "").lstrip("/")
131
- for p in self.path
132
- ]
133
- else:
134
- return self.path.replace(self._base_path.lstrip("/"), "").lstrip("/")
135
- return self.path
136
-
137
- @property
138
- def _glob_path(self) -> str | list[str]:
139
- if isinstance(self._path, list):
140
- return self._path
141
- return path_to_glob(self._path, self.format)
142
-
143
- @property
144
- def _root_path(self) -> str:
145
- if self.fs.protocol == "dir":
146
- return self._base_path.replace(self.fs.path, "")
147
- return self._base_path
148
-
149
- def list_files(self) -> list[str]:
150
- if isinstance(self._path, list):
151
- return self._path
152
-
153
- return self.fs.glob(self._glob_path)
154
-
155
-
156
- # @attrs.define # Removed
157
- class BaseFileReader(BaseFileIO, gc=False):
158
- """
159
- Base class for file loading operations supporting various file formats.
160
- This class provides a foundation for file loading operations across different file formats
161
- including CSV, Parquet, JSON, Arrow, and IPC.
162
-
163
- Args:
164
- path (str | list[str]): Path or list of paths to file(s).
165
- format (str, optional): File format extension (without dot).
166
- fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
167
- include_file_path (bool, optional): Include file path in the output DataFrame.
168
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
169
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
170
- ctx (datafusion.SessionContext, optional): DataFusion session context instance.
171
-
172
- Examples:
173
- ```python
174
- file_loader = BaseFileReader(
175
- path="s3://bucket/path/to/files",
176
- format="csv",
177
- include_file_path=True,
178
- concat=True,
179
- conn=duckdb.connect(),
180
- ctx=datafusion.SessionContext()
181
- data = file_loader.to_polars()
182
- ```
183
- Notes:
184
- - Supports multiple file formats including CSV, Parquet, JSON, Arrow, and IPC
185
- - Automatically handles filesystem initialization based on path protocol
186
- - Supports both single path and multiple path inputs
187
- - Supports loading data into DuckDB and DataFusion for SQL operations
188
-
189
- """
190
-
191
- include_file_path: bool = field(default=False)
192
- concat: bool = field(default=True)
193
- batch_size: int | None = field(default=None)
194
- opt_dtypes: bool = field(default=False)
195
- use_threads: bool = field(default=True)
196
- conn: duckdb.DuckDBPyConnection | None = field(default=None)
197
- ctx: datafusion.SessionContext | None = field(default=None)
198
- jsonlines: bool | None = field(default=None)
199
- partitioning: str | list[str] | pds.Partitioning | None = field(default=None)
200
- verbose: bool | None = field(default=None)
201
- _data: Any | None = field(default=None)
202
-
203
- def _load(
204
- self,
205
- metadata: bool = False,
206
- reload: bool = False,
207
- batch_size: int | None = None,
208
- include_file_path: bool = False,
209
- concat: bool | None = None,
210
- use_threads: bool | None = None,
211
- verbose: bool | None = None,
212
- opt_dtypes: bool | None = None,
213
- **kwargs,
214
- ):
215
- if batch_size is not None:
216
- if self.batch_size != batch_size:
217
- reload = True
218
- self.batch_size = batch_size
219
-
220
- if include_file_path is not None:
221
- if self.include_file_path != include_file_path:
222
- reload = True
223
- self.include_file_path = include_file_path
224
-
225
- if concat is not None:
226
- if self.concat != concat:
227
- reload = True
228
- self.concat = concat
229
-
230
- if use_threads is not None:
231
- if self.use_threads != use_threads:
232
- reload = True
233
- self.use_threads = use_threads
234
-
235
- if verbose is not None:
236
- if self.verbose != verbose:
237
- reload = True
238
- self.verbose = verbose
239
-
240
- if opt_dtypes is not None:
241
- if self.opt_dtypes != opt_dtypes:
242
- reload = True
243
- self.opt_dtypes = opt_dtypes
244
-
245
- if "partitioning" in kwargs:
246
- if self.partitioning != kwargs["partitioning"]:
247
- reload = True
248
- self.partitioning = kwargs.pop("partitioning")
249
-
250
- if not hasattr(self, "_data") or self._data is None or reload:
251
- self._data = self.fs.read_files(
252
- path=self._glob_path,
253
- format=self.format,
254
- include_file_path=True if metadata or self.include_file_path else False,
255
- concat=self.concat,
256
- jsonlines=self.jsonlines or None,
257
- batch_size=self.batch_size,
258
- partitioning=self.partitioning,
259
- opt_dtypes=self.opt_dtypes,
260
- verbose=self.verbose,
261
- use_threads=self.use_threads,
262
- **kwargs,
263
- )
264
- if metadata:
265
- if isinstance(self._data, tuple | list):
266
- self._metadata = [
267
- get_dataframe_metadata(
268
- df=df,
269
- path=self.path,
270
- format=self.format,
271
- num_files=pl.from_arrow(df.select(["file_path"])).select(
272
- pl.n_unique("file_path")
273
- )[0, 0]
274
- if isinstance(df, pa.Table)
275
- else df.select(pl.n_unique("file_path"))[0, 0],
276
- )
277
- for df in self._data
278
- ]
279
- if not self.include_file_path:
280
- self._data = [df.drop("file_path") for df in self._data]
281
-
282
- elif isinstance(self._data, pa.Table):
283
- self._metadata = get_dataframe_metadata(
284
- df=self._data,
285
- path=self.path,
286
- format=self.format,
287
- num_files=pl.from_arrow(
288
- self._data.select(pl.n_unique("file_path"))
289
- )[0, 0],
290
- )
291
- if not self.include_file_path:
292
- self._data = self._data.drop("file_path")
293
-
294
- elif isinstance(self._data, pl.DataFrame | pl.LazyFrame):
295
- self._metadata = get_dataframe_metadata(
296
- df=self._data,
297
- path=self.path,
298
- format=self.format,
299
- num_files=self._data.select(pl.n_unique("file_path"))[0, 0]
300
- if isinstance(self._data, pl.DataFrame)
301
- else self._data.select(pl.n_unique("file_path")).collect()[
302
- 0, 0
303
- ],
304
- )
305
-
306
- if not self.include_file_path:
307
- self._data = self._data.drop("file_path")
308
- else:
309
- metadata = {}
310
- else:
311
- self._metadata = {}
312
-
313
- def to_pandas(
314
- self,
315
- metadata: bool = False,
316
- reload: bool = False,
317
- include_file_path: bool = False,
318
- concat: bool | None = None,
319
- use_threads: bool | None = None,
320
- verbose: bool | None = None,
321
- opt_dtypes: bool | None = None,
322
- **kwargs,
323
- ) -> (
324
- tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]]
325
- | pd.DataFrame
326
- | list[pd.DataFrame]
327
- ):
328
- """Convert data to Pandas DataFrame(s).
329
-
330
- Args:
331
- metadata (bool, optional): Include metadata in the output. Default is False.
332
- reload (bool, optional): Reload data if True. Default is False.
333
- include_file_path (bool, optional): Include file path in the output. Default is False.
334
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
335
- use_threads (bool, optional): Use threads for reading data. Default is True.
336
- verbose (bool, optional): Verbose output. Default is None.
337
- opt_dtypes (bool, optional): Optimize data types. Default is True.
338
- kwargs: Additional keyword arguments.
339
-
340
- Returns:
341
- tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]] | pd.DataFrame | list[pd.DataFrame]: Pandas
342
- DataFrame or list of DataFrames and optional metadata.
343
- """
344
- kwargs.pop("batch_size", None)
345
- self._load(
346
- reload=reload,
347
- metadata=metadata,
348
- batch_size=None,
349
- include_file_path=include_file_path,
350
- concat=concat,
351
- use_threads=use_threads,
352
- verbose=verbose,
353
- opt_dtypes=opt_dtypes,
354
- **kwargs,
355
- )
356
- if isinstance(self._data, list):
357
- df = [
358
- df if isinstance(df, pd.DataFrame) else df.to_pandas()
359
- for df in self._data
360
- ]
361
- df = pd.concat(df) if self.concat else df
362
- else:
363
- df = (
364
- self._data
365
- if isinstance(self._data, pd.DataFrame)
366
- else self._data.to_pandas()
367
- )
368
- if metadata:
369
- # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
370
- return df, self._metadata
371
- return df
372
-
373
- def iter_pandas(
374
- self,
375
- reload: bool = False,
376
- batch_size: int | None = None,
377
- include_file_path: bool = False,
378
- concat: bool | None = None,
379
- use_threads: bool | None = None,
380
- verbose: bool | None = None,
381
- opt_dtypes: bool | None = None,
382
- **kwargs,
383
- ) -> Generator[pd.DataFrame, None, None]:
384
- """Iterate over Pandas DataFrames.
385
-
386
- Args:
387
- batch_size (int, optional): Batch size for iteration. Default is 1.
388
- reload (bool, optional): Reload data if True. Default is False.
389
- include_file_path (bool, optional): Include file path in the output. Default is False.
390
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
391
- use_threads (bool, optional): Use threads for reading data. Default is True.
392
- verbose (bool, optional): Verbose output. Default is None.
393
- opt_dtypes (bool, optional): Optimize data types. Default is True.
394
- kwargs: Additional keyword arguments.
395
-
396
- Returns:
397
- Generator[pd.DataFrame, None, None]: Generator of Pandas DataFrames.
398
- """
399
- batch_size = batch_size or self.batch_size or 1
400
-
401
- self._load(
402
- reload=reload,
403
- batch_size=batch_size,
404
- include_file_path=include_file_path,
405
- concat=concat,
406
- use_threads=use_threads,
407
- verbose=verbose,
408
- opt_dtypes=opt_dtypes,
409
- **kwargs,
410
- )
411
-
412
- if isinstance(self._data, list | Generator):
413
- for df in self._data:
414
- yield df if isinstance(df, pd.DataFrame) else df.to_pandas()
415
- else:
416
- yield (
417
- self._data
418
- if isinstance(self._data, pd.DataFrame)
419
- else self._data.to_pandas()
420
- )
421
-
422
- def _to_polars_dataframe(
423
- self,
424
- metadata: bool = False,
425
- reload: bool = False,
426
- include_file_path: bool = False,
427
- concat: bool | None = None,
428
- use_threads: bool | None = None,
429
- verbose: bool | None = None,
430
- opt_dtypes: bool | None = None,
431
- **kwargs,
432
- ) -> (
433
- tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]]
434
- | pl.DataFrame
435
- | list[pl.DataFrame]
436
- ):
437
- """Convert data to Polars DataFrame(s).
438
-
439
- Args:
440
- metadata (bool, optional): Include metadata in the output. Default is False.
441
- reload (bool, optional): Reload data if True. Default is False.
442
- include_file_path (bool, optional): Include file path in the output. Default is False.
443
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
444
- use_threads (bool, optional): Use threads for reading data. Default is True.
445
- verbose (bool, optional): Verbose output. Default is None.
446
- opt_dtypes (bool, optional): Optimize data types. Default is True.
447
- kwargs: Additional keyword arguments.
448
-
449
- Returns:
450
- tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]] | pl.DataFrame | list[pl.DataFrame]: Polars
451
- DataFrame or list of DataFrames and optional metadata.
452
- """
453
- kwargs.pop("batch_size", None)
454
-
455
- self._load(
456
- metadata=metadata,
457
- reload=reload,
458
- batch_size=None,
459
- include_file_path=include_file_path,
460
- concat=concat,
461
- use_threads=use_threads,
462
- verbose=verbose,
463
- opt_dtypes=opt_dtypes,
464
- **kwargs,
465
- )
466
- if isinstance(self._data, list):
467
- df = [
468
- df if isinstance(self._data, pl.DataFrame) else pl.from_arrow(df)
469
- for df in self._data
470
- ]
471
- df = pl.concat(df) if self.concat else df
472
- else:
473
- df = (
474
- self._data
475
- if isinstance(self._data, pl.DataFrame)
476
- else pl.from_arrow(self._data)
477
- )
478
- if metadata:
479
- # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
480
- return df, self._metadata
481
- return df
482
-
483
- def _iter_polars_dataframe(
484
- self,
485
- reload: bool = False,
486
- batch_size: int | None = None,
487
- include_file_path: bool = False,
488
- concat: bool | None = None,
489
- use_threads: bool | None = None,
490
- verbose: bool | None = None,
491
- opt_dtypes: bool | None = None,
492
- **kwargs,
493
- ) -> Generator[pl.DataFrame, None, None]:
494
- """Iterate over Polars DataFrames.
495
-
496
- Args:
497
- batch_size (int, optional): Batch size for iteration. Default is 1.
498
- reload (bool, optional): Reload data if True. Default is False.
499
- include_file_path (bool, optional): Include file path in the output. Default is False.
500
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
501
- use_threads (bool, optional): Use threads for reading data. Default is True.
502
- verbose (bool, optional): Verbose output. Default is None.
503
- opt_dtypes (bool, optional): Optimize data types. Default is True.
504
- kwargs: Additional keyword arguments.
505
-
506
- Returns:
507
- Generator[pl.DataFrame, None, None]: Generator of Polars DataFrames.
508
- """
509
- batch_size = batch_size or self.batch_size or 1
510
-
511
- self._load(
512
- reload=reload,
513
- batch_size=batch_size,
514
- include_file_path=include_file_path,
515
- concat=concat,
516
- use_threads=use_threads,
517
- verbose=verbose,
518
- opt_dtypes=opt_dtypes,
519
- **kwargs,
520
- )
521
- if isinstance(self._data, list | Generator):
522
- for df in self._data:
523
- yield df if isinstance(df, pl.DataFrame) else pl.from_arrow(df)
524
- else:
525
- yield (
526
- self._data
527
- if isinstance(self._data, pl.DataFrame)
528
- else pl.from_arrow(self._data)
529
- )
530
-
531
- def _to_polars_lazyframe(
532
- self,
533
- metadata: bool = False,
534
- reload: bool = False,
535
- include_file_path: bool = False,
536
- concat: bool | None = None,
537
- use_threads: bool | None = None,
538
- verbose: bool | None = None,
539
- opt_dtypes: bool | None = None,
540
- **kwargs,
541
- ) -> (
542
- tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]]
543
- | pl.LazyFrame
544
- | list[pl.LazyFrame]
545
- ):
546
- """Convert data to Polars LazyFrame(s).
547
-
548
- Args:
549
- metadata (bool, optional): Include metadata in the output. Default is False.
550
- reload (bool, optional): Reload data if True. Default is False.
551
- include_file_path (bool, optional): Include file path in the output. Default is False.
552
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
553
- use_threads (bool, optional): Use threads for reading data. Default is True.
554
- verbose (bool, optional): Verbose output. Default is None.
555
- opt_dtypes (bool, optional): Optimize data types. Default is True.
556
- kwargs: Additional keyword arguments.
557
-
558
- Returns:
559
- tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]] | pl.LazyFrame | list[pl.LazyFrame]: Polars
560
- LazyFrame or list of LazyFrames and optional metadata.
561
- """
562
- kwargs.pop("batch_size", None)
563
-
564
- self._load(
565
- metadata=metadata,
566
- reload=reload,
567
- batch_size=None,
568
- include_file_path=include_file_path,
569
- concat=concat,
570
- use_threads=use_threads,
571
- verbose=verbose,
572
- opt_dtypes=opt_dtypes,
573
- **kwargs,
574
- )
575
- if not self.concat:
576
- df = [df.lazy() for df in self._to_polars_dataframe()]
577
-
578
- else:
579
- df = self._to_polars_dataframe().lazy()
580
- if metadata:
581
- # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
582
- return df, self._metadata
583
- return df
584
-
585
- def _iter_polars_lazyframe(
586
- self,
587
- reload: bool = False,
588
- batch_size: int | None = None,
589
- include_file_path: bool = False,
590
- concat: bool | None = None,
591
- use_threads: bool | None = None,
592
- verbose: bool | None = None,
593
- opt_dtypes: bool | None = None,
594
- **kwargs,
595
- ) -> Generator[pl.LazyFrame, None, None]:
596
- """Iterate over Polars LazyFrames.
597
-
598
- Args:
599
- batch_size (int, optional): Batch size for iteration. Default is 1.
600
- reload (bool, optional): Reload data if True. Default is False.
601
- include_file_path (bool, optional): Include file path in the output. Default is False.
602
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
603
- use_threads (bool, optional): Use threads for reading data. Default is True.
604
- verbose (bool, optional): Verbose output. Default is None.
605
- opt_dtypes (bool, optional): Optimize data types. Default is True.
606
- kwargs: Additional keyword arguments.
607
-
608
- Returns:
609
- Generator[pl.LazyFrame, None, None]: Generator of Polars LazyFrames.
610
- """
611
- batch_size = batch_size or self.batch_size or 1
612
-
613
- self._load(
614
- reload=reload,
615
- batch_size=batch_size,
616
- include_file_path=include_file_path,
617
- concat=concat,
618
- use_threads=use_threads,
619
- verbose=verbose,
620
- opt_dtypes=opt_dtypes,
621
- **kwargs,
622
- )
623
- if isinstance(self._data, list | Generator):
624
- for df in self._data:
625
- yield (
626
- df.lazy()
627
- if isinstance(df, pl.DataFrame)
628
- else pl.from_arrow(df).lazy()
629
- )
630
- else:
631
- yield (
632
- self._data.lazy()
633
- if isinstance(self._data, pl.DataFrame)
634
- else pl.from_arrow(self._data).lazy()
635
- )
636
-
637
- def to_polars(
638
- self,
639
- lazy: bool = False,
640
- metadata: bool = False,
641
- reload: bool = False,
642
- include_file_path: bool = False,
643
- concat: bool | None = None,
644
- use_threads: bool | None = None,
645
- verbose: bool | None = None,
646
- opt_dtypes: bool | None = None,
647
- **kwargs,
648
- ) -> (
649
- pl.DataFrame
650
- | pl.LazyFrame
651
- | list[pl.DataFrame]
652
- | list[pl.LazyFrame]
653
- | tuple[
654
- pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame],
655
- dict[str, Any],
656
- ]
657
- ):
658
- """Convert data to Polars DataFrame or LazyFrame.
659
-
660
- Args:
661
- lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
662
- metadata (bool, optional): Include metadata in the output. Default is False.
663
- reload (bool, optional): Reload data if True. Default is False.
664
- batch_size (int, optional): Batch size for iteration. Default is 1.
665
- include_file_path (bool, optional): Include file path in the output. Default is False.
666
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
667
- use_threads (bool, optional): Use threads for reading data. Default is True.
668
- verbose (bool, optional): Verbose output. Default is None.
669
- opt_dtypes (bool, optional): Optimize data types. Default is True.
670
- kwargs: Additional keyword arguments.
671
-
672
- Returns:
673
- pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame] | tuple[pl.DataFrame | pl.LazyFrame
674
- | list[pl.DataFrame] | list[pl.LazyFrame], dict[str, Any]]: Polars DataFrame or LazyFrame and optional
675
- metadata.
676
- """
677
- kwargs.pop("batch_size", None)
678
- if lazy:
679
- return self._to_polars_lazyframe(
680
- metadata=metadata,
681
- reload=reload,
682
- batch_size=None,
683
- include_file_path=include_file_path,
684
- concat=concat,
685
- use_threads=use_threads,
686
- verbose=verbose,
687
- opt_dtypes=opt_dtypes,
688
- **kwargs,
689
- )
690
- return self._to_polars_dataframe(
691
- metadata=metadata,
692
- reload=reload,
693
- batch_size=None,
694
- include_file_path=include_file_path,
695
- concat=concat,
696
- use_threads=use_threads,
697
- verbose=verbose,
698
- opt_dtypes=opt_dtypes,
699
- **kwargs,
700
- )
701
-
702
- def iter_polars(
703
- self,
704
- lazy: bool = False,
705
- reload: bool = False,
706
- batch_size: int | None = None,
707
- include_file_path: bool = False,
708
- concat: bool | None = None,
709
- use_threads: bool | None = None,
710
- verbose: bool | None = None,
711
- opt_dtypes: bool | None = None,
712
- **kwargs,
713
- ) -> Generator[pl.DataFrame | pl.LazyFrame, None, None]:
714
- """Iterate over Polars DataFrames or LazyFrames.
715
-
716
- Args:
717
- lazy (bool, optional): Return a LazyFrame if True, else a DataFrame. Default is False.
718
- reload (bool, optional): Reload data if True. Default is False.
719
- batch_size (int, optional): Batch size for iteration. Default is 1.
720
- include_file_path (bool, optional): Include file path in the output. Default is False.
721
- concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
722
- use_threads (bool, optional): Use threads for reading data. Default is True.
723
- verbose (bool, optional): Verbose output. Default is None.
724
- opt_dtypes (bool, optional): Optimize data types. Default is True.
725
- kwargs: Additional keyword arguments.
726
-
727
- Returns:
728
- Generator[pl.DataFrame | pl.LazyFrame, None, None]: Generator of Polars DataFrames or LazyFrames.
729
- """
730
- if lazy:
731
- yield from self._iter_polars_lazyframe(
732
- reload=reload,
733
- batch_size=batch_size,
734
- include_file_path=include_file_path,
735
- concat=concat,
736
- use_threads=use_threads,
737
- verbose=verbose,
738
- opt_dtypes=opt_dtypes,
739
- **kwargs,
740
- )
741
- yield from self._iter_polars_dataframe(
742
- reload=reload,
743
- batch_size=batch_size,
744
- include_file_path=include_file_path,
745
- concat=concat,
746
- use_threads=use_threads,
747
- verbose=verbose,
748
- opt_dtypes=opt_dtypes,
749
- **kwargs,
750
- )
751
-
752
- def to_pyarrow_table(
753
- self,
754
- metadata: bool = False,
755
- reload: bool = False,
756
- include_file_path: bool = False,
757
- use_threads: bool | None = None,
758
- verbose: bool | None = None,
759
- opt_dtypes: bool | None = None,
760
- **kwargs,
761
- ) -> pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]:
762
- """Convert data to PyArrow Table(s).
763
-
764
- Args:
765
- metadata (bool, optional): Include metadata in the output. Default is False.
766
- reload (bool, optional): Reload data if True. Default is False.
767
- include_file_path (bool, optional): Include file path in the output. Default is False.
768
- use_threads (bool, optional): Use threads for reading data. Default is True.
769
- verbose (bool, optional): Verbose output. Default is None.
770
- opt_dtypes (bool, optional): Optimize data types. Default is True.
771
- kwargs: Additional keyword arguments.
772
-
773
- Returns:
774
- pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]: PyArrow Table or list of
775
- Tables and optional metadata.
776
- """
777
- kwargs.pop("batch_size", None)
778
- self._load(
779
- reload=reload,
780
- metadata=metadata,
781
- batch_size=None,
782
- include_file_path=include_file_path,
783
- concat=None,
784
- use_threads=use_threads,
785
- verbose=verbose,
786
- opt_dtypes=opt_dtypes,
787
- **kwargs,
788
- )
789
- if isinstance(self._data, list):
790
- df = [
791
- df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
792
- for df in self._data
793
- ]
794
- df = pa.concat_tables(df) if self.concat else df
795
- else:
796
- df = (
797
- self._data.to_arrow(**kwargs)
798
- if isinstance(self._data, pl.DataFrame)
799
- else self._data
800
- )
801
- if metadata:
802
- # metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
803
- return df, self._metadata
804
- return df
805
-
806
- def iter_pyarrow_table(
807
- self,
808
- reload: bool = False,
809
- batch_size: int | None = None,
810
- include_file_path: bool = False,
811
- concat: bool | None = None,
812
- use_threads: bool | None = None,
813
- verbose: bool | None = None,
814
- opt_dtypes: bool | None = None,
815
- **kwargs,
816
- ) -> Generator[pa.Table, None, None]:
817
- """Iterate over PyArrow Tables.
818
-
819
- Args:
820
- reload (bool, optional): Reload data if True. Default is False.
821
- include_file_path (bool, optional): Include file path in the output. Default is False.
822
- concat (bool, optional): Concatenate multiple files into a single Table. Default is True.
823
- batch_size (int, optional): Batch size for iteration. Default is 1.
824
- use_threads (bool, optional): Use threads for reading data. Default is True.
825
- verbose (bool, optional): Verbose output. Default is None.
826
- opt_dtypes (bool, optional): Optimize data types. Default is True.
827
- kwargs: Additional keyword arguments.
828
-
829
- Returns:
830
- Generator[pa.Table, None, None]: Generator of PyArrow Tables.
831
- """
832
- batch_size = batch_size or self.batch_size or 1
833
-
834
- self._load(
835
- reload=reload,
836
- batch_size=batch_size,
837
- include_file_path=include_file_path,
838
- concat=concat,
839
- use_threads=use_threads,
840
- verbose=verbose,
841
- opt_dtypes=opt_dtypes,
842
- **kwargs,
843
- )
844
- if isinstance(self._data, list | Generator):
845
- for df in self._data:
846
- yield df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
847
- else:
848
- yield (
849
- self._data.to_arrow(**kwargs)
850
- if isinstance(self._data, pl.DataFrame)
851
- else self._data
852
- )
853
-
854
- def to_duckdb_relation(
855
- self,
856
- conn: duckdb.DuckDBPyConnection | None = None,
857
- metadata: bool = False,
858
- reload: bool = False,
859
- include_file_path: bool = False,
860
- use_threads: bool | None = None,
861
- verbose: bool | None = None,
862
- opt_dtypes: bool | None = None,
863
- **kwargs,
864
- ) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
865
- """Convert data to DuckDB relation.
866
-
867
- Args:
868
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
869
- metadata (bool, optional): Include metadata in the output. Default is False.
870
- reload (bool, optional): Reload data if True. Default is False.
871
- include_file_path (bool, optional): Include file path in the output. Default is False.
872
- use_threads (bool, optional): Use threads for reading data. Default is True.
873
- verbose (bool, optional): Verbose output. Default is None.
874
- opt_dtypes (bool, optional): Optimize data types. Default is True.
875
- kwargs: Additional keyword arguments.
876
-
877
- Returns:
878
- duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
879
- metadata.
880
- """
881
- kwargs.pop("batch_size", None)
882
- if self._conn is None:
883
- if conn is None:
884
- conn = duckdb.connect()
885
- self._conn = conn
886
-
887
- if metadata:
888
- return self._conn.from_arrow(
889
- self.to_pyarrow_table(
890
- metadata=metadata,
891
- reload=reload,
892
- batch_size=None,
893
- include_file_path=include_file_path,
894
- se_threads=use_threads,
895
- verbose=verbose,
896
- opt_dtypes=opt_dtypes,
897
- **kwargs,
898
- ),
899
- ), self._metadata
900
- return self._conn.from_arrow(
901
- self.to_pyarrow_table(
902
- reload=reload,
903
- batch_size=None,
904
- include_file_path=include_file_path,
905
- use_threads=use_threads,
906
- verbose=verbose,
907
- opt_dtypes=opt_dtypes,
908
- **kwargs,
909
- )
910
- )
911
-
912
- def register_in_duckdb(
913
- self,
914
- conn: duckdb.DuckDBPyConnection,
915
- name: str | None = None,
916
- metadata: bool = False,
917
- reload: bool = False,
918
- include_file_path: bool = False,
919
- use_threads: bool | None = None,
920
- verbose: bool | None = None,
921
- opt_dtypes: bool | None = None,
922
- **kwargs,
923
- ) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
924
- """Register data in DuckDB.
925
-
926
- Args:
927
- conn (duckdb.DuckDBPyConnection): DuckDB connection instance.
928
- name (str, optional): Name for the DuckDB table.
929
- metadata (bool, optional): Include metadata in the output. Default is False.
930
- reload (bool, optional): Reload data if True. Default is False.
931
- include_file_path (bool, optional): Include file path in the output. Default is False.
932
- use_threads (bool, optional): Use threads for reading data. Default is True.
933
- verbose (bool, optional): Verbose output. Default is None.
934
- opt_dtypes (bool, optional): Optimize data types. Default is True.
935
- kwargs: Additional keyword arguments.
936
-
937
- Returns:
938
- duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
939
- or DuckDB connection instance and optional metadata.
940
- """
941
- kwargs.pop("batch_size", None)
942
- if name is None:
943
- name = f"{self.format}:{self.path}"
944
-
945
- if self._conn is None:
946
- if conn is None:
947
- conn = duckdb.connect()
948
- self._conn = conn
949
-
950
- self._conn.register(
951
- name,
952
- self.to_pyarrow_table(
953
- metadata=metadata,
954
- reload=reload,
955
- include_file_path=include_file_path,
956
- use_threads=use_threads,
957
- verbose=verbose,
958
- opt_dtypes=opt_dtypes,
959
- **kwargs,
960
- ),
961
- )
962
- if metadata:
963
- return self._conn, self._metadata
964
- return self._conn
965
-
966
- def to_duckdb(
967
- self,
968
- as_relation: bool = True,
969
- conn: duckdb.DuckDBPyConnection | None = None,
970
- name: str | None = None,
971
- metadata: bool = False,
972
- reload: bool = False,
973
- include_file_path: bool = False,
974
- use_threads: bool | None = None,
975
- verbose: bool | None = None,
976
- opt_dtypes: bool | None = None,
977
- **kwargs,
978
- ) -> (
979
- duckdb.DuckDBPyRelation
980
- | duckdb.DuckDBPyConnection
981
- | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]
982
- | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]
983
- ):
984
- """Convert data to DuckDB relation or register in DuckDB.
985
-
986
- Args:
987
- as_relation (bool, optional): Return a DuckDB relation if True, else register in DuckDB. Default is True.
988
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
989
- name (str, optional): Name for the DuckDB table.
990
- metadata (bool, optional): Include metadata in the output. Default is False.
991
- reload (bool, optional): Reload data if True. Default is False.
992
- include_file_path (bool, optional): Include file path in the output. Default is False.
993
- use_threads (bool, optional): Use threads for reading data. Default is True.
994
- verbose (bool, optional): Verbose output. Default is None.
995
- opt_dtypes (bool, optional): Optimize data types. Default is True.
996
- **kwargs: Additional keyword arguments.
997
-
998
- Returns:
999
- duckdb.DuckDBPyRelation | duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyRelation, dict[str, Any]] |
1000
- tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB relation or connection instance
1001
- or DuckDB relation or connection instance and optional metadata.
1002
-
1003
- """
1004
- kwargs.pop("batch_size", None)
1005
- if as_relation:
1006
- return self.to_duckdb_relation(
1007
- conn=conn,
1008
- metadata=metadata,
1009
- reload=reload,
1010
- include_file_path=include_file_path,
1011
- use_threads=use_threads,
1012
- verbose=verbose,
1013
- opt_dtypes=opt_dtypes,
1014
- **kwargs,
1015
- )
1016
- return self.register_in_duckdb(
1017
- conn=conn,
1018
- name=name,
1019
- metadata=metadata,
1020
- reload=reload,
1021
- include_file_path=include_file_path,
1022
- use_threads=use_threads,
1023
- verbose=verbose,
1024
- opt_dtypes=opt_dtypes,
1025
- **kwargs,
1026
- )
1027
-
1028
- def register_in_datafusion(
1029
- self,
1030
- ctx: datafusion.SessionContext,
1031
- name: str | None = None,
1032
- metadata: bool = False,
1033
- reload: bool = False,
1034
- include_file_path: bool = False,
1035
- use_threads: bool | None = None,
1036
- verbose: bool | None = None,
1037
- opt_dtypes: bool | None = None,
1038
- **kwargs,
1039
- ) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
1040
- """Register data in DataFusion.
1041
-
1042
- Args:
1043
- ctx (datafusion.SessionContext): DataFusion session context instance.
1044
- name (str, optional): Name for the DataFusion table.
1045
- metadata (bool, optional): Include metadata in the output. Default is False.
1046
- reload (bool, optional): Reload data if True. Default is False.
1047
- **kwargs: Additional keyword arguments.
1048
-
1049
- Returns:
1050
- None
1051
- """
1052
- kwargs.pop("batch_size", None)
1053
- if name is None:
1054
- name = f"{self.format}:{self.path}"
1055
-
1056
- if self._ctx is None:
1057
- if ctx is None:
1058
- ctx = datafusion.SessionContext()
1059
- self._ctx = ctx
1060
-
1061
- self._ctx.register_record_batches(
1062
- name,
1063
- [
1064
- self.to_pyarrow_table(
1065
- reload=reload,
1066
- include_file_path=include_file_path,
1067
- use_threads=use_threads,
1068
- opt_dtypes=opt_dtypes,
1069
- verbose=verbose,
1070
- **kwargs,
1071
- ).to_batches()
1072
- ],
1073
- )
1074
- if metadata:
1075
- return self._ctx, self._metadata
1076
- return self._ctx
1077
-
1078
- def filter(
1079
- self, filter_expr: str | pl.Expr | pa.compute.Expression
1080
- ) -> (
1081
- pl.DataFrame
1082
- | pl.LazyFrame
1083
- | pa.Table
1084
- | list[pl.DataFrame]
1085
- | list[pl.LazyFrame]
1086
- | list[pa.Table]
1087
- ):
1088
- """Filter data based on a filter expression.
1089
-
1090
- Args:
1091
- filter_expr (str | pl.Expr | pa.compute.Expression): Filter expression. Can be a SQL expression, Polars
1092
- expression, or PyArrow compute expression.
1093
-
1094
- Returns:
1095
- pl.DataFrame | pl.LazyFrame | pa.Table | list[pl.DataFrame] | list[pl.LazyFrame]
1096
- | list[pa.Table]: Filtered data.
1097
- """
1098
- if isinstance(self._data, pl.DataFrame | pl.LazyFrame):
1099
- pl_schema = (
1100
- self._data.schema
1101
- if isinstance(self._data, pl.DataFrame)
1102
- else self._data.collect_schema()
1103
- )
1104
- filter_expr = (
1105
- sql2polars_filter(filter_expr, pl_schema)
1106
- if isinstance(filter_expr, str)
1107
- else filter_expr
1108
- )
1109
- return self._data.filter(filter_expr)
1110
-
1111
- elif isinstance(self._data, pa.Table):
1112
- pa_schema = self._data.schema
1113
- filter_expr = (
1114
- sql2pyarrow_filter(filter_expr, pa_schema)
1115
- if isinstance(filter_expr, str)
1116
- else filter_expr
1117
- )
1118
- return self._data.filter(filter_expr)
1119
-
1120
- if isinstance(self._data, str):
1121
- if isinstance(self._data[0], pl.DataFrame | pl.LazyFrame):
1122
- pl_schema = (
1123
- self._data.schema
1124
- if isinstance(self._data[0], pl.DataFrame)
1125
- else self._data[0].collect_schema()
1126
- )
1127
- filter_expr = (
1128
- sql2polars_filter(filter_expr, pl_schema)
1129
- if isinstance(filter_expr, str)
1130
- else filter_expr
1131
- )
1132
- return [d.filter(filter_expr) for d in self._data]
1133
- elif isinstance(self._data[0], pa.Table):
1134
- pa_schema = self._data[0].schema
1135
- filter_expr = (
1136
- sql2pyarrow_filter(filter_expr, pa_schema)
1137
- if isinstance(filter_expr, str)
1138
- else filter_expr
1139
- )
1140
- return [d.filter(filter_expr) for d in self._data]
1141
-
1142
- @property
1143
- def metadata(self):
1144
- if not hasattr(self, "_metadata"):
1145
- self._load()
1146
- return self._metadata
1147
-
1148
-
1149
- # @attrs.define # Removed
1150
- class BaseDatasetReader(BaseFileReader, gc=False):
1151
- """
1152
- Base class for dataset loading operations supporting various file formats.
1153
- This class provides a foundation for dataset loading operations across different file formats
1154
- including CSV, Parquet, JSON, Arrow, and IPC.
1155
-
1156
- Args:
1157
- path (str | list[str]): Path or list of paths to file(s).
1158
- format (str, optional): File format extension (without dot).
1159
- fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
1160
- include_file_path (bool, optional): Include file path in the output DataFrame.
1161
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
1162
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
1163
- ctx (datafusion.SessionContext, optional): DataFusion session context instance.
1164
- schema (pa.Schema, optional): PyArrow schema for the dataset.
1165
- partitioning (str | list[str] | pds.Partitioning, optional): Dataset partitioning scheme.
1166
-
1167
- Examples:
1168
- ```python
1169
- dataset_loader = BaseDatasetReader(
1170
- path="s3://bucket/path/to/files",
1171
- format="csv",
1172
- include_file_path=True,
1173
- concat=True,
1174
- conn=duckdb.connect(),
1175
- ctx=datafusion.SessionContext(),
1176
- schema=pa.schema([
1177
- pa.field("column1", pa.int64()),
1178
- pa.field("column2", pa.string())
1179
- ]),
1180
- partitioning="hive"
1181
- )
1182
- data = dataset_loader.to_polars()
1183
- ```
1184
- Notes:
1185
- - Supports multiple file formats including CSV, Parquet, JSON, Arrow, and IPC
1186
- - Automatically handles filesystem initialization based on path protocol
1187
- - Supports both single path and multiple path inputs
1188
- - Supports loading data into DuckDB and DataFusion for SQL operations
1189
- - Supports custom schema and partitioning for datasets
1190
-
1191
- """
1192
-
1193
- schema_: pa.Schema | None = field(default=None)
1194
- _dataset: pds.Dataset | None = field(default=None)
1195
- _pydala_dataset: Any | None = field(default=None)
1196
-
1197
- def to_pyarrow_dataset(
1198
- self,
1199
- metadata: bool = False,
1200
- reload: bool = False,
1201
- **kwargs,
1202
- ) -> pds.Dataset | tuple[pds.Dataset, dict[str, Any]]:
1203
- """
1204
- Convert data to PyArrow Dataset.
1205
-
1206
- Args:
1207
- metadata (bool, optional): Include metadata in the output. Default is False.
1208
- reload (bool, optional): Reload data if True. Default is False.
1209
-
1210
- Returns:
1211
- pds.Dataset: PyArrow Dataset.
1212
- """
1213
- if self._dataset is not None and not reload:
1214
- if metadata:
1215
- return self._dataset, self._metadata
1216
- return self._dataset
1217
-
1218
- if self.format == ["csv", "arrow", "ipc"]:
1219
- self._dataset = self.fs.pyarrow_dataset(
1220
- self._path,
1221
- format=self.format,
1222
- schema=self.schema_,
1223
- partitioning=self.partitioning,
1224
- **kwargs,
1225
- )
1226
- self._metadata = get_pyarrow_dataset_metadata(
1227
- self._dataset, path=self.path, format=self.format
1228
- )
1229
- elif self.format == "parquet":
1230
- if self.fs.exists(posixpath.join(self._root_path, "_metadata")):
1231
- self._dataset = self.fs.parquet_dataset(
1232
- posixpath.join(self._root_path, "_metadata"),
1233
- schema=self.schema_,
1234
- partitioning=self.partitioning,
1235
- **kwargs,
1236
- )
1237
- else:
1238
- self._dataset = self.fs.pyarrow_dataset(
1239
- self._path,
1240
- format=self.format,
1241
- schema=self.schema_,
1242
- partitioning=self.partitioning,
1243
- **kwargs,
1244
- )
1245
- self._metadata = get_pyarrow_dataset_metadata(
1246
- self._dataset, path=self.path, format=self.format
1247
- )
1248
- else:
1249
- raise ValueError(f"Unsupported format: {self.format}")
1250
- if metadata:
1251
- return self._dataset, self._metadata
1252
- return self._dataset
1253
-
1254
- def to_pandas(
1255
- self, metadata: bool = False, reload: bool = False, **kwargs
1256
- ) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
1257
- """
1258
- Convert data to Pandas DataFrame.
1259
-
1260
- Args:
1261
- metadata (bool, optional): Include metadata in the output. Default is False.
1262
-
1263
- Returns:
1264
- pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]: Pandas DataFrame and optional metadata.
1265
- """
1266
- self.to_pyarrow_dataset(reload=reload, **kwargs)
1267
- df = self._dataset.to_table().to_pandas()
1268
- if metadata:
1269
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
1270
- return df, metadata
1271
- return df
1272
-
1273
- def _to_polars_dataframe(
1274
- self, metadata: bool = False, reload: bool = False, **kwargs
1275
- ) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]:
1276
- self.to_pyarrow_dataset(reload=reload, **kwargs)
1277
- df = pl.from_arrow(self._dataset.to_table())
1278
- if metadata:
1279
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
1280
- return df, metadata
1281
- return df
1282
-
1283
- def _to_polars_lazyframe(
1284
- self, metadata: bool = False, reload: bool = False, **kwargs
1285
- ) -> pl.LazyFrame | tuple[pl.LazyFrame, dict[str, Any]]:
1286
- self.to_pyarrow_dataset(reload=reload, **kwargs)
1287
- df = pl.scan_pyarrow_dataset(self._dataset)
1288
- if metadata:
1289
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
1290
- return df, metadata
1291
- return df
1292
-
1293
- def to_polars(
1294
- self, lazy: bool = True, metadata: bool = False, reload: bool = False, **kwargs
1295
- ) -> (
1296
- pl.DataFrame | pl.LazyFrame | tuple[pl.DataFrame | pl.LazyFrame, dict[str, Any]]
1297
- ):
1298
- """
1299
- Convert data to Polars DataFrame or LazyFrame.
1300
-
1301
- Args:
1302
- lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
1303
- metadata (bool, optional): Include metadata in the output. Default is False.
1304
- reload (bool, optional): Reload data if True. Default is False.
1305
-
1306
- Returns:
1307
- pl.DataFrame | pl.LazyFrame | tuple[pl.DataFrame | pl.LazyFrame, dict[str, Any]]: Polars DataFrame or
1308
- LazyFrame and optional metadata.
1309
- """
1310
- df = (
1311
- self._to_polars_lazyframe(reload=reload, **kwargs)
1312
- if lazy
1313
- else self._to_polars_dataframe(reload=reload, **kwargs)
1314
- )
1315
- if metadata:
1316
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
1317
- return df, metadata
1318
- return df
1319
-
1320
- def to_pyarrow_table(
1321
- self, metadata: bool = False, reload: bool = False, **kwargs
1322
- ) -> pa.Table | tuple[pa.Table, dict]:
1323
- """Convert data to PyArrow Table.
1324
-
1325
- Args:
1326
- metadata (bool, optional): Include metadata in the output. Default is False.
1327
- reload (bool, optional): Reload data if True. Default is False.
1328
-
1329
- Returns:
1330
- pa.Table | tuple[pa.Table, dict]: PyArrow Table and optional metadata.
1331
- """
1332
- self.to_pyarrow_dataset(reload=reload, **kwargs)
1333
- df = self._dataset.to_table()
1334
- if metadata:
1335
- metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
1336
- return df, metadata
1337
- return df
1338
-
1339
- def to_pydala_dataset(
1340
- self, metadata: bool = False, reload: bool = False, **kwargs
1341
- ) -> ParquetDataset | tuple[ParquetDataset, dict[str, Any]]: # type: ignore
1342
- """Convert data to Pydala ParquetDataset.
1343
-
1344
- Args:
1345
- metadata (bool, optional): Include metadata in the output. Default is False.
1346
-
1347
- Returns:
1348
- ParquetDataset: Pydala ParquetDataset.
1349
- """
1350
- if ParquetDataset is None:
1351
- raise ImportError("pydala is not installed.")
1352
- if not hasattr(self, "_pydala_dataset") or reload:
1353
- if not hasattr(self, "conn"):
1354
- self._conn = duckdb.connect()
1355
- self._pydala_dataset = self.fs.pydala_dataset(
1356
- self._path,
1357
- partitioning=self.partitioning,
1358
- ddb_con=self._conn,
1359
- **kwargs,
1360
- )
1361
- self._pydala_dataset.load(update_metadata=True)
1362
- self._metadata = get_pyarrow_dataset_metadata(
1363
- self._pydala_dataset._arrow_dataset, path=self.path, format=self.format
1364
- )
1365
- if metadata:
1366
- return self._pydala_dataset, self._metadata
1367
- return self._pydala_dataset
1368
-
1369
- def to_duckdb_relation(
1370
- self,
1371
- conn: duckdb.DuckDBPyConnection | None = None,
1372
- metadata: bool = False,
1373
- reload: bool = False,
1374
- **kwargs,
1375
- ) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
1376
- """Convert data to DuckDB relation.
1377
-
1378
- Args:
1379
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
1380
- metadata (bool, optional): Include metadata in the output. Default is False.
1381
- reload (bool, optional): Reload data if True. Default is False.
1382
-
1383
- Returns:
1384
- duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
1385
- metadata.
1386
- """
1387
- if self._conn is None:
1388
- if conn is None:
1389
- conn = duckdb.connect()
1390
- self._conn = conn
1391
-
1392
- self.to_pyarrow_dataset(reload=reload, **kwargs)
1393
- if metadata:
1394
- return self._conn.from_arrow(self._dataset), self._metadata
1395
- return self._conn.from_arrow(self._dataset)
1396
-
1397
- def register_in_duckdb(
1398
- self,
1399
- conn: duckdb.DuckDBPyConnection | None = None,
1400
- name: str | None = None,
1401
- metadata: bool = False,
1402
- reload: bool = False,
1403
- **kwargs,
1404
- ) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
1405
- """Register data in DuckDB.
1406
-
1407
- Args:
1408
- conn (duckdb.DuckDBPyConnection): DuckDB connection instance.
1409
- name (str, optional): Name for the DuckDB table.
1410
- metadata (bool, optional): Include metadata in the output. Default is False.
1411
- reload (bool, optional): Reload data if True. Default is False.
1412
-
1413
- Returns:
1414
- duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
1415
- or DuckDB connection instance and optional metadata.
1416
- """
1417
- if name is None:
1418
- name = f"{self.format}:{self.path}"
1419
-
1420
- if self._conn is None:
1421
- if conn is None:
1422
- conn = duckdb.connect()
1423
- self._conn = conn
1424
-
1425
- self._conn.register(name, self._dataset)
1426
- if metadata:
1427
- return self._conn, self._metadata
1428
- return self._conn
1429
-
1430
- def to_duckdb(
1431
- self,
1432
- as_relation: bool = True,
1433
- conn: duckdb.DuckDBPyConnection | None = None,
1434
- name: str | None = None,
1435
- metadata: bool = False,
1436
- reload: bool = False,
1437
- **kwargs,
1438
- ) -> (
1439
- duckdb.DuckDBPyRelation
1440
- | duckdb.DuckDBPyConnection
1441
- | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]
1442
- | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]
1443
- ):
1444
- """Convert data to DuckDB relation or register in DuckDB.
1445
-
1446
- Args:
1447
- as_relation (bool, optional): Return a DuckDB relation if True, else register in DuckDB. Default is True.
1448
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
1449
- name (str, optional): Name for the DuckDB table.
1450
- metadata (bool, optional): Include metadata in the output. Default is False.
1451
- reload (bool, optional): Reload data if True. Default is False.
1452
- **kwargs: Additional keyword arguments.
1453
-
1454
- Returns:
1455
- duckdb.DuckDBPyRelation | duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyRelation, dict[str, Any]] |
1456
- tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB relation or connection instance
1457
- or DuckDB relation or connection instance and optional metadata.
1458
-
1459
- """
1460
- if as_relation:
1461
- return self.to_duckdb_relation(
1462
- conn=conn, metadata=metadata, reload=reload, **kwargs
1463
- )
1464
- return self.register_in_duckdb(
1465
- conn=conn, name=name, metadata=metadata, reload=reload, **kwargs
1466
- )
1467
-
1468
- def register_in_datafusion(
1469
- self,
1470
- ctx: datafusion.SessionContext,
1471
- name: str | None = None,
1472
- metadata: bool = False,
1473
- reload: bool = False,
1474
- **kwargs,
1475
- ) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
1476
- """Register data in DataFusion.
1477
-
1478
- Args:
1479
- ctx (datafusion.SessionContext): DataFusion session context instance.
1480
- name (str, optional): Name for the DataFusion table.
1481
- metadata (bool, optional): Include metadata in the output. Default is False.
1482
- reload (bool, optional): Reload data if True. Default is False.
1483
- **kwargs: Additional keyword arguments.
1484
-
1485
- Returns:
1486
- None
1487
- """
1488
- if name is None:
1489
- name = f"{self.format}:{self.path}"
1490
-
1491
- if self._ctx is None:
1492
- if ctx is None:
1493
- ctx = datafusion.SessionContext()
1494
- self._ctx = ctx
1495
-
1496
- self._ctx.register_record_batches(name, [self.to_pyarrow_table().to_batches()])
1497
-
1498
- def filter(
1499
- self, filter_expr: str | pl.Expr | pa.compute.Expression
1500
- ) -> (
1501
- pl.DataFrame
1502
- | pl.LazyFrame
1503
- | pa.Table
1504
- | list[pl.DataFrame]
1505
- | list[pl.LazyFrame]
1506
- | list[pa.Table]
1507
- ):
1508
- """Filter data based on a filter expression.
1509
-
1510
- Args:
1511
- filter_expr (str | pl.Expr | pa.compute.Expression): Filter expression. Can be a SQL expression, Polars
1512
- expression, or PyArrow compute expression.
1513
-
1514
- Returns:
1515
- pl.DataFrame | pl.LazyFrame | pa.Table | list[pl.DataFrame] | list[pl.LazyFrame]
1516
- | list[pa.Table]: Filtered data.
1517
- """
1518
- if isinstance(self._data, pl.DataFrame | pl.LazyFrame):
1519
- pl_schema = (
1520
- self._data.schema
1521
- if isinstance(self._data, pl.DataFrame)
1522
- else self._data.collect_schema()
1523
- )
1524
- filter_expr = (
1525
- sql2polars_filter(filter_expr, pl_schema)
1526
- if isinstance(filter_expr, str)
1527
- else filter_expr
1528
- )
1529
- return self._data.filter(filter_expr)
1530
-
1531
- elif isinstance(self._data, pa.Table):
1532
- pa_schema = self._data.schema
1533
- filter_expr = (
1534
- sql2pyarrow_filter(filter_expr, pa_schema)
1535
- if isinstance(filter_expr, str)
1536
- else filter_expr
1537
- )
1538
- return self._data.filter(filter_expr)
1539
-
1540
- if isinstance(self._data, str):
1541
- if isinstance(self._data[0], pl.DataFrame | pl.LazyFrame):
1542
- pl_schema = (
1543
- self._data.schema
1544
- if isinstance(self._data[0], pl.DataFrame)
1545
- else self._data[0].collect_schema()
1546
- )
1547
- filter_expr = (
1548
- sql2polars_filter(filter_expr, pl_schema)
1549
- if isinstance(filter_expr, str)
1550
- else filter_expr
1551
- )
1552
- return [d.filter(filter_expr) for d in self._data]
1553
- elif isinstance(self._data[0], pa.Table):
1554
- pa_schema = self._data[0].schema
1555
- filter_expr = (
1556
- sql2pyarrow_filter(filter_expr, pa_schema)
1557
- if isinstance(filter_expr, str)
1558
- else filter_expr
1559
- )
1560
- return [d.filter(filter_expr) for d in self._data]
1561
-
1562
- @property
1563
- def metadata(self):
1564
- if not hasattr(self, "_metadata"):
1565
- self._load()
1566
- return self._metadata
1567
-
1568
-
1569
- # @attrs.define # Removed
1570
- class BaseFileWriter(BaseFileIO, gc=False):
1571
- """
1572
- Base class for file writing operations supporting various storage backends.
1573
- This class provides a foundation for file writing operations across different storage systems
1574
- including AWS S3, Google Cloud Storage, Azure Blob Storage, GitHub, and GitLab.
1575
-
1576
- Args:
1577
- path (str | list[str]): Path or list of paths to file(s).
1578
- storage_options (AwsStorageOptions | GcsStorageOptions | AzureStorageOptions |
1579
- GitHubStorageOptions | GitLabStorageOptions | dict[str, Any] | None, optional):
1580
- Storage-specific options for accessing remote filesystems.
1581
- fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
1582
- format (str, optional): File format extension (without dot).
1583
- basename (str, optional): Basename for the output file(s).
1584
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
1585
- mode (str, optional): Write mode (append, overwrite, delete_matching, error_if_exists).
1586
- unique (bool | list[str] | str, optional): Unique columns for deduplication.
1587
-
1588
- Examples:
1589
- ```python
1590
- file_writer = BaseFileWriter(
1591
- path="s3://bucket/path/to/files",
1592
- storage_options=AwsStorageOptions(
1593
- key="access_key",
1594
- secret="secret_key"),
1595
- format="csv",
1596
- basename="output",
1597
- concat=True,
1598
- mode="append",
1599
- unique=True
1600
- )
1601
- file_writer.write(data=df)
1602
- ```
1603
-
1604
- Notes:
1605
- - Supports multiple cloud storage backends through different storage options
1606
- - Automatically handles filesystem initialization based on path protocol
1607
- - Supports both single path and multiple path inputs
1608
- - Supports writing data to cloud storage with various write modes
1609
- """
1610
-
1611
- basename: str | None = field(default=None)
1612
- concat: bool = field(default=False)
1613
- mode: str = field(default="append")
1614
- unique: bool | list[str] | str = field(default=False)
1615
-
1616
- def write(
1617
- self,
1618
- data: (
1619
- pl.DataFrame
1620
- | pl.LazyFrame
1621
- | pa.Table
1622
- | pd.DataFrame
1623
- | dict[str, Any]
1624
- | list[
1625
- pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]
1626
- ]
1627
- ),
1628
- basename: str | None = None,
1629
- concat: bool | None = None,
1630
- unique: bool | list[str] | str | None = None,
1631
- mode: str | None = None,
1632
- **kwargs,
1633
- ) -> dict[str, Any]:
1634
- """
1635
- Write data to file.
1636
-
1637
- Args:
1638
- data (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any] | list[pl.DataFrame |
1639
- pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]] | None, optional): Data to write.
1640
- basename (str, optional): Basename for the output file(s).
1641
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
1642
- unique (bool | list[str] | str, optional): Unique columns for deduplication.
1643
- mode (str, optional): Write mode (append, overwrite, delete_matching, error_if_exists).
1644
- **kwargs: Additional keyword arguments.
1645
-
1646
- Returns:
1647
- dict[str, Any]: Metadata for the written data
1648
- """
1649
- if isinstance(data, list):
1650
- if isinstance(data[0], dict):
1651
- data = _dict_to_dataframe(data)
1652
- if isinstance(data, dict):
1653
- data = _dict_to_dataframe(data)
1654
-
1655
- self._metadata = get_dataframe_metadata(
1656
- df=data, path=self.path, format=self.format
1657
- )
1658
-
1659
- self.fs.write_files(
1660
- data=data, # if data is not None else self.data,
1661
- path=self._path,
1662
- basename=basename or self.basename,
1663
- concat=concat or self.concat,
1664
- unique=unique or self.unique,
1665
- mode=mode or self.mode,
1666
- **kwargs,
1667
- )
1668
- return self._metadata
1669
-
1670
- @property
1671
- def metadata(self):
1672
- if not hasattr(self, "_metadata"):
1673
- return {}
1674
- return self._metadata
1675
-
1676
-
1677
- # @attrs.define # Removed
1678
- class BaseDatasetWriter(BaseFileWriter, gc=False):
1679
- """
1680
- Base class for dataset writing operations supporting various file formats.
1681
- This class provides a foundation for dataset writing operations across different file formats
1682
- including CSV, Parquet, JSON, Arrow, and IPC.
1683
-
1684
- Args:
1685
- path (str | list[str]): Path or list of paths to file(s).
1686
- format (str, optional): File format extension (without dot).
1687
- storage_options (AwsStorageOptions | GcsStorageOptions | AzureStorageOptions |
1688
- GitHubStorageOptions | GitLabStorageOptions | dict[str, Any] | None, optional):
1689
- Storage-specific options for accessing remote filesystems.
1690
- fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
1691
- basename (str, optional): Basename for the output file(s).
1692
- schema (pa.Schema, optional): PyArrow schema for the dataset.
1693
- partition_by (str | list[str] | pds.Partitioning, optional): Dataset partitioning scheme.
1694
- partitioning_flavor (str, optional): Partitioning flavor for the dataset.
1695
- compression (str, optional): Compression codec for the dataset.
1696
- row_group_size (int, optional): Row group size for the dataset.
1697
- max_rows_per_file (int, optional): Maximum number of rows per file.
1698
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
1699
- unique (bool | list[str] | str, optional): Unique columns for deduplication.
1700
- mode (str, optional): Write mode (append, overwrite, delete_matching, error_if_exists).
1701
- is_pydala_dataset (bool, optional): Write data as a Pydala ParquetDataset.
1702
-
1703
- Examples:
1704
- ```python
1705
- dataset_writer = BaseDatasetWriter(
1706
- path="s3://bucket/path/to/files",
1707
- format="parquet",
1708
- storage_options=AwsStorageOptions(
1709
- key="access_key",
1710
- secret="secret_key"),
1711
- basename="output",
1712
- schema=pa.schema([
1713
- pa.field("column1", pa.int64()),
1714
- pa.field("column2", pa.string())
1715
- ]),
1716
- partition_by="column1",
1717
- partitioning_flavor="hive",
1718
- compression="zstd",
1719
- row_group_size=250_000,
1720
- max_rows_per_file=2_500_000,
1721
- concat=True,
1722
- unique=True,
1723
- mode="append",
1724
- is_pydala_dataset=False
1725
- )
1726
- dataset_writer.write(data=df)
1727
- ```
1728
- Notes:
1729
- - Supports multiple file formats including CSV, Parquet, JSON, Arrow, and IPC
1730
- - Automatically handles filesystem initialization based on path protocol
1731
- - Supports both single path and multiple path inputs
1732
- - Supports writing data to cloud storage with various write modes
1733
- - Supports writing data as a Pydala ParquetDataset
1734
- """
1735
-
1736
- # basename, concat, unique, mode are inherited from BaseFileWriter
1737
- schema_: pa.Schema | None = None
1738
- partition_by: str | list[str] | pds.Partitioning | None = None
1739
- partitioning_flavor: str | None = None
1740
- compression: str = "zstd"
1741
- row_group_size: int | None = 250_000
1742
- max_rows_per_file: int | None = 2_500_000
1743
- is_pydala_dataset: bool = False
1744
-
1745
- def write(
1746
- self,
1747
- data: (
1748
- pl.DataFrame
1749
- | pl.LazyFrame
1750
- | pa.Table
1751
- | pa.RecordBatch
1752
- | pa.RecordBatchReader
1753
- | pd.DataFrame
1754
- | dict[str, Any]
1755
- | list[
1756
- pl.DataFrame
1757
- | pl.LazyFrame
1758
- | pa.Table
1759
- | pa.RecordBatch
1760
- | pa.RecordBatchReader
1761
- | pd.DataFrame
1762
- | dict[str, Any]
1763
- ]
1764
- ),
1765
- concat: bool | None = None,
1766
- unique: bool | list[str] | str | None = None,
1767
- mode: str | None = None,
1768
- delta_subset: str | None = None,
1769
- alter_schema: bool = False,
1770
- update_metadata: bool = True,
1771
- timestamp_column: str | None = None,
1772
- verbose: bool = False,
1773
- **kwargs,
1774
- ) -> dict[str, Any]:
1775
- """
1776
- Write data to dataset.
1777
-
1778
- Args:
1779
- data (pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader | pd.DataFrame |
1780
- dict[str, Any] | list[pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
1781
- pd.DataFrame | dict[str, Any]] | None, optional): Data to write.
1782
- unique (bool | list[str] | str, optional): Unique columns for deduplication.
1783
- delta_subset (str | None, optional): Delta subset for incremental updates.
1784
- alter_schema (bool, optional): Alter schema for compatibility.
1785
- update_metadata (bool, optional): Update metadata.
1786
- timestamp_column (str | None, optional): Timestamp column for updates.
1787
- verbose (bool, optional): Verbose output.
1788
- **kwargs: Additional keyword arguments.
1789
-
1790
- Returns:
1791
- dict[str, Any]: Metadata of the written data.
1792
- """
1793
- basename = kwargs.pop("basename", self.basename)
1794
- schema = kwargs.pop("schema", self.schema_)
1795
- partition_by = kwargs.pop("partition_by", self.partition_by)
1796
- partitioning_flavor = kwargs.pop(
1797
- "partitioning_flavor", self.partitioning_flavor
1798
- )
1799
- compression = kwargs.pop("compression", self.compression)
1800
- row_group_size = kwargs.pop("row_group_size", self.row_group_size)
1801
- max_rows_per_file = kwargs.pop("max_rows_per_file", self.max_rows_per_file)
1802
-
1803
- if isinstance(data, list):
1804
- if isinstance(data[0], dict):
1805
- data = _dict_to_dataframe(data)
1806
- if isinstance(data, dict):
1807
- data = _dict_to_dataframe(data)
1808
-
1809
- self._metadata = get_dataframe_metadata(
1810
- df=data, path=self.path, format=self.format
1811
- )
1812
-
1813
- if not self.is_pydala_dataset:
1814
- self.fs.write_pyarrow_dataset(
1815
- data=data, # if data is not None else self.data,
1816
- path=self._path,
1817
- basename=basename or self.basename,
1818
- schema=schema or self.schema_,
1819
- partition_by=partition_by or self.partition_by,
1820
- partitioning_flavor=partitioning_flavor or self.partitioning_flavor,
1821
- format=self.format,
1822
- compression=compression or self.compression,
1823
- row_group_size=row_group_size or self.row_group_size,
1824
- max_rows_per_file=max_rows_per_file or self.max_rows_per_file,
1825
- concat=concat or self.concat,
1826
- unique=unique or self.unique,
1827
- mode=mode or self.mode,
1828
- **kwargs,
1829
- )
1830
- else:
1831
- self.fs.write_pydala_dataset(
1832
- data=data, # if data is not None else self.data,
1833
- path=self._path,
1834
- mode=mode or self.mode,
1835
- basename=basename or self.basename,
1836
- schema=schema or self.schema_,
1837
- partition_by=partition_by or self.partition_by,
1838
- compression=compression or self.compression,
1839
- row_group_size=row_group_size or self.row_group_size,
1840
- max_rows_per_file=max_rows_per_file or self.max_rows_per_file,
1841
- concat=concat or self.concat,
1842
- unique=unique or self.unique,
1843
- delta_subset=delta_subset,
1844
- alter_schema=alter_schema,
1845
- update_metadata=update_metadata,
1846
- timestamp_column=timestamp_column,
1847
- verbose=verbose,
1848
- **kwargs,
1849
- )
1850
- return self._metadata
1851
-
1852
- @property
1853
- def metadata(self):
1854
- if not hasattr(self, "_metadata"):
1855
- return {}
1856
- return self._metadata
1857
-
1858
-
1859
- # @attrs.define # Removed
1860
- class BaseDatabaseIO(msgspec.Struct, gc=False):
1861
- """
1862
- Base class for database read/write operations supporting various database systems.
1863
- This class provides a foundation for database read/write operations across different database systems
1864
- including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle.
1865
-
1866
- Args:
1867
- type_ (str): Database type (sqlite, duckdb, postgres, mysql, mssql, oracle).
1868
- table_name (str): Table name in the database.
1869
- path (str | None, optional): File path for SQLite or DuckDB databases.
1870
- connection_string (str | None, optional): Connection string for SQLAlchemy-based databases.
1871
- username (str | None, optional): Username for the database.
1872
- password (str | None, optional): Password for the database.
1873
- server (str | None, optional): Server address for the database.
1874
- port (str | None, optional): Port number for the database.
1875
- database (str | None, optional): Database name.
1876
-
1877
- Examples:
1878
- ```python
1879
- db_reader = BaseDatabaseIO(
1880
- type_="sqlite",
1881
- table_name="table_name",
1882
- path="path/to/database.db"
1883
- )
1884
- data = db_reader.read()
1885
- ```
1886
-
1887
- Notes:
1888
- - Supports multiple database systems including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle
1889
- - Automatically handles database initialization based on connection parameters
1890
- - Supports reading data from databases into DataFrames
1891
- - Supports writing data to databases from DataFrames
1892
- """
1893
-
1894
- type_: str
1895
- table_name: str = field(default="")
1896
- path: str | None = field(default=None)
1897
- username: str | None = field(default=None)
1898
- password: str | None = field(default=None)
1899
- server: str | None = field(default=None)
1900
- port: str | int | None = field(default=None)
1901
- database: str | None = field(default=None)
1902
- ssl: bool = field(default=False)
1903
- connection_string: str | None = field(default=None)
1904
- _metadata: dict[str, Any] = field(default_factory=dict)
1905
- _data: pa.Table | pl.DataFrame | pl.LazyFrame | pd.DataFrame | None = field(
1906
- default=None
1907
- )
1908
- _conn: duckdb.DuckDBPyConnection | None = field(default=None)
1909
- _ctx: datafusion.SessionContext | None = field(default=None)
1910
-
1911
- def __post_init__(self): # Renamed from __attrs_post_init__
1912
- db = self.type_.lower()
1913
- if (
1914
- db in ["postgres", "mysql", "mssql", "oracle"]
1915
- and not self.connection_string
1916
- ):
1917
- if not all([
1918
- self.username,
1919
- self.password,
1920
- self.server,
1921
- self.port,
1922
- self.database,
1923
- ]):
1924
- raise ValueError(
1925
- f"{self.type_} requires connection_string or username, password, server, port, and table_name "
1926
- "to build it."
1927
- )
1928
- if db == "postgres":
1929
- ssl_mode = "?sslmode=require" if self.ssl else ""
1930
- self.connection_string = (
1931
- f"postgresql://{self.username}:{self.password}@{self.server}:{self.port}/"
1932
- f"{self.database}{ssl_mode}"
1933
- )
1934
- elif db == "mysql":
1935
- ssl_mode = "?ssl=true" if self.ssl else ""
1936
- self.connection_string = (
1937
- f"mysql+pymysql://{self.username}:{self.password}@{self.server}:{self.port}/"
1938
- f"{self.database}{ssl_mode}"
1939
- )
1940
- elif db == "mssql":
1941
- ssl_mode = ";Encrypt=yes;TrustServerCertificate=yes" if self.ssl else ""
1942
- self.connection_string = (
1943
- f"mssql+pyodbc://{self.username}:{self.password}@{self.server}:{self.port}/"
1944
- f"{self.database}?driver=ODBC+Driver+17+for+SQL+Server{ssl_mode}"
1945
- )
1946
- elif db == "oracle":
1947
- ssl_mode = "?ssl=true" if self.ssl else ""
1948
- self.connection_string = (
1949
- f"oracle+cx_oracle://{self.username}:{self.password}@{self.server}:{self.port}/"
1950
- f"{self.database}{ssl_mode}"
1951
- )
1952
- if db in ["sqlite", "sqlite3"]:
1953
- if not self.path:
1954
- raise ValueError("SQLite requires a file path.")
1955
- self.connection_string = f"sqlite:///{self.path}"
1956
- elif db == "duckdb":
1957
- if not self.path:
1958
- raise ValueError("DuckDB requires a file path.")
1959
- self.connection_string = self.path
1960
-
1961
- def execute(self, query: str, cursor: bool = True, **query_kwargs):
1962
- """Execute a SQL query.
1963
-
1964
- Args:
1965
- query (str): SQL query.
1966
- cursor (bool, optional): Use cursor for execution. Default is True.
1967
- **query_kwargs: Additional keyword arguments.
1968
- """
1969
- query = query.format(**query_kwargs)
1970
- if self.type_ == "sqlite" or self.type_ == "duckdb":
1971
- with self.connect() as conn:
1972
- if cursor:
1973
- cur = conn.cursor()
1974
- res = cur.execute(query)
1975
-
1976
- else:
1977
- res = conn.execute(query)
1978
-
1979
- conn.commit()
1980
- return res
1981
-
1982
- with self.connect() as conn:
1983
- cur = conn.cursor()
1984
- res = cur.execute(text(query))
1985
- conn.commit()
1986
- return res
1987
-
1988
- def _to_pandas(
1989
- self,
1990
- data: pl.DataFrame
1991
- | pl.LazyFrame
1992
- | pa.Table
1993
- | pa.RecordBatch
1994
- | pa.RecordBatchReader
1995
- | pd.DataFrame
1996
- | dict[str, Any],
1997
- ) -> pd.DataFrame | list[pd.DataFrame]:
1998
- # convert data to pandas DataFrame if needed
1999
- if isinstance(data, pl.DataFrame):
2000
- return data.to_pandas()
2001
- elif isinstance(data, pa.Table):
2002
- return data.to_pandas()
2003
- elif isinstance(data, pl.LazyFrame):
2004
- return data.collect().to_pandas()
2005
- elif isinstance(data, pa.RecordBatch):
2006
- return pa.Table.from_batches([self.data]).to_pandas()
2007
- elif isinstance(data, pa.RecordBatchReader):
2008
- return data.read_all().to_pandas()
2009
- elif isinstance(data, dict):
2010
- return pd.DataFrame(data)
2011
- return data
2012
-
2013
- def create_engine(self):
2014
- return create_engine(self.connection_string)
2015
-
2016
- def connect(self):
2017
- if self.type_ == "sqlite":
2018
- conn = sqlite3.connect(self.path)
2019
- # Activate WAL mode:
2020
- conn.execute("PRAGMA journal_mode=WAL;")
2021
- return conn
2022
- if self.type_ == "duckdb":
2023
- return duckdb.connect(database=self.path)
2024
- return self.create_engine().connect()
2025
-
2026
-
2027
- # @attrs.define # Removed
2028
- class BaseDatabaseWriter(BaseDatabaseIO, gc=False):
2029
- """
2030
- Base class for database writing operations supporting various database systems.
2031
- This class provides a foundation for database writing operations across different database systems
2032
- including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle.
2033
-
2034
- Args:
2035
- type_ (str): Database type (sqlite, duckdb, postgres, mysql, mssql, oracle).
2036
- table_name (str): Table name in the database.
2037
- path (str | None, optional): File path for SQLite or DuckDB databases.
2038
- connection_string (str | None, optional): Connection string for SQLAlchemy-based databases.
2039
- username (str | None, optional): Username for the database.
2040
- password (str | None, optional): Password for the database.
2041
- server (str | None, optional): Server address for the database.
2042
- port (str | None, optional): Port number for the database.
2043
- database (str | None, optional): Database name.
2044
- mode (str, optional): Write mode (append, replace, fail).
2045
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
2046
- unique (bool | list[str] | str, optional): Unique columns for deduplication.
2047
-
2048
- Examples:
2049
- ```python
2050
- db_writer = BaseDatabaseWriter(
2051
- type_="sqlite",
2052
- table_name="table_name",
2053
- path="path/to/database.db"
2054
- )
2055
- db_writer.write(data=df)
2056
- ```
2057
-
2058
- Notes:
2059
- - Supports multiple database systems including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle
2060
- - Automatically handles database initialization based on connection parameters
2061
- - Supports writing data to databases from DataFrames
2062
- """
2063
-
2064
- mode: str = field(default="append") # append, replace, fail
2065
- concat: bool = field(default=False)
2066
- unique: bool | list[str] | str = field(default=False)
2067
-
2068
- def _write_sqlite(
2069
- self,
2070
- data: pl.DataFrame
2071
- | pl.LazyFrame
2072
- | pa.Table
2073
- | pa.RecordBatch
2074
- | pa.RecordBatchReader
2075
- | pd.DataFrame
2076
- | dict[str, Any]
2077
- | list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
2078
- mode: str | None = None,
2079
- concat: bool | None = None,
2080
- unique: bool | list[str] | str | None = None,
2081
- ) -> dict[str, Any]:
2082
- if not self.path:
2083
- raise ValueError("SQLite requires a file path.")
2084
-
2085
- data = to_pyarrow_table(
2086
- data, unique=unique or self.unique, concat=concat or self.concat
2087
- )
2088
- if not isinstance(data, list):
2089
- data = [data]
2090
-
2091
- with sqlite3.connect(self.path) as conn:
2092
- # Activate WAL mode:
2093
- conn.execute("PRAGMA journal_mode=WAL;")
2094
-
2095
- self._metadata = get_dataframe_metadata(
2096
- df=data, path=self.connection_string, format=self.type_
2097
- )
2098
-
2099
- for n, _data in enumerate(data):
2100
- df = self._to_pandas(_data)
2101
- df.to_sql(self.table_name, conn, if_exists=mode or self.mode, index=False)
2102
-
2103
- return self._metadata
2104
-
2105
- def _write_duckdb(
2106
- self,
2107
- data: pl.DataFrame
2108
- | pl.LazyFrame
2109
- | pa.Table
2110
- | pa.RecordBatch
2111
- | pa.RecordBatchReader
2112
- | pd.DataFrame
2113
- | dict[str, Any]
2114
- | list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
2115
- mode: str | None = None,
2116
- concat: bool | None = None,
2117
- unique: bool | list[str] | str | None = None,
2118
- ) -> dict[str, Any]:
2119
- if not self.path:
2120
- raise ValueError("DuckDB requires a file path.")
2121
-
2122
- data = to_pyarrow_table(
2123
- data, unique=unique or self.unique, concat=concat or self.concat
2124
- )
2125
- if not isinstance(data, list):
2126
- data = [data]
2127
-
2128
- self._metadata = get_dataframe_metadata(
2129
- df=data, path=self.connection_string, format=self.type_
2130
- )
2131
-
2132
- with duckdb.connect(database=self.path) as conn:
2133
- mode = mode or self.mode
2134
- for _data in data:
2135
- conn.register(f"temp_{self.table_name}", _data)
2136
- if mode == "append":
2137
- conn.execute(
2138
- f"CREATE TABLE IF NOT EXISTS {self.table_name} AS SELECT * FROM temp_{self.table_name} LIMIT 0;"
2139
- )
2140
- conn.execute(
2141
- f"INSERT INTO {self.table_name} SELECT * FROM temp_{self.table_name};"
2142
- )
2143
- elif mode == "replace":
2144
- conn.execute(
2145
- f"CREATE OR REPLACE TABLE {self.table_name} AS SELECT * FROM temp_{self.table_name};"
2146
- )
2147
- elif mode == "fail":
2148
- try:
2149
- conn.execute(
2150
- f"CREATE TABLE {self.table_name} AS SELECT * FROM temp_{self.table_name};"
2151
- )
2152
- except Exception as e:
2153
- raise e
2154
-
2155
- conn.execute(
2156
- f"DROP TABLE temp_{self.table_name};"
2157
- ) # Fixed: TABLE not VIEW
2158
-
2159
- return self._metadata
2160
-
2161
- def _write_sqlalchemy(
2162
- self,
2163
- data: pl.DataFrame
2164
- | pl.LazyFrame
2165
- | pa.Table
2166
- | pa.RecordBatch
2167
- | pa.RecordBatchReader
2168
- | pd.DataFrame
2169
- | dict[str, Any]
2170
- | list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
2171
- mode: str | None = None,
2172
- concat: bool | None = None,
2173
- unique: bool | list[str] | str | None = None,
2174
- ) -> dict[str, Any]:
2175
- if not self.connection_string:
2176
- raise ValueError(f"{self.type_} requires a connection string.")
2177
-
2178
- data = to_pyarrow_table(
2179
- data, unique=unique or self.unique, concat=concat or self.concat
2180
- )
2181
- if not isinstance(data, list):
2182
- data = [data]
2183
-
2184
- self._metadata = get_dataframe_metadata(
2185
- df=data, path=self.connection_string, format=self.type_
2186
- )
2187
-
2188
- engine = create_engine(self.connection_string)
2189
- for _data in data:
2190
- df = self._to_pandas(_data)
2191
- df.to_sql(self.table_name, engine, if_exists=mode or self.mode, index=False)
2192
- engine.dispose()
2193
-
2194
- return self._metadata
2195
-
2196
- def write(
2197
- self,
2198
- data: pl.DataFrame
2199
- | pl.LazyFrame
2200
- | pa.Table
2201
- | pa.RecordBatch
2202
- | pa.RecordBatchReader
2203
- | pd.DataFrame
2204
- | dict[str, Any]
2205
- | list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
2206
- mode: str | None = None,
2207
- concat: bool | None = None,
2208
- unique: bool | list[str] | str | None = None,
2209
- ) -> dict[str, Any]:
2210
- """
2211
- Write data to database.
2212
-
2213
- Args:
2214
- data (pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader | pd.DataFrame |
2215
- dict[str, Any] | list[pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
2216
- pd.DataFrame | dict[str, Any]], optional): Data to write.
2217
- mode (str, optional): Write mode (append, replace, fail).
2218
- concat (bool, optional): Concatenate multiple files into a single DataFrame.
2219
- unique (bool | list[str] | str, optional): Unique columns for deduplication.
2220
-
2221
- Returns:
2222
- dict[str, Any]: Metadata of the written data
2223
- """
2224
- db = self.type_.lower()
2225
- if db == "sqlite":
2226
- return self._write_sqlite(
2227
- data=data, mode=mode, concat=concat, unique=unique
2228
- )
2229
- elif db == "duckdb":
2230
- return self._write_duckdb(
2231
- data=data, mode=mode, concat=concat, unique=unique
2232
- )
2233
- elif db in ["postgres", "mysql", "mssql", "oracle"]:
2234
- return self._write_sqlalchemy(
2235
- data=data, mode=mode, concat=concat, unique=unique
2236
- )
2237
- else:
2238
- raise ValueError(f"Unsupported database type: {self.type_}")
2239
-
2240
- @property
2241
- def metadata(self):
2242
- if not hasattr(self, "_metadata"):
2243
- return {}
2244
- return self._metadata
2245
-
2246
-
2247
- # @attrs.define # Removed
2248
- class BaseDatabaseReader(BaseDatabaseIO, gc=False):
2249
- """
2250
- Base class for database read operations supporting various database systems.
2251
- This class provides a foundation for database read operations across different database systems
2252
- including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle.
2253
-
2254
- Args:
2255
- type_ (str): Database type (sqlite, duckdb, postgres, mysql, mssql, oracle).
2256
- table_name (str): Table name in the database.
2257
- path (str | None, optional): File path for SQLite or DuckDB databases.
2258
- connection_string (str | None, optional): Connection string for SQLAlchemy-based databases.
2259
- username (str | None, optional): Username for the database.
2260
- password (str | None, optional): Password for the database.
2261
- server (str | None, optional): Server address for the database.
2262
- port (str | None, optional): Port number for the database.
2263
- database (str | None, optional): Database name.
2264
- query (str | None, optional): SQL query to execute.
2265
-
2266
- Examples:
2267
- ```python
2268
- db_reader = BaseDatabaseReader(
2269
- type_="sqlite",
2270
- table_name="table_name",
2271
- path="path/to/database.db"
2272
- )
2273
- data = db_reader.read()
2274
- ```
2275
- Notes:
2276
- - Supports multiple database systems including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle
2277
- - Automatically handles database initialization based on connection parameters
2278
- - Supports reading data from databases into DataFrames
2279
- """
2280
-
2281
- query: str | None = None
2282
-
2283
- def __post_init__(self): # Renamed from __attrs_post_init__
2284
- super().__post_init__() # Call super's post_init if BaseDatabaseIO has one and it's needed
2285
- if self.connection_string is not None:
2286
- if "+" in self.connection_string:
2287
- self.connection_string = (
2288
- f"{self.connection_string.split('+')[0]}://"
2289
- f"{self.connection_string.split('://')[1]}"
2290
- )
2291
-
2292
- def _load(self, query: str | None = None, reload: bool = False, **kwargs) -> None:
2293
- """Load data from database.
2294
-
2295
- Args:
2296
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2297
- reload (bool, optional): Reload data if True.
2298
- **kwargs: Additional keyword arguments.
2299
-
2300
- Returns:
2301
- None
2302
- """
2303
- if query is None:
2304
- query = f"SELECT * FROM {self.table_name}"
2305
- else:
2306
- query = query.replace("table", self.table_name)
2307
-
2308
- if "engine" in kwargs:
2309
- engine = kwargs.pop("engine", "adbc")
2310
- else:
2311
- engine = "adbc"
2312
-
2313
- if query != self.query:
2314
- reload = True
2315
-
2316
- self.query = query
2317
-
2318
- if self.type_ == "duckdb":
2319
- if not self.path:
2320
- raise ValueError("DuckDB requires a file path.")
2321
-
2322
- if not hasattr(self, "_data") or self._data is None or reload:
2323
- with duckdb.connect(database=self.path) as conn:
2324
- self._data = conn.execute(query).arrow()
2325
-
2326
- else:
2327
- if not self.connection_string:
2328
- raise ValueError(f"{self.type_} requires a connection string.")
2329
- if not hasattr(self, "_data") or self._data is None or reload:
2330
- if engine == "connectorx":
2331
- cs = self.connection_string.replace("///", "//")
2332
- else:
2333
- cs = self.connection_string
2334
- data = (
2335
- pl.read_database_uri(
2336
- query=query,
2337
- uri=cs,
2338
- engine=engine,
2339
- **kwargs,
2340
- )
2341
- ).to_arrow()
2342
- self._data = data.cast(convert_large_types_to_standard(data.schema))
2343
-
2344
- self._metadata = get_dataframe_metadata(
2345
- self._data, path=self.connection_string, format=self.type_
2346
- )
2347
-
2348
- def to_polars(
2349
- self,
2350
- query: str | None = None,
2351
- reload: bool = False,
2352
- metadata: bool = False,
2353
- **kwargs,
2354
- ) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]:
2355
- """Convert data to Polars DataFrame.
2356
-
2357
- Args:
2358
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2359
- reload (bool, optional): Reload data if True.
2360
- metadata (bool, optional): Include metadata in the output. Default is False.
2361
- **kwargs: Additional keyword arguments.
2362
-
2363
- Returns:
2364
- pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]: Polars DataFrame or tuple of DataFrame and metadata.
2365
- """
2366
- self._load(query=query, reload=reload, **kwargs)
2367
- df = pl.from_arrow(self._data)
2368
- if metadata:
2369
- return df, self.metadata
2370
- return df
2371
-
2372
- def to_pandas(
2373
- self,
2374
- query: str | None = None,
2375
- reload: bool = False,
2376
- metadata: bool = False,
2377
- **kwargs,
2378
- ) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
2379
- """Convert data to Pandas DataFrame.
2380
-
2381
- Args:
2382
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2383
- reload (bool, optional): Reload data if True.
2384
- metadata (bool, optional): Include metadata in the output. Default is False.
2385
- **kwargs: Additional keyword arguments.
2386
-
2387
- Returns:
2388
- pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]: Pandas DataFrame or tuple of DataFrame and metadata.
2389
- """
2390
- self._load(query=query, reload=reload, **kwargs)
2391
- df = self._data.to_pandas()
2392
- if metadata:
2393
- return df, self.metadata
2394
- return df
2395
-
2396
- def to_pyarrow_table(
2397
- self,
2398
- query: str | None = None,
2399
- reload: bool = False,
2400
- metadata: bool = False,
2401
- **kwargs,
2402
- ) -> pa.Table:
2403
- """Convert data to PyArrow Table.
2404
-
2405
- Args:
2406
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2407
- reload (bool, optional): Reload data if True.
2408
- metadata (bool, optional): Include metadata in the output. Default is False.
2409
- **kwargs: Additional keyword arguments.
2410
-
2411
- Returns:
2412
- pa.Table | tuple[pa.Table, dict[str, Any]]: PyArrow Table or tuple of Table and metadata.
2413
- """
2414
- self._load(query=query, reload=reload, **kwargs)
2415
- if metadata:
2416
- return self._data, self.metadata
2417
- return self._data
2418
-
2419
- def to_duckdb_relation(
2420
- self,
2421
- query: str | None = None,
2422
- reload: bool = False,
2423
- metadata: bool = False,
2424
- conn: duckdb.DuckDBPyConnection | None = None,
2425
- **kwargs,
2426
- ) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
2427
- """Convert data to DuckDB relation.
2428
-
2429
- Args:
2430
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2431
- reload (bool, optional): Reload data if True.
2432
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
2433
- metadata (bool, optional): Include metadata in the output. Default is False.
2434
- **kwargs: Additional keyword arguments.
2435
-
2436
- Returns:
2437
- duckdb.DuckDBPyRelation: DuckDB relation.
2438
- """
2439
- self._load(query=query, reload=reload, **kwargs)
2440
- if self._conn is None:
2441
- if conn is None:
2442
- conn = duckdb.connect()
2443
- self._conn = conn
2444
- if metadata:
2445
- return self._conn.from_arrow(self._data), self.metadata
2446
- return self._conn.from_arrow(self._data)
2447
-
2448
- def register_in_duckdb(
2449
- self,
2450
- query: str | None = None,
2451
- reload: bool = False,
2452
- conn: duckdb.DuckDBPyConnection | None = None,
2453
- name: str | None = None,
2454
- **kwargs,
2455
- ) -> None:
2456
- """Register data in DuckDB.
2457
-
2458
- Args:
2459
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2460
- reload (bool, optional): Reload data if True.
2461
- conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
2462
- name (str, optional): Name of the relation.
2463
- **kwargs: Additional keyword arguments.
2464
-
2465
- Returns:
2466
- None
2467
- """
2468
- if name is None:
2469
- name = f"{self.type_}:{self.table_name}"
2470
-
2471
- if self._conn is None:
2472
- if conn is None:
2473
- conn = duckdb.connect()
2474
- self._conn = conn
2475
-
2476
- self._load(query=query, reload=reload, **kwargs)
2477
- self._conn.register(name, self._data)
2478
-
2479
- def register_in_datafusion(
2480
- self,
2481
- query: str | None = None,
2482
- reload: bool = False,
2483
- ctx: datafusion.SessionContext | None = None,
2484
- name: str | None = None,
2485
- **kwargs,
2486
- ) -> None:
2487
- """Register data in DataFusion.
2488
-
2489
- Args:
2490
- query (str, optional): SQL query to execute. If None, loads all data from the table.
2491
- reload (bool, optional): Reload data if True.
2492
- ctx (datafusion.SessionContext, optional): DataFusion session context instance.
2493
- name (str, optional): Name of the relation.
2494
- **kwargs: Additional keyword arguments.
2495
-
2496
- Returns:
2497
- None
2498
- """
2499
- if name is None:
2500
- name = f"{self.type_}:{self.table_name}"
2501
-
2502
- if self._ctx is None:
2503
- if ctx is None:
2504
- ctx = datafusion.SessionContext()
2505
- self._ctx = ctx
2506
-
2507
- self._load(query=query, reload=reload, **kwargs)
2508
-
2509
- self._ctx.register_record_batches(name, [self.to_pyarrow_table().to_batches()])
2510
-
2511
- @property
2512
- def metadata(self):
2513
- if not hasattr(self, "_metadata"):
2514
- self._load()
2515
- return self._metadata
2516
-
2517
- def metadata(self):
2518
- if not hasattr(self, "_metadata"):
2519
- self._load()
2520
- return self._metadata