FlowerPower 0.11.6.20__py3-none-any.whl → 0.21.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/__init__.py +2 -6
- flowerpower/cfg/__init__.py +7 -14
- flowerpower/cfg/base.py +29 -25
- flowerpower/cfg/pipeline/__init__.py +8 -6
- flowerpower/cfg/pipeline/_schedule.py +32 -0
- flowerpower/cfg/pipeline/adapter.py +0 -5
- flowerpower/cfg/pipeline/builder.py +377 -0
- flowerpower/cfg/pipeline/run.py +36 -0
- flowerpower/cfg/project/__init__.py +11 -24
- flowerpower/cfg/project/adapter.py +0 -12
- flowerpower/cli/__init__.py +2 -21
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/mqtt.py +0 -6
- flowerpower/cli/pipeline.py +22 -415
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +345 -146
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +21 -12
- flowerpower/pipeline/io.py +58 -54
- flowerpower/pipeline/manager.py +165 -726
- flowerpower/pipeline/pipeline.py +643 -0
- flowerpower/pipeline/registry.py +285 -18
- flowerpower/pipeline/visualizer.py +5 -6
- flowerpower/plugins/io/__init__.py +8 -0
- flowerpower/plugins/mqtt/__init__.py +7 -11
- flowerpower/settings/__init__.py +0 -2
- flowerpower/settings/{backend.py → _backend.py} +0 -21
- flowerpower/settings/logging.py +1 -1
- flowerpower/utils/logging.py +24 -12
- flowerpower/utils/misc.py +17 -256
- flowerpower/utils/monkey.py +1 -83
- flowerpower-0.21.0.dist-info/METADATA +463 -0
- flowerpower-0.21.0.dist-info/RECORD +44 -0
- flowerpower/cfg/pipeline/schedule.py +0 -74
- flowerpower/cfg/project/job_queue.py +0 -238
- flowerpower/cli/job_queue.py +0 -1061
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/__init__.py +0 -294
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/job_queue/base.py +0 -413
- flowerpower/job_queue/rq/__init__.py +0 -10
- flowerpower/job_queue/rq/_trigger.py +0 -37
- flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +0 -226
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -231
- flowerpower/job_queue/rq/manager.py +0 -1582
- flowerpower/job_queue/rq/setup.py +0 -154
- flowerpower/job_queue/rq/utils.py +0 -69
- flowerpower/mqtt.py +0 -12
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/plugins/mqtt/cfg.py +0 -17
- flowerpower/plugins/mqtt/manager.py +0 -962
- flowerpower/settings/job_queue.py +0 -87
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.20.dist-info/METADATA +0 -537
- flowerpower-0.11.6.20.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/top_level.txt +0 -0
flowerpower/plugins/io/base.py
DELETED
@@ -1,2520 +0,0 @@
|
|
1
|
-
import importlib
|
2
|
-
import os
|
3
|
-
import posixpath
|
4
|
-
from typing import Any, Generator
|
5
|
-
|
6
|
-
if importlib.util.find_spec("datafusion"):
|
7
|
-
import datafusion
|
8
|
-
else:
|
9
|
-
raise ImportError("To use this module, please install `flowerpower[io]`.")
|
10
|
-
import sqlite3
|
11
|
-
|
12
|
-
import duckdb
|
13
|
-
import msgspec
|
14
|
-
import pandas as pd
|
15
|
-
import pyarrow as pa
|
16
|
-
import pyarrow.dataset as pds
|
17
|
-
from fsspec import AbstractFileSystem
|
18
|
-
from msgspec import field
|
19
|
-
from pydala.dataset import ParquetDataset
|
20
|
-
from sqlalchemy import create_engine, text
|
21
|
-
|
22
|
-
from ...fs import get_filesystem
|
23
|
-
from ...fs.ext import _dict_to_dataframe, path_to_glob
|
24
|
-
from ...fs.storage_options import (AwsStorageOptions, AzureStorageOptions,
|
25
|
-
GcsStorageOptions, GitHubStorageOptions,
|
26
|
-
GitLabStorageOptions, StorageOptions)
|
27
|
-
from ...utils.misc import convert_large_types_to_standard, to_pyarrow_table
|
28
|
-
from .helpers.polars import pl
|
29
|
-
from .helpers.pyarrow import opt_dtype
|
30
|
-
from .helpers.sql import sql2polars_filter, sql2pyarrow_filter
|
31
|
-
from .metadata import get_dataframe_metadata, get_pyarrow_dataset_metadata
|
32
|
-
|
33
|
-
|
34
|
-
# @attrs.define # Removed
|
35
|
-
class BaseFileIO(msgspec.Struct, gc=False):
|
36
|
-
"""
|
37
|
-
Base class for file I/O operations supporting various storage backends.
|
38
|
-
This class provides a foundation for file operations across different storage systems
|
39
|
-
including AWS S3, Google Cloud Storage, Azure Blob Storage, GitHub, and GitLab.
|
40
|
-
|
41
|
-
Args:
|
42
|
-
path (str | list[str]): Path or list of paths to file(s).
|
43
|
-
storage_options (AwsStorageOptions | GcsStorageOptions | AzureStorageOptions |
|
44
|
-
GitHubStorageOptions | GitLabStorageOptions | dict[str, Any] | None, optional):
|
45
|
-
Storage-specific options for accessing remote filesystems.
|
46
|
-
fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
|
47
|
-
format (str, optional): File format extension (without dot).
|
48
|
-
|
49
|
-
Notes:
|
50
|
-
```python
|
51
|
-
file_io = BaseFileIO(
|
52
|
-
path="s3://bucket/path/to/files",
|
53
|
-
storage_options=AwsStorageOptions(
|
54
|
-
key="access_key",
|
55
|
-
secret="secret_key"
|
56
|
-
files = file_io.list_files()
|
57
|
-
```
|
58
|
-
Notes:
|
59
|
-
- Supports multiple cloud storage backends through different storage options
|
60
|
-
- Automatically handles filesystem initialization based on path protocol
|
61
|
-
- Supports both single path and multiple path inputs
|
62
|
-
- Can read credentials from environment variables when using from_env() methods
|
63
|
-
|
64
|
-
"""
|
65
|
-
|
66
|
-
path: str | list[str]
|
67
|
-
storage_options: (
|
68
|
-
StorageOptions
|
69
|
-
| AwsStorageOptions
|
70
|
-
| AzureStorageOptions
|
71
|
-
| GcsStorageOptions
|
72
|
-
| GitLabStorageOptions
|
73
|
-
| GitHubStorageOptions
|
74
|
-
| dict[str, Any]
|
75
|
-
| None
|
76
|
-
) = field(default=None)
|
77
|
-
fs: AbstractFileSystem | None = field(default=None)
|
78
|
-
format: str | None = None
|
79
|
-
# _base_path: str | list[str] | None = field(default=None)
|
80
|
-
# _full_path: str | list[str] | None = field(default=None)
|
81
|
-
# _rel_path: str | list[str] | None = field(default=None)
|
82
|
-
# _glob_path
|
83
|
-
_metadata: dict[str, Any] | None = field(default=None)
|
84
|
-
|
85
|
-
def __post_init__(self):
|
86
|
-
# self._base_path = self.path if isinstance(self.path, str) else os.path.commonpath(self.path)
|
87
|
-
|
88
|
-
# if self.fs is None:
|
89
|
-
self.fs = get_filesystem(
|
90
|
-
path=self._base_path,
|
91
|
-
storage_options=self.storage_options,
|
92
|
-
fs=self.fs,
|
93
|
-
dirfs=True,
|
94
|
-
)
|
95
|
-
|
96
|
-
self.storage_options = (
|
97
|
-
self.storage_options or self.fs.storage_options
|
98
|
-
if self.protocol != "dir"
|
99
|
-
else self.fs.fs.storage_options
|
100
|
-
)
|
101
|
-
|
102
|
-
@property
|
103
|
-
def protocol(self):
|
104
|
-
"""Get the protocol of the filesystem."""
|
105
|
-
protocol = (
|
106
|
-
self.fs.protocol if self.fs.protocol != "dir" else self.fs.fs.protocol
|
107
|
-
)
|
108
|
-
if isinstance(protocol, list | tuple):
|
109
|
-
protocol = protocol[0]
|
110
|
-
return protocol
|
111
|
-
|
112
|
-
@property
|
113
|
-
def _base_path(self) -> str:
|
114
|
-
"""Get the base path for the filesystem."""
|
115
|
-
if isinstance(self.path, list):
|
116
|
-
base_path = posixpath.commonpath(self.path).rstrip("/*")
|
117
|
-
else:
|
118
|
-
base_path = self.path
|
119
|
-
|
120
|
-
if self.format in base_path:
|
121
|
-
base_path = posixpath.dirname(base_path).rstrip("/")
|
122
|
-
|
123
|
-
return base_path
|
124
|
-
|
125
|
-
@property
|
126
|
-
def _path(self) -> str | list[str]:
|
127
|
-
if self.fs.protocol == "dir":
|
128
|
-
if isinstance(self.path, list):
|
129
|
-
return [
|
130
|
-
p.replace(self._base_path.lstrip("/"), "").lstrip("/")
|
131
|
-
for p in self.path
|
132
|
-
]
|
133
|
-
else:
|
134
|
-
return self.path.replace(self._base_path.lstrip("/"), "").lstrip("/")
|
135
|
-
return self.path
|
136
|
-
|
137
|
-
@property
|
138
|
-
def _glob_path(self) -> str | list[str]:
|
139
|
-
if isinstance(self._path, list):
|
140
|
-
return self._path
|
141
|
-
return path_to_glob(self._path, self.format)
|
142
|
-
|
143
|
-
@property
|
144
|
-
def _root_path(self) -> str:
|
145
|
-
if self.fs.protocol == "dir":
|
146
|
-
return self._base_path.replace(self.fs.path, "")
|
147
|
-
return self._base_path
|
148
|
-
|
149
|
-
def list_files(self) -> list[str]:
|
150
|
-
if isinstance(self._path, list):
|
151
|
-
return self._path
|
152
|
-
|
153
|
-
return self.fs.glob(self._glob_path)
|
154
|
-
|
155
|
-
|
156
|
-
# @attrs.define # Removed
|
157
|
-
class BaseFileReader(BaseFileIO, gc=False):
|
158
|
-
"""
|
159
|
-
Base class for file loading operations supporting various file formats.
|
160
|
-
This class provides a foundation for file loading operations across different file formats
|
161
|
-
including CSV, Parquet, JSON, Arrow, and IPC.
|
162
|
-
|
163
|
-
Args:
|
164
|
-
path (str | list[str]): Path or list of paths to file(s).
|
165
|
-
format (str, optional): File format extension (without dot).
|
166
|
-
fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
|
167
|
-
include_file_path (bool, optional): Include file path in the output DataFrame.
|
168
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
169
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
170
|
-
ctx (datafusion.SessionContext, optional): DataFusion session context instance.
|
171
|
-
|
172
|
-
Examples:
|
173
|
-
```python
|
174
|
-
file_loader = BaseFileReader(
|
175
|
-
path="s3://bucket/path/to/files",
|
176
|
-
format="csv",
|
177
|
-
include_file_path=True,
|
178
|
-
concat=True,
|
179
|
-
conn=duckdb.connect(),
|
180
|
-
ctx=datafusion.SessionContext()
|
181
|
-
data = file_loader.to_polars()
|
182
|
-
```
|
183
|
-
Notes:
|
184
|
-
- Supports multiple file formats including CSV, Parquet, JSON, Arrow, and IPC
|
185
|
-
- Automatically handles filesystem initialization based on path protocol
|
186
|
-
- Supports both single path and multiple path inputs
|
187
|
-
- Supports loading data into DuckDB and DataFusion for SQL operations
|
188
|
-
|
189
|
-
"""
|
190
|
-
|
191
|
-
include_file_path: bool = field(default=False)
|
192
|
-
concat: bool = field(default=True)
|
193
|
-
batch_size: int | None = field(default=None)
|
194
|
-
opt_dtypes: bool = field(default=False)
|
195
|
-
use_threads: bool = field(default=True)
|
196
|
-
conn: duckdb.DuckDBPyConnection | None = field(default=None)
|
197
|
-
ctx: datafusion.SessionContext | None = field(default=None)
|
198
|
-
jsonlines: bool | None = field(default=None)
|
199
|
-
partitioning: str | list[str] | pds.Partitioning | None = field(default=None)
|
200
|
-
verbose: bool | None = field(default=None)
|
201
|
-
_data: Any | None = field(default=None)
|
202
|
-
|
203
|
-
def _load(
|
204
|
-
self,
|
205
|
-
metadata: bool = False,
|
206
|
-
reload: bool = False,
|
207
|
-
batch_size: int | None = None,
|
208
|
-
include_file_path: bool = False,
|
209
|
-
concat: bool | None = None,
|
210
|
-
use_threads: bool | None = None,
|
211
|
-
verbose: bool | None = None,
|
212
|
-
opt_dtypes: bool | None = None,
|
213
|
-
**kwargs,
|
214
|
-
):
|
215
|
-
if batch_size is not None:
|
216
|
-
if self.batch_size != batch_size:
|
217
|
-
reload = True
|
218
|
-
self.batch_size = batch_size
|
219
|
-
|
220
|
-
if include_file_path is not None:
|
221
|
-
if self.include_file_path != include_file_path:
|
222
|
-
reload = True
|
223
|
-
self.include_file_path = include_file_path
|
224
|
-
|
225
|
-
if concat is not None:
|
226
|
-
if self.concat != concat:
|
227
|
-
reload = True
|
228
|
-
self.concat = concat
|
229
|
-
|
230
|
-
if use_threads is not None:
|
231
|
-
if self.use_threads != use_threads:
|
232
|
-
reload = True
|
233
|
-
self.use_threads = use_threads
|
234
|
-
|
235
|
-
if verbose is not None:
|
236
|
-
if self.verbose != verbose:
|
237
|
-
reload = True
|
238
|
-
self.verbose = verbose
|
239
|
-
|
240
|
-
if opt_dtypes is not None:
|
241
|
-
if self.opt_dtypes != opt_dtypes:
|
242
|
-
reload = True
|
243
|
-
self.opt_dtypes = opt_dtypes
|
244
|
-
|
245
|
-
if "partitioning" in kwargs:
|
246
|
-
if self.partitioning != kwargs["partitioning"]:
|
247
|
-
reload = True
|
248
|
-
self.partitioning = kwargs.pop("partitioning")
|
249
|
-
|
250
|
-
if not hasattr(self, "_data") or self._data is None or reload:
|
251
|
-
self._data = self.fs.read_files(
|
252
|
-
path=self._glob_path,
|
253
|
-
format=self.format,
|
254
|
-
include_file_path=True if metadata or self.include_file_path else False,
|
255
|
-
concat=self.concat,
|
256
|
-
jsonlines=self.jsonlines or None,
|
257
|
-
batch_size=self.batch_size,
|
258
|
-
partitioning=self.partitioning,
|
259
|
-
opt_dtypes=self.opt_dtypes,
|
260
|
-
verbose=self.verbose,
|
261
|
-
use_threads=self.use_threads,
|
262
|
-
**kwargs,
|
263
|
-
)
|
264
|
-
if metadata:
|
265
|
-
if isinstance(self._data, tuple | list):
|
266
|
-
self._metadata = [
|
267
|
-
get_dataframe_metadata(
|
268
|
-
df=df,
|
269
|
-
path=self.path,
|
270
|
-
format=self.format,
|
271
|
-
num_files=pl.from_arrow(df.select(["file_path"])).select(
|
272
|
-
pl.n_unique("file_path")
|
273
|
-
)[0, 0]
|
274
|
-
if isinstance(df, pa.Table)
|
275
|
-
else df.select(pl.n_unique("file_path"))[0, 0],
|
276
|
-
)
|
277
|
-
for df in self._data
|
278
|
-
]
|
279
|
-
if not self.include_file_path:
|
280
|
-
self._data = [df.drop("file_path") for df in self._data]
|
281
|
-
|
282
|
-
elif isinstance(self._data, pa.Table):
|
283
|
-
self._metadata = get_dataframe_metadata(
|
284
|
-
df=self._data,
|
285
|
-
path=self.path,
|
286
|
-
format=self.format,
|
287
|
-
num_files=pl.from_arrow(
|
288
|
-
self._data.select(pl.n_unique("file_path"))
|
289
|
-
)[0, 0],
|
290
|
-
)
|
291
|
-
if not self.include_file_path:
|
292
|
-
self._data = self._data.drop("file_path")
|
293
|
-
|
294
|
-
elif isinstance(self._data, pl.DataFrame | pl.LazyFrame):
|
295
|
-
self._metadata = get_dataframe_metadata(
|
296
|
-
df=self._data,
|
297
|
-
path=self.path,
|
298
|
-
format=self.format,
|
299
|
-
num_files=self._data.select(pl.n_unique("file_path"))[0, 0]
|
300
|
-
if isinstance(self._data, pl.DataFrame)
|
301
|
-
else self._data.select(pl.n_unique("file_path")).collect()[
|
302
|
-
0, 0
|
303
|
-
],
|
304
|
-
)
|
305
|
-
|
306
|
-
if not self.include_file_path:
|
307
|
-
self._data = self._data.drop("file_path")
|
308
|
-
else:
|
309
|
-
metadata = {}
|
310
|
-
else:
|
311
|
-
self._metadata = {}
|
312
|
-
|
313
|
-
def to_pandas(
|
314
|
-
self,
|
315
|
-
metadata: bool = False,
|
316
|
-
reload: bool = False,
|
317
|
-
include_file_path: bool = False,
|
318
|
-
concat: bool | None = None,
|
319
|
-
use_threads: bool | None = None,
|
320
|
-
verbose: bool | None = None,
|
321
|
-
opt_dtypes: bool | None = None,
|
322
|
-
**kwargs,
|
323
|
-
) -> (
|
324
|
-
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]]
|
325
|
-
| pd.DataFrame
|
326
|
-
| list[pd.DataFrame]
|
327
|
-
):
|
328
|
-
"""Convert data to Pandas DataFrame(s).
|
329
|
-
|
330
|
-
Args:
|
331
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
332
|
-
reload (bool, optional): Reload data if True. Default is False.
|
333
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
334
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
335
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
336
|
-
verbose (bool, optional): Verbose output. Default is None.
|
337
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
338
|
-
kwargs: Additional keyword arguments.
|
339
|
-
|
340
|
-
Returns:
|
341
|
-
tuple[pd.DataFrame | list[pd.DataFrame], dict[str, Any]] | pd.DataFrame | list[pd.DataFrame]: Pandas
|
342
|
-
DataFrame or list of DataFrames and optional metadata.
|
343
|
-
"""
|
344
|
-
kwargs.pop("batch_size", None)
|
345
|
-
self._load(
|
346
|
-
reload=reload,
|
347
|
-
metadata=metadata,
|
348
|
-
batch_size=None,
|
349
|
-
include_file_path=include_file_path,
|
350
|
-
concat=concat,
|
351
|
-
use_threads=use_threads,
|
352
|
-
verbose=verbose,
|
353
|
-
opt_dtypes=opt_dtypes,
|
354
|
-
**kwargs,
|
355
|
-
)
|
356
|
-
if isinstance(self._data, list):
|
357
|
-
df = [
|
358
|
-
df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
359
|
-
for df in self._data
|
360
|
-
]
|
361
|
-
df = pd.concat(df) if self.concat else df
|
362
|
-
else:
|
363
|
-
df = (
|
364
|
-
self._data
|
365
|
-
if isinstance(self._data, pd.DataFrame)
|
366
|
-
else self._data.to_pandas()
|
367
|
-
)
|
368
|
-
if metadata:
|
369
|
-
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
370
|
-
return df, self._metadata
|
371
|
-
return df
|
372
|
-
|
373
|
-
def iter_pandas(
|
374
|
-
self,
|
375
|
-
reload: bool = False,
|
376
|
-
batch_size: int | None = None,
|
377
|
-
include_file_path: bool = False,
|
378
|
-
concat: bool | None = None,
|
379
|
-
use_threads: bool | None = None,
|
380
|
-
verbose: bool | None = None,
|
381
|
-
opt_dtypes: bool | None = None,
|
382
|
-
**kwargs,
|
383
|
-
) -> Generator[pd.DataFrame, None, None]:
|
384
|
-
"""Iterate over Pandas DataFrames.
|
385
|
-
|
386
|
-
Args:
|
387
|
-
batch_size (int, optional): Batch size for iteration. Default is 1.
|
388
|
-
reload (bool, optional): Reload data if True. Default is False.
|
389
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
390
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
391
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
392
|
-
verbose (bool, optional): Verbose output. Default is None.
|
393
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
394
|
-
kwargs: Additional keyword arguments.
|
395
|
-
|
396
|
-
Returns:
|
397
|
-
Generator[pd.DataFrame, None, None]: Generator of Pandas DataFrames.
|
398
|
-
"""
|
399
|
-
batch_size = batch_size or self.batch_size or 1
|
400
|
-
|
401
|
-
self._load(
|
402
|
-
reload=reload,
|
403
|
-
batch_size=batch_size,
|
404
|
-
include_file_path=include_file_path,
|
405
|
-
concat=concat,
|
406
|
-
use_threads=use_threads,
|
407
|
-
verbose=verbose,
|
408
|
-
opt_dtypes=opt_dtypes,
|
409
|
-
**kwargs,
|
410
|
-
)
|
411
|
-
|
412
|
-
if isinstance(self._data, list | Generator):
|
413
|
-
for df in self._data:
|
414
|
-
yield df if isinstance(df, pd.DataFrame) else df.to_pandas()
|
415
|
-
else:
|
416
|
-
yield (
|
417
|
-
self._data
|
418
|
-
if isinstance(self._data, pd.DataFrame)
|
419
|
-
else self._data.to_pandas()
|
420
|
-
)
|
421
|
-
|
422
|
-
def _to_polars_dataframe(
|
423
|
-
self,
|
424
|
-
metadata: bool = False,
|
425
|
-
reload: bool = False,
|
426
|
-
include_file_path: bool = False,
|
427
|
-
concat: bool | None = None,
|
428
|
-
use_threads: bool | None = None,
|
429
|
-
verbose: bool | None = None,
|
430
|
-
opt_dtypes: bool | None = None,
|
431
|
-
**kwargs,
|
432
|
-
) -> (
|
433
|
-
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]]
|
434
|
-
| pl.DataFrame
|
435
|
-
| list[pl.DataFrame]
|
436
|
-
):
|
437
|
-
"""Convert data to Polars DataFrame(s).
|
438
|
-
|
439
|
-
Args:
|
440
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
441
|
-
reload (bool, optional): Reload data if True. Default is False.
|
442
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
443
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
444
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
445
|
-
verbose (bool, optional): Verbose output. Default is None.
|
446
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
447
|
-
kwargs: Additional keyword arguments.
|
448
|
-
|
449
|
-
Returns:
|
450
|
-
tuple[pl.DataFrame | list[pl.DataFrame], dict[str, Any]] | pl.DataFrame | list[pl.DataFrame]: Polars
|
451
|
-
DataFrame or list of DataFrames and optional metadata.
|
452
|
-
"""
|
453
|
-
kwargs.pop("batch_size", None)
|
454
|
-
|
455
|
-
self._load(
|
456
|
-
metadata=metadata,
|
457
|
-
reload=reload,
|
458
|
-
batch_size=None,
|
459
|
-
include_file_path=include_file_path,
|
460
|
-
concat=concat,
|
461
|
-
use_threads=use_threads,
|
462
|
-
verbose=verbose,
|
463
|
-
opt_dtypes=opt_dtypes,
|
464
|
-
**kwargs,
|
465
|
-
)
|
466
|
-
if isinstance(self._data, list):
|
467
|
-
df = [
|
468
|
-
df if isinstance(self._data, pl.DataFrame) else pl.from_arrow(df)
|
469
|
-
for df in self._data
|
470
|
-
]
|
471
|
-
df = pl.concat(df) if self.concat else df
|
472
|
-
else:
|
473
|
-
df = (
|
474
|
-
self._data
|
475
|
-
if isinstance(self._data, pl.DataFrame)
|
476
|
-
else pl.from_arrow(self._data)
|
477
|
-
)
|
478
|
-
if metadata:
|
479
|
-
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
480
|
-
return df, self._metadata
|
481
|
-
return df
|
482
|
-
|
483
|
-
def _iter_polars_dataframe(
|
484
|
-
self,
|
485
|
-
reload: bool = False,
|
486
|
-
batch_size: int | None = None,
|
487
|
-
include_file_path: bool = False,
|
488
|
-
concat: bool | None = None,
|
489
|
-
use_threads: bool | None = None,
|
490
|
-
verbose: bool | None = None,
|
491
|
-
opt_dtypes: bool | None = None,
|
492
|
-
**kwargs,
|
493
|
-
) -> Generator[pl.DataFrame, None, None]:
|
494
|
-
"""Iterate over Polars DataFrames.
|
495
|
-
|
496
|
-
Args:
|
497
|
-
batch_size (int, optional): Batch size for iteration. Default is 1.
|
498
|
-
reload (bool, optional): Reload data if True. Default is False.
|
499
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
500
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
501
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
502
|
-
verbose (bool, optional): Verbose output. Default is None.
|
503
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
504
|
-
kwargs: Additional keyword arguments.
|
505
|
-
|
506
|
-
Returns:
|
507
|
-
Generator[pl.DataFrame, None, None]: Generator of Polars DataFrames.
|
508
|
-
"""
|
509
|
-
batch_size = batch_size or self.batch_size or 1
|
510
|
-
|
511
|
-
self._load(
|
512
|
-
reload=reload,
|
513
|
-
batch_size=batch_size,
|
514
|
-
include_file_path=include_file_path,
|
515
|
-
concat=concat,
|
516
|
-
use_threads=use_threads,
|
517
|
-
verbose=verbose,
|
518
|
-
opt_dtypes=opt_dtypes,
|
519
|
-
**kwargs,
|
520
|
-
)
|
521
|
-
if isinstance(self._data, list | Generator):
|
522
|
-
for df in self._data:
|
523
|
-
yield df if isinstance(df, pl.DataFrame) else pl.from_arrow(df)
|
524
|
-
else:
|
525
|
-
yield (
|
526
|
-
self._data
|
527
|
-
if isinstance(self._data, pl.DataFrame)
|
528
|
-
else pl.from_arrow(self._data)
|
529
|
-
)
|
530
|
-
|
531
|
-
def _to_polars_lazyframe(
|
532
|
-
self,
|
533
|
-
metadata: bool = False,
|
534
|
-
reload: bool = False,
|
535
|
-
include_file_path: bool = False,
|
536
|
-
concat: bool | None = None,
|
537
|
-
use_threads: bool | None = None,
|
538
|
-
verbose: bool | None = None,
|
539
|
-
opt_dtypes: bool | None = None,
|
540
|
-
**kwargs,
|
541
|
-
) -> (
|
542
|
-
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]]
|
543
|
-
| pl.LazyFrame
|
544
|
-
| list[pl.LazyFrame]
|
545
|
-
):
|
546
|
-
"""Convert data to Polars LazyFrame(s).
|
547
|
-
|
548
|
-
Args:
|
549
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
550
|
-
reload (bool, optional): Reload data if True. Default is False.
|
551
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
552
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
553
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
554
|
-
verbose (bool, optional): Verbose output. Default is None.
|
555
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
556
|
-
kwargs: Additional keyword arguments.
|
557
|
-
|
558
|
-
Returns:
|
559
|
-
tuple[pl.LazyFrame | list[pl.LazyFrame], dict[str, Any]] | pl.LazyFrame | list[pl.LazyFrame]: Polars
|
560
|
-
LazyFrame or list of LazyFrames and optional metadata.
|
561
|
-
"""
|
562
|
-
kwargs.pop("batch_size", None)
|
563
|
-
|
564
|
-
self._load(
|
565
|
-
metadata=metadata,
|
566
|
-
reload=reload,
|
567
|
-
batch_size=None,
|
568
|
-
include_file_path=include_file_path,
|
569
|
-
concat=concat,
|
570
|
-
use_threads=use_threads,
|
571
|
-
verbose=verbose,
|
572
|
-
opt_dtypes=opt_dtypes,
|
573
|
-
**kwargs,
|
574
|
-
)
|
575
|
-
if not self.concat:
|
576
|
-
df = [df.lazy() for df in self._to_polars_dataframe()]
|
577
|
-
|
578
|
-
else:
|
579
|
-
df = self._to_polars_dataframe().lazy()
|
580
|
-
if metadata:
|
581
|
-
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
582
|
-
return df, self._metadata
|
583
|
-
return df
|
584
|
-
|
585
|
-
def _iter_polars_lazyframe(
|
586
|
-
self,
|
587
|
-
reload: bool = False,
|
588
|
-
batch_size: int | None = None,
|
589
|
-
include_file_path: bool = False,
|
590
|
-
concat: bool | None = None,
|
591
|
-
use_threads: bool | None = None,
|
592
|
-
verbose: bool | None = None,
|
593
|
-
opt_dtypes: bool | None = None,
|
594
|
-
**kwargs,
|
595
|
-
) -> Generator[pl.LazyFrame, None, None]:
|
596
|
-
"""Iterate over Polars LazyFrames.
|
597
|
-
|
598
|
-
Args:
|
599
|
-
batch_size (int, optional): Batch size for iteration. Default is 1.
|
600
|
-
reload (bool, optional): Reload data if True. Default is False.
|
601
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
602
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
603
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
604
|
-
verbose (bool, optional): Verbose output. Default is None.
|
605
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
606
|
-
kwargs: Additional keyword arguments.
|
607
|
-
|
608
|
-
Returns:
|
609
|
-
Generator[pl.LazyFrame, None, None]: Generator of Polars LazyFrames.
|
610
|
-
"""
|
611
|
-
batch_size = batch_size or self.batch_size or 1
|
612
|
-
|
613
|
-
self._load(
|
614
|
-
reload=reload,
|
615
|
-
batch_size=batch_size,
|
616
|
-
include_file_path=include_file_path,
|
617
|
-
concat=concat,
|
618
|
-
use_threads=use_threads,
|
619
|
-
verbose=verbose,
|
620
|
-
opt_dtypes=opt_dtypes,
|
621
|
-
**kwargs,
|
622
|
-
)
|
623
|
-
if isinstance(self._data, list | Generator):
|
624
|
-
for df in self._data:
|
625
|
-
yield (
|
626
|
-
df.lazy()
|
627
|
-
if isinstance(df, pl.DataFrame)
|
628
|
-
else pl.from_arrow(df).lazy()
|
629
|
-
)
|
630
|
-
else:
|
631
|
-
yield (
|
632
|
-
self._data.lazy()
|
633
|
-
if isinstance(self._data, pl.DataFrame)
|
634
|
-
else pl.from_arrow(self._data).lazy()
|
635
|
-
)
|
636
|
-
|
637
|
-
def to_polars(
|
638
|
-
self,
|
639
|
-
lazy: bool = False,
|
640
|
-
metadata: bool = False,
|
641
|
-
reload: bool = False,
|
642
|
-
include_file_path: bool = False,
|
643
|
-
concat: bool | None = None,
|
644
|
-
use_threads: bool | None = None,
|
645
|
-
verbose: bool | None = None,
|
646
|
-
opt_dtypes: bool | None = None,
|
647
|
-
**kwargs,
|
648
|
-
) -> (
|
649
|
-
pl.DataFrame
|
650
|
-
| pl.LazyFrame
|
651
|
-
| list[pl.DataFrame]
|
652
|
-
| list[pl.LazyFrame]
|
653
|
-
| tuple[
|
654
|
-
pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame],
|
655
|
-
dict[str, Any],
|
656
|
-
]
|
657
|
-
):
|
658
|
-
"""Convert data to Polars DataFrame or LazyFrame.
|
659
|
-
|
660
|
-
Args:
|
661
|
-
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
|
662
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
663
|
-
reload (bool, optional): Reload data if True. Default is False.
|
664
|
-
batch_size (int, optional): Batch size for iteration. Default is 1.
|
665
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
666
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
667
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
668
|
-
verbose (bool, optional): Verbose output. Default is None.
|
669
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
670
|
-
kwargs: Additional keyword arguments.
|
671
|
-
|
672
|
-
Returns:
|
673
|
-
pl.DataFrame | pl.LazyFrame | list[pl.DataFrame] | list[pl.LazyFrame] | tuple[pl.DataFrame | pl.LazyFrame
|
674
|
-
| list[pl.DataFrame] | list[pl.LazyFrame], dict[str, Any]]: Polars DataFrame or LazyFrame and optional
|
675
|
-
metadata.
|
676
|
-
"""
|
677
|
-
kwargs.pop("batch_size", None)
|
678
|
-
if lazy:
|
679
|
-
return self._to_polars_lazyframe(
|
680
|
-
metadata=metadata,
|
681
|
-
reload=reload,
|
682
|
-
batch_size=None,
|
683
|
-
include_file_path=include_file_path,
|
684
|
-
concat=concat,
|
685
|
-
use_threads=use_threads,
|
686
|
-
verbose=verbose,
|
687
|
-
opt_dtypes=opt_dtypes,
|
688
|
-
**kwargs,
|
689
|
-
)
|
690
|
-
return self._to_polars_dataframe(
|
691
|
-
metadata=metadata,
|
692
|
-
reload=reload,
|
693
|
-
batch_size=None,
|
694
|
-
include_file_path=include_file_path,
|
695
|
-
concat=concat,
|
696
|
-
use_threads=use_threads,
|
697
|
-
verbose=verbose,
|
698
|
-
opt_dtypes=opt_dtypes,
|
699
|
-
**kwargs,
|
700
|
-
)
|
701
|
-
|
702
|
-
def iter_polars(
|
703
|
-
self,
|
704
|
-
lazy: bool = False,
|
705
|
-
reload: bool = False,
|
706
|
-
batch_size: int | None = None,
|
707
|
-
include_file_path: bool = False,
|
708
|
-
concat: bool | None = None,
|
709
|
-
use_threads: bool | None = None,
|
710
|
-
verbose: bool | None = None,
|
711
|
-
opt_dtypes: bool | None = None,
|
712
|
-
**kwargs,
|
713
|
-
) -> Generator[pl.DataFrame | pl.LazyFrame, None, None]:
|
714
|
-
"""Iterate over Polars DataFrames or LazyFrames.
|
715
|
-
|
716
|
-
Args:
|
717
|
-
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame. Default is False.
|
718
|
-
reload (bool, optional): Reload data if True. Default is False.
|
719
|
-
batch_size (int, optional): Batch size for iteration. Default is 1.
|
720
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
721
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame. Default is True.
|
722
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
723
|
-
verbose (bool, optional): Verbose output. Default is None.
|
724
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
725
|
-
kwargs: Additional keyword arguments.
|
726
|
-
|
727
|
-
Returns:
|
728
|
-
Generator[pl.DataFrame | pl.LazyFrame, None, None]: Generator of Polars DataFrames or LazyFrames.
|
729
|
-
"""
|
730
|
-
if lazy:
|
731
|
-
yield from self._iter_polars_lazyframe(
|
732
|
-
reload=reload,
|
733
|
-
batch_size=batch_size,
|
734
|
-
include_file_path=include_file_path,
|
735
|
-
concat=concat,
|
736
|
-
use_threads=use_threads,
|
737
|
-
verbose=verbose,
|
738
|
-
opt_dtypes=opt_dtypes,
|
739
|
-
**kwargs,
|
740
|
-
)
|
741
|
-
yield from self._iter_polars_dataframe(
|
742
|
-
reload=reload,
|
743
|
-
batch_size=batch_size,
|
744
|
-
include_file_path=include_file_path,
|
745
|
-
concat=concat,
|
746
|
-
use_threads=use_threads,
|
747
|
-
verbose=verbose,
|
748
|
-
opt_dtypes=opt_dtypes,
|
749
|
-
**kwargs,
|
750
|
-
)
|
751
|
-
|
752
|
-
def to_pyarrow_table(
|
753
|
-
self,
|
754
|
-
metadata: bool = False,
|
755
|
-
reload: bool = False,
|
756
|
-
include_file_path: bool = False,
|
757
|
-
use_threads: bool | None = None,
|
758
|
-
verbose: bool | None = None,
|
759
|
-
opt_dtypes: bool | None = None,
|
760
|
-
**kwargs,
|
761
|
-
) -> pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]:
|
762
|
-
"""Convert data to PyArrow Table(s).
|
763
|
-
|
764
|
-
Args:
|
765
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
766
|
-
reload (bool, optional): Reload data if True. Default is False.
|
767
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
768
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
769
|
-
verbose (bool, optional): Verbose output. Default is None.
|
770
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
771
|
-
kwargs: Additional keyword arguments.
|
772
|
-
|
773
|
-
Returns:
|
774
|
-
pa.Table | list[pa.Table] | tuple[pa.Table | list[pa.Table], dict[str, Any]]: PyArrow Table or list of
|
775
|
-
Tables and optional metadata.
|
776
|
-
"""
|
777
|
-
kwargs.pop("batch_size", None)
|
778
|
-
self._load(
|
779
|
-
reload=reload,
|
780
|
-
metadata=metadata,
|
781
|
-
batch_size=None,
|
782
|
-
include_file_path=include_file_path,
|
783
|
-
concat=None,
|
784
|
-
use_threads=use_threads,
|
785
|
-
verbose=verbose,
|
786
|
-
opt_dtypes=opt_dtypes,
|
787
|
-
**kwargs,
|
788
|
-
)
|
789
|
-
if isinstance(self._data, list):
|
790
|
-
df = [
|
791
|
-
df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
792
|
-
for df in self._data
|
793
|
-
]
|
794
|
-
df = pa.concat_tables(df) if self.concat else df
|
795
|
-
else:
|
796
|
-
df = (
|
797
|
-
self._data.to_arrow(**kwargs)
|
798
|
-
if isinstance(self._data, pl.DataFrame)
|
799
|
-
else self._data
|
800
|
-
)
|
801
|
-
if metadata:
|
802
|
-
# metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
803
|
-
return df, self._metadata
|
804
|
-
return df
|
805
|
-
|
806
|
-
def iter_pyarrow_table(
|
807
|
-
self,
|
808
|
-
reload: bool = False,
|
809
|
-
batch_size: int | None = None,
|
810
|
-
include_file_path: bool = False,
|
811
|
-
concat: bool | None = None,
|
812
|
-
use_threads: bool | None = None,
|
813
|
-
verbose: bool | None = None,
|
814
|
-
opt_dtypes: bool | None = None,
|
815
|
-
**kwargs,
|
816
|
-
) -> Generator[pa.Table, None, None]:
|
817
|
-
"""Iterate over PyArrow Tables.
|
818
|
-
|
819
|
-
Args:
|
820
|
-
reload (bool, optional): Reload data if True. Default is False.
|
821
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
822
|
-
concat (bool, optional): Concatenate multiple files into a single Table. Default is True.
|
823
|
-
batch_size (int, optional): Batch size for iteration. Default is 1.
|
824
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
825
|
-
verbose (bool, optional): Verbose output. Default is None.
|
826
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
827
|
-
kwargs: Additional keyword arguments.
|
828
|
-
|
829
|
-
Returns:
|
830
|
-
Generator[pa.Table, None, None]: Generator of PyArrow Tables.
|
831
|
-
"""
|
832
|
-
batch_size = batch_size or self.batch_size or 1
|
833
|
-
|
834
|
-
self._load(
|
835
|
-
reload=reload,
|
836
|
-
batch_size=batch_size,
|
837
|
-
include_file_path=include_file_path,
|
838
|
-
concat=concat,
|
839
|
-
use_threads=use_threads,
|
840
|
-
verbose=verbose,
|
841
|
-
opt_dtypes=opt_dtypes,
|
842
|
-
**kwargs,
|
843
|
-
)
|
844
|
-
if isinstance(self._data, list | Generator):
|
845
|
-
for df in self._data:
|
846
|
-
yield df.to_arrow(**kwargs) if isinstance(df, pl.DataFrame) else df
|
847
|
-
else:
|
848
|
-
yield (
|
849
|
-
self._data.to_arrow(**kwargs)
|
850
|
-
if isinstance(self._data, pl.DataFrame)
|
851
|
-
else self._data
|
852
|
-
)
|
853
|
-
|
854
|
-
def to_duckdb_relation(
|
855
|
-
self,
|
856
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
857
|
-
metadata: bool = False,
|
858
|
-
reload: bool = False,
|
859
|
-
include_file_path: bool = False,
|
860
|
-
use_threads: bool | None = None,
|
861
|
-
verbose: bool | None = None,
|
862
|
-
opt_dtypes: bool | None = None,
|
863
|
-
**kwargs,
|
864
|
-
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
865
|
-
"""Convert data to DuckDB relation.
|
866
|
-
|
867
|
-
Args:
|
868
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
869
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
870
|
-
reload (bool, optional): Reload data if True. Default is False.
|
871
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
872
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
873
|
-
verbose (bool, optional): Verbose output. Default is None.
|
874
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
875
|
-
kwargs: Additional keyword arguments.
|
876
|
-
|
877
|
-
Returns:
|
878
|
-
duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
|
879
|
-
metadata.
|
880
|
-
"""
|
881
|
-
kwargs.pop("batch_size", None)
|
882
|
-
if self._conn is None:
|
883
|
-
if conn is None:
|
884
|
-
conn = duckdb.connect()
|
885
|
-
self._conn = conn
|
886
|
-
|
887
|
-
if metadata:
|
888
|
-
return self._conn.from_arrow(
|
889
|
-
self.to_pyarrow_table(
|
890
|
-
metadata=metadata,
|
891
|
-
reload=reload,
|
892
|
-
batch_size=None,
|
893
|
-
include_file_path=include_file_path,
|
894
|
-
se_threads=use_threads,
|
895
|
-
verbose=verbose,
|
896
|
-
opt_dtypes=opt_dtypes,
|
897
|
-
**kwargs,
|
898
|
-
),
|
899
|
-
), self._metadata
|
900
|
-
return self._conn.from_arrow(
|
901
|
-
self.to_pyarrow_table(
|
902
|
-
reload=reload,
|
903
|
-
batch_size=None,
|
904
|
-
include_file_path=include_file_path,
|
905
|
-
use_threads=use_threads,
|
906
|
-
verbose=verbose,
|
907
|
-
opt_dtypes=opt_dtypes,
|
908
|
-
**kwargs,
|
909
|
-
)
|
910
|
-
)
|
911
|
-
|
912
|
-
def register_in_duckdb(
|
913
|
-
self,
|
914
|
-
conn: duckdb.DuckDBPyConnection,
|
915
|
-
name: str | None = None,
|
916
|
-
metadata: bool = False,
|
917
|
-
reload: bool = False,
|
918
|
-
include_file_path: bool = False,
|
919
|
-
use_threads: bool | None = None,
|
920
|
-
verbose: bool | None = None,
|
921
|
-
opt_dtypes: bool | None = None,
|
922
|
-
**kwargs,
|
923
|
-
) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
|
924
|
-
"""Register data in DuckDB.
|
925
|
-
|
926
|
-
Args:
|
927
|
-
conn (duckdb.DuckDBPyConnection): DuckDB connection instance.
|
928
|
-
name (str, optional): Name for the DuckDB table.
|
929
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
930
|
-
reload (bool, optional): Reload data if True. Default is False.
|
931
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
932
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
933
|
-
verbose (bool, optional): Verbose output. Default is None.
|
934
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
935
|
-
kwargs: Additional keyword arguments.
|
936
|
-
|
937
|
-
Returns:
|
938
|
-
duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
|
939
|
-
or DuckDB connection instance and optional metadata.
|
940
|
-
"""
|
941
|
-
kwargs.pop("batch_size", None)
|
942
|
-
if name is None:
|
943
|
-
name = f"{self.format}:{self.path}"
|
944
|
-
|
945
|
-
if self._conn is None:
|
946
|
-
if conn is None:
|
947
|
-
conn = duckdb.connect()
|
948
|
-
self._conn = conn
|
949
|
-
|
950
|
-
self._conn.register(
|
951
|
-
name,
|
952
|
-
self.to_pyarrow_table(
|
953
|
-
metadata=metadata,
|
954
|
-
reload=reload,
|
955
|
-
include_file_path=include_file_path,
|
956
|
-
use_threads=use_threads,
|
957
|
-
verbose=verbose,
|
958
|
-
opt_dtypes=opt_dtypes,
|
959
|
-
**kwargs,
|
960
|
-
),
|
961
|
-
)
|
962
|
-
if metadata:
|
963
|
-
return self._conn, self._metadata
|
964
|
-
return self._conn
|
965
|
-
|
966
|
-
def to_duckdb(
|
967
|
-
self,
|
968
|
-
as_relation: bool = True,
|
969
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
970
|
-
name: str | None = None,
|
971
|
-
metadata: bool = False,
|
972
|
-
reload: bool = False,
|
973
|
-
include_file_path: bool = False,
|
974
|
-
use_threads: bool | None = None,
|
975
|
-
verbose: bool | None = None,
|
976
|
-
opt_dtypes: bool | None = None,
|
977
|
-
**kwargs,
|
978
|
-
) -> (
|
979
|
-
duckdb.DuckDBPyRelation
|
980
|
-
| duckdb.DuckDBPyConnection
|
981
|
-
| tuple[duckdb.DuckDBPyRelation, dict[str, Any]]
|
982
|
-
| tuple[duckdb.DuckDBPyConnection, dict[str, Any]]
|
983
|
-
):
|
984
|
-
"""Convert data to DuckDB relation or register in DuckDB.
|
985
|
-
|
986
|
-
Args:
|
987
|
-
as_relation (bool, optional): Return a DuckDB relation if True, else register in DuckDB. Default is True.
|
988
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
989
|
-
name (str, optional): Name for the DuckDB table.
|
990
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
991
|
-
reload (bool, optional): Reload data if True. Default is False.
|
992
|
-
include_file_path (bool, optional): Include file path in the output. Default is False.
|
993
|
-
use_threads (bool, optional): Use threads for reading data. Default is True.
|
994
|
-
verbose (bool, optional): Verbose output. Default is None.
|
995
|
-
opt_dtypes (bool, optional): Optimize data types. Default is True.
|
996
|
-
**kwargs: Additional keyword arguments.
|
997
|
-
|
998
|
-
Returns:
|
999
|
-
duckdb.DuckDBPyRelation | duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyRelation, dict[str, Any]] |
|
1000
|
-
tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB relation or connection instance
|
1001
|
-
or DuckDB relation or connection instance and optional metadata.
|
1002
|
-
|
1003
|
-
"""
|
1004
|
-
kwargs.pop("batch_size", None)
|
1005
|
-
if as_relation:
|
1006
|
-
return self.to_duckdb_relation(
|
1007
|
-
conn=conn,
|
1008
|
-
metadata=metadata,
|
1009
|
-
reload=reload,
|
1010
|
-
include_file_path=include_file_path,
|
1011
|
-
use_threads=use_threads,
|
1012
|
-
verbose=verbose,
|
1013
|
-
opt_dtypes=opt_dtypes,
|
1014
|
-
**kwargs,
|
1015
|
-
)
|
1016
|
-
return self.register_in_duckdb(
|
1017
|
-
conn=conn,
|
1018
|
-
name=name,
|
1019
|
-
metadata=metadata,
|
1020
|
-
reload=reload,
|
1021
|
-
include_file_path=include_file_path,
|
1022
|
-
use_threads=use_threads,
|
1023
|
-
verbose=verbose,
|
1024
|
-
opt_dtypes=opt_dtypes,
|
1025
|
-
**kwargs,
|
1026
|
-
)
|
1027
|
-
|
1028
|
-
def register_in_datafusion(
|
1029
|
-
self,
|
1030
|
-
ctx: datafusion.SessionContext,
|
1031
|
-
name: str | None = None,
|
1032
|
-
metadata: bool = False,
|
1033
|
-
reload: bool = False,
|
1034
|
-
include_file_path: bool = False,
|
1035
|
-
use_threads: bool | None = None,
|
1036
|
-
verbose: bool | None = None,
|
1037
|
-
opt_dtypes: bool | None = None,
|
1038
|
-
**kwargs,
|
1039
|
-
) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
|
1040
|
-
"""Register data in DataFusion.
|
1041
|
-
|
1042
|
-
Args:
|
1043
|
-
ctx (datafusion.SessionContext): DataFusion session context instance.
|
1044
|
-
name (str, optional): Name for the DataFusion table.
|
1045
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1046
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1047
|
-
**kwargs: Additional keyword arguments.
|
1048
|
-
|
1049
|
-
Returns:
|
1050
|
-
None
|
1051
|
-
"""
|
1052
|
-
kwargs.pop("batch_size", None)
|
1053
|
-
if name is None:
|
1054
|
-
name = f"{self.format}:{self.path}"
|
1055
|
-
|
1056
|
-
if self._ctx is None:
|
1057
|
-
if ctx is None:
|
1058
|
-
ctx = datafusion.SessionContext()
|
1059
|
-
self._ctx = ctx
|
1060
|
-
|
1061
|
-
self._ctx.register_record_batches(
|
1062
|
-
name,
|
1063
|
-
[
|
1064
|
-
self.to_pyarrow_table(
|
1065
|
-
reload=reload,
|
1066
|
-
include_file_path=include_file_path,
|
1067
|
-
use_threads=use_threads,
|
1068
|
-
opt_dtypes=opt_dtypes,
|
1069
|
-
verbose=verbose,
|
1070
|
-
**kwargs,
|
1071
|
-
).to_batches()
|
1072
|
-
],
|
1073
|
-
)
|
1074
|
-
if metadata:
|
1075
|
-
return self._ctx, self._metadata
|
1076
|
-
return self._ctx
|
1077
|
-
|
1078
|
-
def filter(
|
1079
|
-
self, filter_expr: str | pl.Expr | pa.compute.Expression
|
1080
|
-
) -> (
|
1081
|
-
pl.DataFrame
|
1082
|
-
| pl.LazyFrame
|
1083
|
-
| pa.Table
|
1084
|
-
| list[pl.DataFrame]
|
1085
|
-
| list[pl.LazyFrame]
|
1086
|
-
| list[pa.Table]
|
1087
|
-
):
|
1088
|
-
"""Filter data based on a filter expression.
|
1089
|
-
|
1090
|
-
Args:
|
1091
|
-
filter_expr (str | pl.Expr | pa.compute.Expression): Filter expression. Can be a SQL expression, Polars
|
1092
|
-
expression, or PyArrow compute expression.
|
1093
|
-
|
1094
|
-
Returns:
|
1095
|
-
pl.DataFrame | pl.LazyFrame | pa.Table | list[pl.DataFrame] | list[pl.LazyFrame]
|
1096
|
-
| list[pa.Table]: Filtered data.
|
1097
|
-
"""
|
1098
|
-
if isinstance(self._data, pl.DataFrame | pl.LazyFrame):
|
1099
|
-
pl_schema = (
|
1100
|
-
self._data.schema
|
1101
|
-
if isinstance(self._data, pl.DataFrame)
|
1102
|
-
else self._data.collect_schema()
|
1103
|
-
)
|
1104
|
-
filter_expr = (
|
1105
|
-
sql2polars_filter(filter_expr, pl_schema)
|
1106
|
-
if isinstance(filter_expr, str)
|
1107
|
-
else filter_expr
|
1108
|
-
)
|
1109
|
-
return self._data.filter(filter_expr)
|
1110
|
-
|
1111
|
-
elif isinstance(self._data, pa.Table):
|
1112
|
-
pa_schema = self._data.schema
|
1113
|
-
filter_expr = (
|
1114
|
-
sql2pyarrow_filter(filter_expr, pa_schema)
|
1115
|
-
if isinstance(filter_expr, str)
|
1116
|
-
else filter_expr
|
1117
|
-
)
|
1118
|
-
return self._data.filter(filter_expr)
|
1119
|
-
|
1120
|
-
if isinstance(self._data, str):
|
1121
|
-
if isinstance(self._data[0], pl.DataFrame | pl.LazyFrame):
|
1122
|
-
pl_schema = (
|
1123
|
-
self._data.schema
|
1124
|
-
if isinstance(self._data[0], pl.DataFrame)
|
1125
|
-
else self._data[0].collect_schema()
|
1126
|
-
)
|
1127
|
-
filter_expr = (
|
1128
|
-
sql2polars_filter(filter_expr, pl_schema)
|
1129
|
-
if isinstance(filter_expr, str)
|
1130
|
-
else filter_expr
|
1131
|
-
)
|
1132
|
-
return [d.filter(filter_expr) for d in self._data]
|
1133
|
-
elif isinstance(self._data[0], pa.Table):
|
1134
|
-
pa_schema = self._data[0].schema
|
1135
|
-
filter_expr = (
|
1136
|
-
sql2pyarrow_filter(filter_expr, pa_schema)
|
1137
|
-
if isinstance(filter_expr, str)
|
1138
|
-
else filter_expr
|
1139
|
-
)
|
1140
|
-
return [d.filter(filter_expr) for d in self._data]
|
1141
|
-
|
1142
|
-
@property
|
1143
|
-
def metadata(self):
|
1144
|
-
if not hasattr(self, "_metadata"):
|
1145
|
-
self._load()
|
1146
|
-
return self._metadata
|
1147
|
-
|
1148
|
-
|
1149
|
-
# @attrs.define # Removed
|
1150
|
-
class BaseDatasetReader(BaseFileReader, gc=False):
|
1151
|
-
"""
|
1152
|
-
Base class for dataset loading operations supporting various file formats.
|
1153
|
-
This class provides a foundation for dataset loading operations across different file formats
|
1154
|
-
including CSV, Parquet, JSON, Arrow, and IPC.
|
1155
|
-
|
1156
|
-
Args:
|
1157
|
-
path (str | list[str]): Path or list of paths to file(s).
|
1158
|
-
format (str, optional): File format extension (without dot).
|
1159
|
-
fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
|
1160
|
-
include_file_path (bool, optional): Include file path in the output DataFrame.
|
1161
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
1162
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
1163
|
-
ctx (datafusion.SessionContext, optional): DataFusion session context instance.
|
1164
|
-
schema (pa.Schema, optional): PyArrow schema for the dataset.
|
1165
|
-
partitioning (str | list[str] | pds.Partitioning, optional): Dataset partitioning scheme.
|
1166
|
-
|
1167
|
-
Examples:
|
1168
|
-
```python
|
1169
|
-
dataset_loader = BaseDatasetReader(
|
1170
|
-
path="s3://bucket/path/to/files",
|
1171
|
-
format="csv",
|
1172
|
-
include_file_path=True,
|
1173
|
-
concat=True,
|
1174
|
-
conn=duckdb.connect(),
|
1175
|
-
ctx=datafusion.SessionContext(),
|
1176
|
-
schema=pa.schema([
|
1177
|
-
pa.field("column1", pa.int64()),
|
1178
|
-
pa.field("column2", pa.string())
|
1179
|
-
]),
|
1180
|
-
partitioning="hive"
|
1181
|
-
)
|
1182
|
-
data = dataset_loader.to_polars()
|
1183
|
-
```
|
1184
|
-
Notes:
|
1185
|
-
- Supports multiple file formats including CSV, Parquet, JSON, Arrow, and IPC
|
1186
|
-
- Automatically handles filesystem initialization based on path protocol
|
1187
|
-
- Supports both single path and multiple path inputs
|
1188
|
-
- Supports loading data into DuckDB and DataFusion for SQL operations
|
1189
|
-
- Supports custom schema and partitioning for datasets
|
1190
|
-
|
1191
|
-
"""
|
1192
|
-
|
1193
|
-
schema_: pa.Schema | None = field(default=None)
|
1194
|
-
_dataset: pds.Dataset | None = field(default=None)
|
1195
|
-
_pydala_dataset: Any | None = field(default=None)
|
1196
|
-
|
1197
|
-
def to_pyarrow_dataset(
|
1198
|
-
self,
|
1199
|
-
metadata: bool = False,
|
1200
|
-
reload: bool = False,
|
1201
|
-
**kwargs,
|
1202
|
-
) -> pds.Dataset | tuple[pds.Dataset, dict[str, Any]]:
|
1203
|
-
"""
|
1204
|
-
Convert data to PyArrow Dataset.
|
1205
|
-
|
1206
|
-
Args:
|
1207
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1208
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1209
|
-
|
1210
|
-
Returns:
|
1211
|
-
pds.Dataset: PyArrow Dataset.
|
1212
|
-
"""
|
1213
|
-
if self._dataset is not None and not reload:
|
1214
|
-
if metadata:
|
1215
|
-
return self._dataset, self._metadata
|
1216
|
-
return self._dataset
|
1217
|
-
|
1218
|
-
if self.format == ["csv", "arrow", "ipc"]:
|
1219
|
-
self._dataset = self.fs.pyarrow_dataset(
|
1220
|
-
self._path,
|
1221
|
-
format=self.format,
|
1222
|
-
schema=self.schema_,
|
1223
|
-
partitioning=self.partitioning,
|
1224
|
-
**kwargs,
|
1225
|
-
)
|
1226
|
-
self._metadata = get_pyarrow_dataset_metadata(
|
1227
|
-
self._dataset, path=self.path, format=self.format
|
1228
|
-
)
|
1229
|
-
elif self.format == "parquet":
|
1230
|
-
if self.fs.exists(posixpath.join(self._root_path, "_metadata")):
|
1231
|
-
self._dataset = self.fs.parquet_dataset(
|
1232
|
-
posixpath.join(self._root_path, "_metadata"),
|
1233
|
-
schema=self.schema_,
|
1234
|
-
partitioning=self.partitioning,
|
1235
|
-
**kwargs,
|
1236
|
-
)
|
1237
|
-
else:
|
1238
|
-
self._dataset = self.fs.pyarrow_dataset(
|
1239
|
-
self._path,
|
1240
|
-
format=self.format,
|
1241
|
-
schema=self.schema_,
|
1242
|
-
partitioning=self.partitioning,
|
1243
|
-
**kwargs,
|
1244
|
-
)
|
1245
|
-
self._metadata = get_pyarrow_dataset_metadata(
|
1246
|
-
self._dataset, path=self.path, format=self.format
|
1247
|
-
)
|
1248
|
-
else:
|
1249
|
-
raise ValueError(f"Unsupported format: {self.format}")
|
1250
|
-
if metadata:
|
1251
|
-
return self._dataset, self._metadata
|
1252
|
-
return self._dataset
|
1253
|
-
|
1254
|
-
def to_pandas(
|
1255
|
-
self, metadata: bool = False, reload: bool = False, **kwargs
|
1256
|
-
) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
|
1257
|
-
"""
|
1258
|
-
Convert data to Pandas DataFrame.
|
1259
|
-
|
1260
|
-
Args:
|
1261
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1262
|
-
|
1263
|
-
Returns:
|
1264
|
-
pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]: Pandas DataFrame and optional metadata.
|
1265
|
-
"""
|
1266
|
-
self.to_pyarrow_dataset(reload=reload, **kwargs)
|
1267
|
-
df = self._dataset.to_table().to_pandas()
|
1268
|
-
if metadata:
|
1269
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
1270
|
-
return df, metadata
|
1271
|
-
return df
|
1272
|
-
|
1273
|
-
def _to_polars_dataframe(
|
1274
|
-
self, metadata: bool = False, reload: bool = False, **kwargs
|
1275
|
-
) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]:
|
1276
|
-
self.to_pyarrow_dataset(reload=reload, **kwargs)
|
1277
|
-
df = pl.from_arrow(self._dataset.to_table())
|
1278
|
-
if metadata:
|
1279
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
1280
|
-
return df, metadata
|
1281
|
-
return df
|
1282
|
-
|
1283
|
-
def _to_polars_lazyframe(
|
1284
|
-
self, metadata: bool = False, reload: bool = False, **kwargs
|
1285
|
-
) -> pl.LazyFrame | tuple[pl.LazyFrame, dict[str, Any]]:
|
1286
|
-
self.to_pyarrow_dataset(reload=reload, **kwargs)
|
1287
|
-
df = pl.scan_pyarrow_dataset(self._dataset)
|
1288
|
-
if metadata:
|
1289
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
1290
|
-
return df, metadata
|
1291
|
-
return df
|
1292
|
-
|
1293
|
-
def to_polars(
|
1294
|
-
self, lazy: bool = True, metadata: bool = False, reload: bool = False, **kwargs
|
1295
|
-
) -> (
|
1296
|
-
pl.DataFrame | pl.LazyFrame | tuple[pl.DataFrame | pl.LazyFrame, dict[str, Any]]
|
1297
|
-
):
|
1298
|
-
"""
|
1299
|
-
Convert data to Polars DataFrame or LazyFrame.
|
1300
|
-
|
1301
|
-
Args:
|
1302
|
-
lazy (bool, optional): Return a LazyFrame if True, else a DataFrame.
|
1303
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1304
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1305
|
-
|
1306
|
-
Returns:
|
1307
|
-
pl.DataFrame | pl.LazyFrame | tuple[pl.DataFrame | pl.LazyFrame, dict[str, Any]]: Polars DataFrame or
|
1308
|
-
LazyFrame and optional metadata.
|
1309
|
-
"""
|
1310
|
-
df = (
|
1311
|
-
self._to_polars_lazyframe(reload=reload, **kwargs)
|
1312
|
-
if lazy
|
1313
|
-
else self._to_polars_dataframe(reload=reload, **kwargs)
|
1314
|
-
)
|
1315
|
-
if metadata:
|
1316
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
1317
|
-
return df, metadata
|
1318
|
-
return df
|
1319
|
-
|
1320
|
-
def to_pyarrow_table(
|
1321
|
-
self, metadata: bool = False, reload: bool = False, **kwargs
|
1322
|
-
) -> pa.Table | tuple[pa.Table, dict]:
|
1323
|
-
"""Convert data to PyArrow Table.
|
1324
|
-
|
1325
|
-
Args:
|
1326
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1327
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1328
|
-
|
1329
|
-
Returns:
|
1330
|
-
pa.Table | tuple[pa.Table, dict]: PyArrow Table and optional metadata.
|
1331
|
-
"""
|
1332
|
-
self.to_pyarrow_dataset(reload=reload, **kwargs)
|
1333
|
-
df = self._dataset.to_table()
|
1334
|
-
if metadata:
|
1335
|
-
metadata = get_dataframe_metadata(df, path=self.path, format=self.format)
|
1336
|
-
return df, metadata
|
1337
|
-
return df
|
1338
|
-
|
1339
|
-
def to_pydala_dataset(
|
1340
|
-
self, metadata: bool = False, reload: bool = False, **kwargs
|
1341
|
-
) -> ParquetDataset | tuple[ParquetDataset, dict[str, Any]]: # type: ignore
|
1342
|
-
"""Convert data to Pydala ParquetDataset.
|
1343
|
-
|
1344
|
-
Args:
|
1345
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1346
|
-
|
1347
|
-
Returns:
|
1348
|
-
ParquetDataset: Pydala ParquetDataset.
|
1349
|
-
"""
|
1350
|
-
if ParquetDataset is None:
|
1351
|
-
raise ImportError("pydala is not installed.")
|
1352
|
-
if not hasattr(self, "_pydala_dataset") or reload:
|
1353
|
-
if not hasattr(self, "conn"):
|
1354
|
-
self._conn = duckdb.connect()
|
1355
|
-
self._pydala_dataset = self.fs.pydala_dataset(
|
1356
|
-
self._path,
|
1357
|
-
partitioning=self.partitioning,
|
1358
|
-
ddb_con=self._conn,
|
1359
|
-
**kwargs,
|
1360
|
-
)
|
1361
|
-
self._pydala_dataset.load(update_metadata=True)
|
1362
|
-
self._metadata = get_pyarrow_dataset_metadata(
|
1363
|
-
self._pydala_dataset._arrow_dataset, path=self.path, format=self.format
|
1364
|
-
)
|
1365
|
-
if metadata:
|
1366
|
-
return self._pydala_dataset, self._metadata
|
1367
|
-
return self._pydala_dataset
|
1368
|
-
|
1369
|
-
def to_duckdb_relation(
|
1370
|
-
self,
|
1371
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
1372
|
-
metadata: bool = False,
|
1373
|
-
reload: bool = False,
|
1374
|
-
**kwargs,
|
1375
|
-
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
1376
|
-
"""Convert data to DuckDB relation.
|
1377
|
-
|
1378
|
-
Args:
|
1379
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
1380
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1381
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1382
|
-
|
1383
|
-
Returns:
|
1384
|
-
duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]: DuckDB relation and optional
|
1385
|
-
metadata.
|
1386
|
-
"""
|
1387
|
-
if self._conn is None:
|
1388
|
-
if conn is None:
|
1389
|
-
conn = duckdb.connect()
|
1390
|
-
self._conn = conn
|
1391
|
-
|
1392
|
-
self.to_pyarrow_dataset(reload=reload, **kwargs)
|
1393
|
-
if metadata:
|
1394
|
-
return self._conn.from_arrow(self._dataset), self._metadata
|
1395
|
-
return self._conn.from_arrow(self._dataset)
|
1396
|
-
|
1397
|
-
def register_in_duckdb(
|
1398
|
-
self,
|
1399
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
1400
|
-
name: str | None = None,
|
1401
|
-
metadata: bool = False,
|
1402
|
-
reload: bool = False,
|
1403
|
-
**kwargs,
|
1404
|
-
) -> duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]:
|
1405
|
-
"""Register data in DuckDB.
|
1406
|
-
|
1407
|
-
Args:
|
1408
|
-
conn (duckdb.DuckDBPyConnection): DuckDB connection instance.
|
1409
|
-
name (str, optional): Name for the DuckDB table.
|
1410
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1411
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1412
|
-
|
1413
|
-
Returns:
|
1414
|
-
duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB connection instance
|
1415
|
-
or DuckDB connection instance and optional metadata.
|
1416
|
-
"""
|
1417
|
-
if name is None:
|
1418
|
-
name = f"{self.format}:{self.path}"
|
1419
|
-
|
1420
|
-
if self._conn is None:
|
1421
|
-
if conn is None:
|
1422
|
-
conn = duckdb.connect()
|
1423
|
-
self._conn = conn
|
1424
|
-
|
1425
|
-
self._conn.register(name, self._dataset)
|
1426
|
-
if metadata:
|
1427
|
-
return self._conn, self._metadata
|
1428
|
-
return self._conn
|
1429
|
-
|
1430
|
-
def to_duckdb(
|
1431
|
-
self,
|
1432
|
-
as_relation: bool = True,
|
1433
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
1434
|
-
name: str | None = None,
|
1435
|
-
metadata: bool = False,
|
1436
|
-
reload: bool = False,
|
1437
|
-
**kwargs,
|
1438
|
-
) -> (
|
1439
|
-
duckdb.DuckDBPyRelation
|
1440
|
-
| duckdb.DuckDBPyConnection
|
1441
|
-
| tuple[duckdb.DuckDBPyRelation, dict[str, Any]]
|
1442
|
-
| tuple[duckdb.DuckDBPyConnection, dict[str, Any]]
|
1443
|
-
):
|
1444
|
-
"""Convert data to DuckDB relation or register in DuckDB.
|
1445
|
-
|
1446
|
-
Args:
|
1447
|
-
as_relation (bool, optional): Return a DuckDB relation if True, else register in DuckDB. Default is True.
|
1448
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
1449
|
-
name (str, optional): Name for the DuckDB table.
|
1450
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1451
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1452
|
-
**kwargs: Additional keyword arguments.
|
1453
|
-
|
1454
|
-
Returns:
|
1455
|
-
duckdb.DuckDBPyRelation | duckdb.DuckDBPyConnection | tuple[duckdb.DuckDBPyRelation, dict[str, Any]] |
|
1456
|
-
tuple[duckdb.DuckDBPyConnection, dict[str, Any]]: DuckDB relation or connection instance
|
1457
|
-
or DuckDB relation or connection instance and optional metadata.
|
1458
|
-
|
1459
|
-
"""
|
1460
|
-
if as_relation:
|
1461
|
-
return self.to_duckdb_relation(
|
1462
|
-
conn=conn, metadata=metadata, reload=reload, **kwargs
|
1463
|
-
)
|
1464
|
-
return self.register_in_duckdb(
|
1465
|
-
conn=conn, name=name, metadata=metadata, reload=reload, **kwargs
|
1466
|
-
)
|
1467
|
-
|
1468
|
-
def register_in_datafusion(
|
1469
|
-
self,
|
1470
|
-
ctx: datafusion.SessionContext,
|
1471
|
-
name: str | None = None,
|
1472
|
-
metadata: bool = False,
|
1473
|
-
reload: bool = False,
|
1474
|
-
**kwargs,
|
1475
|
-
) -> datafusion.SessionContext | tuple[datafusion.SessionContext, dict[str, Any]]:
|
1476
|
-
"""Register data in DataFusion.
|
1477
|
-
|
1478
|
-
Args:
|
1479
|
-
ctx (datafusion.SessionContext): DataFusion session context instance.
|
1480
|
-
name (str, optional): Name for the DataFusion table.
|
1481
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
1482
|
-
reload (bool, optional): Reload data if True. Default is False.
|
1483
|
-
**kwargs: Additional keyword arguments.
|
1484
|
-
|
1485
|
-
Returns:
|
1486
|
-
None
|
1487
|
-
"""
|
1488
|
-
if name is None:
|
1489
|
-
name = f"{self.format}:{self.path}"
|
1490
|
-
|
1491
|
-
if self._ctx is None:
|
1492
|
-
if ctx is None:
|
1493
|
-
ctx = datafusion.SessionContext()
|
1494
|
-
self._ctx = ctx
|
1495
|
-
|
1496
|
-
self._ctx.register_record_batches(name, [self.to_pyarrow_table().to_batches()])
|
1497
|
-
|
1498
|
-
def filter(
|
1499
|
-
self, filter_expr: str | pl.Expr | pa.compute.Expression
|
1500
|
-
) -> (
|
1501
|
-
pl.DataFrame
|
1502
|
-
| pl.LazyFrame
|
1503
|
-
| pa.Table
|
1504
|
-
| list[pl.DataFrame]
|
1505
|
-
| list[pl.LazyFrame]
|
1506
|
-
| list[pa.Table]
|
1507
|
-
):
|
1508
|
-
"""Filter data based on a filter expression.
|
1509
|
-
|
1510
|
-
Args:
|
1511
|
-
filter_expr (str | pl.Expr | pa.compute.Expression): Filter expression. Can be a SQL expression, Polars
|
1512
|
-
expression, or PyArrow compute expression.
|
1513
|
-
|
1514
|
-
Returns:
|
1515
|
-
pl.DataFrame | pl.LazyFrame | pa.Table | list[pl.DataFrame] | list[pl.LazyFrame]
|
1516
|
-
| list[pa.Table]: Filtered data.
|
1517
|
-
"""
|
1518
|
-
if isinstance(self._data, pl.DataFrame | pl.LazyFrame):
|
1519
|
-
pl_schema = (
|
1520
|
-
self._data.schema
|
1521
|
-
if isinstance(self._data, pl.DataFrame)
|
1522
|
-
else self._data.collect_schema()
|
1523
|
-
)
|
1524
|
-
filter_expr = (
|
1525
|
-
sql2polars_filter(filter_expr, pl_schema)
|
1526
|
-
if isinstance(filter_expr, str)
|
1527
|
-
else filter_expr
|
1528
|
-
)
|
1529
|
-
return self._data.filter(filter_expr)
|
1530
|
-
|
1531
|
-
elif isinstance(self._data, pa.Table):
|
1532
|
-
pa_schema = self._data.schema
|
1533
|
-
filter_expr = (
|
1534
|
-
sql2pyarrow_filter(filter_expr, pa_schema)
|
1535
|
-
if isinstance(filter_expr, str)
|
1536
|
-
else filter_expr
|
1537
|
-
)
|
1538
|
-
return self._data.filter(filter_expr)
|
1539
|
-
|
1540
|
-
if isinstance(self._data, str):
|
1541
|
-
if isinstance(self._data[0], pl.DataFrame | pl.LazyFrame):
|
1542
|
-
pl_schema = (
|
1543
|
-
self._data.schema
|
1544
|
-
if isinstance(self._data[0], pl.DataFrame)
|
1545
|
-
else self._data[0].collect_schema()
|
1546
|
-
)
|
1547
|
-
filter_expr = (
|
1548
|
-
sql2polars_filter(filter_expr, pl_schema)
|
1549
|
-
if isinstance(filter_expr, str)
|
1550
|
-
else filter_expr
|
1551
|
-
)
|
1552
|
-
return [d.filter(filter_expr) for d in self._data]
|
1553
|
-
elif isinstance(self._data[0], pa.Table):
|
1554
|
-
pa_schema = self._data[0].schema
|
1555
|
-
filter_expr = (
|
1556
|
-
sql2pyarrow_filter(filter_expr, pa_schema)
|
1557
|
-
if isinstance(filter_expr, str)
|
1558
|
-
else filter_expr
|
1559
|
-
)
|
1560
|
-
return [d.filter(filter_expr) for d in self._data]
|
1561
|
-
|
1562
|
-
@property
|
1563
|
-
def metadata(self):
|
1564
|
-
if not hasattr(self, "_metadata"):
|
1565
|
-
self._load()
|
1566
|
-
return self._metadata
|
1567
|
-
|
1568
|
-
|
1569
|
-
# @attrs.define # Removed
|
1570
|
-
class BaseFileWriter(BaseFileIO, gc=False):
|
1571
|
-
"""
|
1572
|
-
Base class for file writing operations supporting various storage backends.
|
1573
|
-
This class provides a foundation for file writing operations across different storage systems
|
1574
|
-
including AWS S3, Google Cloud Storage, Azure Blob Storage, GitHub, and GitLab.
|
1575
|
-
|
1576
|
-
Args:
|
1577
|
-
path (str | list[str]): Path or list of paths to file(s).
|
1578
|
-
storage_options (AwsStorageOptions | GcsStorageOptions | AzureStorageOptions |
|
1579
|
-
GitHubStorageOptions | GitLabStorageOptions | dict[str, Any] | None, optional):
|
1580
|
-
Storage-specific options for accessing remote filesystems.
|
1581
|
-
fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
|
1582
|
-
format (str, optional): File format extension (without dot).
|
1583
|
-
basename (str, optional): Basename for the output file(s).
|
1584
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
1585
|
-
mode (str, optional): Write mode (append, overwrite, delete_matching, error_if_exists).
|
1586
|
-
unique (bool | list[str] | str, optional): Unique columns for deduplication.
|
1587
|
-
|
1588
|
-
Examples:
|
1589
|
-
```python
|
1590
|
-
file_writer = BaseFileWriter(
|
1591
|
-
path="s3://bucket/path/to/files",
|
1592
|
-
storage_options=AwsStorageOptions(
|
1593
|
-
key="access_key",
|
1594
|
-
secret="secret_key"),
|
1595
|
-
format="csv",
|
1596
|
-
basename="output",
|
1597
|
-
concat=True,
|
1598
|
-
mode="append",
|
1599
|
-
unique=True
|
1600
|
-
)
|
1601
|
-
file_writer.write(data=df)
|
1602
|
-
```
|
1603
|
-
|
1604
|
-
Notes:
|
1605
|
-
- Supports multiple cloud storage backends through different storage options
|
1606
|
-
- Automatically handles filesystem initialization based on path protocol
|
1607
|
-
- Supports both single path and multiple path inputs
|
1608
|
-
- Supports writing data to cloud storage with various write modes
|
1609
|
-
"""
|
1610
|
-
|
1611
|
-
basename: str | None = field(default=None)
|
1612
|
-
concat: bool = field(default=False)
|
1613
|
-
mode: str = field(default="append")
|
1614
|
-
unique: bool | list[str] | str = field(default=False)
|
1615
|
-
|
1616
|
-
def write(
|
1617
|
-
self,
|
1618
|
-
data: (
|
1619
|
-
pl.DataFrame
|
1620
|
-
| pl.LazyFrame
|
1621
|
-
| pa.Table
|
1622
|
-
| pd.DataFrame
|
1623
|
-
| dict[str, Any]
|
1624
|
-
| list[
|
1625
|
-
pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]
|
1626
|
-
]
|
1627
|
-
),
|
1628
|
-
basename: str | None = None,
|
1629
|
-
concat: bool | None = None,
|
1630
|
-
unique: bool | list[str] | str | None = None,
|
1631
|
-
mode: str | None = None,
|
1632
|
-
**kwargs,
|
1633
|
-
) -> dict[str, Any]:
|
1634
|
-
"""
|
1635
|
-
Write data to file.
|
1636
|
-
|
1637
|
-
Args:
|
1638
|
-
data (pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any] | list[pl.DataFrame |
|
1639
|
-
pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]] | None, optional): Data to write.
|
1640
|
-
basename (str, optional): Basename for the output file(s).
|
1641
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
1642
|
-
unique (bool | list[str] | str, optional): Unique columns for deduplication.
|
1643
|
-
mode (str, optional): Write mode (append, overwrite, delete_matching, error_if_exists).
|
1644
|
-
**kwargs: Additional keyword arguments.
|
1645
|
-
|
1646
|
-
Returns:
|
1647
|
-
dict[str, Any]: Metadata for the written data
|
1648
|
-
"""
|
1649
|
-
if isinstance(data, list):
|
1650
|
-
if isinstance(data[0], dict):
|
1651
|
-
data = _dict_to_dataframe(data)
|
1652
|
-
if isinstance(data, dict):
|
1653
|
-
data = _dict_to_dataframe(data)
|
1654
|
-
|
1655
|
-
self._metadata = get_dataframe_metadata(
|
1656
|
-
df=data, path=self.path, format=self.format
|
1657
|
-
)
|
1658
|
-
|
1659
|
-
self.fs.write_files(
|
1660
|
-
data=data, # if data is not None else self.data,
|
1661
|
-
path=self._path,
|
1662
|
-
basename=basename or self.basename,
|
1663
|
-
concat=concat or self.concat,
|
1664
|
-
unique=unique or self.unique,
|
1665
|
-
mode=mode or self.mode,
|
1666
|
-
**kwargs,
|
1667
|
-
)
|
1668
|
-
return self._metadata
|
1669
|
-
|
1670
|
-
@property
|
1671
|
-
def metadata(self):
|
1672
|
-
if not hasattr(self, "_metadata"):
|
1673
|
-
return {}
|
1674
|
-
return self._metadata
|
1675
|
-
|
1676
|
-
|
1677
|
-
# @attrs.define # Removed
|
1678
|
-
class BaseDatasetWriter(BaseFileWriter, gc=False):
|
1679
|
-
"""
|
1680
|
-
Base class for dataset writing operations supporting various file formats.
|
1681
|
-
This class provides a foundation for dataset writing operations across different file formats
|
1682
|
-
including CSV, Parquet, JSON, Arrow, and IPC.
|
1683
|
-
|
1684
|
-
Args:
|
1685
|
-
path (str | list[str]): Path or list of paths to file(s).
|
1686
|
-
format (str, optional): File format extension (without dot).
|
1687
|
-
storage_options (AwsStorageOptions | GcsStorageOptions | AzureStorageOptions |
|
1688
|
-
GitHubStorageOptions | GitLabStorageOptions | dict[str, Any] | None, optional):
|
1689
|
-
Storage-specific options for accessing remote filesystems.
|
1690
|
-
fs (AbstractFileSystem, optional): Filesystem instance for handling file operations.
|
1691
|
-
basename (str, optional): Basename for the output file(s).
|
1692
|
-
schema (pa.Schema, optional): PyArrow schema for the dataset.
|
1693
|
-
partition_by (str | list[str] | pds.Partitioning, optional): Dataset partitioning scheme.
|
1694
|
-
partitioning_flavor (str, optional): Partitioning flavor for the dataset.
|
1695
|
-
compression (str, optional): Compression codec for the dataset.
|
1696
|
-
row_group_size (int, optional): Row group size for the dataset.
|
1697
|
-
max_rows_per_file (int, optional): Maximum number of rows per file.
|
1698
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
1699
|
-
unique (bool | list[str] | str, optional): Unique columns for deduplication.
|
1700
|
-
mode (str, optional): Write mode (append, overwrite, delete_matching, error_if_exists).
|
1701
|
-
is_pydala_dataset (bool, optional): Write data as a Pydala ParquetDataset.
|
1702
|
-
|
1703
|
-
Examples:
|
1704
|
-
```python
|
1705
|
-
dataset_writer = BaseDatasetWriter(
|
1706
|
-
path="s3://bucket/path/to/files",
|
1707
|
-
format="parquet",
|
1708
|
-
storage_options=AwsStorageOptions(
|
1709
|
-
key="access_key",
|
1710
|
-
secret="secret_key"),
|
1711
|
-
basename="output",
|
1712
|
-
schema=pa.schema([
|
1713
|
-
pa.field("column1", pa.int64()),
|
1714
|
-
pa.field("column2", pa.string())
|
1715
|
-
]),
|
1716
|
-
partition_by="column1",
|
1717
|
-
partitioning_flavor="hive",
|
1718
|
-
compression="zstd",
|
1719
|
-
row_group_size=250_000,
|
1720
|
-
max_rows_per_file=2_500_000,
|
1721
|
-
concat=True,
|
1722
|
-
unique=True,
|
1723
|
-
mode="append",
|
1724
|
-
is_pydala_dataset=False
|
1725
|
-
)
|
1726
|
-
dataset_writer.write(data=df)
|
1727
|
-
```
|
1728
|
-
Notes:
|
1729
|
-
- Supports multiple file formats including CSV, Parquet, JSON, Arrow, and IPC
|
1730
|
-
- Automatically handles filesystem initialization based on path protocol
|
1731
|
-
- Supports both single path and multiple path inputs
|
1732
|
-
- Supports writing data to cloud storage with various write modes
|
1733
|
-
- Supports writing data as a Pydala ParquetDataset
|
1734
|
-
"""
|
1735
|
-
|
1736
|
-
# basename, concat, unique, mode are inherited from BaseFileWriter
|
1737
|
-
schema_: pa.Schema | None = None
|
1738
|
-
partition_by: str | list[str] | pds.Partitioning | None = None
|
1739
|
-
partitioning_flavor: str | None = None
|
1740
|
-
compression: str = "zstd"
|
1741
|
-
row_group_size: int | None = 250_000
|
1742
|
-
max_rows_per_file: int | None = 2_500_000
|
1743
|
-
is_pydala_dataset: bool = False
|
1744
|
-
|
1745
|
-
def write(
|
1746
|
-
self,
|
1747
|
-
data: (
|
1748
|
-
pl.DataFrame
|
1749
|
-
| pl.LazyFrame
|
1750
|
-
| pa.Table
|
1751
|
-
| pa.RecordBatch
|
1752
|
-
| pa.RecordBatchReader
|
1753
|
-
| pd.DataFrame
|
1754
|
-
| dict[str, Any]
|
1755
|
-
| list[
|
1756
|
-
pl.DataFrame
|
1757
|
-
| pl.LazyFrame
|
1758
|
-
| pa.Table
|
1759
|
-
| pa.RecordBatch
|
1760
|
-
| pa.RecordBatchReader
|
1761
|
-
| pd.DataFrame
|
1762
|
-
| dict[str, Any]
|
1763
|
-
]
|
1764
|
-
),
|
1765
|
-
concat: bool | None = None,
|
1766
|
-
unique: bool | list[str] | str | None = None,
|
1767
|
-
mode: str | None = None,
|
1768
|
-
delta_subset: str | None = None,
|
1769
|
-
alter_schema: bool = False,
|
1770
|
-
update_metadata: bool = True,
|
1771
|
-
timestamp_column: str | None = None,
|
1772
|
-
verbose: bool = False,
|
1773
|
-
**kwargs,
|
1774
|
-
) -> dict[str, Any]:
|
1775
|
-
"""
|
1776
|
-
Write data to dataset.
|
1777
|
-
|
1778
|
-
Args:
|
1779
|
-
data (pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader | pd.DataFrame |
|
1780
|
-
dict[str, Any] | list[pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
|
1781
|
-
pd.DataFrame | dict[str, Any]] | None, optional): Data to write.
|
1782
|
-
unique (bool | list[str] | str, optional): Unique columns for deduplication.
|
1783
|
-
delta_subset (str | None, optional): Delta subset for incremental updates.
|
1784
|
-
alter_schema (bool, optional): Alter schema for compatibility.
|
1785
|
-
update_metadata (bool, optional): Update metadata.
|
1786
|
-
timestamp_column (str | None, optional): Timestamp column for updates.
|
1787
|
-
verbose (bool, optional): Verbose output.
|
1788
|
-
**kwargs: Additional keyword arguments.
|
1789
|
-
|
1790
|
-
Returns:
|
1791
|
-
dict[str, Any]: Metadata of the written data.
|
1792
|
-
"""
|
1793
|
-
basename = kwargs.pop("basename", self.basename)
|
1794
|
-
schema = kwargs.pop("schema", self.schema_)
|
1795
|
-
partition_by = kwargs.pop("partition_by", self.partition_by)
|
1796
|
-
partitioning_flavor = kwargs.pop(
|
1797
|
-
"partitioning_flavor", self.partitioning_flavor
|
1798
|
-
)
|
1799
|
-
compression = kwargs.pop("compression", self.compression)
|
1800
|
-
row_group_size = kwargs.pop("row_group_size", self.row_group_size)
|
1801
|
-
max_rows_per_file = kwargs.pop("max_rows_per_file", self.max_rows_per_file)
|
1802
|
-
|
1803
|
-
if isinstance(data, list):
|
1804
|
-
if isinstance(data[0], dict):
|
1805
|
-
data = _dict_to_dataframe(data)
|
1806
|
-
if isinstance(data, dict):
|
1807
|
-
data = _dict_to_dataframe(data)
|
1808
|
-
|
1809
|
-
self._metadata = get_dataframe_metadata(
|
1810
|
-
df=data, path=self.path, format=self.format
|
1811
|
-
)
|
1812
|
-
|
1813
|
-
if not self.is_pydala_dataset:
|
1814
|
-
self.fs.write_pyarrow_dataset(
|
1815
|
-
data=data, # if data is not None else self.data,
|
1816
|
-
path=self._path,
|
1817
|
-
basename=basename or self.basename,
|
1818
|
-
schema=schema or self.schema_,
|
1819
|
-
partition_by=partition_by or self.partition_by,
|
1820
|
-
partitioning_flavor=partitioning_flavor or self.partitioning_flavor,
|
1821
|
-
format=self.format,
|
1822
|
-
compression=compression or self.compression,
|
1823
|
-
row_group_size=row_group_size or self.row_group_size,
|
1824
|
-
max_rows_per_file=max_rows_per_file or self.max_rows_per_file,
|
1825
|
-
concat=concat or self.concat,
|
1826
|
-
unique=unique or self.unique,
|
1827
|
-
mode=mode or self.mode,
|
1828
|
-
**kwargs,
|
1829
|
-
)
|
1830
|
-
else:
|
1831
|
-
self.fs.write_pydala_dataset(
|
1832
|
-
data=data, # if data is not None else self.data,
|
1833
|
-
path=self._path,
|
1834
|
-
mode=mode or self.mode,
|
1835
|
-
basename=basename or self.basename,
|
1836
|
-
schema=schema or self.schema_,
|
1837
|
-
partition_by=partition_by or self.partition_by,
|
1838
|
-
compression=compression or self.compression,
|
1839
|
-
row_group_size=row_group_size or self.row_group_size,
|
1840
|
-
max_rows_per_file=max_rows_per_file or self.max_rows_per_file,
|
1841
|
-
concat=concat or self.concat,
|
1842
|
-
unique=unique or self.unique,
|
1843
|
-
delta_subset=delta_subset,
|
1844
|
-
alter_schema=alter_schema,
|
1845
|
-
update_metadata=update_metadata,
|
1846
|
-
timestamp_column=timestamp_column,
|
1847
|
-
verbose=verbose,
|
1848
|
-
**kwargs,
|
1849
|
-
)
|
1850
|
-
return self._metadata
|
1851
|
-
|
1852
|
-
@property
|
1853
|
-
def metadata(self):
|
1854
|
-
if not hasattr(self, "_metadata"):
|
1855
|
-
return {}
|
1856
|
-
return self._metadata
|
1857
|
-
|
1858
|
-
|
1859
|
-
# @attrs.define # Removed
|
1860
|
-
class BaseDatabaseIO(msgspec.Struct, gc=False):
|
1861
|
-
"""
|
1862
|
-
Base class for database read/write operations supporting various database systems.
|
1863
|
-
This class provides a foundation for database read/write operations across different database systems
|
1864
|
-
including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle.
|
1865
|
-
|
1866
|
-
Args:
|
1867
|
-
type_ (str): Database type (sqlite, duckdb, postgres, mysql, mssql, oracle).
|
1868
|
-
table_name (str): Table name in the database.
|
1869
|
-
path (str | None, optional): File path for SQLite or DuckDB databases.
|
1870
|
-
connection_string (str | None, optional): Connection string for SQLAlchemy-based databases.
|
1871
|
-
username (str | None, optional): Username for the database.
|
1872
|
-
password (str | None, optional): Password for the database.
|
1873
|
-
server (str | None, optional): Server address for the database.
|
1874
|
-
port (str | None, optional): Port number for the database.
|
1875
|
-
database (str | None, optional): Database name.
|
1876
|
-
|
1877
|
-
Examples:
|
1878
|
-
```python
|
1879
|
-
db_reader = BaseDatabaseIO(
|
1880
|
-
type_="sqlite",
|
1881
|
-
table_name="table_name",
|
1882
|
-
path="path/to/database.db"
|
1883
|
-
)
|
1884
|
-
data = db_reader.read()
|
1885
|
-
```
|
1886
|
-
|
1887
|
-
Notes:
|
1888
|
-
- Supports multiple database systems including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle
|
1889
|
-
- Automatically handles database initialization based on connection parameters
|
1890
|
-
- Supports reading data from databases into DataFrames
|
1891
|
-
- Supports writing data to databases from DataFrames
|
1892
|
-
"""
|
1893
|
-
|
1894
|
-
type_: str
|
1895
|
-
table_name: str = field(default="")
|
1896
|
-
path: str | None = field(default=None)
|
1897
|
-
username: str | None = field(default=None)
|
1898
|
-
password: str | None = field(default=None)
|
1899
|
-
server: str | None = field(default=None)
|
1900
|
-
port: str | int | None = field(default=None)
|
1901
|
-
database: str | None = field(default=None)
|
1902
|
-
ssl: bool = field(default=False)
|
1903
|
-
connection_string: str | None = field(default=None)
|
1904
|
-
_metadata: dict[str, Any] = field(default_factory=dict)
|
1905
|
-
_data: pa.Table | pl.DataFrame | pl.LazyFrame | pd.DataFrame | None = field(
|
1906
|
-
default=None
|
1907
|
-
)
|
1908
|
-
_conn: duckdb.DuckDBPyConnection | None = field(default=None)
|
1909
|
-
_ctx: datafusion.SessionContext | None = field(default=None)
|
1910
|
-
|
1911
|
-
def __post_init__(self): # Renamed from __attrs_post_init__
|
1912
|
-
db = self.type_.lower()
|
1913
|
-
if (
|
1914
|
-
db in ["postgres", "mysql", "mssql", "oracle"]
|
1915
|
-
and not self.connection_string
|
1916
|
-
):
|
1917
|
-
if not all([
|
1918
|
-
self.username,
|
1919
|
-
self.password,
|
1920
|
-
self.server,
|
1921
|
-
self.port,
|
1922
|
-
self.database,
|
1923
|
-
]):
|
1924
|
-
raise ValueError(
|
1925
|
-
f"{self.type_} requires connection_string or username, password, server, port, and table_name "
|
1926
|
-
"to build it."
|
1927
|
-
)
|
1928
|
-
if db == "postgres":
|
1929
|
-
ssl_mode = "?sslmode=require" if self.ssl else ""
|
1930
|
-
self.connection_string = (
|
1931
|
-
f"postgresql://{self.username}:{self.password}@{self.server}:{self.port}/"
|
1932
|
-
f"{self.database}{ssl_mode}"
|
1933
|
-
)
|
1934
|
-
elif db == "mysql":
|
1935
|
-
ssl_mode = "?ssl=true" if self.ssl else ""
|
1936
|
-
self.connection_string = (
|
1937
|
-
f"mysql+pymysql://{self.username}:{self.password}@{self.server}:{self.port}/"
|
1938
|
-
f"{self.database}{ssl_mode}"
|
1939
|
-
)
|
1940
|
-
elif db == "mssql":
|
1941
|
-
ssl_mode = ";Encrypt=yes;TrustServerCertificate=yes" if self.ssl else ""
|
1942
|
-
self.connection_string = (
|
1943
|
-
f"mssql+pyodbc://{self.username}:{self.password}@{self.server}:{self.port}/"
|
1944
|
-
f"{self.database}?driver=ODBC+Driver+17+for+SQL+Server{ssl_mode}"
|
1945
|
-
)
|
1946
|
-
elif db == "oracle":
|
1947
|
-
ssl_mode = "?ssl=true" if self.ssl else ""
|
1948
|
-
self.connection_string = (
|
1949
|
-
f"oracle+cx_oracle://{self.username}:{self.password}@{self.server}:{self.port}/"
|
1950
|
-
f"{self.database}{ssl_mode}"
|
1951
|
-
)
|
1952
|
-
if db in ["sqlite", "sqlite3"]:
|
1953
|
-
if not self.path:
|
1954
|
-
raise ValueError("SQLite requires a file path.")
|
1955
|
-
self.connection_string = f"sqlite:///{self.path}"
|
1956
|
-
elif db == "duckdb":
|
1957
|
-
if not self.path:
|
1958
|
-
raise ValueError("DuckDB requires a file path.")
|
1959
|
-
self.connection_string = self.path
|
1960
|
-
|
1961
|
-
def execute(self, query: str, cursor: bool = True, **query_kwargs):
|
1962
|
-
"""Execute a SQL query.
|
1963
|
-
|
1964
|
-
Args:
|
1965
|
-
query (str): SQL query.
|
1966
|
-
cursor (bool, optional): Use cursor for execution. Default is True.
|
1967
|
-
**query_kwargs: Additional keyword arguments.
|
1968
|
-
"""
|
1969
|
-
query = query.format(**query_kwargs)
|
1970
|
-
if self.type_ == "sqlite" or self.type_ == "duckdb":
|
1971
|
-
with self.connect() as conn:
|
1972
|
-
if cursor:
|
1973
|
-
cur = conn.cursor()
|
1974
|
-
res = cur.execute(query)
|
1975
|
-
|
1976
|
-
else:
|
1977
|
-
res = conn.execute(query)
|
1978
|
-
|
1979
|
-
conn.commit()
|
1980
|
-
return res
|
1981
|
-
|
1982
|
-
with self.connect() as conn:
|
1983
|
-
cur = conn.cursor()
|
1984
|
-
res = cur.execute(text(query))
|
1985
|
-
conn.commit()
|
1986
|
-
return res
|
1987
|
-
|
1988
|
-
def _to_pandas(
|
1989
|
-
self,
|
1990
|
-
data: pl.DataFrame
|
1991
|
-
| pl.LazyFrame
|
1992
|
-
| pa.Table
|
1993
|
-
| pa.RecordBatch
|
1994
|
-
| pa.RecordBatchReader
|
1995
|
-
| pd.DataFrame
|
1996
|
-
| dict[str, Any],
|
1997
|
-
) -> pd.DataFrame | list[pd.DataFrame]:
|
1998
|
-
# convert data to pandas DataFrame if needed
|
1999
|
-
if isinstance(data, pl.DataFrame):
|
2000
|
-
return data.to_pandas()
|
2001
|
-
elif isinstance(data, pa.Table):
|
2002
|
-
return data.to_pandas()
|
2003
|
-
elif isinstance(data, pl.LazyFrame):
|
2004
|
-
return data.collect().to_pandas()
|
2005
|
-
elif isinstance(data, pa.RecordBatch):
|
2006
|
-
return pa.Table.from_batches([self.data]).to_pandas()
|
2007
|
-
elif isinstance(data, pa.RecordBatchReader):
|
2008
|
-
return data.read_all().to_pandas()
|
2009
|
-
elif isinstance(data, dict):
|
2010
|
-
return pd.DataFrame(data)
|
2011
|
-
return data
|
2012
|
-
|
2013
|
-
def create_engine(self):
|
2014
|
-
return create_engine(self.connection_string)
|
2015
|
-
|
2016
|
-
def connect(self):
|
2017
|
-
if self.type_ == "sqlite":
|
2018
|
-
conn = sqlite3.connect(self.path)
|
2019
|
-
# Activate WAL mode:
|
2020
|
-
conn.execute("PRAGMA journal_mode=WAL;")
|
2021
|
-
return conn
|
2022
|
-
if self.type_ == "duckdb":
|
2023
|
-
return duckdb.connect(database=self.path)
|
2024
|
-
return self.create_engine().connect()
|
2025
|
-
|
2026
|
-
|
2027
|
-
# @attrs.define # Removed
|
2028
|
-
class BaseDatabaseWriter(BaseDatabaseIO, gc=False):
|
2029
|
-
"""
|
2030
|
-
Base class for database writing operations supporting various database systems.
|
2031
|
-
This class provides a foundation for database writing operations across different database systems
|
2032
|
-
including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle.
|
2033
|
-
|
2034
|
-
Args:
|
2035
|
-
type_ (str): Database type (sqlite, duckdb, postgres, mysql, mssql, oracle).
|
2036
|
-
table_name (str): Table name in the database.
|
2037
|
-
path (str | None, optional): File path for SQLite or DuckDB databases.
|
2038
|
-
connection_string (str | None, optional): Connection string for SQLAlchemy-based databases.
|
2039
|
-
username (str | None, optional): Username for the database.
|
2040
|
-
password (str | None, optional): Password for the database.
|
2041
|
-
server (str | None, optional): Server address for the database.
|
2042
|
-
port (str | None, optional): Port number for the database.
|
2043
|
-
database (str | None, optional): Database name.
|
2044
|
-
mode (str, optional): Write mode (append, replace, fail).
|
2045
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
2046
|
-
unique (bool | list[str] | str, optional): Unique columns for deduplication.
|
2047
|
-
|
2048
|
-
Examples:
|
2049
|
-
```python
|
2050
|
-
db_writer = BaseDatabaseWriter(
|
2051
|
-
type_="sqlite",
|
2052
|
-
table_name="table_name",
|
2053
|
-
path="path/to/database.db"
|
2054
|
-
)
|
2055
|
-
db_writer.write(data=df)
|
2056
|
-
```
|
2057
|
-
|
2058
|
-
Notes:
|
2059
|
-
- Supports multiple database systems including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle
|
2060
|
-
- Automatically handles database initialization based on connection parameters
|
2061
|
-
- Supports writing data to databases from DataFrames
|
2062
|
-
"""
|
2063
|
-
|
2064
|
-
mode: str = field(default="append") # append, replace, fail
|
2065
|
-
concat: bool = field(default=False)
|
2066
|
-
unique: bool | list[str] | str = field(default=False)
|
2067
|
-
|
2068
|
-
def _write_sqlite(
|
2069
|
-
self,
|
2070
|
-
data: pl.DataFrame
|
2071
|
-
| pl.LazyFrame
|
2072
|
-
| pa.Table
|
2073
|
-
| pa.RecordBatch
|
2074
|
-
| pa.RecordBatchReader
|
2075
|
-
| pd.DataFrame
|
2076
|
-
| dict[str, Any]
|
2077
|
-
| list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
|
2078
|
-
mode: str | None = None,
|
2079
|
-
concat: bool | None = None,
|
2080
|
-
unique: bool | list[str] | str | None = None,
|
2081
|
-
) -> dict[str, Any]:
|
2082
|
-
if not self.path:
|
2083
|
-
raise ValueError("SQLite requires a file path.")
|
2084
|
-
|
2085
|
-
data = to_pyarrow_table(
|
2086
|
-
data, unique=unique or self.unique, concat=concat or self.concat
|
2087
|
-
)
|
2088
|
-
if not isinstance(data, list):
|
2089
|
-
data = [data]
|
2090
|
-
|
2091
|
-
with sqlite3.connect(self.path) as conn:
|
2092
|
-
# Activate WAL mode:
|
2093
|
-
conn.execute("PRAGMA journal_mode=WAL;")
|
2094
|
-
|
2095
|
-
self._metadata = get_dataframe_metadata(
|
2096
|
-
df=data, path=self.connection_string, format=self.type_
|
2097
|
-
)
|
2098
|
-
|
2099
|
-
for n, _data in enumerate(data):
|
2100
|
-
df = self._to_pandas(_data)
|
2101
|
-
df.to_sql(self.table_name, conn, if_exists=mode or self.mode, index=False)
|
2102
|
-
|
2103
|
-
return self._metadata
|
2104
|
-
|
2105
|
-
def _write_duckdb(
|
2106
|
-
self,
|
2107
|
-
data: pl.DataFrame
|
2108
|
-
| pl.LazyFrame
|
2109
|
-
| pa.Table
|
2110
|
-
| pa.RecordBatch
|
2111
|
-
| pa.RecordBatchReader
|
2112
|
-
| pd.DataFrame
|
2113
|
-
| dict[str, Any]
|
2114
|
-
| list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
|
2115
|
-
mode: str | None = None,
|
2116
|
-
concat: bool | None = None,
|
2117
|
-
unique: bool | list[str] | str | None = None,
|
2118
|
-
) -> dict[str, Any]:
|
2119
|
-
if not self.path:
|
2120
|
-
raise ValueError("DuckDB requires a file path.")
|
2121
|
-
|
2122
|
-
data = to_pyarrow_table(
|
2123
|
-
data, unique=unique or self.unique, concat=concat or self.concat
|
2124
|
-
)
|
2125
|
-
if not isinstance(data, list):
|
2126
|
-
data = [data]
|
2127
|
-
|
2128
|
-
self._metadata = get_dataframe_metadata(
|
2129
|
-
df=data, path=self.connection_string, format=self.type_
|
2130
|
-
)
|
2131
|
-
|
2132
|
-
with duckdb.connect(database=self.path) as conn:
|
2133
|
-
mode = mode or self.mode
|
2134
|
-
for _data in data:
|
2135
|
-
conn.register(f"temp_{self.table_name}", _data)
|
2136
|
-
if mode == "append":
|
2137
|
-
conn.execute(
|
2138
|
-
f"CREATE TABLE IF NOT EXISTS {self.table_name} AS SELECT * FROM temp_{self.table_name} LIMIT 0;"
|
2139
|
-
)
|
2140
|
-
conn.execute(
|
2141
|
-
f"INSERT INTO {self.table_name} SELECT * FROM temp_{self.table_name};"
|
2142
|
-
)
|
2143
|
-
elif mode == "replace":
|
2144
|
-
conn.execute(
|
2145
|
-
f"CREATE OR REPLACE TABLE {self.table_name} AS SELECT * FROM temp_{self.table_name};"
|
2146
|
-
)
|
2147
|
-
elif mode == "fail":
|
2148
|
-
try:
|
2149
|
-
conn.execute(
|
2150
|
-
f"CREATE TABLE {self.table_name} AS SELECT * FROM temp_{self.table_name};"
|
2151
|
-
)
|
2152
|
-
except Exception as e:
|
2153
|
-
raise e
|
2154
|
-
|
2155
|
-
conn.execute(
|
2156
|
-
f"DROP TABLE temp_{self.table_name};"
|
2157
|
-
) # Fixed: TABLE not VIEW
|
2158
|
-
|
2159
|
-
return self._metadata
|
2160
|
-
|
2161
|
-
def _write_sqlalchemy(
|
2162
|
-
self,
|
2163
|
-
data: pl.DataFrame
|
2164
|
-
| pl.LazyFrame
|
2165
|
-
| pa.Table
|
2166
|
-
| pa.RecordBatch
|
2167
|
-
| pa.RecordBatchReader
|
2168
|
-
| pd.DataFrame
|
2169
|
-
| dict[str, Any]
|
2170
|
-
| list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
|
2171
|
-
mode: str | None = None,
|
2172
|
-
concat: bool | None = None,
|
2173
|
-
unique: bool | list[str] | str | None = None,
|
2174
|
-
) -> dict[str, Any]:
|
2175
|
-
if not self.connection_string:
|
2176
|
-
raise ValueError(f"{self.type_} requires a connection string.")
|
2177
|
-
|
2178
|
-
data = to_pyarrow_table(
|
2179
|
-
data, unique=unique or self.unique, concat=concat or self.concat
|
2180
|
-
)
|
2181
|
-
if not isinstance(data, list):
|
2182
|
-
data = [data]
|
2183
|
-
|
2184
|
-
self._metadata = get_dataframe_metadata(
|
2185
|
-
df=data, path=self.connection_string, format=self.type_
|
2186
|
-
)
|
2187
|
-
|
2188
|
-
engine = create_engine(self.connection_string)
|
2189
|
-
for _data in data:
|
2190
|
-
df = self._to_pandas(_data)
|
2191
|
-
df.to_sql(self.table_name, engine, if_exists=mode or self.mode, index=False)
|
2192
|
-
engine.dispose()
|
2193
|
-
|
2194
|
-
return self._metadata
|
2195
|
-
|
2196
|
-
def write(
|
2197
|
-
self,
|
2198
|
-
data: pl.DataFrame
|
2199
|
-
| pl.LazyFrame
|
2200
|
-
| pa.Table
|
2201
|
-
| pa.RecordBatch
|
2202
|
-
| pa.RecordBatchReader
|
2203
|
-
| pd.DataFrame
|
2204
|
-
| dict[str, Any]
|
2205
|
-
| list[pl.DataFrame | pl.LazyFrame | pa.Table | pd.DataFrame | dict[str, Any]],
|
2206
|
-
mode: str | None = None,
|
2207
|
-
concat: bool | None = None,
|
2208
|
-
unique: bool | list[str] | str | None = None,
|
2209
|
-
) -> dict[str, Any]:
|
2210
|
-
"""
|
2211
|
-
Write data to database.
|
2212
|
-
|
2213
|
-
Args:
|
2214
|
-
data (pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader | pd.DataFrame |
|
2215
|
-
dict[str, Any] | list[pl.DataFrame | pl.LazyFrame | pa.Table | pa.RecordBatch | pa.RecordBatchReader |
|
2216
|
-
pd.DataFrame | dict[str, Any]], optional): Data to write.
|
2217
|
-
mode (str, optional): Write mode (append, replace, fail).
|
2218
|
-
concat (bool, optional): Concatenate multiple files into a single DataFrame.
|
2219
|
-
unique (bool | list[str] | str, optional): Unique columns for deduplication.
|
2220
|
-
|
2221
|
-
Returns:
|
2222
|
-
dict[str, Any]: Metadata of the written data
|
2223
|
-
"""
|
2224
|
-
db = self.type_.lower()
|
2225
|
-
if db == "sqlite":
|
2226
|
-
return self._write_sqlite(
|
2227
|
-
data=data, mode=mode, concat=concat, unique=unique
|
2228
|
-
)
|
2229
|
-
elif db == "duckdb":
|
2230
|
-
return self._write_duckdb(
|
2231
|
-
data=data, mode=mode, concat=concat, unique=unique
|
2232
|
-
)
|
2233
|
-
elif db in ["postgres", "mysql", "mssql", "oracle"]:
|
2234
|
-
return self._write_sqlalchemy(
|
2235
|
-
data=data, mode=mode, concat=concat, unique=unique
|
2236
|
-
)
|
2237
|
-
else:
|
2238
|
-
raise ValueError(f"Unsupported database type: {self.type_}")
|
2239
|
-
|
2240
|
-
@property
|
2241
|
-
def metadata(self):
|
2242
|
-
if not hasattr(self, "_metadata"):
|
2243
|
-
return {}
|
2244
|
-
return self._metadata
|
2245
|
-
|
2246
|
-
|
2247
|
-
# @attrs.define # Removed
|
2248
|
-
class BaseDatabaseReader(BaseDatabaseIO, gc=False):
|
2249
|
-
"""
|
2250
|
-
Base class for database read operations supporting various database systems.
|
2251
|
-
This class provides a foundation for database read operations across different database systems
|
2252
|
-
including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle.
|
2253
|
-
|
2254
|
-
Args:
|
2255
|
-
type_ (str): Database type (sqlite, duckdb, postgres, mysql, mssql, oracle).
|
2256
|
-
table_name (str): Table name in the database.
|
2257
|
-
path (str | None, optional): File path for SQLite or DuckDB databases.
|
2258
|
-
connection_string (str | None, optional): Connection string for SQLAlchemy-based databases.
|
2259
|
-
username (str | None, optional): Username for the database.
|
2260
|
-
password (str | None, optional): Password for the database.
|
2261
|
-
server (str | None, optional): Server address for the database.
|
2262
|
-
port (str | None, optional): Port number for the database.
|
2263
|
-
database (str | None, optional): Database name.
|
2264
|
-
query (str | None, optional): SQL query to execute.
|
2265
|
-
|
2266
|
-
Examples:
|
2267
|
-
```python
|
2268
|
-
db_reader = BaseDatabaseReader(
|
2269
|
-
type_="sqlite",
|
2270
|
-
table_name="table_name",
|
2271
|
-
path="path/to/database.db"
|
2272
|
-
)
|
2273
|
-
data = db_reader.read()
|
2274
|
-
```
|
2275
|
-
Notes:
|
2276
|
-
- Supports multiple database systems including SQLite, DuckDB, PostgreSQL, MySQL, SQL Server, and Oracle
|
2277
|
-
- Automatically handles database initialization based on connection parameters
|
2278
|
-
- Supports reading data from databases into DataFrames
|
2279
|
-
"""
|
2280
|
-
|
2281
|
-
query: str | None = None
|
2282
|
-
|
2283
|
-
def __post_init__(self): # Renamed from __attrs_post_init__
|
2284
|
-
super().__post_init__() # Call super's post_init if BaseDatabaseIO has one and it's needed
|
2285
|
-
if self.connection_string is not None:
|
2286
|
-
if "+" in self.connection_string:
|
2287
|
-
self.connection_string = (
|
2288
|
-
f"{self.connection_string.split('+')[0]}://"
|
2289
|
-
f"{self.connection_string.split('://')[1]}"
|
2290
|
-
)
|
2291
|
-
|
2292
|
-
def _load(self, query: str | None = None, reload: bool = False, **kwargs) -> None:
|
2293
|
-
"""Load data from database.
|
2294
|
-
|
2295
|
-
Args:
|
2296
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2297
|
-
reload (bool, optional): Reload data if True.
|
2298
|
-
**kwargs: Additional keyword arguments.
|
2299
|
-
|
2300
|
-
Returns:
|
2301
|
-
None
|
2302
|
-
"""
|
2303
|
-
if query is None:
|
2304
|
-
query = f"SELECT * FROM {self.table_name}"
|
2305
|
-
else:
|
2306
|
-
query = query.replace("table", self.table_name)
|
2307
|
-
|
2308
|
-
if "engine" in kwargs:
|
2309
|
-
engine = kwargs.pop("engine", "adbc")
|
2310
|
-
else:
|
2311
|
-
engine = "adbc"
|
2312
|
-
|
2313
|
-
if query != self.query:
|
2314
|
-
reload = True
|
2315
|
-
|
2316
|
-
self.query = query
|
2317
|
-
|
2318
|
-
if self.type_ == "duckdb":
|
2319
|
-
if not self.path:
|
2320
|
-
raise ValueError("DuckDB requires a file path.")
|
2321
|
-
|
2322
|
-
if not hasattr(self, "_data") or self._data is None or reload:
|
2323
|
-
with duckdb.connect(database=self.path) as conn:
|
2324
|
-
self._data = conn.execute(query).arrow()
|
2325
|
-
|
2326
|
-
else:
|
2327
|
-
if not self.connection_string:
|
2328
|
-
raise ValueError(f"{self.type_} requires a connection string.")
|
2329
|
-
if not hasattr(self, "_data") or self._data is None or reload:
|
2330
|
-
if engine == "connectorx":
|
2331
|
-
cs = self.connection_string.replace("///", "//")
|
2332
|
-
else:
|
2333
|
-
cs = self.connection_string
|
2334
|
-
data = (
|
2335
|
-
pl.read_database_uri(
|
2336
|
-
query=query,
|
2337
|
-
uri=cs,
|
2338
|
-
engine=engine,
|
2339
|
-
**kwargs,
|
2340
|
-
)
|
2341
|
-
).to_arrow()
|
2342
|
-
self._data = data.cast(convert_large_types_to_standard(data.schema))
|
2343
|
-
|
2344
|
-
self._metadata = get_dataframe_metadata(
|
2345
|
-
self._data, path=self.connection_string, format=self.type_
|
2346
|
-
)
|
2347
|
-
|
2348
|
-
def to_polars(
|
2349
|
-
self,
|
2350
|
-
query: str | None = None,
|
2351
|
-
reload: bool = False,
|
2352
|
-
metadata: bool = False,
|
2353
|
-
**kwargs,
|
2354
|
-
) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]:
|
2355
|
-
"""Convert data to Polars DataFrame.
|
2356
|
-
|
2357
|
-
Args:
|
2358
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2359
|
-
reload (bool, optional): Reload data if True.
|
2360
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
2361
|
-
**kwargs: Additional keyword arguments.
|
2362
|
-
|
2363
|
-
Returns:
|
2364
|
-
pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]: Polars DataFrame or tuple of DataFrame and metadata.
|
2365
|
-
"""
|
2366
|
-
self._load(query=query, reload=reload, **kwargs)
|
2367
|
-
df = pl.from_arrow(self._data)
|
2368
|
-
if metadata:
|
2369
|
-
return df, self.metadata
|
2370
|
-
return df
|
2371
|
-
|
2372
|
-
def to_pandas(
|
2373
|
-
self,
|
2374
|
-
query: str | None = None,
|
2375
|
-
reload: bool = False,
|
2376
|
-
metadata: bool = False,
|
2377
|
-
**kwargs,
|
2378
|
-
) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
|
2379
|
-
"""Convert data to Pandas DataFrame.
|
2380
|
-
|
2381
|
-
Args:
|
2382
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2383
|
-
reload (bool, optional): Reload data if True.
|
2384
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
2385
|
-
**kwargs: Additional keyword arguments.
|
2386
|
-
|
2387
|
-
Returns:
|
2388
|
-
pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]: Pandas DataFrame or tuple of DataFrame and metadata.
|
2389
|
-
"""
|
2390
|
-
self._load(query=query, reload=reload, **kwargs)
|
2391
|
-
df = self._data.to_pandas()
|
2392
|
-
if metadata:
|
2393
|
-
return df, self.metadata
|
2394
|
-
return df
|
2395
|
-
|
2396
|
-
def to_pyarrow_table(
|
2397
|
-
self,
|
2398
|
-
query: str | None = None,
|
2399
|
-
reload: bool = False,
|
2400
|
-
metadata: bool = False,
|
2401
|
-
**kwargs,
|
2402
|
-
) -> pa.Table:
|
2403
|
-
"""Convert data to PyArrow Table.
|
2404
|
-
|
2405
|
-
Args:
|
2406
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2407
|
-
reload (bool, optional): Reload data if True.
|
2408
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
2409
|
-
**kwargs: Additional keyword arguments.
|
2410
|
-
|
2411
|
-
Returns:
|
2412
|
-
pa.Table | tuple[pa.Table, dict[str, Any]]: PyArrow Table or tuple of Table and metadata.
|
2413
|
-
"""
|
2414
|
-
self._load(query=query, reload=reload, **kwargs)
|
2415
|
-
if metadata:
|
2416
|
-
return self._data, self.metadata
|
2417
|
-
return self._data
|
2418
|
-
|
2419
|
-
def to_duckdb_relation(
|
2420
|
-
self,
|
2421
|
-
query: str | None = None,
|
2422
|
-
reload: bool = False,
|
2423
|
-
metadata: bool = False,
|
2424
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
2425
|
-
**kwargs,
|
2426
|
-
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
2427
|
-
"""Convert data to DuckDB relation.
|
2428
|
-
|
2429
|
-
Args:
|
2430
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2431
|
-
reload (bool, optional): Reload data if True.
|
2432
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
2433
|
-
metadata (bool, optional): Include metadata in the output. Default is False.
|
2434
|
-
**kwargs: Additional keyword arguments.
|
2435
|
-
|
2436
|
-
Returns:
|
2437
|
-
duckdb.DuckDBPyRelation: DuckDB relation.
|
2438
|
-
"""
|
2439
|
-
self._load(query=query, reload=reload, **kwargs)
|
2440
|
-
if self._conn is None:
|
2441
|
-
if conn is None:
|
2442
|
-
conn = duckdb.connect()
|
2443
|
-
self._conn = conn
|
2444
|
-
if metadata:
|
2445
|
-
return self._conn.from_arrow(self._data), self.metadata
|
2446
|
-
return self._conn.from_arrow(self._data)
|
2447
|
-
|
2448
|
-
def register_in_duckdb(
|
2449
|
-
self,
|
2450
|
-
query: str | None = None,
|
2451
|
-
reload: bool = False,
|
2452
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
2453
|
-
name: str | None = None,
|
2454
|
-
**kwargs,
|
2455
|
-
) -> None:
|
2456
|
-
"""Register data in DuckDB.
|
2457
|
-
|
2458
|
-
Args:
|
2459
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2460
|
-
reload (bool, optional): Reload data if True.
|
2461
|
-
conn (duckdb.DuckDBPyConnection, optional): DuckDB connection instance.
|
2462
|
-
name (str, optional): Name of the relation.
|
2463
|
-
**kwargs: Additional keyword arguments.
|
2464
|
-
|
2465
|
-
Returns:
|
2466
|
-
None
|
2467
|
-
"""
|
2468
|
-
if name is None:
|
2469
|
-
name = f"{self.type_}:{self.table_name}"
|
2470
|
-
|
2471
|
-
if self._conn is None:
|
2472
|
-
if conn is None:
|
2473
|
-
conn = duckdb.connect()
|
2474
|
-
self._conn = conn
|
2475
|
-
|
2476
|
-
self._load(query=query, reload=reload, **kwargs)
|
2477
|
-
self._conn.register(name, self._data)
|
2478
|
-
|
2479
|
-
def register_in_datafusion(
|
2480
|
-
self,
|
2481
|
-
query: str | None = None,
|
2482
|
-
reload: bool = False,
|
2483
|
-
ctx: datafusion.SessionContext | None = None,
|
2484
|
-
name: str | None = None,
|
2485
|
-
**kwargs,
|
2486
|
-
) -> None:
|
2487
|
-
"""Register data in DataFusion.
|
2488
|
-
|
2489
|
-
Args:
|
2490
|
-
query (str, optional): SQL query to execute. If None, loads all data from the table.
|
2491
|
-
reload (bool, optional): Reload data if True.
|
2492
|
-
ctx (datafusion.SessionContext, optional): DataFusion session context instance.
|
2493
|
-
name (str, optional): Name of the relation.
|
2494
|
-
**kwargs: Additional keyword arguments.
|
2495
|
-
|
2496
|
-
Returns:
|
2497
|
-
None
|
2498
|
-
"""
|
2499
|
-
if name is None:
|
2500
|
-
name = f"{self.type_}:{self.table_name}"
|
2501
|
-
|
2502
|
-
if self._ctx is None:
|
2503
|
-
if ctx is None:
|
2504
|
-
ctx = datafusion.SessionContext()
|
2505
|
-
self._ctx = ctx
|
2506
|
-
|
2507
|
-
self._load(query=query, reload=reload, **kwargs)
|
2508
|
-
|
2509
|
-
self._ctx.register_record_batches(name, [self.to_pyarrow_table().to_batches()])
|
2510
|
-
|
2511
|
-
@property
|
2512
|
-
def metadata(self):
|
2513
|
-
if not hasattr(self, "_metadata"):
|
2514
|
-
self._load()
|
2515
|
-
return self._metadata
|
2516
|
-
|
2517
|
-
def metadata(self):
|
2518
|
-
if not hasattr(self, "_metadata"):
|
2519
|
-
self._load()
|
2520
|
-
return self._metadata
|