FlowerPower 0.11.6.20__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/cfg/__init__.py +3 -3
- flowerpower/cfg/pipeline/__init__.py +5 -3
- flowerpower/cfg/project/__init__.py +3 -3
- flowerpower/cfg/project/job_queue.py +1 -128
- flowerpower/cli/__init__.py +5 -5
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/job_queue.py +400 -132
- flowerpower/cli/pipeline.py +14 -413
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +537 -28
- flowerpower/job_queue/__init__.py +5 -94
- flowerpower/job_queue/base.py +201 -3
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
- flowerpower/job_queue/rq/manager.py +388 -77
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +2 -2
- flowerpower/pipeline/io.py +14 -16
- flowerpower/pipeline/manager.py +21 -642
- flowerpower/pipeline/pipeline.py +571 -0
- flowerpower/pipeline/registry.py +242 -10
- flowerpower/pipeline/visualizer.py +1 -2
- flowerpower/plugins/_io/__init__.py +8 -0
- flowerpower/plugins/mqtt/manager.py +6 -6
- flowerpower/settings/backend.py +0 -2
- flowerpower/settings/job_queue.py +1 -57
- flowerpower/utils/misc.py +0 -256
- flowerpower/utils/monkey.py +1 -83
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
- flowerpower-0.20.0.dist-info/RECORD +58 -0
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.20.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,26 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class MySQLReader(BaseDatabaseReader, gc=False):
|
8
|
-
"""MySQL loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from MySQL database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = MySQLReader(table_name="table", host="localhost",
|
15
|
-
port=5432, username="user", password="password",
|
16
|
-
database="database")
|
17
|
-
df = loader.to_polars()
|
18
|
-
|
19
|
-
# or
|
20
|
-
loader = MySQLReader(table_name="table",
|
21
|
-
connection_string="mssql+pyodbc://user:password@localhost:5432/database")
|
22
|
-
df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
|
23
|
-
```
|
24
|
-
"""
|
25
|
-
|
26
|
-
type_: str = field(default="mysql")
|
@@ -1,26 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class OracleDBReader(BaseDatabaseReader, gc=False):
|
8
|
-
"""OracleDB loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from OracleDB database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = OracleDBReader(table_name="table", host="localhost",
|
15
|
-
port=5432, username="user", password="password",
|
16
|
-
database="database")
|
17
|
-
df = loader.to_polars()
|
18
|
-
|
19
|
-
# or
|
20
|
-
loader = OracleDBReader(table_name="table",
|
21
|
-
connection_string="mssql+pyodbc://user:password@localhost:5432/database")
|
22
|
-
df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
|
23
|
-
```
|
24
|
-
"""
|
25
|
-
|
26
|
-
type_: str = field(default="oracle")
|
@@ -1,35 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatasetReader, BaseFileReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class ParquetFileReader(BaseFileReader, gc=False):
|
8
|
-
"""Parquet file loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from Parquet files.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = ParquetFileReader("data.parquet")
|
15
|
-
df = loader.load()
|
16
|
-
```
|
17
|
-
"""
|
18
|
-
|
19
|
-
format: str = field(default="parquet")
|
20
|
-
|
21
|
-
|
22
|
-
# @attrs.define
|
23
|
-
class ParquetDatasetReader(BaseDatasetReader, gc=False):
|
24
|
-
"""Parquet dataset loader.
|
25
|
-
|
26
|
-
This class is responsible for loading dataframes from Parquet dataset.
|
27
|
-
|
28
|
-
Examples:
|
29
|
-
```python
|
30
|
-
loader = ParquetDatasetReader("parquet_data/")
|
31
|
-
df = loader.load()
|
32
|
-
```
|
33
|
-
"""
|
34
|
-
|
35
|
-
format: str = field(default="parquet")
|
@@ -1,26 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class PostgreSQLReader(BaseDatabaseReader, gc=False):
|
8
|
-
"""PostgreSQL loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from PostgreSQL database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = PostgreSQLReader(table_name="table", host="localhost",
|
15
|
-
port=5432, username="user", password="password",
|
16
|
-
database="database")
|
17
|
-
df = loader.to_polars()
|
18
|
-
|
19
|
-
# or
|
20
|
-
loader = PostgreSQLReader(table_name="table",
|
21
|
-
connection_string="mssql+pyodbc://user:password@localhost:5432/database")
|
22
|
-
df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
|
23
|
-
```
|
24
|
-
"""
|
25
|
-
|
26
|
-
type_: str = field(default="postgres")
|
@@ -1,19 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatasetReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class PydalaDatasetReader(BaseDatasetReader, gc=False):
|
8
|
-
"""Pydala dataset loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from Pydala dataset.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = PydalaDatasetReader("pydala_data/")
|
15
|
-
df = loader.load()
|
16
|
-
```
|
17
|
-
"""
|
18
|
-
|
19
|
-
format: str = field(default="parquet")
|
@@ -1,23 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class SQLiteReader(BaseDatabaseReader, gc=False):
|
8
|
-
"""SQLite loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from SQLite database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = SQLiteReader(table_name="table", path="data.db")
|
15
|
-
df = loader.to_polars("SELECT * FROM table WHERE column = 'value'")
|
16
|
-
|
17
|
-
# or
|
18
|
-
loader = SQLiteReader(table_name="table", connection_string="sqlite://data.db")
|
19
|
-
df = loader.to_pyarrow_table()
|
20
|
-
```
|
21
|
-
"""
|
22
|
-
|
23
|
-
type_: str = field(default="sqlite")
|
@@ -1,244 +0,0 @@
|
|
1
|
-
import datetime as dt
|
2
|
-
import importlib
|
3
|
-
import os
|
4
|
-
|
5
|
-
import duckdb
|
6
|
-
import pandas as pd
|
7
|
-
import polars as pl
|
8
|
-
import pyarrow as pa
|
9
|
-
import pyarrow.dataset as pds
|
10
|
-
from deltalake import DeltaTable
|
11
|
-
from fsspec import AbstractFileSystem
|
12
|
-
|
13
|
-
from ...fs.ext import path_to_glob
|
14
|
-
|
15
|
-
|
16
|
-
def get_serializable_schema(
|
17
|
-
data: (
|
18
|
-
pd.DataFrame
|
19
|
-
| pl.DataFrame
|
20
|
-
| pl.LazyFrame
|
21
|
-
| duckdb.DuckDBPyRelation
|
22
|
-
| pa.Table
|
23
|
-
| pa.Schema
|
24
|
-
| pa.RecordBatch
|
25
|
-
| pa.RecordBatchReader
|
26
|
-
| pds.Dataset
|
27
|
-
),
|
28
|
-
) -> dict[str, str]:
|
29
|
-
"""
|
30
|
-
Convert DataFrame dtypes to a serializable dictionary.
|
31
|
-
|
32
|
-
Args:
|
33
|
-
data: DataFrame
|
34
|
-
|
35
|
-
Returns:
|
36
|
-
dict mapping column names to dtype strings
|
37
|
-
"""
|
38
|
-
if isinstance(data, pd.DataFrame):
|
39
|
-
return {col: str(dtype) for col, dtype in data.dtypes.items()}
|
40
|
-
elif isinstance(data, pl.DataFrame):
|
41
|
-
return data.schema.to_python()
|
42
|
-
elif isinstance(data, pl.LazyFrame):
|
43
|
-
return data.collect_schema().to_python()
|
44
|
-
elif isinstance(data, duckdb.DuckDBPyRelation):
|
45
|
-
return dict(zip(data.columns, [str(dtype) for dtype in data.types]))
|
46
|
-
elif isinstance(
|
47
|
-
data, pa.Table | pa.RecordBatch | pa.RecordBatchReader | pds.Dataset
|
48
|
-
):
|
49
|
-
return dict(zip(data.schema.names, [str(dtype) for dtype in data.schema.types]))
|
50
|
-
elif isinstance(data, pa.Schema):
|
51
|
-
return dict(zip(data.names, [str(dtype) for dtype in data.types]))
|
52
|
-
|
53
|
-
|
54
|
-
def get_dataframe_metadata(
|
55
|
-
df: pd.DataFrame
|
56
|
-
| pl.DataFrame
|
57
|
-
| pl.LazyFrame
|
58
|
-
| pa.Table
|
59
|
-
| pa.RecordBatch
|
60
|
-
| pa.RecordBatchReader
|
61
|
-
| list[
|
62
|
-
pd.DataFrame
|
63
|
-
| pl.DataFrame
|
64
|
-
| pl.LazyFrame
|
65
|
-
| pa.Table
|
66
|
-
| pa.RecordBatch
|
67
|
-
| pa.RecordBatchReader
|
68
|
-
],
|
69
|
-
path: str | list[str] | None = None,
|
70
|
-
format: str | None = None,
|
71
|
-
topic: str | None = None,
|
72
|
-
num_files: int | None = None,
|
73
|
-
partition_columns: list[str] | None = None,
|
74
|
-
fs: AbstractFileSystem | None = None,
|
75
|
-
**kwargs,
|
76
|
-
) -> dict:
|
77
|
-
"""
|
78
|
-
Get metadata for a DataFrame.
|
79
|
-
|
80
|
-
Args:
|
81
|
-
df: DataFrame
|
82
|
-
path: Path to the file(s) that the DataFrame was loaded from
|
83
|
-
fs: Optional filesystem
|
84
|
-
kwargs: Additional metadata fields
|
85
|
-
|
86
|
-
Returns:
|
87
|
-
dict: DataFrame metadata
|
88
|
-
"""
|
89
|
-
if isinstance(df, list):
|
90
|
-
schema = get_serializable_schema(df[0])
|
91
|
-
num_rows = sum(df_.shape[0] for df_ in df)
|
92
|
-
else:
|
93
|
-
schema = get_serializable_schema(df)
|
94
|
-
num_rows = df.shape[0] if hasattr(df, "shape") else None
|
95
|
-
|
96
|
-
if path is not None and num_files is None:
|
97
|
-
if isinstance(path, list):
|
98
|
-
num_files = len(path)
|
99
|
-
else:
|
100
|
-
path_ = path_to_glob(path=path, format=format)
|
101
|
-
num_files = len(fs.glob(path_)) if fs is not None else None
|
102
|
-
|
103
|
-
if partition_columns is not None:
|
104
|
-
schema = {k: v for k, v in schema.items() if k not in partition_columns}
|
105
|
-
|
106
|
-
metadata = {
|
107
|
-
"path": path,
|
108
|
-
"topic": topic,
|
109
|
-
"format": format,
|
110
|
-
"timestamp": int(dt.datetime.now().timestamp() * 1000),
|
111
|
-
"schema": schema,
|
112
|
-
"partition_columns": partition_columns,
|
113
|
-
"num_columns": len(schema),
|
114
|
-
"num_rows": num_rows,
|
115
|
-
"num_files": num_files,
|
116
|
-
}
|
117
|
-
metadata.update(kwargs)
|
118
|
-
return {k: v for k, v in metadata.items() if v is not None}
|
119
|
-
|
120
|
-
|
121
|
-
def get_duckdb_metadata(
|
122
|
-
rel: duckdb.DuckDBPyRelation,
|
123
|
-
path: str,
|
124
|
-
format: str,
|
125
|
-
fs: AbstractFileSystem | None = None,
|
126
|
-
include_shape: bool = False,
|
127
|
-
include_num_files: bool = False,
|
128
|
-
partition_columns: list[str] | None = None,
|
129
|
-
**kwargs,
|
130
|
-
) -> dict:
|
131
|
-
"""
|
132
|
-
Get metadata for a DuckDBPyRelation.
|
133
|
-
|
134
|
-
Args:
|
135
|
-
rel: DuckDBPyRelation
|
136
|
-
path: Path to the file(s) that the DuckDBPyRelation was loaded from
|
137
|
-
fs: Filesystem
|
138
|
-
include_shape: Include shape in metadata
|
139
|
-
include_num_files: Include number of files in metadata
|
140
|
-
kwargs: Additional metadata fields
|
141
|
-
|
142
|
-
Returns:
|
143
|
-
dict: DuckDBPyRelation metadata
|
144
|
-
"""
|
145
|
-
|
146
|
-
schema = get_serializable_schema(rel)
|
147
|
-
if include_shape:
|
148
|
-
shape = rel.shape
|
149
|
-
else:
|
150
|
-
shape = None
|
151
|
-
if partition_columns is not None:
|
152
|
-
schema = {k: v for k, v in schema.items() if k not in partition_columns}
|
153
|
-
|
154
|
-
metadata = {
|
155
|
-
"path": path,
|
156
|
-
"format": format,
|
157
|
-
"timestamp": dt.datetime.now().timestamp(),
|
158
|
-
"schema": schema,
|
159
|
-
"partition_columns": partition_columns,
|
160
|
-
"num_columns": shape[1] if shape else None,
|
161
|
-
"num_rows": shape[0] if shape else None,
|
162
|
-
"num_files": len(fs.glob(path)) if include_num_files else None,
|
163
|
-
}
|
164
|
-
metadata.update(kwargs)
|
165
|
-
return {k: v for k, v in metadata.items() if v is not None}
|
166
|
-
|
167
|
-
|
168
|
-
def get_pyarrow_dataset_metadata(
|
169
|
-
ds: pds.Dataset,
|
170
|
-
path: str,
|
171
|
-
format: str,
|
172
|
-
**kwargs,
|
173
|
-
) -> dict:
|
174
|
-
schema = get_serializable_schema(ds.schema)
|
175
|
-
files = ds.files
|
176
|
-
|
177
|
-
metadata = {
|
178
|
-
"path": path or os.path.dirname(files[0]),
|
179
|
-
"format": format,
|
180
|
-
"timestamp": dt.datetime.now().timestamp(),
|
181
|
-
"schema": schema,
|
182
|
-
"partition_columns": ds.partitioning.schema.names if ds.partitioning else None,
|
183
|
-
"num_columns": len(ds.schema),
|
184
|
-
"num_rows": None,
|
185
|
-
"num_files": len(files),
|
186
|
-
}
|
187
|
-
metadata.update(kwargs)
|
188
|
-
return metadata
|
189
|
-
|
190
|
-
|
191
|
-
def get_delta_metadata(
|
192
|
-
dtable: DeltaTable,
|
193
|
-
path: str,
|
194
|
-
**kwargs,
|
195
|
-
) -> dict:
|
196
|
-
dt_meta = dtable.metadata()
|
197
|
-
dt_schema = dtable.schema().to_pyarrow()
|
198
|
-
metadata = {
|
199
|
-
"path": path,
|
200
|
-
"format": "delta",
|
201
|
-
"timestamp": dt.datetime.now().timestamp(),
|
202
|
-
"schema": dict(zip(dt_schema.names, [str(x) for x in dt_schema.types])),
|
203
|
-
"partition_columns": dt_meta.partition_columns
|
204
|
-
if hasattr(dt_meta, "partition_columns")
|
205
|
-
else None,
|
206
|
-
"num_columns": len(dt_schema),
|
207
|
-
"num_files": len(dtable.files()),
|
208
|
-
"name": dt_meta.name or kwargs.get("name", None),
|
209
|
-
"description": dt_meta.description or kwargs.get("description", None),
|
210
|
-
"id": dt_meta.id or kwargs.get("id", None),
|
211
|
-
}
|
212
|
-
|
213
|
-
return {k: v for k, v in metadata.items() if v is not None}
|
214
|
-
|
215
|
-
|
216
|
-
if importlib.util.find_spec("orjson"):
|
217
|
-
import orjson
|
218
|
-
|
219
|
-
def get_mqtt_metadata(
|
220
|
-
payload: bytes | dict[str, any],
|
221
|
-
topic: str | None = None,
|
222
|
-
**kwargs,
|
223
|
-
) -> dict:
|
224
|
-
if isinstance(payload, bytes):
|
225
|
-
payload = orjson.loads(payload)
|
226
|
-
|
227
|
-
schema = get_serializable_schema(payload)
|
228
|
-
metadata = {
|
229
|
-
"topic": topic,
|
230
|
-
"format": "mqtt",
|
231
|
-
"timestamp": dt.datetime.now().timestamp(),
|
232
|
-
"schema": schema,
|
233
|
-
"num_columns": len(schema),
|
234
|
-
"num_rows": len(payload),
|
235
|
-
"name": kwargs.get("name", None),
|
236
|
-
"description": kwargs.get("description", None),
|
237
|
-
"id": kwargs.get("id", None),
|
238
|
-
}
|
239
|
-
return metadata
|
240
|
-
|
241
|
-
else:
|
242
|
-
|
243
|
-
def get_mqtt_metadata(*args, **kwargs):
|
244
|
-
raise ImportError("orjson not installed")
|
@@ -1,28 +0,0 @@
|
|
1
|
-
from .csv import CSVDatasetWriter, CSVFileWriter
|
2
|
-
from .deltatable import DeltaTableWriter
|
3
|
-
from .duckdb import DuckDBWriter
|
4
|
-
from .json import JsonDatasetWriter, JsonFileWriter
|
5
|
-
from .mssql import MSSQLWriter
|
6
|
-
from .mysql import MySQLWriter
|
7
|
-
from .oracle import OracleDBWriter
|
8
|
-
from .parquet import ParquetDatasetWriter, ParquetFileWriter
|
9
|
-
from .postgres import PostgreSQLWriter
|
10
|
-
from .pydala import PydalaDatasetWriter
|
11
|
-
from .sqlite import SQLiteWriter
|
12
|
-
|
13
|
-
__all__ = [
|
14
|
-
"CSVFileWriter",
|
15
|
-
"CSVDatasetWriter",
|
16
|
-
"DeltaTableWriter",
|
17
|
-
"DuckDBWriter",
|
18
|
-
"JsonFileWriter",
|
19
|
-
"JsonDatasetWriter",
|
20
|
-
"MSSQLWriter",
|
21
|
-
"MySQLWriter",
|
22
|
-
"OracleDBWriter",
|
23
|
-
"ParquetFileWriter",
|
24
|
-
"ParquetDatasetWriter",
|
25
|
-
"PostgreSQLWriter",
|
26
|
-
"PydalaDatasetWriter",
|
27
|
-
"SQLiteWriter",
|
28
|
-
]
|
@@ -1,36 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatasetWriter, BaseFileWriter
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class CSVFileWriter(BaseFileWriter, gc=False):
|
8
|
-
"""CSV file writer.
|
9
|
-
|
10
|
-
This class is responsible for writing dataframes to CSV files.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
writer = CSVFileWriter(df, "data.csv")
|
15
|
-
writer.write()
|
16
|
-
```
|
17
|
-
"""
|
18
|
-
|
19
|
-
format: str = field(default="csv")
|
20
|
-
|
21
|
-
|
22
|
-
# @attrs.define
|
23
|
-
class CSVDatasetWriter(BaseDatasetWriter, gc=False):
|
24
|
-
"""CSV dataset writer.
|
25
|
-
|
26
|
-
This class is responsible for writing dataframes to CSV dataset.
|
27
|
-
|
28
|
-
Examples:
|
29
|
-
```python
|
30
|
-
writer = CSVDatasetWriter(df, "csv_data/")
|
31
|
-
writer.write()
|
32
|
-
```
|
33
|
-
|
34
|
-
"""
|
35
|
-
|
36
|
-
format: str = field(default="csv")
|
@@ -1,186 +0,0 @@
|
|
1
|
-
from typing import Any
|
2
|
-
|
3
|
-
import pandas as pd
|
4
|
-
import polars as pl
|
5
|
-
import pyarrow as pa
|
6
|
-
from deltalake.transaction import CommitProperties, PostCommitHookProperties
|
7
|
-
from deltalake.writer import (ColumnProperties, WriterProperties,
|
8
|
-
write_deltalake)
|
9
|
-
from msgspec import field
|
10
|
-
from redis import Redis, StrictRedis
|
11
|
-
from sherlock import RedisLock
|
12
|
-
|
13
|
-
from ....utils.misc import _dict_to_dataframe
|
14
|
-
from ..base import BaseDatasetWriter
|
15
|
-
from ..metadata import get_dataframe_metadata
|
16
|
-
|
17
|
-
|
18
|
-
# @attrs.define
|
19
|
-
class DeltaTableWriter(BaseDatasetWriter, gc=False):
|
20
|
-
"""Delta table writer.
|
21
|
-
|
22
|
-
This class is responsible for writing dataframes to Delta tables.
|
23
|
-
|
24
|
-
Examples:
|
25
|
-
```python
|
26
|
-
writer = DeltaTableWriter("data/")
|
27
|
-
writer.write(df)
|
28
|
-
```
|
29
|
-
"""
|
30
|
-
|
31
|
-
description: str | None = None
|
32
|
-
with_lock: bool = False
|
33
|
-
redis: StrictRedis | Redis | None = None
|
34
|
-
format: str = field(default="delta")
|
35
|
-
|
36
|
-
def __post_init__(self):
|
37
|
-
super().__post_init__()
|
38
|
-
if self.with_lock and self.redis is None:
|
39
|
-
raise ValueError("Redis connection is required when using locks.")
|
40
|
-
|
41
|
-
def write(
|
42
|
-
self,
|
43
|
-
data: (
|
44
|
-
pl.DataFrame
|
45
|
-
| pl.LazyFrame
|
46
|
-
| pa.Table
|
47
|
-
| pa.RecordBatch
|
48
|
-
| pa.RecordBatchReader
|
49
|
-
| pd.DataFrame
|
50
|
-
| dict[str, Any]
|
51
|
-
| list[
|
52
|
-
pl.DataFrame
|
53
|
-
| pl.LazyFrame
|
54
|
-
| pa.Table
|
55
|
-
| pa.RecordBatch
|
56
|
-
| pa.RecordBatchReader
|
57
|
-
| pd.DataFrame
|
58
|
-
| dict[str, Any]
|
59
|
-
]
|
60
|
-
)
|
61
|
-
| None = None,
|
62
|
-
mode: str = "append", # "overwrite" | "append" | "error | "ignore"
|
63
|
-
# schema: pa.Schema | None = None,
|
64
|
-
schema_mode: str | None = None, # "merge" | "overwrite"
|
65
|
-
partition_by: list[str] | None = None,
|
66
|
-
# partition_filters: list[tuple[str, str, Any]] | None = None,
|
67
|
-
predicate: str | None = None,
|
68
|
-
target_file_size: int | None = None,
|
69
|
-
# large_dtypes: bool = False,
|
70
|
-
# custom_metadata: dict[str, Any] | None = None,
|
71
|
-
post_commithook_properties: PostCommitHookProperties | None = None,
|
72
|
-
commit_properties: CommitProperties | None = None,
|
73
|
-
# writerproperties
|
74
|
-
data_page_size_limit: int | None = None,
|
75
|
-
dictionary_page_size_limit: int | None = None,
|
76
|
-
data_page_row_count_limit: int | None = None,
|
77
|
-
write_batch_size: int | None = None,
|
78
|
-
max_row_group_size: int | None = None,
|
79
|
-
compression: str | None = None,
|
80
|
-
compression_level: int | None = None,
|
81
|
-
statistics_truncate_length: int | None = None,
|
82
|
-
default_column_properties: ColumnProperties | None = None,
|
83
|
-
column_properties: dict[str, ColumnProperties] | None = None,
|
84
|
-
) -> dict[str, Any]:
|
85
|
-
"""
|
86
|
-
Write data to a Delta table.
|
87
|
-
|
88
|
-
Args:
|
89
|
-
data: Data to write
|
90
|
-
mode: Write mode
|
91
|
-
schema: Schema of the data
|
92
|
-
schema_mode: Schema mode
|
93
|
-
partition_by: Columns to partition by
|
94
|
-
partition_filters: Filters to apply to the partitions
|
95
|
-
predicate: Predicate to apply to the data
|
96
|
-
target_file_size: Target file size
|
97
|
-
large_dtypes: Whether to use large dtypes
|
98
|
-
custom_metadata: Custom metadata
|
99
|
-
post_commithook_properties: Post-commit hook properties
|
100
|
-
commit_properties: Commit properties
|
101
|
-
data_page_size_limit: Data page size limit
|
102
|
-
dictionary_page_size_limit: Dictionary page size limit
|
103
|
-
data_page_row_count_limit: Data page row count limit
|
104
|
-
write_batch_size: Write batch size
|
105
|
-
max_row_group_size: Maximum row group size
|
106
|
-
compression: Compression method
|
107
|
-
compression_level: Compression level
|
108
|
-
statistics_truncate_length: Statistics truncate length
|
109
|
-
default_column_properties: Default column properties
|
110
|
-
column_properties: Column properties
|
111
|
-
|
112
|
-
Returns:
|
113
|
-
Metadata
|
114
|
-
"""
|
115
|
-
if data is None:
|
116
|
-
data = self.data
|
117
|
-
if isinstance(data, dict):
|
118
|
-
data = _dict_to_dataframe(data)
|
119
|
-
if not isinstance(data, list):
|
120
|
-
data = [data]
|
121
|
-
if isinstance(data[0], dict):
|
122
|
-
data = [_dict_to_dataframe(d) for d in data]
|
123
|
-
if isinstance(data[0], pl.LazyFrame):
|
124
|
-
data = [d.collect() for d in data]
|
125
|
-
if isinstance(data[0], pl.DataFrame):
|
126
|
-
data = pl.concat(data, how="diagonal_relaxed").to_arrow()
|
127
|
-
if isinstance(data[0], pd.DataFrame):
|
128
|
-
data = pa.concat_tables(
|
129
|
-
[pa.Table.from_pandas(d, preserve_index=False) for d in data],
|
130
|
-
promote_options="permissive",
|
131
|
-
)
|
132
|
-
if isinstance(data[0], pa.RecordBatch | pa.RecordBatchReader):
|
133
|
-
data = pa.Table.from_batches(data)
|
134
|
-
if isinstance(data[0], pa.Table):
|
135
|
-
data = pa.concat_tables(data, promote_options="permissive")
|
136
|
-
|
137
|
-
metadata = get_dataframe_metadata(
|
138
|
-
data, path=self._base_path, format=self.format
|
139
|
-
)
|
140
|
-
|
141
|
-
writer_properties = WriterProperties(
|
142
|
-
data_page_size_limit=data_page_size_limit,
|
143
|
-
dictionary_page_size_limit=dictionary_page_size_limit,
|
144
|
-
data_page_row_count_limit=data_page_row_count_limit,
|
145
|
-
write_batch_size=write_batch_size,
|
146
|
-
max_row_group_size=max_row_group_size or self.row_group_size,
|
147
|
-
compression=compression or self.compression.upper(),
|
148
|
-
compression_level=compression_level,
|
149
|
-
statistics_truncate_length=statistics_truncate_length,
|
150
|
-
default_column_properties=default_column_properties,
|
151
|
-
column_properties=column_properties,
|
152
|
-
)
|
153
|
-
|
154
|
-
def _write():
|
155
|
-
write_deltalake(
|
156
|
-
self._base_path,
|
157
|
-
data,
|
158
|
-
mode=mode,
|
159
|
-
# schema=schema or self.schema_,
|
160
|
-
partition_by=partition_by or self.partition_by,
|
161
|
-
storage_options=self.storage_options.to_object_store_kwargs(),
|
162
|
-
description=self.description,
|
163
|
-
schema_mode=schema_mode,
|
164
|
-
# partition_filters=partition_filters,
|
165
|
-
predicate=predicate,
|
166
|
-
target_file_size=target_file_size,
|
167
|
-
# large_dtypes=large_dtypes,
|
168
|
-
# custom_metadata=custom_metadata,
|
169
|
-
post_commithook_properties=post_commithook_properties,
|
170
|
-
commit_properties=commit_properties,
|
171
|
-
writer_properties=writer_properties,
|
172
|
-
)
|
173
|
-
|
174
|
-
if self.with_lock:
|
175
|
-
with RedisLock(
|
176
|
-
lock_name=self._base_path,
|
177
|
-
namespace="flowerpower",
|
178
|
-
client=self.redis,
|
179
|
-
expire=10,
|
180
|
-
timeout=5,
|
181
|
-
retry_interval=0.1,
|
182
|
-
):
|
183
|
-
_write()
|
184
|
-
else:
|
185
|
-
_write()
|
186
|
-
return metadata
|
@@ -1,19 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseWriter
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class DuckDBWriter(BaseDatabaseWriter, gc=False):
|
8
|
-
"""DuckDB writer.
|
9
|
-
|
10
|
-
This class is responsible for writing dataframes to DuckDB database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
writer = DuckDBWriter(table_name="table", path="data.db")
|
15
|
-
writer.write(df)
|
16
|
-
```
|
17
|
-
"""
|
18
|
-
|
19
|
-
type_: str = field(default="duckdb")
|