FlowerPower 0.11.6.20__py3-none-any.whl → 0.20.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flowerpower/cfg/__init__.py +3 -3
- flowerpower/cfg/pipeline/__init__.py +5 -3
- flowerpower/cfg/project/__init__.py +3 -3
- flowerpower/cfg/project/job_queue.py +1 -128
- flowerpower/cli/__init__.py +5 -5
- flowerpower/cli/cfg.py +0 -3
- flowerpower/cli/job_queue.py +400 -132
- flowerpower/cli/pipeline.py +14 -413
- flowerpower/cli/utils.py +0 -1
- flowerpower/flowerpower.py +537 -28
- flowerpower/job_queue/__init__.py +5 -94
- flowerpower/job_queue/base.py +201 -3
- flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -3
- flowerpower/job_queue/rq/manager.py +388 -77
- flowerpower/pipeline/__init__.py +2 -0
- flowerpower/pipeline/base.py +2 -2
- flowerpower/pipeline/io.py +14 -16
- flowerpower/pipeline/manager.py +21 -642
- flowerpower/pipeline/pipeline.py +571 -0
- flowerpower/pipeline/registry.py +242 -10
- flowerpower/pipeline/visualizer.py +1 -2
- flowerpower/plugins/_io/__init__.py +8 -0
- flowerpower/plugins/mqtt/manager.py +6 -6
- flowerpower/settings/backend.py +0 -2
- flowerpower/settings/job_queue.py +1 -57
- flowerpower/utils/misc.py +0 -256
- flowerpower/utils/monkey.py +1 -83
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/METADATA +308 -152
- flowerpower-0.20.0.dist-info/RECORD +58 -0
- flowerpower/fs/__init__.py +0 -29
- flowerpower/fs/base.py +0 -662
- flowerpower/fs/ext.py +0 -2143
- flowerpower/fs/storage_options.py +0 -1420
- flowerpower/job_queue/apscheduler/__init__.py +0 -11
- flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
- flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
- flowerpower/job_queue/apscheduler/manager.py +0 -1051
- flowerpower/job_queue/apscheduler/setup.py +0 -554
- flowerpower/job_queue/apscheduler/trigger.py +0 -169
- flowerpower/job_queue/apscheduler/utils.py +0 -311
- flowerpower/pipeline/job_queue.py +0 -583
- flowerpower/pipeline/runner.py +0 -603
- flowerpower/plugins/io/base.py +0 -2520
- flowerpower/plugins/io/helpers/datetime.py +0 -298
- flowerpower/plugins/io/helpers/polars.py +0 -875
- flowerpower/plugins/io/helpers/pyarrow.py +0 -570
- flowerpower/plugins/io/helpers/sql.py +0 -202
- flowerpower/plugins/io/loader/__init__.py +0 -28
- flowerpower/plugins/io/loader/csv.py +0 -37
- flowerpower/plugins/io/loader/deltatable.py +0 -190
- flowerpower/plugins/io/loader/duckdb.py +0 -19
- flowerpower/plugins/io/loader/json.py +0 -37
- flowerpower/plugins/io/loader/mqtt.py +0 -159
- flowerpower/plugins/io/loader/mssql.py +0 -26
- flowerpower/plugins/io/loader/mysql.py +0 -26
- flowerpower/plugins/io/loader/oracle.py +0 -26
- flowerpower/plugins/io/loader/parquet.py +0 -35
- flowerpower/plugins/io/loader/postgres.py +0 -26
- flowerpower/plugins/io/loader/pydala.py +0 -19
- flowerpower/plugins/io/loader/sqlite.py +0 -23
- flowerpower/plugins/io/metadata.py +0 -244
- flowerpower/plugins/io/saver/__init__.py +0 -28
- flowerpower/plugins/io/saver/csv.py +0 -36
- flowerpower/plugins/io/saver/deltatable.py +0 -186
- flowerpower/plugins/io/saver/duckdb.py +0 -19
- flowerpower/plugins/io/saver/json.py +0 -36
- flowerpower/plugins/io/saver/mqtt.py +0 -28
- flowerpower/plugins/io/saver/mssql.py +0 -26
- flowerpower/plugins/io/saver/mysql.py +0 -26
- flowerpower/plugins/io/saver/oracle.py +0 -26
- flowerpower/plugins/io/saver/parquet.py +0 -36
- flowerpower/plugins/io/saver/postgres.py +0 -26
- flowerpower/plugins/io/saver/pydala.py +0 -20
- flowerpower/plugins/io/saver/sqlite.py +0 -24
- flowerpower/utils/scheduler.py +0 -311
- flowerpower-0.11.6.20.dist-info/RECORD +0 -102
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/WHEEL +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/entry_points.txt +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/licenses/LICENSE +0 -0
- {flowerpower-0.11.6.20.dist-info → flowerpower-0.20.0.dist-info}/top_level.txt +0 -0
@@ -1,202 +0,0 @@
|
|
1
|
-
import re
|
2
|
-
from typing import Any
|
3
|
-
|
4
|
-
import pyarrow as pa
|
5
|
-
import pyarrow.compute as pc
|
6
|
-
from sqlglot import exp, parse_one
|
7
|
-
|
8
|
-
from .datetime import timestamp_from_string
|
9
|
-
from .polars import pl
|
10
|
-
|
11
|
-
# Compile regex patterns once for efficiency
|
12
|
-
SPLIT_PATTERN = re.compile(
|
13
|
-
r"<=|<|>=|>|=|!=|\s+[n,N][o,O][t,T]\s+[i,I][n,N]\s+|\s+[i,I][n,N]\s+|"
|
14
|
-
r"\s+[i,I][s,S]\s+[n,N][o,O][t,T]\s+[n,N][u,U][l,L]{2}\s+|\s+[i,I][s,S]\s+[n,N][u,U][l,L]{2}\s+"
|
15
|
-
)
|
16
|
-
LOGICAL_OPERATORS_PATTERN = re.compile(
|
17
|
-
r"\s+[a,A][n,N][d,D] [n,N][o,O][t,T]\s+|\s+[a,A][n,N][d,D]\s+|"
|
18
|
-
r"\s+[o,O][r,R] [n,N][o,O][t,T]\s+|\s+[o,O][r,R]\s+"
|
19
|
-
)
|
20
|
-
|
21
|
-
|
22
|
-
def sql2pyarrow_filter(string: str, schema: pa.Schema) -> pc.Expression:
|
23
|
-
"""
|
24
|
-
Generates a filter expression for PyArrow based on a given string and schema.
|
25
|
-
|
26
|
-
Parameters:
|
27
|
-
string (str): The string containing the filter expression.
|
28
|
-
schema (pa.Schema): The PyArrow schema used to validate the filter expression.
|
29
|
-
|
30
|
-
Returns:
|
31
|
-
pc.Expression: The generated filter expression.
|
32
|
-
|
33
|
-
Raises:
|
34
|
-
ValueError: If the input string is invalid or contains unsupported operations.
|
35
|
-
"""
|
36
|
-
|
37
|
-
def parse_value(val: str, type_: pa.DataType) -> Any:
|
38
|
-
"""Parse and convert value based on the field type."""
|
39
|
-
if isinstance(val, (tuple, list)):
|
40
|
-
return type(val)(parse_value(v, type_) for v in val)
|
41
|
-
|
42
|
-
if pa.types.is_timestamp(type_):
|
43
|
-
return timestamp_from_string(val, exact=False, tz=type_.tz)
|
44
|
-
elif pa.types.is_date(type_):
|
45
|
-
return timestamp_from_string(val, exact=True).date()
|
46
|
-
elif pa.types.is_time(type_):
|
47
|
-
return timestamp_from_string(val, exact=True).time()
|
48
|
-
|
49
|
-
elif pa.types.is_integer(type_):
|
50
|
-
return int(float(val.strip("'").replace(",", ".")))
|
51
|
-
elif pa.types.is_floating(type_):
|
52
|
-
return float(val.strip("'").replace(",", "."))
|
53
|
-
elif pa.types.is_boolean(type_):
|
54
|
-
return val.lower().strip("'") in ("true", "1", "yes")
|
55
|
-
else:
|
56
|
-
return val.strip("'")
|
57
|
-
|
58
|
-
def _parse_part(part: str) -> pc.Expression:
|
59
|
-
match = SPLIT_PATTERN.search(part)
|
60
|
-
if not match:
|
61
|
-
raise ValueError(f"Invalid condition: {part}")
|
62
|
-
|
63
|
-
sign = match.group().lower().strip()
|
64
|
-
field, val = [p.strip() for p in SPLIT_PATTERN.split(part)]
|
65
|
-
|
66
|
-
if field not in schema.names:
|
67
|
-
raise ValueError(f"Unknown field: {field}")
|
68
|
-
|
69
|
-
type_ = schema.field(field).type
|
70
|
-
val = parse_value(val, type_)
|
71
|
-
|
72
|
-
operations = {
|
73
|
-
">=": lambda f, v: pc.field(f) >= v,
|
74
|
-
">": lambda f, v: pc.field(f) > v,
|
75
|
-
"<=": lambda f, v: pc.field(f) <= v,
|
76
|
-
"<": lambda f, v: pc.field(f) < v,
|
77
|
-
"=": lambda f, v: pc.field(f) == v,
|
78
|
-
"!=": lambda f, v: pc.field(f) != v,
|
79
|
-
"in": lambda f, v: pc.field(f).isin(v),
|
80
|
-
"not in": lambda f, v: ~pc.field(f).isin(v),
|
81
|
-
"is null": lambda f, v: pc.field(f).is_null(nan_is_null=True),
|
82
|
-
"is not null": lambda f, v: ~pc.field(f).is_null(nan_is_null=True),
|
83
|
-
}
|
84
|
-
|
85
|
-
if sign not in operations:
|
86
|
-
raise ValueError(f"Unsupported operation: {sign}")
|
87
|
-
|
88
|
-
return operations[sign](field, val)
|
89
|
-
|
90
|
-
parts = LOGICAL_OPERATORS_PATTERN.split(string)
|
91
|
-
operators = [op.lower().strip() for op in LOGICAL_OPERATORS_PATTERN.findall(string)]
|
92
|
-
|
93
|
-
if len(parts) == 1:
|
94
|
-
return _parse_part(parts[0])
|
95
|
-
|
96
|
-
expr = _parse_part(parts[0])
|
97
|
-
for part, operator in zip(parts[1:], operators):
|
98
|
-
if operator == "and":
|
99
|
-
expr = expr & _parse_part(part)
|
100
|
-
elif operator == "and not":
|
101
|
-
expr = expr & ~_parse_part(part)
|
102
|
-
elif operator == "or":
|
103
|
-
expr = expr | _parse_part(part)
|
104
|
-
elif operator == "or not":
|
105
|
-
expr = expr | ~_parse_part(part)
|
106
|
-
else:
|
107
|
-
raise ValueError(f"Unsupported logical operator: {operator}")
|
108
|
-
|
109
|
-
return expr
|
110
|
-
|
111
|
-
|
112
|
-
def sql2polars_filter(string: str, schema: pl.Schema) -> pl.Expr:
|
113
|
-
"""
|
114
|
-
Generates a filter expression for Polars based on a given string and schema.
|
115
|
-
|
116
|
-
Parameters:
|
117
|
-
string (str): The string containing the filter expression.
|
118
|
-
schema (pl.Schema): The Polars schema used to validate the filter expression.
|
119
|
-
|
120
|
-
Returns:
|
121
|
-
pl.Expr: The generated filter expression.
|
122
|
-
|
123
|
-
Raises:
|
124
|
-
ValueError: If the input string is invalid or contains unsupported operations.
|
125
|
-
"""
|
126
|
-
|
127
|
-
def parse_value(val: str, dtype: pl.DataType) -> Any:
|
128
|
-
"""Parse and convert value based on the field type."""
|
129
|
-
if isinstance(val, (tuple, list)):
|
130
|
-
return type(val)(parse_value(v, dtype) for v in val)
|
131
|
-
|
132
|
-
if dtype == pl.Datetime:
|
133
|
-
return timestamp_from_string(val, exact=False, tz=dtype.time_zone)
|
134
|
-
elif dtype == pl.Date:
|
135
|
-
return timestamp_from_string(val, exact=True).date()
|
136
|
-
elif dtype == pl.Time:
|
137
|
-
return timestamp_from_string(val, exact=True).time()
|
138
|
-
elif dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64):
|
139
|
-
return int(float(val.strip("'").replace(",", ".")))
|
140
|
-
elif dtype in (pl.Float32, pl.Float64):
|
141
|
-
return float(val.strip("'").replace(",", "."))
|
142
|
-
elif dtype == pl.Boolean:
|
143
|
-
return val.lower().strip("'") in ("true", "1", "yes")
|
144
|
-
else:
|
145
|
-
return val.strip("'")
|
146
|
-
|
147
|
-
def _parse_part(part: str) -> pl.Expr:
|
148
|
-
match = SPLIT_PATTERN.search(part)
|
149
|
-
if not match:
|
150
|
-
raise ValueError(f"Invalid condition: {part}")
|
151
|
-
|
152
|
-
sign = match.group().lower().strip()
|
153
|
-
field, val = [p.strip() for p in SPLIT_PATTERN.split(part)]
|
154
|
-
|
155
|
-
if field not in schema.names():
|
156
|
-
raise ValueError(f"Unknown field: {field}")
|
157
|
-
|
158
|
-
dtype = schema[field]
|
159
|
-
val = parse_value(val, dtype)
|
160
|
-
|
161
|
-
operations = {
|
162
|
-
">=": lambda f, v: pl.col(f) >= v,
|
163
|
-
">": lambda f, v: pl.col(f) > v,
|
164
|
-
"<=": lambda f, v: pl.col(f) <= v,
|
165
|
-
"<": lambda f, v: pl.col(f) < v,
|
166
|
-
"=": lambda f, v: pl.col(f) == v,
|
167
|
-
"!=": lambda f, v: pl.col(f) != v,
|
168
|
-
"in": lambda f, v: pl.col(f).is_in(v),
|
169
|
-
"not in": lambda f, v: ~pl.col(f).is_in(v),
|
170
|
-
"is null": lambda f, v: pl.col(f).is_null(),
|
171
|
-
"is not null": lambda f, v: pl.col(f).is_not_null(),
|
172
|
-
}
|
173
|
-
|
174
|
-
if sign not in operations:
|
175
|
-
raise ValueError(f"Unsupported operation: {sign}")
|
176
|
-
|
177
|
-
return operations[sign](field, val)
|
178
|
-
|
179
|
-
parts = LOGICAL_OPERATORS_PATTERN.split(string)
|
180
|
-
operators = [op.lower().strip() for op in LOGICAL_OPERATORS_PATTERN.findall(string)]
|
181
|
-
|
182
|
-
if len(parts) == 1:
|
183
|
-
return _parse_part(parts[0])
|
184
|
-
|
185
|
-
expr = _parse_part(parts[0])
|
186
|
-
for part, operator in zip(parts[1:], operators):
|
187
|
-
if operator == "and":
|
188
|
-
expr = expr & _parse_part(part)
|
189
|
-
elif operator == "and not":
|
190
|
-
expr = expr & ~_parse_part(part)
|
191
|
-
elif operator == "or":
|
192
|
-
expr = expr | _parse_part(part)
|
193
|
-
elif operator == "or not":
|
194
|
-
expr = expr | ~_parse_part(part)
|
195
|
-
else:
|
196
|
-
raise ValueError(f"Unsupported logical operator: {operator}")
|
197
|
-
|
198
|
-
return expr
|
199
|
-
|
200
|
-
|
201
|
-
def get_table_names(sql_query):
|
202
|
-
return [table.name for table in parse_one(sql_query).find_all(exp.Table)]
|
@@ -1,28 +0,0 @@
|
|
1
|
-
from .csv import CSVDatasetReader, CSVFileReader
|
2
|
-
from .deltatable import DeltaTableReader
|
3
|
-
from .duckdb import DuckDBReader
|
4
|
-
from .json import JsonDatasetReader, JsonFileReader
|
5
|
-
from .mssql import MSSQLReader
|
6
|
-
from .mysql import MySQLReader
|
7
|
-
from .oracle import OracleDBReader
|
8
|
-
from .parquet import ParquetDatasetReader, ParquetFileReader
|
9
|
-
from .postgres import PostgreSQLReader
|
10
|
-
from .pydala import PydalaDatasetReader
|
11
|
-
from .sqlite import SQLiteReader
|
12
|
-
|
13
|
-
__all__ = [
|
14
|
-
"CSVFileReader",
|
15
|
-
"CSVDatasetReader",
|
16
|
-
"DeltaTableReader",
|
17
|
-
"DuckDBReader",
|
18
|
-
"JsonFileReader",
|
19
|
-
"JsonDatasetReader",
|
20
|
-
"MSSQLReader",
|
21
|
-
"MySQLReader",
|
22
|
-
"OracleDBReader",
|
23
|
-
"ParquetFileReader",
|
24
|
-
"ParquetDatasetReader",
|
25
|
-
"PostgreSQLReader",
|
26
|
-
"PydalaDatasetReader",
|
27
|
-
"SQLiteReader",
|
28
|
-
]
|
@@ -1,37 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatasetReader, BaseFileReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class CSVFileReader(BaseFileReader, gc=False):
|
8
|
-
"""CSV file loader.
|
9
|
-
|
10
|
-
This class is responsible for loading CSV files into several dataframe formats,
|
11
|
-
duckdb and datafusion.
|
12
|
-
|
13
|
-
Examples:
|
14
|
-
```python
|
15
|
-
loader = CSVFileReader("data.csv")
|
16
|
-
df = loader.to_pandas()
|
17
|
-
```
|
18
|
-
"""
|
19
|
-
|
20
|
-
format: str = field(default="csv")
|
21
|
-
|
22
|
-
|
23
|
-
# @attrs.define
|
24
|
-
class CSVDatasetReader(BaseDatasetReader, gc=False):
|
25
|
-
"""CSV dataset loader.
|
26
|
-
|
27
|
-
This class is responsible for loading CSV files into several dataframe formats,
|
28
|
-
duckdb and datafusion.
|
29
|
-
|
30
|
-
Examples:
|
31
|
-
```python
|
32
|
-
loader = CSVDatasetReader("csv_data/")
|
33
|
-
df = loader.to_pandas()
|
34
|
-
```
|
35
|
-
"""
|
36
|
-
|
37
|
-
format: str = field(default="csv")
|
@@ -1,190 +0,0 @@
|
|
1
|
-
# import datetime as dt
|
2
|
-
|
3
|
-
|
4
|
-
import datetime
|
5
|
-
|
6
|
-
import pyarrow as pa
|
7
|
-
import pyarrow.dataset as pds
|
8
|
-
from deltalake import DeltaTable, table
|
9
|
-
from deltalake.exceptions import TableNotFoundError
|
10
|
-
from deltalake.transaction import CommitProperties, PostCommitHookProperties
|
11
|
-
from deltalake.writer import WriterProperties
|
12
|
-
from loguru import logger
|
13
|
-
from msgspec import field
|
14
|
-
from sherlock import RedisLock
|
15
|
-
|
16
|
-
from ..base import BaseDatasetReader
|
17
|
-
from ..metadata import (get_dataframe_metadata, get_delta_metadata,
|
18
|
-
get_pyarrow_dataset_metadata)
|
19
|
-
|
20
|
-
|
21
|
-
# @attrs.define
|
22
|
-
class DeltaTableReader(BaseDatasetReader, gc=False):
|
23
|
-
"""Delta table loader.
|
24
|
-
|
25
|
-
This class is responsible for loading Delta tables into several dataframe formats,
|
26
|
-
duckdb and datafusion.
|
27
|
-
|
28
|
-
"""
|
29
|
-
|
30
|
-
delta_table: DeltaTable | None = None
|
31
|
-
with_lock: bool = False
|
32
|
-
redis: str | None = None
|
33
|
-
format: str = field(default="delta")
|
34
|
-
|
35
|
-
def __post_init__(self):
|
36
|
-
super().__post_init__()
|
37
|
-
|
38
|
-
self._init_dt()
|
39
|
-
if self.with_lock and self.redis is None:
|
40
|
-
raise ValueError("Redis connection is required when using locks.")
|
41
|
-
|
42
|
-
def _init_dt(self):
|
43
|
-
try:
|
44
|
-
self.delta_table = DeltaTable(
|
45
|
-
self._base_path,
|
46
|
-
storage_options=self.storage_options.to_object_store_kwargs(),
|
47
|
-
)
|
48
|
-
except TableNotFoundError:
|
49
|
-
logger.warning(f"Table {self._base_path} not found.")
|
50
|
-
self.delta_table = None
|
51
|
-
|
52
|
-
@property
|
53
|
-
def dt(self) -> DeltaTable:
|
54
|
-
return self.delta_table
|
55
|
-
|
56
|
-
def _load(self, reload: bool = False):
|
57
|
-
self.to_pyarrow_table(reload=reload)
|
58
|
-
|
59
|
-
def to_pyarrow_dataset(
|
60
|
-
self, metadata: bool = False, reload: bool = False
|
61
|
-
) -> pds.Dataset | tuple[pds.Dataset, dict[str, any]]:
|
62
|
-
"""Converts the DeltaTable to a PyArrow Dataset.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
metadata (bool, optional): Whether to include metadata. Defaults to False.
|
66
|
-
reload (bool, optional): Whether to reload the dataset. Defaults to False.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
pds.Dataset | tuple[pds.Dataset, dict[str, any]]: PyArrow Dataset or tuple of PyArrow Dataset and metadata.
|
70
|
-
"""
|
71
|
-
if self.delta_table is None:
|
72
|
-
self._init_dt()
|
73
|
-
if self.delta_table is None:
|
74
|
-
return None
|
75
|
-
|
76
|
-
if reload or not hasattr(self, "_dataset"):
|
77
|
-
self._dataset = self.delta_table.to_pyarrow_dataset()
|
78
|
-
if metadata:
|
79
|
-
metadata = get_pyarrow_dataset_metadata(
|
80
|
-
self._dataset, self._base_path, "parquet"
|
81
|
-
)
|
82
|
-
return self._dataset, metadata
|
83
|
-
return self._dataset
|
84
|
-
|
85
|
-
def to_pyarrow_table(
|
86
|
-
self, metadata: bool = False, reload: bool = False
|
87
|
-
) -> pa.Table | tuple[pa.Table, dict[str, any]]:
|
88
|
-
"""Converts the DeltaTable to a PyArrow Table.
|
89
|
-
|
90
|
-
Args:
|
91
|
-
metadata (bool, optional): Whether to include metadata. Defaults to False.
|
92
|
-
reload (bool, optional): Whether to reload the table. Defaults to False.
|
93
|
-
|
94
|
-
Returns:
|
95
|
-
pa.Table | tuple[pa.Table, dict[str, any]]: PyArrow Table or tuple of PyArrow Table and metadata.
|
96
|
-
"""
|
97
|
-
if self.delta_table is None:
|
98
|
-
self._init_dt()
|
99
|
-
if self.delta_table is None:
|
100
|
-
return None
|
101
|
-
|
102
|
-
if reload or not hasattr(self, "_data"):
|
103
|
-
self._data = self.delta_table.to_pyarrow_table()
|
104
|
-
if metadata:
|
105
|
-
metadata = get_dataframe_metadata(table, self._base_path, "parquet")
|
106
|
-
return self._data, metadata
|
107
|
-
return self._data
|
108
|
-
|
109
|
-
def compact(
|
110
|
-
self,
|
111
|
-
partition_filters: list[tuple[str, str, any]] | None = None,
|
112
|
-
target_size: int = None,
|
113
|
-
max_concurrent_tasks: int = None,
|
114
|
-
min_commit_interval: int | datetime.timedelta | None = None,
|
115
|
-
writer_properties: WriterProperties = None,
|
116
|
-
custom_metadata: dict[str, str] | None = None,
|
117
|
-
post_commithook_properties: PostCommitHookProperties | None = None,
|
118
|
-
commit_properties: CommitProperties | None = None,
|
119
|
-
) -> dict[str, any]:
|
120
|
-
def _compact():
|
121
|
-
self.delta_table.compact(
|
122
|
-
partition_filters=partition_filters,
|
123
|
-
target_size=target_size,
|
124
|
-
max_concurrent_tasks=max_concurrent_tasks,
|
125
|
-
min_commit_interval=min_commit_interval,
|
126
|
-
writer_properties=writer_properties,
|
127
|
-
custom_metadata=custom_metadata,
|
128
|
-
post_commithook_properties=post_commithook_properties,
|
129
|
-
commit_properties=commit_properties,
|
130
|
-
)
|
131
|
-
|
132
|
-
if self.with_lock:
|
133
|
-
with RedisLock(
|
134
|
-
lock_name=self._base_path,
|
135
|
-
namespace="flowerpower",
|
136
|
-
client=self.redis,
|
137
|
-
expire=10,
|
138
|
-
timeout=5,
|
139
|
-
retry_interval=0.1,
|
140
|
-
):
|
141
|
-
_compact()
|
142
|
-
else:
|
143
|
-
_compact()
|
144
|
-
|
145
|
-
def z_order(
|
146
|
-
self,
|
147
|
-
columns: list[str],
|
148
|
-
partition_filters: list[tuple[str, str, any]] | None = None,
|
149
|
-
target_size: int = None,
|
150
|
-
max_concurrent_tasks: int = None,
|
151
|
-
min_commit_interval: int | datetime.timedelta | None = None,
|
152
|
-
writer_properties: WriterProperties = None,
|
153
|
-
custom_metadata: dict[str, str] | None = None,
|
154
|
-
post_commithook_properties: PostCommitHookProperties | None = None,
|
155
|
-
commit_properties: CommitProperties | None = None,
|
156
|
-
) -> dict[str, any]:
|
157
|
-
def _z_order():
|
158
|
-
self.delta_table.z_order(
|
159
|
-
columns=columns,
|
160
|
-
partition_filters=partition_filters,
|
161
|
-
target_size=target_size,
|
162
|
-
max_concurrent_tasks=max_concurrent_tasks,
|
163
|
-
min_commit_interval=min_commit_interval,
|
164
|
-
writer_properties=writer_properties,
|
165
|
-
custom_metadata=custom_metadata,
|
166
|
-
post_commithook_properties=post_commithook_properties,
|
167
|
-
commit_properties=commit_properties,
|
168
|
-
)
|
169
|
-
|
170
|
-
if self.with_lock:
|
171
|
-
with RedisLock(
|
172
|
-
lock_name=self._base_path,
|
173
|
-
namespace="flowerpower",
|
174
|
-
client=self.redis,
|
175
|
-
expire=10,
|
176
|
-
timeout=5,
|
177
|
-
retry_interval=0.1,
|
178
|
-
):
|
179
|
-
_z_order()
|
180
|
-
else:
|
181
|
-
_z_order()
|
182
|
-
|
183
|
-
@property
|
184
|
-
def metadata(self) -> dict:
|
185
|
-
if not hasattr(self, "_metadata"):
|
186
|
-
self._metadata = get_delta_metadata(self.delta_table, self._base_path)
|
187
|
-
return self._metadata
|
188
|
-
if not hasattr(self, "_metadata"):
|
189
|
-
self._metadata = get_delta_metadata(self.delta_table, self._base_path)
|
190
|
-
return self._metadata
|
@@ -1,19 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class DuckDBReader(BaseDatabaseReader, gc=False):
|
8
|
-
"""DuckDB loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from DuckDB database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = DuckDBReader(table_name="table", path="data.db")
|
15
|
-
df = loader.to_polars("SELECT * FROM table WHERE column = 'value'")
|
16
|
-
```
|
17
|
-
"""
|
18
|
-
|
19
|
-
type_: str = field(default="duckdb")
|
@@ -1,37 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseFileReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class JsonFileReader(BaseFileReader, gc=False):
|
8
|
-
"""
|
9
|
-
JSON file loader.
|
10
|
-
|
11
|
-
This class is responsible for loading dataframes from JSON files.
|
12
|
-
|
13
|
-
Examples:
|
14
|
-
```python
|
15
|
-
loader = JsonFileReader("data.json")
|
16
|
-
df = loader.load()
|
17
|
-
```
|
18
|
-
"""
|
19
|
-
|
20
|
-
format: str = field(default="json")
|
21
|
-
|
22
|
-
|
23
|
-
# @attrs.define
|
24
|
-
class JsonDatasetReader(BaseFileReader, gc=False):
|
25
|
-
"""
|
26
|
-
JSON dataset loader.
|
27
|
-
|
28
|
-
This class is responsible for loading dataframes from JSON dataset.
|
29
|
-
|
30
|
-
Examples:
|
31
|
-
```python
|
32
|
-
loader = JsonDatasetReader("json_data/")
|
33
|
-
df = loader.load()
|
34
|
-
```
|
35
|
-
"""
|
36
|
-
|
37
|
-
format: str = field(default="json")
|
@@ -1,159 +0,0 @@
|
|
1
|
-
from typing import Any
|
2
|
-
|
3
|
-
import datafusion
|
4
|
-
import duckdb
|
5
|
-
import msgspec
|
6
|
-
import orjson
|
7
|
-
import pandas as pd
|
8
|
-
import polars as pl
|
9
|
-
import pyarrow as pa
|
10
|
-
import pyarrow.dataset as pds
|
11
|
-
|
12
|
-
from ..helpers.sql import sql2polars_filter
|
13
|
-
from ..metadata import get_dataframe_metadata, get_duckdb_metadata
|
14
|
-
|
15
|
-
|
16
|
-
class PayloadReader(msgspec.Struct):
|
17
|
-
payload: bytes | dict[str, Any]
|
18
|
-
topic: str | None = None
|
19
|
-
conn: duckdb.DuckDBPyConnection | None = None
|
20
|
-
ctx: datafusion.SessionContext | None = None
|
21
|
-
format: str = "mqtt"
|
22
|
-
|
23
|
-
def __post_init__(self):
|
24
|
-
if isinstance(self.payload, bytes):
|
25
|
-
self.payload = orjson.loads(self.payload)
|
26
|
-
|
27
|
-
self._metadata = {
|
28
|
-
"format": self.format,
|
29
|
-
"timestamp": pd.Timestamp.now(),
|
30
|
-
"topic": self.topic,
|
31
|
-
}
|
32
|
-
|
33
|
-
def to_pyarrow_table(
|
34
|
-
self, metadata: bool = False
|
35
|
-
) -> pa.Table | tuple[pa.Table, dict[str, Any]]:
|
36
|
-
try:
|
37
|
-
df = pa.Table.from_pydict(self.payload)
|
38
|
-
except pa.ArrowInvalid:
|
39
|
-
df = pa.Table.from_pylist([self.payload])
|
40
|
-
if metadata:
|
41
|
-
self._metadata = get_dataframe_metadata(df, **self._metadata)
|
42
|
-
return df, self._metadata
|
43
|
-
return df
|
44
|
-
|
45
|
-
def to_pandas(
|
46
|
-
self, metadata: bool = False
|
47
|
-
) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
|
48
|
-
try:
|
49
|
-
df = pd.DataFrame(self.payload)
|
50
|
-
except ValueError:
|
51
|
-
df = pd.DataFrame([self.payload])
|
52
|
-
if metadata:
|
53
|
-
self._metadata = get_dataframe_metadata(df, **self._metadata)
|
54
|
-
return df, self._metadata
|
55
|
-
return df
|
56
|
-
|
57
|
-
def _to_polars_dataframe(
|
58
|
-
self, metadata: bool = False
|
59
|
-
) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]:
|
60
|
-
try:
|
61
|
-
df = pl.DataFrame(self.payload)
|
62
|
-
except pl.exceptions.ShapeError:
|
63
|
-
df = pl.DataFrame([self.payload])
|
64
|
-
if metadata:
|
65
|
-
self._metadata = get_dataframe_metadata(df, **self._metadata)
|
66
|
-
return df, self._metadata
|
67
|
-
return df
|
68
|
-
|
69
|
-
def _to_polars_lazyframe(
|
70
|
-
self, metadata: bool = False
|
71
|
-
) -> pl.LazyFrame | tuple[pl.LazyFrame, dict[str, Any]]:
|
72
|
-
try:
|
73
|
-
df = pl.LazyFrame(self.payload)
|
74
|
-
except pl.exceptions.ShapeError:
|
75
|
-
df = pl.LazyFrame([self.payload])
|
76
|
-
if metadata:
|
77
|
-
self._metadata = get_dataframe_metadata(df, **self._metadata)
|
78
|
-
return df, self._metadata
|
79
|
-
return df
|
80
|
-
|
81
|
-
def to_polars(
|
82
|
-
self, lazy: bool = False, metadata: bool = False
|
83
|
-
) -> (
|
84
|
-
pl.DataFrame | pl.LazyFrame | tuple[pl.DataFrame | pl.LazyFrame, dict[str, Any]]
|
85
|
-
):
|
86
|
-
if lazy:
|
87
|
-
return self._to_polars_lazyframe(metadata=metadata)
|
88
|
-
else:
|
89
|
-
return self._to_polars_dataframe(metadata=metadata)
|
90
|
-
|
91
|
-
def to_duckdb_relation(
|
92
|
-
self, conn: duckdb.DuckDBPyConnection | None = None, metadata: bool = False
|
93
|
-
) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
|
94
|
-
if self.conn is None:
|
95
|
-
if conn is None:
|
96
|
-
conn = duckdb.connect()
|
97
|
-
self.conn = conn
|
98
|
-
rel = self.conn.from_arrow(self.to_pyarrow_table())
|
99
|
-
if metadata:
|
100
|
-
self._metadata = get_duckdb_metadata(rel, **self._metadata)
|
101
|
-
return rel, self._metadata
|
102
|
-
return rel
|
103
|
-
|
104
|
-
def to_pyarrow_dataset(
|
105
|
-
self, metadata: bool = False, **kwargs
|
106
|
-
) -> pds.Dataset | tuple[pds.Dataset, dict[str, Any]]:
|
107
|
-
if metadata:
|
108
|
-
t, self._metadata = self.to_pyarrow_table(metadata=True)
|
109
|
-
return pds.dataset(t, **kwargs), self._metadata
|
110
|
-
return pds.dataset(self.to_pyarrow_table(), **kwargs)
|
111
|
-
|
112
|
-
def register_in_duckdb(
|
113
|
-
self,
|
114
|
-
conn: duckdb.DuckDBPyConnection | None = None,
|
115
|
-
name: str | None = None,
|
116
|
-
) -> duckdb.DuckDBPyConnection:
|
117
|
-
if name is None:
|
118
|
-
name = f"mqtt:{self.topic}"
|
119
|
-
|
120
|
-
if self.conn is None:
|
121
|
-
if conn is None:
|
122
|
-
conn = duckdb.connect()
|
123
|
-
self.conn = conn
|
124
|
-
|
125
|
-
self.conn.register(name, self.to_pyarrow_table())
|
126
|
-
return self.conn
|
127
|
-
|
128
|
-
def register_in_datafusion(
|
129
|
-
self,
|
130
|
-
ctx: datafusion.SessionContext | None = None,
|
131
|
-
name: str | None = None,
|
132
|
-
) -> None:
|
133
|
-
if name is None:
|
134
|
-
name = f"mqtt:{self.topic}"
|
135
|
-
|
136
|
-
if self.ctx is None:
|
137
|
-
if ctx is None:
|
138
|
-
ctx = datafusion.SessionContext()
|
139
|
-
self.ctx = ctx
|
140
|
-
|
141
|
-
self.ctx.register(name, [self.to_pyarrow_table()])
|
142
|
-
|
143
|
-
return self.ctx
|
144
|
-
|
145
|
-
def filter(self, filter_expr: str | pl.Expr) -> pl.DataFrame | pl.LazyFrame:
|
146
|
-
self._data = self.to_polars()
|
147
|
-
|
148
|
-
pl_schema = (
|
149
|
-
self._data.schema
|
150
|
-
if isinstance(self._data, pl.DataFrame)
|
151
|
-
else self._data.collect_schema()
|
152
|
-
)
|
153
|
-
filter_expr = (
|
154
|
-
sql2polars_filter(filter_expr, pl_schema)
|
155
|
-
if isinstance(filter_expr, str)
|
156
|
-
else filter_expr
|
157
|
-
)
|
158
|
-
return self._data.filter(filter_expr)
|
159
|
-
return self._data.filter(filter_expr)
|
@@ -1,26 +0,0 @@
|
|
1
|
-
from msgspec import field
|
2
|
-
|
3
|
-
from ..base import BaseDatabaseReader
|
4
|
-
|
5
|
-
|
6
|
-
# @attrs.define
|
7
|
-
class MSSQLReader(BaseDatabaseReader, gc=False):
|
8
|
-
"""MSSQL loader.
|
9
|
-
|
10
|
-
This class is responsible for loading dataframes from MSSQL database.
|
11
|
-
|
12
|
-
Examples:
|
13
|
-
```python
|
14
|
-
loader = MSSQLReader(table_name="table", host="localhost",
|
15
|
-
port=5432, username="user", password="password",
|
16
|
-
database="database")
|
17
|
-
df = loader.to_polars()
|
18
|
-
|
19
|
-
# or
|
20
|
-
loader = MSSQLReader(table_name="table",
|
21
|
-
connection_string="mssql+pyodbc://user:password@localhost:5432/database")
|
22
|
-
df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
|
23
|
-
```
|
24
|
-
"""
|
25
|
-
|
26
|
-
type_: str = field(default="mssql")
|