FlowerPower 0.11.6.20__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. flowerpower/__init__.py +2 -6
  2. flowerpower/cfg/__init__.py +7 -14
  3. flowerpower/cfg/base.py +29 -25
  4. flowerpower/cfg/pipeline/__init__.py +8 -6
  5. flowerpower/cfg/pipeline/_schedule.py +32 -0
  6. flowerpower/cfg/pipeline/adapter.py +0 -5
  7. flowerpower/cfg/pipeline/builder.py +377 -0
  8. flowerpower/cfg/pipeline/run.py +36 -0
  9. flowerpower/cfg/project/__init__.py +11 -24
  10. flowerpower/cfg/project/adapter.py +0 -12
  11. flowerpower/cli/__init__.py +2 -21
  12. flowerpower/cli/cfg.py +0 -3
  13. flowerpower/cli/mqtt.py +0 -6
  14. flowerpower/cli/pipeline.py +22 -415
  15. flowerpower/cli/utils.py +0 -1
  16. flowerpower/flowerpower.py +345 -146
  17. flowerpower/pipeline/__init__.py +2 -0
  18. flowerpower/pipeline/base.py +21 -12
  19. flowerpower/pipeline/io.py +58 -54
  20. flowerpower/pipeline/manager.py +165 -726
  21. flowerpower/pipeline/pipeline.py +643 -0
  22. flowerpower/pipeline/registry.py +285 -18
  23. flowerpower/pipeline/visualizer.py +5 -6
  24. flowerpower/plugins/io/__init__.py +8 -0
  25. flowerpower/plugins/mqtt/__init__.py +7 -11
  26. flowerpower/settings/__init__.py +0 -2
  27. flowerpower/settings/{backend.py → _backend.py} +0 -21
  28. flowerpower/settings/logging.py +1 -1
  29. flowerpower/utils/logging.py +24 -12
  30. flowerpower/utils/misc.py +17 -256
  31. flowerpower/utils/monkey.py +1 -83
  32. flowerpower-0.21.0.dist-info/METADATA +463 -0
  33. flowerpower-0.21.0.dist-info/RECORD +44 -0
  34. flowerpower/cfg/pipeline/schedule.py +0 -74
  35. flowerpower/cfg/project/job_queue.py +0 -238
  36. flowerpower/cli/job_queue.py +0 -1061
  37. flowerpower/fs/__init__.py +0 -29
  38. flowerpower/fs/base.py +0 -662
  39. flowerpower/fs/ext.py +0 -2143
  40. flowerpower/fs/storage_options.py +0 -1420
  41. flowerpower/job_queue/__init__.py +0 -294
  42. flowerpower/job_queue/apscheduler/__init__.py +0 -11
  43. flowerpower/job_queue/apscheduler/_setup/datastore.py +0 -110
  44. flowerpower/job_queue/apscheduler/_setup/eventbroker.py +0 -93
  45. flowerpower/job_queue/apscheduler/manager.py +0 -1051
  46. flowerpower/job_queue/apscheduler/setup.py +0 -554
  47. flowerpower/job_queue/apscheduler/trigger.py +0 -169
  48. flowerpower/job_queue/apscheduler/utils.py +0 -311
  49. flowerpower/job_queue/base.py +0 -413
  50. flowerpower/job_queue/rq/__init__.py +0 -10
  51. flowerpower/job_queue/rq/_trigger.py +0 -37
  52. flowerpower/job_queue/rq/concurrent_workers/gevent_worker.py +0 -226
  53. flowerpower/job_queue/rq/concurrent_workers/thread_worker.py +0 -231
  54. flowerpower/job_queue/rq/manager.py +0 -1582
  55. flowerpower/job_queue/rq/setup.py +0 -154
  56. flowerpower/job_queue/rq/utils.py +0 -69
  57. flowerpower/mqtt.py +0 -12
  58. flowerpower/pipeline/job_queue.py +0 -583
  59. flowerpower/pipeline/runner.py +0 -603
  60. flowerpower/plugins/io/base.py +0 -2520
  61. flowerpower/plugins/io/helpers/datetime.py +0 -298
  62. flowerpower/plugins/io/helpers/polars.py +0 -875
  63. flowerpower/plugins/io/helpers/pyarrow.py +0 -570
  64. flowerpower/plugins/io/helpers/sql.py +0 -202
  65. flowerpower/plugins/io/loader/__init__.py +0 -28
  66. flowerpower/plugins/io/loader/csv.py +0 -37
  67. flowerpower/plugins/io/loader/deltatable.py +0 -190
  68. flowerpower/plugins/io/loader/duckdb.py +0 -19
  69. flowerpower/plugins/io/loader/json.py +0 -37
  70. flowerpower/plugins/io/loader/mqtt.py +0 -159
  71. flowerpower/plugins/io/loader/mssql.py +0 -26
  72. flowerpower/plugins/io/loader/mysql.py +0 -26
  73. flowerpower/plugins/io/loader/oracle.py +0 -26
  74. flowerpower/plugins/io/loader/parquet.py +0 -35
  75. flowerpower/plugins/io/loader/postgres.py +0 -26
  76. flowerpower/plugins/io/loader/pydala.py +0 -19
  77. flowerpower/plugins/io/loader/sqlite.py +0 -23
  78. flowerpower/plugins/io/metadata.py +0 -244
  79. flowerpower/plugins/io/saver/__init__.py +0 -28
  80. flowerpower/plugins/io/saver/csv.py +0 -36
  81. flowerpower/plugins/io/saver/deltatable.py +0 -186
  82. flowerpower/plugins/io/saver/duckdb.py +0 -19
  83. flowerpower/plugins/io/saver/json.py +0 -36
  84. flowerpower/plugins/io/saver/mqtt.py +0 -28
  85. flowerpower/plugins/io/saver/mssql.py +0 -26
  86. flowerpower/plugins/io/saver/mysql.py +0 -26
  87. flowerpower/plugins/io/saver/oracle.py +0 -26
  88. flowerpower/plugins/io/saver/parquet.py +0 -36
  89. flowerpower/plugins/io/saver/postgres.py +0 -26
  90. flowerpower/plugins/io/saver/pydala.py +0 -20
  91. flowerpower/plugins/io/saver/sqlite.py +0 -24
  92. flowerpower/plugins/mqtt/cfg.py +0 -17
  93. flowerpower/plugins/mqtt/manager.py +0 -962
  94. flowerpower/settings/job_queue.py +0 -87
  95. flowerpower/utils/scheduler.py +0 -311
  96. flowerpower-0.11.6.20.dist-info/METADATA +0 -537
  97. flowerpower-0.11.6.20.dist-info/RECORD +0 -102
  98. {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/WHEEL +0 -0
  99. {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/entry_points.txt +0 -0
  100. {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/licenses/LICENSE +0 -0
  101. {flowerpower-0.11.6.20.dist-info → flowerpower-0.21.0.dist-info}/top_level.txt +0 -0
@@ -1,202 +0,0 @@
1
- import re
2
- from typing import Any
3
-
4
- import pyarrow as pa
5
- import pyarrow.compute as pc
6
- from sqlglot import exp, parse_one
7
-
8
- from .datetime import timestamp_from_string
9
- from .polars import pl
10
-
11
- # Compile regex patterns once for efficiency
12
- SPLIT_PATTERN = re.compile(
13
- r"<=|<|>=|>|=|!=|\s+[n,N][o,O][t,T]\s+[i,I][n,N]\s+|\s+[i,I][n,N]\s+|"
14
- r"\s+[i,I][s,S]\s+[n,N][o,O][t,T]\s+[n,N][u,U][l,L]{2}\s+|\s+[i,I][s,S]\s+[n,N][u,U][l,L]{2}\s+"
15
- )
16
- LOGICAL_OPERATORS_PATTERN = re.compile(
17
- r"\s+[a,A][n,N][d,D] [n,N][o,O][t,T]\s+|\s+[a,A][n,N][d,D]\s+|"
18
- r"\s+[o,O][r,R] [n,N][o,O][t,T]\s+|\s+[o,O][r,R]\s+"
19
- )
20
-
21
-
22
- def sql2pyarrow_filter(string: str, schema: pa.Schema) -> pc.Expression:
23
- """
24
- Generates a filter expression for PyArrow based on a given string and schema.
25
-
26
- Parameters:
27
- string (str): The string containing the filter expression.
28
- schema (pa.Schema): The PyArrow schema used to validate the filter expression.
29
-
30
- Returns:
31
- pc.Expression: The generated filter expression.
32
-
33
- Raises:
34
- ValueError: If the input string is invalid or contains unsupported operations.
35
- """
36
-
37
- def parse_value(val: str, type_: pa.DataType) -> Any:
38
- """Parse and convert value based on the field type."""
39
- if isinstance(val, (tuple, list)):
40
- return type(val)(parse_value(v, type_) for v in val)
41
-
42
- if pa.types.is_timestamp(type_):
43
- return timestamp_from_string(val, exact=False, tz=type_.tz)
44
- elif pa.types.is_date(type_):
45
- return timestamp_from_string(val, exact=True).date()
46
- elif pa.types.is_time(type_):
47
- return timestamp_from_string(val, exact=True).time()
48
-
49
- elif pa.types.is_integer(type_):
50
- return int(float(val.strip("'").replace(",", ".")))
51
- elif pa.types.is_floating(type_):
52
- return float(val.strip("'").replace(",", "."))
53
- elif pa.types.is_boolean(type_):
54
- return val.lower().strip("'") in ("true", "1", "yes")
55
- else:
56
- return val.strip("'")
57
-
58
- def _parse_part(part: str) -> pc.Expression:
59
- match = SPLIT_PATTERN.search(part)
60
- if not match:
61
- raise ValueError(f"Invalid condition: {part}")
62
-
63
- sign = match.group().lower().strip()
64
- field, val = [p.strip() for p in SPLIT_PATTERN.split(part)]
65
-
66
- if field not in schema.names:
67
- raise ValueError(f"Unknown field: {field}")
68
-
69
- type_ = schema.field(field).type
70
- val = parse_value(val, type_)
71
-
72
- operations = {
73
- ">=": lambda f, v: pc.field(f) >= v,
74
- ">": lambda f, v: pc.field(f) > v,
75
- "<=": lambda f, v: pc.field(f) <= v,
76
- "<": lambda f, v: pc.field(f) < v,
77
- "=": lambda f, v: pc.field(f) == v,
78
- "!=": lambda f, v: pc.field(f) != v,
79
- "in": lambda f, v: pc.field(f).isin(v),
80
- "not in": lambda f, v: ~pc.field(f).isin(v),
81
- "is null": lambda f, v: pc.field(f).is_null(nan_is_null=True),
82
- "is not null": lambda f, v: ~pc.field(f).is_null(nan_is_null=True),
83
- }
84
-
85
- if sign not in operations:
86
- raise ValueError(f"Unsupported operation: {sign}")
87
-
88
- return operations[sign](field, val)
89
-
90
- parts = LOGICAL_OPERATORS_PATTERN.split(string)
91
- operators = [op.lower().strip() for op in LOGICAL_OPERATORS_PATTERN.findall(string)]
92
-
93
- if len(parts) == 1:
94
- return _parse_part(parts[0])
95
-
96
- expr = _parse_part(parts[0])
97
- for part, operator in zip(parts[1:], operators):
98
- if operator == "and":
99
- expr = expr & _parse_part(part)
100
- elif operator == "and not":
101
- expr = expr & ~_parse_part(part)
102
- elif operator == "or":
103
- expr = expr | _parse_part(part)
104
- elif operator == "or not":
105
- expr = expr | ~_parse_part(part)
106
- else:
107
- raise ValueError(f"Unsupported logical operator: {operator}")
108
-
109
- return expr
110
-
111
-
112
- def sql2polars_filter(string: str, schema: pl.Schema) -> pl.Expr:
113
- """
114
- Generates a filter expression for Polars based on a given string and schema.
115
-
116
- Parameters:
117
- string (str): The string containing the filter expression.
118
- schema (pl.Schema): The Polars schema used to validate the filter expression.
119
-
120
- Returns:
121
- pl.Expr: The generated filter expression.
122
-
123
- Raises:
124
- ValueError: If the input string is invalid or contains unsupported operations.
125
- """
126
-
127
- def parse_value(val: str, dtype: pl.DataType) -> Any:
128
- """Parse and convert value based on the field type."""
129
- if isinstance(val, (tuple, list)):
130
- return type(val)(parse_value(v, dtype) for v in val)
131
-
132
- if dtype == pl.Datetime:
133
- return timestamp_from_string(val, exact=False, tz=dtype.time_zone)
134
- elif dtype == pl.Date:
135
- return timestamp_from_string(val, exact=True).date()
136
- elif dtype == pl.Time:
137
- return timestamp_from_string(val, exact=True).time()
138
- elif dtype in (pl.Int8, pl.Int16, pl.Int32, pl.Int64):
139
- return int(float(val.strip("'").replace(",", ".")))
140
- elif dtype in (pl.Float32, pl.Float64):
141
- return float(val.strip("'").replace(",", "."))
142
- elif dtype == pl.Boolean:
143
- return val.lower().strip("'") in ("true", "1", "yes")
144
- else:
145
- return val.strip("'")
146
-
147
- def _parse_part(part: str) -> pl.Expr:
148
- match = SPLIT_PATTERN.search(part)
149
- if not match:
150
- raise ValueError(f"Invalid condition: {part}")
151
-
152
- sign = match.group().lower().strip()
153
- field, val = [p.strip() for p in SPLIT_PATTERN.split(part)]
154
-
155
- if field not in schema.names():
156
- raise ValueError(f"Unknown field: {field}")
157
-
158
- dtype = schema[field]
159
- val = parse_value(val, dtype)
160
-
161
- operations = {
162
- ">=": lambda f, v: pl.col(f) >= v,
163
- ">": lambda f, v: pl.col(f) > v,
164
- "<=": lambda f, v: pl.col(f) <= v,
165
- "<": lambda f, v: pl.col(f) < v,
166
- "=": lambda f, v: pl.col(f) == v,
167
- "!=": lambda f, v: pl.col(f) != v,
168
- "in": lambda f, v: pl.col(f).is_in(v),
169
- "not in": lambda f, v: ~pl.col(f).is_in(v),
170
- "is null": lambda f, v: pl.col(f).is_null(),
171
- "is not null": lambda f, v: pl.col(f).is_not_null(),
172
- }
173
-
174
- if sign not in operations:
175
- raise ValueError(f"Unsupported operation: {sign}")
176
-
177
- return operations[sign](field, val)
178
-
179
- parts = LOGICAL_OPERATORS_PATTERN.split(string)
180
- operators = [op.lower().strip() for op in LOGICAL_OPERATORS_PATTERN.findall(string)]
181
-
182
- if len(parts) == 1:
183
- return _parse_part(parts[0])
184
-
185
- expr = _parse_part(parts[0])
186
- for part, operator in zip(parts[1:], operators):
187
- if operator == "and":
188
- expr = expr & _parse_part(part)
189
- elif operator == "and not":
190
- expr = expr & ~_parse_part(part)
191
- elif operator == "or":
192
- expr = expr | _parse_part(part)
193
- elif operator == "or not":
194
- expr = expr | ~_parse_part(part)
195
- else:
196
- raise ValueError(f"Unsupported logical operator: {operator}")
197
-
198
- return expr
199
-
200
-
201
- def get_table_names(sql_query):
202
- return [table.name for table in parse_one(sql_query).find_all(exp.Table)]
@@ -1,28 +0,0 @@
1
- from .csv import CSVDatasetReader, CSVFileReader
2
- from .deltatable import DeltaTableReader
3
- from .duckdb import DuckDBReader
4
- from .json import JsonDatasetReader, JsonFileReader
5
- from .mssql import MSSQLReader
6
- from .mysql import MySQLReader
7
- from .oracle import OracleDBReader
8
- from .parquet import ParquetDatasetReader, ParquetFileReader
9
- from .postgres import PostgreSQLReader
10
- from .pydala import PydalaDatasetReader
11
- from .sqlite import SQLiteReader
12
-
13
- __all__ = [
14
- "CSVFileReader",
15
- "CSVDatasetReader",
16
- "DeltaTableReader",
17
- "DuckDBReader",
18
- "JsonFileReader",
19
- "JsonDatasetReader",
20
- "MSSQLReader",
21
- "MySQLReader",
22
- "OracleDBReader",
23
- "ParquetFileReader",
24
- "ParquetDatasetReader",
25
- "PostgreSQLReader",
26
- "PydalaDatasetReader",
27
- "SQLiteReader",
28
- ]
@@ -1,37 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatasetReader, BaseFileReader
4
-
5
-
6
- # @attrs.define
7
- class CSVFileReader(BaseFileReader, gc=False):
8
- """CSV file loader.
9
-
10
- This class is responsible for loading CSV files into several dataframe formats,
11
- duckdb and datafusion.
12
-
13
- Examples:
14
- ```python
15
- loader = CSVFileReader("data.csv")
16
- df = loader.to_pandas()
17
- ```
18
- """
19
-
20
- format: str = field(default="csv")
21
-
22
-
23
- # @attrs.define
24
- class CSVDatasetReader(BaseDatasetReader, gc=False):
25
- """CSV dataset loader.
26
-
27
- This class is responsible for loading CSV files into several dataframe formats,
28
- duckdb and datafusion.
29
-
30
- Examples:
31
- ```python
32
- loader = CSVDatasetReader("csv_data/")
33
- df = loader.to_pandas()
34
- ```
35
- """
36
-
37
- format: str = field(default="csv")
@@ -1,190 +0,0 @@
1
- # import datetime as dt
2
-
3
-
4
- import datetime
5
-
6
- import pyarrow as pa
7
- import pyarrow.dataset as pds
8
- from deltalake import DeltaTable, table
9
- from deltalake.exceptions import TableNotFoundError
10
- from deltalake.transaction import CommitProperties, PostCommitHookProperties
11
- from deltalake.writer import WriterProperties
12
- from loguru import logger
13
- from msgspec import field
14
- from sherlock import RedisLock
15
-
16
- from ..base import BaseDatasetReader
17
- from ..metadata import (get_dataframe_metadata, get_delta_metadata,
18
- get_pyarrow_dataset_metadata)
19
-
20
-
21
- # @attrs.define
22
- class DeltaTableReader(BaseDatasetReader, gc=False):
23
- """Delta table loader.
24
-
25
- This class is responsible for loading Delta tables into several dataframe formats,
26
- duckdb and datafusion.
27
-
28
- """
29
-
30
- delta_table: DeltaTable | None = None
31
- with_lock: bool = False
32
- redis: str | None = None
33
- format: str = field(default="delta")
34
-
35
- def __post_init__(self):
36
- super().__post_init__()
37
-
38
- self._init_dt()
39
- if self.with_lock and self.redis is None:
40
- raise ValueError("Redis connection is required when using locks.")
41
-
42
- def _init_dt(self):
43
- try:
44
- self.delta_table = DeltaTable(
45
- self._base_path,
46
- storage_options=self.storage_options.to_object_store_kwargs(),
47
- )
48
- except TableNotFoundError:
49
- logger.warning(f"Table {self._base_path} not found.")
50
- self.delta_table = None
51
-
52
- @property
53
- def dt(self) -> DeltaTable:
54
- return self.delta_table
55
-
56
- def _load(self, reload: bool = False):
57
- self.to_pyarrow_table(reload=reload)
58
-
59
- def to_pyarrow_dataset(
60
- self, metadata: bool = False, reload: bool = False
61
- ) -> pds.Dataset | tuple[pds.Dataset, dict[str, any]]:
62
- """Converts the DeltaTable to a PyArrow Dataset.
63
-
64
- Args:
65
- metadata (bool, optional): Whether to include metadata. Defaults to False.
66
- reload (bool, optional): Whether to reload the dataset. Defaults to False.
67
-
68
- Returns:
69
- pds.Dataset | tuple[pds.Dataset, dict[str, any]]: PyArrow Dataset or tuple of PyArrow Dataset and metadata.
70
- """
71
- if self.delta_table is None:
72
- self._init_dt()
73
- if self.delta_table is None:
74
- return None
75
-
76
- if reload or not hasattr(self, "_dataset"):
77
- self._dataset = self.delta_table.to_pyarrow_dataset()
78
- if metadata:
79
- metadata = get_pyarrow_dataset_metadata(
80
- self._dataset, self._base_path, "parquet"
81
- )
82
- return self._dataset, metadata
83
- return self._dataset
84
-
85
- def to_pyarrow_table(
86
- self, metadata: bool = False, reload: bool = False
87
- ) -> pa.Table | tuple[pa.Table, dict[str, any]]:
88
- """Converts the DeltaTable to a PyArrow Table.
89
-
90
- Args:
91
- metadata (bool, optional): Whether to include metadata. Defaults to False.
92
- reload (bool, optional): Whether to reload the table. Defaults to False.
93
-
94
- Returns:
95
- pa.Table | tuple[pa.Table, dict[str, any]]: PyArrow Table or tuple of PyArrow Table and metadata.
96
- """
97
- if self.delta_table is None:
98
- self._init_dt()
99
- if self.delta_table is None:
100
- return None
101
-
102
- if reload or not hasattr(self, "_data"):
103
- self._data = self.delta_table.to_pyarrow_table()
104
- if metadata:
105
- metadata = get_dataframe_metadata(table, self._base_path, "parquet")
106
- return self._data, metadata
107
- return self._data
108
-
109
- def compact(
110
- self,
111
- partition_filters: list[tuple[str, str, any]] | None = None,
112
- target_size: int = None,
113
- max_concurrent_tasks: int = None,
114
- min_commit_interval: int | datetime.timedelta | None = None,
115
- writer_properties: WriterProperties = None,
116
- custom_metadata: dict[str, str] | None = None,
117
- post_commithook_properties: PostCommitHookProperties | None = None,
118
- commit_properties: CommitProperties | None = None,
119
- ) -> dict[str, any]:
120
- def _compact():
121
- self.delta_table.compact(
122
- partition_filters=partition_filters,
123
- target_size=target_size,
124
- max_concurrent_tasks=max_concurrent_tasks,
125
- min_commit_interval=min_commit_interval,
126
- writer_properties=writer_properties,
127
- custom_metadata=custom_metadata,
128
- post_commithook_properties=post_commithook_properties,
129
- commit_properties=commit_properties,
130
- )
131
-
132
- if self.with_lock:
133
- with RedisLock(
134
- lock_name=self._base_path,
135
- namespace="flowerpower",
136
- client=self.redis,
137
- expire=10,
138
- timeout=5,
139
- retry_interval=0.1,
140
- ):
141
- _compact()
142
- else:
143
- _compact()
144
-
145
- def z_order(
146
- self,
147
- columns: list[str],
148
- partition_filters: list[tuple[str, str, any]] | None = None,
149
- target_size: int = None,
150
- max_concurrent_tasks: int = None,
151
- min_commit_interval: int | datetime.timedelta | None = None,
152
- writer_properties: WriterProperties = None,
153
- custom_metadata: dict[str, str] | None = None,
154
- post_commithook_properties: PostCommitHookProperties | None = None,
155
- commit_properties: CommitProperties | None = None,
156
- ) -> dict[str, any]:
157
- def _z_order():
158
- self.delta_table.z_order(
159
- columns=columns,
160
- partition_filters=partition_filters,
161
- target_size=target_size,
162
- max_concurrent_tasks=max_concurrent_tasks,
163
- min_commit_interval=min_commit_interval,
164
- writer_properties=writer_properties,
165
- custom_metadata=custom_metadata,
166
- post_commithook_properties=post_commithook_properties,
167
- commit_properties=commit_properties,
168
- )
169
-
170
- if self.with_lock:
171
- with RedisLock(
172
- lock_name=self._base_path,
173
- namespace="flowerpower",
174
- client=self.redis,
175
- expire=10,
176
- timeout=5,
177
- retry_interval=0.1,
178
- ):
179
- _z_order()
180
- else:
181
- _z_order()
182
-
183
- @property
184
- def metadata(self) -> dict:
185
- if not hasattr(self, "_metadata"):
186
- self._metadata = get_delta_metadata(self.delta_table, self._base_path)
187
- return self._metadata
188
- if not hasattr(self, "_metadata"):
189
- self._metadata = get_delta_metadata(self.delta_table, self._base_path)
190
- return self._metadata
@@ -1,19 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseReader
4
-
5
-
6
- # @attrs.define
7
- class DuckDBReader(BaseDatabaseReader, gc=False):
8
- """DuckDB loader.
9
-
10
- This class is responsible for loading dataframes from DuckDB database.
11
-
12
- Examples:
13
- ```python
14
- loader = DuckDBReader(table_name="table", path="data.db")
15
- df = loader.to_polars("SELECT * FROM table WHERE column = 'value'")
16
- ```
17
- """
18
-
19
- type_: str = field(default="duckdb")
@@ -1,37 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseFileReader
4
-
5
-
6
- # @attrs.define
7
- class JsonFileReader(BaseFileReader, gc=False):
8
- """
9
- JSON file loader.
10
-
11
- This class is responsible for loading dataframes from JSON files.
12
-
13
- Examples:
14
- ```python
15
- loader = JsonFileReader("data.json")
16
- df = loader.load()
17
- ```
18
- """
19
-
20
- format: str = field(default="json")
21
-
22
-
23
- # @attrs.define
24
- class JsonDatasetReader(BaseFileReader, gc=False):
25
- """
26
- JSON dataset loader.
27
-
28
- This class is responsible for loading dataframes from JSON dataset.
29
-
30
- Examples:
31
- ```python
32
- loader = JsonDatasetReader("json_data/")
33
- df = loader.load()
34
- ```
35
- """
36
-
37
- format: str = field(default="json")
@@ -1,159 +0,0 @@
1
- from typing import Any
2
-
3
- import datafusion
4
- import duckdb
5
- import msgspec
6
- import orjson
7
- import pandas as pd
8
- import polars as pl
9
- import pyarrow as pa
10
- import pyarrow.dataset as pds
11
-
12
- from ..helpers.sql import sql2polars_filter
13
- from ..metadata import get_dataframe_metadata, get_duckdb_metadata
14
-
15
-
16
- class PayloadReader(msgspec.Struct):
17
- payload: bytes | dict[str, Any]
18
- topic: str | None = None
19
- conn: duckdb.DuckDBPyConnection | None = None
20
- ctx: datafusion.SessionContext | None = None
21
- format: str = "mqtt"
22
-
23
- def __post_init__(self):
24
- if isinstance(self.payload, bytes):
25
- self.payload = orjson.loads(self.payload)
26
-
27
- self._metadata = {
28
- "format": self.format,
29
- "timestamp": pd.Timestamp.now(),
30
- "topic": self.topic,
31
- }
32
-
33
- def to_pyarrow_table(
34
- self, metadata: bool = False
35
- ) -> pa.Table | tuple[pa.Table, dict[str, Any]]:
36
- try:
37
- df = pa.Table.from_pydict(self.payload)
38
- except pa.ArrowInvalid:
39
- df = pa.Table.from_pylist([self.payload])
40
- if metadata:
41
- self._metadata = get_dataframe_metadata(df, **self._metadata)
42
- return df, self._metadata
43
- return df
44
-
45
- def to_pandas(
46
- self, metadata: bool = False
47
- ) -> pd.DataFrame | tuple[pd.DataFrame, dict[str, Any]]:
48
- try:
49
- df = pd.DataFrame(self.payload)
50
- except ValueError:
51
- df = pd.DataFrame([self.payload])
52
- if metadata:
53
- self._metadata = get_dataframe_metadata(df, **self._metadata)
54
- return df, self._metadata
55
- return df
56
-
57
- def _to_polars_dataframe(
58
- self, metadata: bool = False
59
- ) -> pl.DataFrame | tuple[pl.DataFrame, dict[str, Any]]:
60
- try:
61
- df = pl.DataFrame(self.payload)
62
- except pl.exceptions.ShapeError:
63
- df = pl.DataFrame([self.payload])
64
- if metadata:
65
- self._metadata = get_dataframe_metadata(df, **self._metadata)
66
- return df, self._metadata
67
- return df
68
-
69
- def _to_polars_lazyframe(
70
- self, metadata: bool = False
71
- ) -> pl.LazyFrame | tuple[pl.LazyFrame, dict[str, Any]]:
72
- try:
73
- df = pl.LazyFrame(self.payload)
74
- except pl.exceptions.ShapeError:
75
- df = pl.LazyFrame([self.payload])
76
- if metadata:
77
- self._metadata = get_dataframe_metadata(df, **self._metadata)
78
- return df, self._metadata
79
- return df
80
-
81
- def to_polars(
82
- self, lazy: bool = False, metadata: bool = False
83
- ) -> (
84
- pl.DataFrame | pl.LazyFrame | tuple[pl.DataFrame | pl.LazyFrame, dict[str, Any]]
85
- ):
86
- if lazy:
87
- return self._to_polars_lazyframe(metadata=metadata)
88
- else:
89
- return self._to_polars_dataframe(metadata=metadata)
90
-
91
- def to_duckdb_relation(
92
- self, conn: duckdb.DuckDBPyConnection | None = None, metadata: bool = False
93
- ) -> duckdb.DuckDBPyRelation | tuple[duckdb.DuckDBPyRelation, dict[str, Any]]:
94
- if self.conn is None:
95
- if conn is None:
96
- conn = duckdb.connect()
97
- self.conn = conn
98
- rel = self.conn.from_arrow(self.to_pyarrow_table())
99
- if metadata:
100
- self._metadata = get_duckdb_metadata(rel, **self._metadata)
101
- return rel, self._metadata
102
- return rel
103
-
104
- def to_pyarrow_dataset(
105
- self, metadata: bool = False, **kwargs
106
- ) -> pds.Dataset | tuple[pds.Dataset, dict[str, Any]]:
107
- if metadata:
108
- t, self._metadata = self.to_pyarrow_table(metadata=True)
109
- return pds.dataset(t, **kwargs), self._metadata
110
- return pds.dataset(self.to_pyarrow_table(), **kwargs)
111
-
112
- def register_in_duckdb(
113
- self,
114
- conn: duckdb.DuckDBPyConnection | None = None,
115
- name: str | None = None,
116
- ) -> duckdb.DuckDBPyConnection:
117
- if name is None:
118
- name = f"mqtt:{self.topic}"
119
-
120
- if self.conn is None:
121
- if conn is None:
122
- conn = duckdb.connect()
123
- self.conn = conn
124
-
125
- self.conn.register(name, self.to_pyarrow_table())
126
- return self.conn
127
-
128
- def register_in_datafusion(
129
- self,
130
- ctx: datafusion.SessionContext | None = None,
131
- name: str | None = None,
132
- ) -> None:
133
- if name is None:
134
- name = f"mqtt:{self.topic}"
135
-
136
- if self.ctx is None:
137
- if ctx is None:
138
- ctx = datafusion.SessionContext()
139
- self.ctx = ctx
140
-
141
- self.ctx.register(name, [self.to_pyarrow_table()])
142
-
143
- return self.ctx
144
-
145
- def filter(self, filter_expr: str | pl.Expr) -> pl.DataFrame | pl.LazyFrame:
146
- self._data = self.to_polars()
147
-
148
- pl_schema = (
149
- self._data.schema
150
- if isinstance(self._data, pl.DataFrame)
151
- else self._data.collect_schema()
152
- )
153
- filter_expr = (
154
- sql2polars_filter(filter_expr, pl_schema)
155
- if isinstance(filter_expr, str)
156
- else filter_expr
157
- )
158
- return self._data.filter(filter_expr)
159
- return self._data.filter(filter_expr)
@@ -1,26 +0,0 @@
1
- from msgspec import field
2
-
3
- from ..base import BaseDatabaseReader
4
-
5
-
6
- # @attrs.define
7
- class MSSQLReader(BaseDatabaseReader, gc=False):
8
- """MSSQL loader.
9
-
10
- This class is responsible for loading dataframes from MSSQL database.
11
-
12
- Examples:
13
- ```python
14
- loader = MSSQLReader(table_name="table", host="localhost",
15
- port=5432, username="user", password="password",
16
- database="database")
17
- df = loader.to_polars()
18
-
19
- # or
20
- loader = MSSQLReader(table_name="table",
21
- connection_string="mssql+pyodbc://user:password@localhost:5432/database")
22
- df = loader.to_pyarrow_table("SELECT * FROM table WHERE column = 'value'")
23
- ```
24
- """
25
-
26
- type_: str = field(default="mssql")