feldera 0.27.0__tar.gz → 0.29.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of feldera might be problematic. Click here for more details.
- {feldera-0.27.0 → feldera-0.29.0}/PKG-INFO +14 -3
- {feldera-0.27.0 → feldera-0.29.0}/README.md +10 -1
- feldera-0.29.0/feldera/__init__.py +11 -0
- {feldera-0.27.0 → feldera-0.29.0}/feldera/_callback_runner.py +12 -11
- {feldera-0.27.0 → feldera-0.29.0}/feldera/_helpers.py +39 -35
- {feldera-0.27.0 → feldera-0.29.0}/feldera/enums.py +1 -1
- {feldera-0.27.0 → feldera-0.29.0}/feldera/output_handler.py +11 -4
- {feldera-0.27.0 → feldera-0.29.0}/feldera/pipeline.py +111 -24
- {feldera-0.27.0 → feldera-0.29.0}/feldera/pipeline_builder.py +15 -4
- {feldera-0.27.0 → feldera-0.29.0}/feldera/rest/__init__.py +1 -1
- {feldera-0.27.0 → feldera-0.29.0}/feldera/rest/_httprequests.py +69 -52
- {feldera-0.27.0 → feldera-0.29.0}/feldera/rest/config.py +5 -5
- {feldera-0.27.0 → feldera-0.29.0}/feldera/rest/errors.py +14 -11
- {feldera-0.27.0 → feldera-0.29.0}/feldera/rest/feldera_client.py +172 -38
- {feldera-0.27.0 → feldera-0.29.0}/feldera/rest/pipeline.py +18 -10
- feldera-0.29.0/feldera/rest/sql_table.py +23 -0
- feldera-0.29.0/feldera/rest/sql_view.py +23 -0
- {feldera-0.27.0 → feldera-0.29.0}/feldera/runtime_config.py +11 -12
- {feldera-0.27.0 → feldera-0.29.0}/feldera.egg-info/PKG-INFO +14 -3
- {feldera-0.27.0 → feldera-0.29.0}/feldera.egg-info/SOURCES.txt +1 -0
- {feldera-0.27.0 → feldera-0.29.0}/feldera.egg-info/requires.txt +2 -0
- {feldera-0.27.0 → feldera-0.29.0}/pyproject.toml +4 -2
- {feldera-0.27.0 → feldera-0.29.0}/tests/test_pipeline.py +82 -5
- {feldera-0.27.0 → feldera-0.29.0}/tests/test_pipeline_builder.py +181 -78
- feldera-0.29.0/tests/test_udf.py +312 -0
- {feldera-0.27.0 → feldera-0.29.0}/tests/test_variant.py +23 -23
- feldera-0.27.0/feldera/__init__.py +0 -3
- feldera-0.27.0/feldera/rest/sql_table.py +0 -17
- feldera-0.27.0/feldera/rest/sql_view.py +0 -17
- {feldera-0.27.0 → feldera-0.29.0}/feldera.egg-info/dependency_links.txt +0 -0
- {feldera-0.27.0 → feldera-0.29.0}/feldera.egg-info/top_level.txt +0 -0
- {feldera-0.27.0 → feldera-0.29.0}/setup.cfg +0 -0
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: feldera
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.29.0
|
|
4
4
|
Summary: The feldera python client
|
|
5
5
|
Author-email: Abhinav <abhinav.gyawali@feldera.com>
|
|
6
6
|
License: MIT
|
|
7
7
|
Project-URL: Homepage, https://www.feldera.com
|
|
8
|
-
Project-URL: Documentation, https://docs.feldera.com
|
|
8
|
+
Project-URL: Documentation, https://docs.feldera.com/python
|
|
9
9
|
Project-URL: Repository, https://github.com/feldera/feldera
|
|
10
10
|
Project-URL: Issues, https://github.com/feldera/feldera/issues
|
|
11
11
|
Keywords: feldera,python
|
|
@@ -19,6 +19,8 @@ Requires-Dist: requests
|
|
|
19
19
|
Requires-Dist: pandas
|
|
20
20
|
Requires-Dist: typing-extensions
|
|
21
21
|
Requires-Dist: numpy<2
|
|
22
|
+
Requires-Dist: pretty-errors
|
|
23
|
+
Requires-Dist: ruff>=0.6.9
|
|
22
24
|
|
|
23
25
|
# Feldera Python SDK
|
|
24
26
|
|
|
@@ -44,6 +46,15 @@ $ pip install git+https://github.com/feldera/feldera@{BRANCH_NAME}#subdirectory=
|
|
|
44
46
|
|
|
45
47
|
Replace `{BRANCH_NAME}` with the name of the branch you want to install from.
|
|
46
48
|
|
|
49
|
+
### Installing from Local Directory
|
|
50
|
+
|
|
51
|
+
If you have cloned the Feldera repo, you can install the python SDK as follows:
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# the Feldera Python SDK is present inside the python/ directory
|
|
55
|
+
pip install python/
|
|
56
|
+
```
|
|
57
|
+
|
|
47
58
|
Checkout the docs [here](./feldera/__init__.py) for an example on how to use the SDK.
|
|
48
59
|
|
|
49
60
|
## Documentation
|
|
@@ -90,5 +101,5 @@ To run the aggregate tests use:
|
|
|
90
101
|
|
|
91
102
|
```bash
|
|
92
103
|
cd python
|
|
93
|
-
PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/
|
|
104
|
+
PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/main.py
|
|
94
105
|
```
|
|
@@ -22,6 +22,15 @@ $ pip install git+https://github.com/feldera/feldera@{BRANCH_NAME}#subdirectory=
|
|
|
22
22
|
|
|
23
23
|
Replace `{BRANCH_NAME}` with the name of the branch you want to install from.
|
|
24
24
|
|
|
25
|
+
### Installing from Local Directory
|
|
26
|
+
|
|
27
|
+
If you have cloned the Feldera repo, you can install the python SDK as follows:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# the Feldera Python SDK is present inside the python/ directory
|
|
31
|
+
pip install python/
|
|
32
|
+
```
|
|
33
|
+
|
|
25
34
|
Checkout the docs [here](./feldera/__init__.py) for an example on how to use the SDK.
|
|
26
35
|
|
|
27
36
|
## Documentation
|
|
@@ -68,5 +77,5 @@ To run the aggregate tests use:
|
|
|
68
77
|
|
|
69
78
|
```bash
|
|
70
79
|
cd python
|
|
71
|
-
PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/
|
|
80
|
+
PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/main.py
|
|
72
81
|
```
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
from feldera.rest.feldera_client import FelderaClient as FelderaClient
|
|
2
|
+
from feldera.pipeline import Pipeline as Pipeline
|
|
3
|
+
from feldera.pipeline_builder import PipelineBuilder as PipelineBuilder
|
|
4
|
+
|
|
5
|
+
import pretty_errors
|
|
6
|
+
|
|
7
|
+
pretty_errors.configure(
|
|
8
|
+
line_number_first=True,
|
|
9
|
+
)
|
|
10
|
+
|
|
11
|
+
pretty_errors.activate()
|
|
@@ -15,12 +15,12 @@ class _CallbackRunnerInstruction(Enum):
|
|
|
15
15
|
|
|
16
16
|
class CallbackRunner(Thread):
|
|
17
17
|
def __init__(
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
18
|
+
self,
|
|
19
|
+
client: FelderaClient,
|
|
20
|
+
pipeline_name: str,
|
|
21
|
+
view_name: str,
|
|
22
|
+
callback: Callable[[pd.DataFrame, int], None],
|
|
23
|
+
queue: Optional[Queue],
|
|
24
24
|
):
|
|
25
25
|
super().__init__()
|
|
26
26
|
self.daemon = True
|
|
@@ -49,7 +49,9 @@ class CallbackRunner(Thread):
|
|
|
49
49
|
break
|
|
50
50
|
|
|
51
51
|
if self.schema is None:
|
|
52
|
-
raise ValueError(
|
|
52
|
+
raise ValueError(
|
|
53
|
+
f"Table or View {self.view_name} not found in the pipeline schema."
|
|
54
|
+
)
|
|
53
55
|
|
|
54
56
|
# by default, we assume that the pipeline has been started
|
|
55
57
|
ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
|
|
@@ -60,12 +62,12 @@ class CallbackRunner(Thread):
|
|
|
60
62
|
ack: _CallbackRunnerInstruction = self.queue.get()
|
|
61
63
|
|
|
62
64
|
match ack:
|
|
63
|
-
|
|
64
65
|
# if the pipeline has actually been started, we start a listener
|
|
65
66
|
case _CallbackRunnerInstruction.PipelineStarted:
|
|
66
|
-
|
|
67
67
|
# listen to the pipeline
|
|
68
|
-
gen_obj = self.client.listen_to_pipeline(
|
|
68
|
+
gen_obj = self.client.listen_to_pipeline(
|
|
69
|
+
self.pipeline_name, self.view_name, format="json"
|
|
70
|
+
)
|
|
69
71
|
|
|
70
72
|
# if there is a queue set up, inform the main thread that the listener has been started, and it can
|
|
71
73
|
# proceed with starting the pipeline
|
|
@@ -90,7 +92,6 @@ class CallbackRunner(Thread):
|
|
|
90
92
|
|
|
91
93
|
# if the queue has received a message
|
|
92
94
|
if again_ack:
|
|
93
|
-
|
|
94
95
|
match again_ack:
|
|
95
96
|
case _CallbackRunnerInstruction.RanToCompletion:
|
|
96
97
|
# stop blocking the main thread on `join` and return from this thread
|
|
@@ -8,37 +8,37 @@ def sql_type_to_pandas_type(sql_type: str):
|
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
match sql_type.upper():
|
|
11
|
-
case
|
|
12
|
-
return
|
|
13
|
-
case
|
|
14
|
-
return
|
|
15
|
-
case
|
|
16
|
-
return
|
|
17
|
-
case
|
|
18
|
-
return
|
|
19
|
-
case
|
|
20
|
-
return
|
|
21
|
-
case
|
|
22
|
-
return
|
|
23
|
-
case
|
|
24
|
-
return
|
|
25
|
-
case
|
|
11
|
+
case "BOOLEAN":
|
|
12
|
+
return "boolean"
|
|
13
|
+
case "TINYINT":
|
|
14
|
+
return "Int8"
|
|
15
|
+
case "SMALLINT":
|
|
16
|
+
return "Int16"
|
|
17
|
+
case "INTEGER":
|
|
18
|
+
return "Int32"
|
|
19
|
+
case "BIGINT":
|
|
20
|
+
return "Int64"
|
|
21
|
+
case "REAL":
|
|
22
|
+
return "Float32"
|
|
23
|
+
case "DOUBLE":
|
|
24
|
+
return "Float64"
|
|
25
|
+
case "DECIMAL":
|
|
26
26
|
return None
|
|
27
|
-
case
|
|
28
|
-
return
|
|
29
|
-
case
|
|
30
|
-
return
|
|
31
|
-
case
|
|
32
|
-
return
|
|
33
|
-
case
|
|
34
|
-
return
|
|
35
|
-
case
|
|
27
|
+
case "CHAR":
|
|
28
|
+
return "str"
|
|
29
|
+
case "VARCHAR":
|
|
30
|
+
return "str"
|
|
31
|
+
case "DATE" | "TIMESTAMP":
|
|
32
|
+
return "datetime64[ns]"
|
|
33
|
+
case "TIME" | "INTERVAL":
|
|
34
|
+
return "timedelta64[ns]"
|
|
35
|
+
case "ARRAY":
|
|
36
36
|
return None
|
|
37
|
-
case
|
|
37
|
+
case "NULL":
|
|
38
38
|
return None
|
|
39
|
-
case
|
|
39
|
+
case "BINARY" | "VARBINARY":
|
|
40
40
|
return None
|
|
41
|
-
case
|
|
41
|
+
case "STRUCT" | "MAP":
|
|
42
42
|
return None
|
|
43
43
|
|
|
44
44
|
|
|
@@ -65,17 +65,22 @@ def dataframe_from_response(buffer: list[list[dict]], schema: dict):
|
|
|
65
65
|
|
|
66
66
|
decimal_col = []
|
|
67
67
|
|
|
68
|
-
for column in schema[
|
|
69
|
-
column_name = column[
|
|
70
|
-
|
|
71
|
-
|
|
68
|
+
for column in schema["fields"]:
|
|
69
|
+
column_name = column["name"]
|
|
70
|
+
if not column["case_sensitive"]:
|
|
71
|
+
column_name = column_name.lower()
|
|
72
|
+
column_type = column["columntype"]["type"]
|
|
73
|
+
if column_type == "DECIMAL":
|
|
72
74
|
decimal_col.append(column_name)
|
|
73
75
|
|
|
74
76
|
pd_schema[column_name] = sql_type_to_pandas_type(column_type)
|
|
75
77
|
|
|
76
78
|
data = [
|
|
77
|
-
{**item[
|
|
78
|
-
|
|
79
|
+
{**item["insert"], "insert_delete": 1}
|
|
80
|
+
if "insert" in item
|
|
81
|
+
else {**item["delete"], "insert_delete": -1}
|
|
82
|
+
for sublist in buffer
|
|
83
|
+
for item in sublist
|
|
79
84
|
]
|
|
80
85
|
|
|
81
86
|
if len(decimal_col) != 0:
|
|
@@ -84,7 +89,6 @@ def dataframe_from_response(buffer: list[list[dict]], schema: dict):
|
|
|
84
89
|
if datum[col] is not None:
|
|
85
90
|
datum[col] = Decimal(datum[col])
|
|
86
91
|
|
|
87
|
-
|
|
88
92
|
df = pd.DataFrame(data)
|
|
89
93
|
df = df.astype(pd_schema)
|
|
90
94
|
|
|
@@ -97,4 +101,4 @@ def chunk_dataframe(df, chunk_size=1000):
|
|
|
97
101
|
"""
|
|
98
102
|
|
|
99
103
|
for i in range(0, len(df), chunk_size):
|
|
100
|
-
yield df.iloc[i:i + chunk_size]
|
|
104
|
+
yield df.iloc[i : i + chunk_size]
|
|
@@ -7,7 +7,13 @@ from feldera._callback_runner import CallbackRunner
|
|
|
7
7
|
|
|
8
8
|
|
|
9
9
|
class OutputHandler:
|
|
10
|
-
def __init__(
|
|
10
|
+
def __init__(
|
|
11
|
+
self,
|
|
12
|
+
client: FelderaClient,
|
|
13
|
+
pipeline_name: str,
|
|
14
|
+
view_name: str,
|
|
15
|
+
queue: Optional[Queue],
|
|
16
|
+
):
|
|
11
17
|
"""
|
|
12
18
|
Initializes the output handler, but doesn't start it.
|
|
13
19
|
To start the output handler, call the `.OutputHandler.start` method.
|
|
@@ -25,7 +31,9 @@ class OutputHandler:
|
|
|
25
31
|
self.buffer.append(df)
|
|
26
32
|
|
|
27
33
|
# sets up the callback runner
|
|
28
|
-
self.handler = CallbackRunner(
|
|
34
|
+
self.handler = CallbackRunner(
|
|
35
|
+
self.client, self.pipeline_name, self.view_name, callback, queue
|
|
36
|
+
)
|
|
29
37
|
|
|
30
38
|
def start(self):
|
|
31
39
|
"""
|
|
@@ -56,5 +64,4 @@ class OutputHandler:
|
|
|
56
64
|
:param clear_buffer: Whether to clear the buffer after getting the output.
|
|
57
65
|
"""
|
|
58
66
|
|
|
59
|
-
return self.to_pandas(clear_buffer).to_dict(orient=
|
|
60
|
-
|
|
67
|
+
return self.to_pandas(clear_buffer).to_dict(orient="records")
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import time
|
|
2
2
|
import pandas
|
|
3
3
|
|
|
4
|
-
from typing import List, Dict, Callable, Optional
|
|
4
|
+
from typing import List, Dict, Callable, Optional, Generator, Mapping, Any
|
|
5
|
+
from collections import deque
|
|
5
6
|
from queue import Queue
|
|
6
7
|
|
|
7
8
|
from feldera.rest.errors import FelderaAPIError
|
|
@@ -72,8 +73,12 @@ class Pipeline:
|
|
|
72
73
|
ensure_dataframe_has_columns(df)
|
|
73
74
|
|
|
74
75
|
pipeline = self.client.get_pipeline(self.name)
|
|
75
|
-
if table_name.lower() != "now" and table_name.lower() not in [
|
|
76
|
-
|
|
76
|
+
if table_name.lower() != "now" and table_name.lower() not in [
|
|
77
|
+
tbl.name.lower() for tbl in pipeline.tables
|
|
78
|
+
]:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
f"Cannot push to table '{table_name}' as it is not registered yet"
|
|
81
|
+
)
|
|
77
82
|
else:
|
|
78
83
|
# consider validating the schema here
|
|
79
84
|
for datum in chunk_dataframe(df):
|
|
@@ -81,15 +86,21 @@ class Pipeline:
|
|
|
81
86
|
self.name,
|
|
82
87
|
table_name,
|
|
83
88
|
"json",
|
|
84
|
-
datum.to_json(orient=
|
|
85
|
-
json_flavor=
|
|
89
|
+
datum.to_json(orient="records", date_format="epoch"),
|
|
90
|
+
json_flavor="pandas",
|
|
86
91
|
array=True,
|
|
87
92
|
serialize=False,
|
|
88
93
|
force=force,
|
|
89
94
|
)
|
|
90
95
|
return
|
|
91
96
|
|
|
92
|
-
def input_json(
|
|
97
|
+
def input_json(
|
|
98
|
+
self,
|
|
99
|
+
table_name: str,
|
|
100
|
+
data: Dict | list,
|
|
101
|
+
update_format: str = "raw",
|
|
102
|
+
force: bool = False,
|
|
103
|
+
):
|
|
93
104
|
"""
|
|
94
105
|
Push this JSON data to the specified table of the pipeline.
|
|
95
106
|
|
|
@@ -112,7 +123,7 @@ class Pipeline:
|
|
|
112
123
|
data,
|
|
113
124
|
update_format=update_format,
|
|
114
125
|
array=array,
|
|
115
|
-
force=force
|
|
126
|
+
force=force,
|
|
116
127
|
)
|
|
117
128
|
|
|
118
129
|
def listen(self, view_name: str) -> OutputHandler:
|
|
@@ -134,7 +145,9 @@ class Pipeline:
|
|
|
134
145
|
|
|
135
146
|
return handler
|
|
136
147
|
|
|
137
|
-
def foreach_chunk(
|
|
148
|
+
def foreach_chunk(
|
|
149
|
+
self, view_name: str, callback: Callable[[pandas.DataFrame, int], None]
|
|
150
|
+
):
|
|
138
151
|
"""
|
|
139
152
|
Run the given callback on each chunk of the output of the specified view.
|
|
140
153
|
|
|
@@ -190,11 +203,15 @@ class Pipeline:
|
|
|
190
203
|
raise RuntimeError("Pipeline must be running to wait for completion")
|
|
191
204
|
|
|
192
205
|
while True:
|
|
193
|
-
metrics: dict = self.client.get_pipeline_stats(self.name).get(
|
|
206
|
+
metrics: dict = self.client.get_pipeline_stats(self.name).get(
|
|
207
|
+
"global_metrics"
|
|
208
|
+
)
|
|
194
209
|
pipeline_complete: bool = metrics.get("pipeline_complete")
|
|
195
210
|
|
|
196
211
|
if pipeline_complete is None:
|
|
197
|
-
raise RuntimeError(
|
|
212
|
+
raise RuntimeError(
|
|
213
|
+
"received unknown metrics from the pipeline, pipeline_complete is None"
|
|
214
|
+
)
|
|
198
215
|
|
|
199
216
|
if pipeline_complete:
|
|
200
217
|
break
|
|
@@ -215,7 +232,9 @@ class Pipeline:
|
|
|
215
232
|
|
|
216
233
|
status = self.status()
|
|
217
234
|
if status != PipelineStatus.SHUTDOWN:
|
|
218
|
-
raise RuntimeError(
|
|
235
|
+
raise RuntimeError(
|
|
236
|
+
f"pipeline {self.name} in state: {str(status.name)} cannot be started"
|
|
237
|
+
)
|
|
219
238
|
|
|
220
239
|
self.pause()
|
|
221
240
|
self.__setup_output_listeners()
|
|
@@ -230,10 +249,10 @@ class Pipeline:
|
|
|
230
249
|
self.start()
|
|
231
250
|
|
|
232
251
|
def wait_for_idle(
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
252
|
+
self,
|
|
253
|
+
idle_interval_s: float = 5.0,
|
|
254
|
+
timeout_s: float = 600.0,
|
|
255
|
+
poll_interval_s: float = 0.2,
|
|
237
256
|
):
|
|
238
257
|
"""
|
|
239
258
|
Wait for the pipeline to become idle and then returns.
|
|
@@ -253,12 +272,18 @@ class Pipeline:
|
|
|
253
272
|
reached.
|
|
254
273
|
"""
|
|
255
274
|
if idle_interval_s > timeout_s:
|
|
256
|
-
raise ValueError(
|
|
275
|
+
raise ValueError(
|
|
276
|
+
f"idle interval ({idle_interval_s}s) cannot be larger than timeout ({timeout_s}s)"
|
|
277
|
+
)
|
|
257
278
|
if poll_interval_s > timeout_s:
|
|
258
|
-
raise ValueError(
|
|
279
|
+
raise ValueError(
|
|
280
|
+
f"poll interval ({poll_interval_s}s) cannot be larger than timeout ({timeout_s}s)"
|
|
281
|
+
)
|
|
259
282
|
if poll_interval_s > idle_interval_s:
|
|
260
|
-
raise ValueError(
|
|
261
|
-
|
|
283
|
+
raise ValueError(
|
|
284
|
+
f"poll interval ({poll_interval_s}s) cannot be larger "
|
|
285
|
+
f"than idle interval ({idle_interval_s}s)"
|
|
286
|
+
)
|
|
262
287
|
|
|
263
288
|
start_time_s = time.monotonic()
|
|
264
289
|
idle_started_s = None
|
|
@@ -267,16 +292,24 @@ class Pipeline:
|
|
|
267
292
|
now_s = time.monotonic()
|
|
268
293
|
|
|
269
294
|
# Metrics retrieval
|
|
270
|
-
metrics: dict = self.client.get_pipeline_stats(self.name).get(
|
|
295
|
+
metrics: dict = self.client.get_pipeline_stats(self.name).get(
|
|
296
|
+
"global_metrics"
|
|
297
|
+
)
|
|
271
298
|
total_input_records: int | None = metrics.get("total_input_records")
|
|
272
299
|
total_processed_records: int | None = metrics.get("total_processed_records")
|
|
273
300
|
if total_input_records is None:
|
|
274
|
-
raise RuntimeError(
|
|
301
|
+
raise RuntimeError(
|
|
302
|
+
"total_input_records is missing from the pipeline metrics"
|
|
303
|
+
)
|
|
275
304
|
if total_processed_records is None:
|
|
276
|
-
raise RuntimeError(
|
|
305
|
+
raise RuntimeError(
|
|
306
|
+
"total_processed_records is missing from the pipeline metrics"
|
|
307
|
+
)
|
|
277
308
|
|
|
278
309
|
# Idle check
|
|
279
|
-
unchanged =
|
|
310
|
+
unchanged = (
|
|
311
|
+
prev[0] == total_input_records and prev[1] == total_processed_records
|
|
312
|
+
)
|
|
280
313
|
equal = total_input_records == total_processed_records
|
|
281
314
|
prev = (total_input_records, total_processed_records)
|
|
282
315
|
if unchanged and equal:
|
|
@@ -328,7 +361,7 @@ class Pipeline:
|
|
|
328
361
|
self.client.delete_pipeline(self.name)
|
|
329
362
|
|
|
330
363
|
@staticmethod
|
|
331
|
-
def get(name: str, client: FelderaClient) ->
|
|
364
|
+
def get(name: str, client: FelderaClient) -> "Pipeline":
|
|
332
365
|
"""
|
|
333
366
|
Get the pipeline if it exists.
|
|
334
367
|
|
|
@@ -344,3 +377,57 @@ class Pipeline:
|
|
|
344
377
|
except FelderaAPIError as err:
|
|
345
378
|
if err.status_code == 404:
|
|
346
379
|
raise RuntimeError(f"Pipeline with name {name} not found")
|
|
380
|
+
|
|
381
|
+
def query(self, query: str) -> Generator[Mapping[str, Any], None, None]:
|
|
382
|
+
"""
|
|
383
|
+
Executes an ad-hoc SQL query on this pipeline and returns the result in the specified format.
|
|
384
|
+
For ``INSERT`` and ``DELETE`` queries, consider using :meth:`.execute` instead.
|
|
385
|
+
|
|
386
|
+
Important:
|
|
387
|
+
This method is lazy. It returns a generator and is not evaluated until you consume the result.
|
|
388
|
+
|
|
389
|
+
:param query: The SQL query to be executed.
|
|
390
|
+
:return: A generator that yields the rows of the result as Python dictionaries.
|
|
391
|
+
"""
|
|
392
|
+
|
|
393
|
+
return self.client.query_as_json(self.name, query)
|
|
394
|
+
|
|
395
|
+
def query_parquet(self, query: str, path: str):
|
|
396
|
+
"""
|
|
397
|
+
Executes an ad-hoc SQL query on this pipeline and saves the result to the specified path as a parquet file.
|
|
398
|
+
If the extension isn't `parquet`, it will be automatically appended to `path`.
|
|
399
|
+
|
|
400
|
+
:param query: The SQL query to be executed.
|
|
401
|
+
:param path: The path of the parquet file.
|
|
402
|
+
"""
|
|
403
|
+
|
|
404
|
+
self.client.query_as_parquet(self.name, query, path)
|
|
405
|
+
|
|
406
|
+
def query_tabular(self, query: str) -> Generator[str, None, None]:
|
|
407
|
+
"""
|
|
408
|
+
Executes a SQL query on this pipeline and returns the result as a formatted string.
|
|
409
|
+
|
|
410
|
+
Important:
|
|
411
|
+
This method is lazy. It returns a generator and is not evaluated until you consume the result.
|
|
412
|
+
|
|
413
|
+
:param query: The SQL query to be executed.
|
|
414
|
+
:return: A generator that yields a string representing the query result in a human-readable, tabular format.
|
|
415
|
+
"""
|
|
416
|
+
|
|
417
|
+
return self.client.query_as_text(self.name, query)
|
|
418
|
+
|
|
419
|
+
def execute(self, query: str):
|
|
420
|
+
"""
|
|
421
|
+
Executes an ad-hoc SQL query on the current pipeline, discarding its result.
|
|
422
|
+
Unlike the :meth:`.query` method which returns a generator for retrieving query results lazily,
|
|
423
|
+
this method processes the query eagerly and fully before returning.
|
|
424
|
+
|
|
425
|
+
This method is suitable for SQL operations like ``INSERT`` and ``DELETE``, where the user needs
|
|
426
|
+
confirmation of successful query execution, but does not require the query result.
|
|
427
|
+
If the query fails, an exception will be raised.
|
|
428
|
+
|
|
429
|
+
:param query: The SQL query to be executed.
|
|
430
|
+
"""
|
|
431
|
+
|
|
432
|
+
gen = self.query_tabular(query)
|
|
433
|
+
deque(gen, maxlen=0)
|
|
@@ -14,6 +14,8 @@ class PipelineBuilder:
|
|
|
14
14
|
:param name: The name of the pipeline
|
|
15
15
|
:param description: The description of the pipeline
|
|
16
16
|
:param sql: The SQL code of the pipeline
|
|
17
|
+
:param udf_rust: Rust code for UDFs
|
|
18
|
+
:param udf_toml: Rust dependencies required by UDFs (in the TOML format)
|
|
17
19
|
:param compilation_profile: The compilation profile to use
|
|
18
20
|
:param runtime_config: The runtime config to use
|
|
19
21
|
"""
|
|
@@ -23,15 +25,18 @@ class PipelineBuilder:
|
|
|
23
25
|
client: FelderaClient,
|
|
24
26
|
name: str,
|
|
25
27
|
sql: str,
|
|
28
|
+
udf_rust: str = "",
|
|
29
|
+
udf_toml: str = "",
|
|
26
30
|
description: str = "",
|
|
27
31
|
compilation_profile: CompilationProfile = CompilationProfile.OPTIMIZED,
|
|
28
32
|
runtime_config: RuntimeConfig = RuntimeConfig(resources=Resources()),
|
|
29
|
-
|
|
30
33
|
):
|
|
31
34
|
self.client: FelderaClient = client
|
|
32
35
|
self.name: str | None = name
|
|
33
36
|
self.description: str = description
|
|
34
37
|
self.sql: str = sql
|
|
38
|
+
self.udf_rust: str = udf_rust
|
|
39
|
+
self.udf_toml: str = udf_toml
|
|
35
40
|
self.compilation_profile: CompilationProfile = compilation_profile
|
|
36
41
|
self.runtime_config: RuntimeConfig = runtime_config
|
|
37
42
|
|
|
@@ -52,8 +57,10 @@ class PipelineBuilder:
|
|
|
52
57
|
self.name,
|
|
53
58
|
description=self.description,
|
|
54
59
|
sql=self.sql,
|
|
60
|
+
udf_rust=self.udf_rust,
|
|
61
|
+
udf_toml=self.udf_toml,
|
|
55
62
|
program_config={
|
|
56
|
-
|
|
63
|
+
"profile": self.compilation_profile.value,
|
|
57
64
|
},
|
|
58
65
|
runtime_config=self.runtime_config.__dict__,
|
|
59
66
|
)
|
|
@@ -85,10 +92,14 @@ class PipelineBuilder:
|
|
|
85
92
|
self.name,
|
|
86
93
|
description=self.description,
|
|
87
94
|
sql=self.sql,
|
|
95
|
+
udf_rust=self.udf_rust,
|
|
96
|
+
udf_toml=self.udf_toml,
|
|
88
97
|
program_config={
|
|
89
|
-
|
|
98
|
+
"profile": self.compilation_profile.value,
|
|
90
99
|
},
|
|
91
|
-
runtime_config=dict(
|
|
100
|
+
runtime_config=dict(
|
|
101
|
+
(k, v) for k, v in self.runtime_config.__dict__.items() if v is not None
|
|
102
|
+
),
|
|
92
103
|
)
|
|
93
104
|
|
|
94
105
|
inner = self.client.create_or_update_pipeline(inner)
|