feldera 0.34.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of feldera might be problematic. Click here for more details.

feldera/__init__.py ADDED
@@ -0,0 +1,11 @@
1
+ from feldera.rest.feldera_client import FelderaClient as FelderaClient
2
+ from feldera.pipeline import Pipeline as Pipeline
3
+ from feldera.pipeline_builder import PipelineBuilder as PipelineBuilder
4
+
5
+ import pretty_errors
6
+
7
+ pretty_errors.configure(
8
+ line_number_first=True,
9
+ )
10
+
11
+ pretty_errors.activate()
@@ -0,0 +1,116 @@
1
+ from enum import Enum
2
+ from threading import Thread
3
+ from typing import Callable, Optional
4
+ from queue import Queue, Empty
5
+
6
+ import pandas as pd
7
+ from feldera import FelderaClient
8
+ from feldera._helpers import dataframe_from_response
9
+
10
+
11
+ class _CallbackRunnerInstruction(Enum):
12
+ PipelineStarted = 1
13
+ RanToCompletion = 2
14
+
15
+
16
+ class CallbackRunner(Thread):
17
+ def __init__(
18
+ self,
19
+ client: FelderaClient,
20
+ pipeline_name: str,
21
+ view_name: str,
22
+ callback: Callable[[pd.DataFrame, int], None],
23
+ queue: Optional[Queue],
24
+ ):
25
+ super().__init__()
26
+ self.daemon = True
27
+ self.client: FelderaClient = client
28
+ self.pipeline_name: str = pipeline_name
29
+ self.view_name: str = view_name
30
+ self.callback: Callable[[pd.DataFrame, int], None] = callback
31
+ self.queue: Optional[Queue] = queue
32
+ self.schema: Optional[dict] = None
33
+
34
+ def run(self):
35
+ """
36
+ The main loop of the thread. Listens for data and calls the callback function on each chunk of data received.
37
+
38
+ :meta private:
39
+ """
40
+
41
+ pipeline = self.client.get_pipeline(self.pipeline_name)
42
+ schema = pipeline.program_info["schema"]
43
+
44
+ if schema:
45
+ schemas = [relation for relation in schema["inputs"] + schema["outputs"]]
46
+ for schema in schemas:
47
+ if schema["name"] == self.view_name:
48
+ self.schema = schema
49
+ break
50
+
51
+ if self.schema is None:
52
+ raise ValueError(
53
+ f"Table or View {self.view_name} not found in the pipeline schema."
54
+ )
55
+
56
+ # by default, we assume that the pipeline has been started
57
+ ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
58
+
59
+ # if there is Queue, we wait for the instruction to start the pipeline
60
+ # this means that we are listening to the pipeline before running it, therefore, all data should be received
61
+ if self.queue:
62
+ ack: _CallbackRunnerInstruction = self.queue.get()
63
+
64
+ match ack:
65
+ # if the pipeline has actually been started, we start a listener
66
+ case _CallbackRunnerInstruction.PipelineStarted:
67
+ # listen to the pipeline
68
+ gen_obj = self.client.listen_to_pipeline(
69
+ self.pipeline_name, self.view_name, format="json"
70
+ )
71
+
72
+ # if there is a queue set up, inform the main thread that the listener has been started, and it can
73
+ # proceed with starting the pipeline
74
+ if self.queue:
75
+ # stop blocking the main thread on `join` for the previous message
76
+ self.queue.task_done()
77
+
78
+ for chunk in gen_obj:
79
+ chunk: dict = chunk
80
+ data: list[dict] = chunk.get("json_data")
81
+ seq_no: int = chunk.get("sequence_number")
82
+
83
+ if data is not None:
84
+ self.callback(dataframe_from_response([data], schema), seq_no)
85
+
86
+ if self.queue:
87
+ try:
88
+ # if a non-blocking way, check if the queue has received further instructions
89
+ # this should be a RanToCompletion instruction, which means that the pipeline has been
90
+ # completed
91
+ again_ack = self.queue.get_nowait()
92
+
93
+ # if the queue has received a message
94
+ if again_ack:
95
+ match again_ack:
96
+ case _CallbackRunnerInstruction.RanToCompletion:
97
+ # stop blocking the main thread on `join` and return from this thread
98
+ self.queue.task_done()
99
+
100
+ return
101
+
102
+ case _CallbackRunnerInstruction.PipelineStarted:
103
+ # if the pipeline has been started again, which shouldn't happen,
104
+ # ignore it and continue listening, call `task_done` to avoid blocking the main
105
+ # thread on `join`
106
+ self.queue.task_done()
107
+
108
+ continue
109
+ except Empty:
110
+ # if the queue is empty, continue listening
111
+ continue
112
+
113
+ case _CallbackRunnerInstruction.RanToCompletion:
114
+ if self.queue:
115
+ self.queue.task_done()
116
+ return
feldera/_helpers.py ADDED
@@ -0,0 +1,104 @@
1
+ import pandas as pd
2
+ from decimal import Decimal
3
+
4
+
5
+ def sql_type_to_pandas_type(sql_type: str):
6
+ """
7
+ Converts a SQL type to a pandas type.
8
+ """
9
+
10
+ match sql_type.upper():
11
+ case "BOOLEAN":
12
+ return "boolean"
13
+ case "TINYINT":
14
+ return "Int8"
15
+ case "SMALLINT":
16
+ return "Int16"
17
+ case "INTEGER":
18
+ return "Int32"
19
+ case "BIGINT":
20
+ return "Int64"
21
+ case "REAL":
22
+ return "Float32"
23
+ case "DOUBLE":
24
+ return "Float64"
25
+ case "DECIMAL":
26
+ return None
27
+ case "CHAR":
28
+ return "str"
29
+ case "VARCHAR":
30
+ return "str"
31
+ case "DATE" | "TIMESTAMP":
32
+ return "datetime64[ns]"
33
+ case "TIME" | "INTERVAL":
34
+ return "timedelta64[ns]"
35
+ case "ARRAY":
36
+ return None
37
+ case "NULL":
38
+ return None
39
+ case "BINARY" | "VARBINARY":
40
+ return None
41
+ case "STRUCT" | "MAP":
42
+ return None
43
+
44
+
45
+ def ensure_dataframe_has_columns(df: pd.DataFrame):
46
+ """
47
+ Ensures that the DataFrame has column names set.
48
+ """
49
+
50
+ if [v for v in range(df.shape[1])] == list(df.columns):
51
+ raise ValueError(
52
+ """
53
+ DataFrame has no column names set.
54
+ Input DataFrame must have column names set and they must be consistent with the columns in the input table.
55
+ """
56
+ )
57
+
58
+
59
+ def dataframe_from_response(buffer: list[list[dict]], schema: dict):
60
+ """
61
+ Converts the response from Feldera to a pandas DataFrame.
62
+ """
63
+
64
+ pd_schema = {}
65
+
66
+ decimal_col = []
67
+
68
+ for column in schema["fields"]:
69
+ column_name = column["name"]
70
+ if not column["case_sensitive"]:
71
+ column_name = column_name.lower()
72
+ column_type = column["columntype"]["type"]
73
+ if column_type == "DECIMAL":
74
+ decimal_col.append(column_name)
75
+
76
+ pd_schema[column_name] = sql_type_to_pandas_type(column_type)
77
+
78
+ data = [
79
+ {**item["insert"], "insert_delete": 1}
80
+ if "insert" in item
81
+ else {**item["delete"], "insert_delete": -1}
82
+ for sublist in buffer
83
+ for item in sublist
84
+ ]
85
+
86
+ if len(decimal_col) != 0:
87
+ for datum in data:
88
+ for col in decimal_col:
89
+ if datum[col] is not None:
90
+ datum[col] = Decimal(datum[col])
91
+
92
+ df = pd.DataFrame(data)
93
+ df = df.astype(pd_schema)
94
+
95
+ return df
96
+
97
+
98
+ def chunk_dataframe(df, chunk_size=1000):
99
+ """
100
+ Yield successive n-sized chunks from the given dataframe.
101
+ """
102
+
103
+ for i in range(0, len(df), chunk_size):
104
+ yield df.iloc[i : i + chunk_size]
feldera/enums.py ADDED
@@ -0,0 +1,234 @@
1
+ from enum import Enum
2
+ from typing import Optional
3
+
4
+
5
+ class CompilationProfile(Enum):
6
+ """
7
+ The compilation profile to use when compiling the program.
8
+ """
9
+
10
+ SERVER_DEFAULT = None
11
+ """
12
+ The compiler server default compilation profile.
13
+ """
14
+
15
+ DEV = "dev"
16
+ """
17
+ The development compilation profile.
18
+ """
19
+
20
+ UNOPTIMIZED = "unoptimized"
21
+ """
22
+ The unoptimized compilation profile.
23
+ """
24
+
25
+ OPTIMIZED = "optimized"
26
+ """
27
+ The optimized compilation profile, the default for this API.
28
+ """
29
+
30
+
31
+ class BuildMode(Enum):
32
+ CREATE = 1
33
+ GET = 2
34
+ GET_OR_CREATE = 3
35
+
36
+
37
+ class PipelineStatus(Enum):
38
+ """
39
+ Represents the state that this pipeline is currently in.
40
+
41
+ .. code-block:: text
42
+
43
+ Shutdown ◄────┐
44
+ │ │
45
+ /deploy │ │
46
+ │ ⌛ShuttingDown
47
+ ▼ ▲
48
+ ⌛Provisioning │
49
+ │ │
50
+ Provisioned │
51
+ ▼ │/shutdown
52
+ ⌛Initializing │
53
+ │ │
54
+ ┌────────┴─────────┴─┐
55
+ │ ▼ │
56
+ │ Paused │
57
+ │ │ ▲ │
58
+ │/start│ │/pause │
59
+ │ ▼ │ │
60
+ │ Running │
61
+ └──────────┬─────────┘
62
+
63
+
64
+ Failed
65
+ """
66
+
67
+ NOT_FOUND = 1
68
+ """
69
+ The pipeline has not been created yet.
70
+ """
71
+
72
+ SHUTDOWN = 2
73
+ """
74
+ Pipeline has not been started or has been shut down.
75
+
76
+ The pipeline remains in this state until the user triggers
77
+ a deployment by invoking the `/deploy` endpoint.
78
+ """
79
+
80
+ PROVISIONING = 3
81
+ """
82
+ The runner triggered a deployment of the pipeline and is
83
+ waiting for the pipeline HTTP server to come up.
84
+
85
+ In this state, the runner provisions a runtime for the pipeline,
86
+ starts the pipeline within this runtime and waits for it to start accepting HTTP requests.
87
+
88
+ The user is unable to communicate with the pipeline during this
89
+ time. The pipeline remains in this state until:
90
+
91
+ 1. Its HTTP server is up and running; the pipeline transitions to the
92
+ `PipelineStatus.INITIALIZING` state.
93
+ 2. A pre-defined timeout has passed. The runner performs forced
94
+ shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
95
+ 3. The user cancels the pipeline by invoking the `/shutdown` endpoint.
96
+ The manager performs forced shutdown of the pipeline, returns to the
97
+ `PipelineStatus.SHUTDOWN` state.
98
+
99
+ """
100
+
101
+ INITIALIZING = 4
102
+ """
103
+ The pipeline is initializing its internal state and connectors.
104
+
105
+ This state is part of the pipeline's deployment process. In this state,
106
+ the pipeline's HTTP server is up and running, but its query engine
107
+ and input and output connectors are still initializing.
108
+
109
+ The pipeline remains in this state until:
110
+
111
+ 1. Initialization completes successfully; the pipeline transitions to the
112
+ `PipelineStatus.PAUSED` state.
113
+ 2. Initialization fails; transitions to the `PipelineStatus.FAILED` state.
114
+ 3. A pre-defined timeout has passed. The runner performs forced
115
+ shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
116
+ 4. The user cancels the pipeline by invoking the `/shutdown` endpoint.
117
+ The manager performs forced shutdown of the pipeline; returns to the
118
+ `PipelineStatus.SHUTDOWN` state.
119
+
120
+ """
121
+
122
+ PAUSED = 5
123
+ """
124
+ The pipeline is fully initialized, but data processing has been paused.
125
+
126
+ The pipeline remains in this state until:
127
+
128
+ 1. The user starts the pipeline by invoking the `/start` endpoint. The
129
+ manager passes the request to the pipeline; transitions to the
130
+ `PipelineStatus.RUNNING` state.
131
+ 2. The user cancels the pipeline by invoking the `/shutdown` endpoint.
132
+ The manager passes the shutdown request to the pipeline to perform a
133
+ graceful shutdown; transitions to the `PipelineStatus.SHUTTING_DOWN` state.
134
+ 3. An unexpected runtime error renders the pipeline `PipelineStatus.FAILED`.
135
+
136
+ """
137
+
138
+ RUNNING = 6
139
+ """
140
+ The pipeline is processing data.
141
+
142
+ The pipeline remains in this state until:
143
+
144
+ 1. The user pauses the pipeline by invoking the `/pause` endpoint. The
145
+ manager passes the request to the pipeline; transitions to the
146
+ `PipelineStatus.PAUSED` state.
147
+ 2. The user cancels the pipeline by invoking the `/shutdown` endpoint.
148
+ The runner passes the shutdown request to the pipeline to perform a
149
+ graceful shutdown; transitions to the
150
+ `PipelineStatus.SHUTTING_DOWN` state.
151
+ 3. An unexpected runtime error renders the pipeline
152
+ `PipelineStatus.FAILED`.
153
+
154
+ """
155
+
156
+ SHUTTING_DOWN = 7
157
+ """
158
+ Graceful shutdown in progress.
159
+
160
+ In this state, the pipeline finishes any ongoing data processing,
161
+ produces final outputs, shuts down input/output connectors and
162
+ terminates.
163
+
164
+ The pipeline remains in this state until:
165
+
166
+ 1. Shutdown completes successfully; transitions to the `PipelineStatus.SHUTDOWN` state.
167
+ 2. A pre-defined timeout has passed. The manager performs forced shutdown of the pipeline; returns to the
168
+ `PipelineStatus.SHUTDOWN` state.
169
+
170
+ """
171
+
172
+ FAILED = 8
173
+ """
174
+ The pipeline remains in this state until the users acknowledge the failure
175
+ by issuing a call to shutdown the pipeline; transitions to the
176
+ `PipelineStatus.SHUTDOWN` state.
177
+ """
178
+
179
+ UNAVAILABLE = 9
180
+ """
181
+ The pipeline was at least once initialized, but in the most recent status check either
182
+ could not be reached or returned it is not yet ready.
183
+ """
184
+
185
+ @staticmethod
186
+ def from_str(value):
187
+ for member in PipelineStatus:
188
+ if member.name.lower() == value.lower():
189
+ return member
190
+ raise ValueError(f"Unknown value '{value}' for enum {PipelineStatus.__name__}")
191
+
192
+ def __eq__(self, other):
193
+ return self.value == other.value
194
+
195
+
196
+ class ProgramStatus(Enum):
197
+ Pending = 1
198
+ CompilingSql = 2
199
+ SqlCompiled = 3
200
+ CompilingRust = 4
201
+ Success = 5
202
+ SqlError = 6
203
+ RustError = 7
204
+ SystemError = 8
205
+
206
+ def __init__(self, value):
207
+ self.error: Optional[dict] = None
208
+ self._value_ = value
209
+
210
+ @staticmethod
211
+ def from_value(value):
212
+ error = None
213
+ if isinstance(value, dict):
214
+ error = value
215
+ value = list(value.keys())[0]
216
+
217
+ for member in ProgramStatus:
218
+ if member.name.lower() == value.lower():
219
+ member.error = error
220
+ return member
221
+ raise ValueError(f"Unknown value '{value}' for enum {ProgramStatus.__name__}")
222
+
223
+ def __eq__(self, other):
224
+ return self.value == other.value
225
+
226
+ def __str__(self):
227
+ return self.name + (f": ({self.error})" if self.error else "")
228
+
229
+ def get_error(self) -> Optional[dict]:
230
+ """
231
+ Returns the compilation error, if any.
232
+ """
233
+
234
+ return self.error
@@ -0,0 +1,67 @@
1
+ import pandas as pd
2
+ from typing import Optional
3
+
4
+ from queue import Queue
5
+ from feldera import FelderaClient
6
+ from feldera._callback_runner import CallbackRunner
7
+
8
+
9
+ class OutputHandler:
10
+ def __init__(
11
+ self,
12
+ client: FelderaClient,
13
+ pipeline_name: str,
14
+ view_name: str,
15
+ queue: Optional[Queue],
16
+ ):
17
+ """
18
+ Initializes the output handler, but doesn't start it.
19
+ To start the output handler, call the `.OutputHandler.start` method.
20
+ """
21
+
22
+ self.client: FelderaClient = client
23
+ self.pipeline_name: str = pipeline_name
24
+ self.view_name: str = view_name
25
+ self.queue: Optional[Queue] = queue
26
+ self.buffer: list[pd.DataFrame] = []
27
+
28
+ # the callback that is passed to the `CallbackRunner`
29
+ def callback(df: pd.DataFrame, _: int):
30
+ if not df.empty:
31
+ self.buffer.append(df)
32
+
33
+ # sets up the callback runner
34
+ self.handler = CallbackRunner(
35
+ self.client, self.pipeline_name, self.view_name, callback, queue
36
+ )
37
+
38
+ def start(self):
39
+ """
40
+ Starts the output handler in a separate thread
41
+ """
42
+
43
+ self.handler.start()
44
+
45
+ def to_pandas(self, clear_buffer: bool = True):
46
+ """
47
+ Returns the output of the pipeline as a pandas DataFrame
48
+
49
+ :param clear_buffer: Whether to clear the buffer after getting the output.
50
+ """
51
+
52
+ if len(self.buffer) == 0:
53
+ return pd.DataFrame()
54
+ res = pd.concat(self.buffer, ignore_index=True)
55
+ if clear_buffer:
56
+ self.buffer.clear()
57
+
58
+ return res
59
+
60
+ def to_dict(self, clear_buffer: bool = True):
61
+ """
62
+ Returns the output of the pipeline as a list of python dictionaries
63
+
64
+ :param clear_buffer: Whether to clear the buffer after getting the output.
65
+ """
66
+
67
+ return self.to_pandas(clear_buffer).to_dict(orient="records")