feldera 0.27.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of feldera might be problematic. Click here for more details.

@@ -0,0 +1,94 @@
1
+ Metadata-Version: 2.1
2
+ Name: feldera
3
+ Version: 0.27.0
4
+ Summary: The feldera python client
5
+ Author-email: Abhinav <abhinav.gyawali@feldera.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://www.feldera.com
8
+ Project-URL: Documentation, https://docs.feldera.com
9
+ Project-URL: Repository, https://github.com/feldera/feldera
10
+ Project-URL: Issues, https://github.com/feldera/feldera/issues
11
+ Keywords: feldera,python
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3.10
14
+ Classifier: Programming Language :: Python :: 3.11
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.10
17
+ Description-Content-Type: text/markdown
18
+ Requires-Dist: requests
19
+ Requires-Dist: pandas
20
+ Requires-Dist: typing-extensions
21
+ Requires-Dist: numpy<2
22
+
23
+ # Feldera Python SDK
24
+
25
+ Feldera Python is the Feldera SDK for Python developers.
26
+
27
+ ## Installation
28
+
29
+ ```bash
30
+ pip install feldera
31
+ ```
32
+
33
+ ### Installing from Github
34
+
35
+ ```bash
36
+ pip install git+https://github.com/feldera/feldera#subdirectory=python
37
+ ```
38
+
39
+ Similarly, to install from a specific branch:
40
+
41
+ ```bash
42
+ $ pip install git+https://github.com/feldera/feldera@{BRANCH_NAME}#subdirectory=python
43
+ ```
44
+
45
+ Replace `{BRANCH_NAME}` with the name of the branch you want to install from.
46
+
47
+ Checkout the docs [here](./feldera/__init__.py) for an example on how to use the SDK.
48
+
49
+ ## Documentation
50
+
51
+ To build the html documentation run:
52
+
53
+ Ensure that you have sphinx installed. If not, install it using `pip install sphinx`.
54
+
55
+ Then run the following commands:
56
+
57
+ ```bash
58
+ cd docs
59
+ sphinx-apidoc -o . ../feldera
60
+ make html
61
+ ```
62
+
63
+ To clean the build, run `make clean`.
64
+
65
+ ## Testing
66
+
67
+ To run unit tests:
68
+
69
+ ```bash
70
+ (cd python && python3 -m unittest)
71
+ ```
72
+
73
+ The following command runs end-to-end tests. You'll need a pipeline
74
+ manager running at `http://localhost:8080`. For the pipeline builder
75
+ tests, you'll also need a broker available at `localhost:9092` and
76
+ (from the pipelines) `redpanda:19092`. (To change those locations,
77
+ set the environment variables listed in `python/tests/__init__.py`.)
78
+
79
+ ```bash
80
+ (cd python/tests && python3 -m pytest .)
81
+ ```
82
+
83
+ To run tests from a specific file:
84
+
85
+ ```bash
86
+ (cd python/tests && python3 -m unittest ./tests/path-to-file.py)
87
+ ```
88
+
89
+ To run the aggregate tests use:
90
+
91
+ ```bash
92
+ cd python
93
+ PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/test_base.py
94
+ ```
@@ -0,0 +1,72 @@
1
+ # Feldera Python SDK
2
+
3
+ Feldera Python is the Feldera SDK for Python developers.
4
+
5
+ ## Installation
6
+
7
+ ```bash
8
+ pip install feldera
9
+ ```
10
+
11
+ ### Installing from Github
12
+
13
+ ```bash
14
+ pip install git+https://github.com/feldera/feldera#subdirectory=python
15
+ ```
16
+
17
+ Similarly, to install from a specific branch:
18
+
19
+ ```bash
20
+ $ pip install git+https://github.com/feldera/feldera@{BRANCH_NAME}#subdirectory=python
21
+ ```
22
+
23
+ Replace `{BRANCH_NAME}` with the name of the branch you want to install from.
24
+
25
+ Checkout the docs [here](./feldera/__init__.py) for an example on how to use the SDK.
26
+
27
+ ## Documentation
28
+
29
+ To build the html documentation run:
30
+
31
+ Ensure that you have sphinx installed. If not, install it using `pip install sphinx`.
32
+
33
+ Then run the following commands:
34
+
35
+ ```bash
36
+ cd docs
37
+ sphinx-apidoc -o . ../feldera
38
+ make html
39
+ ```
40
+
41
+ To clean the build, run `make clean`.
42
+
43
+ ## Testing
44
+
45
+ To run unit tests:
46
+
47
+ ```bash
48
+ (cd python && python3 -m unittest)
49
+ ```
50
+
51
+ The following command runs end-to-end tests. You'll need a pipeline
52
+ manager running at `http://localhost:8080`. For the pipeline builder
53
+ tests, you'll also need a broker available at `localhost:9092` and
54
+ (from the pipelines) `redpanda:19092`. (To change those locations,
55
+ set the environment variables listed in `python/tests/__init__.py`.)
56
+
57
+ ```bash
58
+ (cd python/tests && python3 -m pytest .)
59
+ ```
60
+
61
+ To run tests from a specific file:
62
+
63
+ ```bash
64
+ (cd python/tests && python3 -m unittest ./tests/path-to-file.py)
65
+ ```
66
+
67
+ To run the aggregate tests use:
68
+
69
+ ```bash
70
+ cd python
71
+ PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/test_base.py
72
+ ```
@@ -0,0 +1,3 @@
1
+ from feldera.rest.feldera_client import FelderaClient
2
+ from feldera.pipeline import Pipeline
3
+ from feldera.pipeline_builder import PipelineBuilder
@@ -0,0 +1,115 @@
1
+ from enum import Enum
2
+ from threading import Thread
3
+ from typing import Callable, Optional
4
+ from queue import Queue, Empty
5
+
6
+ import pandas as pd
7
+ from feldera import FelderaClient
8
+ from feldera._helpers import dataframe_from_response
9
+
10
+
11
+ class _CallbackRunnerInstruction(Enum):
12
+ PipelineStarted = 1
13
+ RanToCompletion = 2
14
+
15
+
16
+ class CallbackRunner(Thread):
17
+ def __init__(
18
+ self,
19
+ client: FelderaClient,
20
+ pipeline_name: str,
21
+ view_name: str,
22
+ callback: Callable[[pd.DataFrame, int], None],
23
+ queue: Optional[Queue],
24
+ ):
25
+ super().__init__()
26
+ self.daemon = True
27
+ self.client: FelderaClient = client
28
+ self.pipeline_name: str = pipeline_name
29
+ self.view_name: str = view_name
30
+ self.callback: Callable[[pd.DataFrame, int], None] = callback
31
+ self.queue: Optional[Queue] = queue
32
+ self.schema: Optional[dict] = None
33
+
34
+ def run(self):
35
+ """
36
+ The main loop of the thread. Listens for data and calls the callback function on each chunk of data received.
37
+
38
+ :meta private:
39
+ """
40
+
41
+ pipeline = self.client.get_pipeline(self.pipeline_name)
42
+ schema = pipeline.program_info["schema"]
43
+
44
+ if schema:
45
+ schemas = [relation for relation in schema["inputs"] + schema["outputs"]]
46
+ for schema in schemas:
47
+ if schema["name"] == self.view_name:
48
+ self.schema = schema
49
+ break
50
+
51
+ if self.schema is None:
52
+ raise ValueError(f"Table or View {self.view_name} not found in the pipeline schema.")
53
+
54
+ # by default, we assume that the pipeline has been started
55
+ ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
56
+
57
+ # if there is Queue, we wait for the instruction to start the pipeline
58
+ # this means that we are listening to the pipeline before running it, therefore, all data should be received
59
+ if self.queue:
60
+ ack: _CallbackRunnerInstruction = self.queue.get()
61
+
62
+ match ack:
63
+
64
+ # if the pipeline has actually been started, we start a listener
65
+ case _CallbackRunnerInstruction.PipelineStarted:
66
+
67
+ # listen to the pipeline
68
+ gen_obj = self.client.listen_to_pipeline(self.pipeline_name, self.view_name, format="json")
69
+
70
+ # if there is a queue set up, inform the main thread that the listener has been started, and it can
71
+ # proceed with starting the pipeline
72
+ if self.queue:
73
+ # stop blocking the main thread on `join` for the previous message
74
+ self.queue.task_done()
75
+
76
+ for chunk in gen_obj:
77
+ chunk: dict = chunk
78
+ data: list[dict] = chunk.get("json_data")
79
+ seq_no: int = chunk.get("sequence_number")
80
+
81
+ if data is not None:
82
+ self.callback(dataframe_from_response([data], schema), seq_no)
83
+
84
+ if self.queue:
85
+ try:
86
+ # if a non-blocking way, check if the queue has received further instructions
87
+ # this should be a RanToCompletion instruction, which means that the pipeline has been
88
+ # completed
89
+ again_ack = self.queue.get_nowait()
90
+
91
+ # if the queue has received a message
92
+ if again_ack:
93
+
94
+ match again_ack:
95
+ case _CallbackRunnerInstruction.RanToCompletion:
96
+ # stop blocking the main thread on `join` and return from this thread
97
+ self.queue.task_done()
98
+
99
+ return
100
+
101
+ case _CallbackRunnerInstruction.PipelineStarted:
102
+ # if the pipeline has been started again, which shouldn't happen,
103
+ # ignore it and continue listening, call `task_done` to avoid blocking the main
104
+ # thread on `join`
105
+ self.queue.task_done()
106
+
107
+ continue
108
+ except Empty:
109
+ # if the queue is empty, continue listening
110
+ continue
111
+
112
+ case _CallbackRunnerInstruction.RanToCompletion:
113
+ if self.queue:
114
+ self.queue.task_done()
115
+ return
@@ -0,0 +1,100 @@
1
+ import pandas as pd
2
+ from decimal import Decimal
3
+
4
+
5
+ def sql_type_to_pandas_type(sql_type: str):
6
+ """
7
+ Converts a SQL type to a pandas type.
8
+ """
9
+
10
+ match sql_type.upper():
11
+ case 'BOOLEAN':
12
+ return 'boolean'
13
+ case 'TINYINT':
14
+ return 'Int8'
15
+ case 'SMALLINT':
16
+ return 'Int16'
17
+ case 'INTEGER':
18
+ return 'Int32'
19
+ case 'BIGINT':
20
+ return 'Int64'
21
+ case 'REAL':
22
+ return 'Float32'
23
+ case 'DOUBLE':
24
+ return 'Float64'
25
+ case 'DECIMAL':
26
+ return None
27
+ case 'CHAR':
28
+ return 'str'
29
+ case 'VARCHAR':
30
+ return 'str'
31
+ case 'DATE' | 'TIMESTAMP':
32
+ return 'datetime64[ns]'
33
+ case 'TIME' | 'INTERVAL':
34
+ return 'timedelta64[ns]'
35
+ case 'ARRAY':
36
+ return None
37
+ case 'NULL':
38
+ return None
39
+ case 'BINARY' | 'VARBINARY':
40
+ return None
41
+ case 'STRUCT' | 'MAP':
42
+ return None
43
+
44
+
45
+ def ensure_dataframe_has_columns(df: pd.DataFrame):
46
+ """
47
+ Ensures that the DataFrame has column names set.
48
+ """
49
+
50
+ if [v for v in range(df.shape[1])] == list(df.columns):
51
+ raise ValueError(
52
+ """
53
+ DataFrame has no column names set.
54
+ Input DataFrame must have column names set and they must be consistent with the columns in the input table.
55
+ """
56
+ )
57
+
58
+
59
+ def dataframe_from_response(buffer: list[list[dict]], schema: dict):
60
+ """
61
+ Converts the response from Feldera to a pandas DataFrame.
62
+ """
63
+
64
+ pd_schema = {}
65
+
66
+ decimal_col = []
67
+
68
+ for column in schema['fields']:
69
+ column_name = column['name']
70
+ column_type = column['columntype']['type']
71
+ if column_type == 'DECIMAL':
72
+ decimal_col.append(column_name)
73
+
74
+ pd_schema[column_name] = sql_type_to_pandas_type(column_type)
75
+
76
+ data = [
77
+ {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
78
+ for sublist in buffer for item in sublist
79
+ ]
80
+
81
+ if len(decimal_col) != 0:
82
+ for datum in data:
83
+ for col in decimal_col:
84
+ if datum[col] is not None:
85
+ datum[col] = Decimal(datum[col])
86
+
87
+
88
+ df = pd.DataFrame(data)
89
+ df = df.astype(pd_schema)
90
+
91
+ return df
92
+
93
+
94
+ def chunk_dataframe(df, chunk_size=1000):
95
+ """
96
+ Yield successive n-sized chunks from the given dataframe.
97
+ """
98
+
99
+ for i in range(0, len(df), chunk_size):
100
+ yield df.iloc[i:i + chunk_size]
@@ -0,0 +1,186 @@
1
+ from enum import Enum
2
+
3
+
4
+ class CompilationProfile(Enum):
5
+ """
6
+ The compilation profile to use when compiling the program.
7
+ """
8
+
9
+ SERVER_DEFAULT = None
10
+ """
11
+ The compiler server default compilation profile.
12
+ """
13
+
14
+ DEV = "dev"
15
+ """
16
+ The development compilation profile.
17
+ """
18
+
19
+ UNOPTIMIZED = "unoptimized"
20
+ """
21
+ The unoptimized compilation profile.
22
+ """
23
+
24
+ OPTIMIZED = "optimized"
25
+ """
26
+ The optimized compilation profile, the default for this API.
27
+ """
28
+
29
+
30
+ class BuildMode(Enum):
31
+ CREATE = 1
32
+ GET = 2
33
+ GET_OR_CREATE = 3
34
+
35
+
36
+ class PipelineStatus(Enum):
37
+ """
38
+ Represents the state that this pipeline is currently in.
39
+
40
+ .. code-block:: text
41
+
42
+ Shutdown ◄────┐
43
+ │ │
44
+ /deploy │ │
45
+ │ ⌛ShuttingDown
46
+ ▼ ▲
47
+ ⌛Provisioning │
48
+ │ │
49
+ Provisioned │
50
+ ▼ │/shutdown
51
+ ⌛Initializing │
52
+ │ │
53
+ ┌────────┴─────────┴─┐
54
+ │ ▼ │
55
+ │ Paused │
56
+ │ │ ▲ │
57
+ │/start│ │/pause │
58
+ │ ▼ │ │
59
+ │ Running │
60
+ └──────────┬─────────┘
61
+
62
+
63
+ Failed
64
+ """
65
+
66
+ NOT_FOUND = 1
67
+ """
68
+ The pipeline has not been created yet.
69
+ """
70
+
71
+ SHUTDOWN = 2
72
+ """
73
+ Pipeline has not been started or has been shut down.
74
+
75
+ The pipeline remains in this state until the user triggers
76
+ a deployment by invoking the `/deploy` endpoint.
77
+ """
78
+
79
+ PROVISIONING = 3
80
+ """
81
+ The runner triggered a deployment of the pipeline and is
82
+ waiting for the pipeline HTTP server to come up.
83
+
84
+ In this state, the runner provisions a runtime for the pipeline,
85
+ starts the pipeline within this runtime and waits for it to start accepting HTTP requests.
86
+
87
+ The user is unable to communicate with the pipeline during this
88
+ time. The pipeline remains in this state until:
89
+
90
+ 1. Its HTTP server is up and running; the pipeline transitions to the
91
+ `PipelineStatus.INITIALIZING` state.
92
+ 2. A pre-defined timeout has passed. The runner performs forced
93
+ shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
94
+ 3. The user cancels the pipeline by invoking the `/shutdown` endpoint.
95
+ The manager performs forced shutdown of the pipeline, returns to the
96
+ `PipelineStatus.SHUTDOWN` state.
97
+
98
+ """
99
+
100
+ INITIALIZING = 4
101
+ """
102
+ The pipeline is initializing its internal state and connectors.
103
+
104
+ This state is part of the pipeline's deployment process. In this state,
105
+ the pipeline's HTTP server is up and running, but its query engine
106
+ and input and output connectors are still initializing.
107
+
108
+ The pipeline remains in this state until:
109
+
110
+ 1. Initialization completes successfully; the pipeline transitions to the
111
+ `PipelineStatus.PAUSED` state.
112
+ 2. Initialization fails; transitions to the `PipelineStatus.FAILED` state.
113
+ 3. A pre-defined timeout has passed. The runner performs forced
114
+ shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
115
+ 4. The user cancels the pipeline by invoking the `/shutdown` endpoint.
116
+ The manager performs forced shutdown of the pipeline; returns to the
117
+ `PipelineStatus.SHUTDOWN` state.
118
+
119
+ """
120
+
121
+ PAUSED = 5
122
+ """
123
+ The pipeline is fully initialized, but data processing has been paused.
124
+
125
+ The pipeline remains in this state until:
126
+
127
+ 1. The user starts the pipeline by invoking the `/start` endpoint. The
128
+ manager passes the request to the pipeline; transitions to the
129
+ `PipelineStatus.RUNNING` state.
130
+ 2. The user cancels the pipeline by invoking the `/shutdown` endpoint.
131
+ The manager passes the shutdown request to the pipeline to perform a
132
+ graceful shutdown; transitions to the `PipelineStatus.SHUTTING_DOWN` state.
133
+ 3. An unexpected runtime error renders the pipeline `PipelineStatus.FAILED`.
134
+
135
+ """
136
+
137
+ RUNNING = 6
138
+ """
139
+ The pipeline is processing data.
140
+
141
+ The pipeline remains in this state until:
142
+
143
+ 1. The user pauses the pipeline by invoking the `/pause` endpoint. The
144
+ manager passes the request to the pipeline; transitions to the
145
+ `PipelineStatus.PAUSED` state.
146
+ 2. The user cancels the pipeline by invoking the `/shutdown` endpoint.
147
+ The runner passes the shutdown request to the pipeline to perform a
148
+ graceful shutdown; transitions to the
149
+ `PipelineStatus.SHUTTING_DOWN` state.
150
+ 3. An unexpected runtime error renders the pipeline
151
+ `PipelineStatus.FAILED`.
152
+
153
+ """
154
+
155
+ SHUTTING_DOWN = 7
156
+ """
157
+ Graceful shutdown in progress.
158
+
159
+ In this state, the pipeline finishes any ongoing data processing,
160
+ produces final outputs, shuts down input/output connectors and
161
+ terminates.
162
+
163
+ The pipeline remains in this state until:
164
+
165
+ 1. Shutdown completes successfully; transitions to the `PipelineStatus.SHUTDOWN` state.
166
+ 2. A pre-defined timeout has passed. The manager performs forced shutdown of the pipeline; returns to the
167
+ `PipelineStatus.SHUTDOWN` state.
168
+
169
+ """
170
+
171
+ FAILED = 8
172
+ """
173
+ The pipeline remains in this state until the users acknowledge the failure
174
+ by issuing a call to shutdown the pipeline; transitions to the
175
+ `PipelineStatus.SHUTDOWN` state.
176
+ """
177
+
178
+ @staticmethod
179
+ def from_str(value):
180
+ for member in PipelineStatus:
181
+ if member.name.lower() == value.lower():
182
+ return member
183
+ raise ValueError(f"Unknown value '{value}' for enum {PipelineStatus.__name__}")
184
+
185
+ def __eq__(self, other):
186
+ return self.value == other.value
@@ -0,0 +1,60 @@
1
+ import pandas as pd
2
+ from typing import Optional
3
+
4
+ from queue import Queue
5
+ from feldera import FelderaClient
6
+ from feldera._callback_runner import CallbackRunner
7
+
8
+
9
+ class OutputHandler:
10
+ def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, queue: Optional[Queue]):
11
+ """
12
+ Initializes the output handler, but doesn't start it.
13
+ To start the output handler, call the `.OutputHandler.start` method.
14
+ """
15
+
16
+ self.client: FelderaClient = client
17
+ self.pipeline_name: str = pipeline_name
18
+ self.view_name: str = view_name
19
+ self.queue: Optional[Queue] = queue
20
+ self.buffer: list[pd.DataFrame] = []
21
+
22
+ # the callback that is passed to the `CallbackRunner`
23
+ def callback(df: pd.DataFrame, _: int):
24
+ if not df.empty:
25
+ self.buffer.append(df)
26
+
27
+ # sets up the callback runner
28
+ self.handler = CallbackRunner(self.client, self.pipeline_name, self.view_name, callback, queue)
29
+
30
+ def start(self):
31
+ """
32
+ Starts the output handler in a separate thread
33
+ """
34
+
35
+ self.handler.start()
36
+
37
+ def to_pandas(self, clear_buffer: bool = True):
38
+ """
39
+ Returns the output of the pipeline as a pandas DataFrame
40
+
41
+ :param clear_buffer: Whether to clear the buffer after getting the output.
42
+ """
43
+
44
+ if len(self.buffer) == 0:
45
+ return pd.DataFrame()
46
+ res = pd.concat(self.buffer, ignore_index=True)
47
+ if clear_buffer:
48
+ self.buffer.clear()
49
+
50
+ return res
51
+
52
+ def to_dict(self, clear_buffer: bool = True):
53
+ """
54
+ Returns the output of the pipeline as a list of python dictionaries
55
+
56
+ :param clear_buffer: Whether to clear the buffer after getting the output.
57
+ """
58
+
59
+ return self.to_pandas(clear_buffer).to_dict(orient='records')
60
+