PyPI - feldera - Versions diffs - 0.27.0__tar.gz → 0.29.0__tar.gz - Mend

feldera 0.27.0tar.gz → 0.29.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of feldera might be problematic. Click here for more details.

Files changed (32) hide show

{feldera-0.27.0 → feldera-0.29.0}/PKG-INFO RENAMED Viewed

@@ -1,11 +1,11 @@
 Metadata-Version: 2.1
 Name: feldera
-Version: 0.27.0
+Version: 0.29.0
 Summary: The feldera python client
 Author-email: Abhinav <abhinav.gyawali@feldera.com>
 License: MIT
 Project-URL: Homepage, https://www.feldera.com
-Project-URL: Documentation, https://docs.feldera.com
+Project-URL: Documentation, https://docs.feldera.com/python
 Project-URL: Repository, https://github.com/feldera/feldera
 Project-URL: Issues, https://github.com/feldera/feldera/issues
 Keywords: feldera,python
@@ -19,6 +19,8 @@ Requires-Dist: requests
 Requires-Dist: pandas
 Requires-Dist: typing-extensions
 Requires-Dist: numpy<2
+Requires-Dist: pretty-errors
+Requires-Dist: ruff>=0.6.9
 # Feldera Python SDK
@@ -44,6 +46,15 @@ $ pip install git+https://github.com/feldera/feldera@{BRANCH_NAME}#subdirectory=
 Replace `{BRANCH_NAME}` with the name of the branch you want to install from.
+### Installing from Local Directory
+If you have cloned the Feldera repo, you can install the python SDK as follows:
+```bash
+# the Feldera Python SDK is present inside the python/ directory
+pip install python/
+```
 Checkout the docs [here](./feldera/__init__.py) for an example on how to use the SDK.
 ## Documentation
@@ -90,5 +101,5 @@ To run the aggregate tests use:
 ```bash
 cd python
-PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/test_base.py
+PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/main.py
 ```

{feldera-0.27.0 → feldera-0.29.0}/README.md RENAMED Viewed

@@ -22,6 +22,15 @@ $ pip install git+https://github.com/feldera/feldera@{BRANCH_NAME}#subdirectory=
 Replace `{BRANCH_NAME}` with the name of the branch you want to install from.
+### Installing from Local Directory
+If you have cloned the Feldera repo, you can install the python SDK as follows:
+```bash
+# the Feldera Python SDK is present inside the python/ directory
+pip install python/
+```
 Checkout the docs [here](./feldera/__init__.py) for an example on how to use the SDK.
 ## Documentation
@@ -68,5 +77,5 @@ To run the aggregate tests use:
 ```bash
 cd python
-PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/test_base.py
+PYTHONPATH=`pwd` python3 ./tests/aggregate_tests/main.py
 ```

feldera-0.29.0/feldera/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from feldera.rest.feldera_client import FelderaClient as FelderaClient
+from feldera.pipeline import Pipeline as Pipeline
+from feldera.pipeline_builder import PipelineBuilder as PipelineBuilder
+import pretty_errors
+pretty_errors.configure(
+    line_number_first=True,
+)
+pretty_errors.activate()

{feldera-0.27.0 → feldera-0.29.0}/feldera/_callback_runner.py RENAMED Viewed

@@ -15,12 +15,12 @@ class _CallbackRunnerInstruction(Enum):
 class CallbackRunner(Thread):
     def __init__(
-            self,
-            client: FelderaClient,
-            pipeline_name: str,
-            view_name: str,
-            callback: Callable[[pd.DataFrame, int], None],
-            queue: Optional[Queue],
+        self,
+        client: FelderaClient,
+        pipeline_name: str,
+        view_name: str,
+        callback: Callable[[pd.DataFrame, int], None],
+        queue: Optional[Queue],
     ):
         super().__init__()
         self.daemon = True
@@ -49,7 +49,9 @@ class CallbackRunner(Thread):
                     break
         if self.schema is None:
-            raise ValueError(f"Table or View {self.view_name} not found in the pipeline schema.")
+            raise ValueError(
+                f"Table or View {self.view_name} not found in the pipeline schema."
+            )
         # by default, we assume that the pipeline has been started
         ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
@@ -60,12 +62,12 @@ class CallbackRunner(Thread):
             ack: _CallbackRunnerInstruction = self.queue.get()
         match ack:
             # if the pipeline has actually been started, we start a listener
             case _CallbackRunnerInstruction.PipelineStarted:
                 # listen to the pipeline
-                gen_obj = self.client.listen_to_pipeline(self.pipeline_name, self.view_name, format="json")
+                gen_obj = self.client.listen_to_pipeline(
+                    self.pipeline_name, self.view_name, format="json"
+                )
                 # if there is a queue set up, inform the main thread that the listener has been started, and it can
                 # proceed with starting the pipeline
@@ -90,7 +92,6 @@ class CallbackRunner(Thread):
                             # if the queue has received a message
                             if again_ack:
                                 match again_ack:
                                     case _CallbackRunnerInstruction.RanToCompletion:
                                         # stop blocking the main thread on `join` and return from this thread

{feldera-0.27.0 → feldera-0.29.0}/feldera/_helpers.py RENAMED Viewed

@@ -8,37 +8,37 @@ def sql_type_to_pandas_type(sql_type: str):
     """
     match sql_type.upper():
-        case 'BOOLEAN':
-            return 'boolean'
-        case 'TINYINT':
-            return 'Int8'
-        case 'SMALLINT':
-            return 'Int16'
-        case 'INTEGER':
-            return 'Int32'
-        case 'BIGINT':
-            return 'Int64'
-        case 'REAL':
-            return 'Float32'
-        case 'DOUBLE':
-            return 'Float64'
-        case 'DECIMAL':
+        case "BOOLEAN":
+            return "boolean"
+        case "TINYINT":
+            return "Int8"
+        case "SMALLINT":
+            return "Int16"
+        case "INTEGER":
+            return "Int32"
+        case "BIGINT":
+            return "Int64"
+        case "REAL":
+            return "Float32"
+        case "DOUBLE":
+            return "Float64"
+        case "DECIMAL":
             return None
-        case 'CHAR':
-            return 'str'
-        case 'VARCHAR':
-            return 'str'
-        case 'DATE' | 'TIMESTAMP':
-            return 'datetime64[ns]'
-        case 'TIME' | 'INTERVAL':
-            return 'timedelta64[ns]'
-        case 'ARRAY':
+        case "CHAR":
+            return "str"
+        case "VARCHAR":
+            return "str"
+        case "DATE" | "TIMESTAMP":
+            return "datetime64[ns]"
+        case "TIME" | "INTERVAL":
+            return "timedelta64[ns]"
+        case "ARRAY":
             return None
-        case 'NULL':
+        case "NULL":
             return None
-        case 'BINARY' | 'VARBINARY':
+        case "BINARY" | "VARBINARY":
             return None
-        case 'STRUCT' | 'MAP':
+        case "STRUCT" | "MAP":
             return None
@@ -65,17 +65,22 @@ def dataframe_from_response(buffer: list[list[dict]], schema: dict):
     decimal_col = []
-    for column in schema['fields']:
-        column_name = column['name']
-        column_type = column['columntype']['type']
-        if column_type == 'DECIMAL':
+    for column in schema["fields"]:
+        column_name = column["name"]
+        if not column["case_sensitive"]:
+            column_name = column_name.lower()
+        column_type = column["columntype"]["type"]
+        if column_type == "DECIMAL":
             decimal_col.append(column_name)
         pd_schema[column_name] = sql_type_to_pandas_type(column_type)
     data = [
-        {**item['insert'], 'insert_delete': 1} if 'insert' in item else {**item['delete'], 'insert_delete': -1}
-        for sublist in buffer for item in sublist
+        {**item["insert"], "insert_delete": 1}
+        if "insert" in item
+        else {**item["delete"], "insert_delete": -1}
+        for sublist in buffer
+        for item in sublist
     ]
     if len(decimal_col) != 0:
@@ -84,7 +89,6 @@ def dataframe_from_response(buffer: list[list[dict]], schema: dict):
                 if datum[col] is not None:
                     datum[col] = Decimal(datum[col])
     df = pd.DataFrame(data)
     df = df.astype(pd_schema)
@@ -97,4 +101,4 @@ def chunk_dataframe(df, chunk_size=1000):
     """
     for i in range(0, len(df), chunk_size):
-        yield df.iloc[i:i + chunk_size]
+        yield df.iloc[i : i + chunk_size]

{feldera-0.27.0 → feldera-0.29.0}/feldera/enums.py RENAMED Viewed

@@ -61,7 +61,7 @@ class PipelineStatus(Enum):
                    │
                    ▼
                 Failed
-   """
+    """
     NOT_FOUND = 1
     """

{feldera-0.27.0 → feldera-0.29.0}/feldera/output_handler.py RENAMED Viewed

@@ -7,7 +7,13 @@ from feldera._callback_runner import CallbackRunner
 class OutputHandler:
-    def __init__(self, client: FelderaClient, pipeline_name: str, view_name: str, queue: Optional[Queue]):
+    def __init__(
+        self,
+        client: FelderaClient,
+        pipeline_name: str,
+        view_name: str,
+        queue: Optional[Queue],
+    ):
         """
         Initializes the output handler, but doesn't start it.
         To start the output handler, call the `.OutputHandler.start` method.
@@ -25,7 +31,9 @@ class OutputHandler:
                 self.buffer.append(df)
         # sets up the callback runner
-        self.handler = CallbackRunner(self.client, self.pipeline_name, self.view_name, callback, queue)
+        self.handler = CallbackRunner(
+            self.client, self.pipeline_name, self.view_name, callback, queue
+        )
     def start(self):
         """
@@ -56,5 +64,4 @@ class OutputHandler:
         :param clear_buffer: Whether to clear the buffer after getting the output.
         """
-        return self.to_pandas(clear_buffer).to_dict(orient='records')
+        return self.to_pandas(clear_buffer).to_dict(orient="records")

{feldera-0.27.0 → feldera-0.29.0}/feldera/pipeline.py RENAMED Viewed

@@ -1,7 +1,8 @@
 import time
 import pandas
-from typing import List, Dict, Callable, Optional
+from typing import List, Dict, Callable, Optional, Generator, Mapping, Any
+from collections import deque
 from queue import Queue
 from feldera.rest.errors import FelderaAPIError
@@ -72,8 +73,12 @@ class Pipeline:
         ensure_dataframe_has_columns(df)
         pipeline = self.client.get_pipeline(self.name)
-        if table_name.lower() != "now" and table_name.lower() not in [tbl.name.lower() for tbl in pipeline.tables]:
-            raise ValueError(f"Cannot push to table '{table_name}' as it is not registered yet")
+        if table_name.lower() != "now" and table_name.lower() not in [
+            tbl.name.lower() for tbl in pipeline.tables
+        ]:
+            raise ValueError(
+                f"Cannot push to table '{table_name}' as it is not registered yet"
+            )
         else:
             # consider validating the schema here
             for datum in chunk_dataframe(df):
@@ -81,15 +86,21 @@ class Pipeline:
                     self.name,
                     table_name,
                     "json",
-                    datum.to_json(orient='records', date_format='epoch'),
-                    json_flavor='pandas',
+                    datum.to_json(orient="records", date_format="epoch"),
+                    json_flavor="pandas",
                     array=True,
                     serialize=False,
                     force=force,
                 )
             return
-    def input_json(self, table_name: str, data: Dict | list, update_format: str = "raw", force: bool = False):
+    def input_json(
+        self,
+        table_name: str,
+        data: Dict | list,
+        update_format: str = "raw",
+        force: bool = False,
+    ):
         """
         Push this JSON data to the specified table of the pipeline.
@@ -112,7 +123,7 @@ class Pipeline:
             data,
             update_format=update_format,
             array=array,
-            force=force
+            force=force,
         )
     def listen(self, view_name: str) -> OutputHandler:
@@ -134,7 +145,9 @@ class Pipeline:
         return handler
-    def foreach_chunk(self, view_name: str, callback: Callable[[pandas.DataFrame, int], None]):
+    def foreach_chunk(
+        self, view_name: str, callback: Callable[[pandas.DataFrame, int], None]
+    ):
         """
         Run the given callback on each chunk of the output of the specified view.
@@ -190,11 +203,15 @@ class Pipeline:
             raise RuntimeError("Pipeline must be running to wait for completion")
         while True:
-            metrics: dict = self.client.get_pipeline_stats(self.name).get("global_metrics")
+            metrics: dict = self.client.get_pipeline_stats(self.name).get(
+                "global_metrics"
+            )
             pipeline_complete: bool = metrics.get("pipeline_complete")
             if pipeline_complete is None:
-                raise RuntimeError("received unknown metrics from the pipeline, pipeline_complete is None")
+                raise RuntimeError(
+                    "received unknown metrics from the pipeline, pipeline_complete is None"
+                )
             if pipeline_complete:
                 break
@@ -215,7 +232,9 @@ class Pipeline:
         status = self.status()
         if status != PipelineStatus.SHUTDOWN:
-            raise RuntimeError(f"pipeline {self.name} in state: {str(status.name)} cannot be started")
+            raise RuntimeError(
+                f"pipeline {self.name} in state: {str(status.name)} cannot be started"
+            )
         self.pause()
         self.__setup_output_listeners()
@@ -230,10 +249,10 @@ class Pipeline:
         self.start()
     def wait_for_idle(
-            self,
-            idle_interval_s: float = 5.0,
-            timeout_s: float = 600.0,
-            poll_interval_s: float = 0.2
+        self,
+        idle_interval_s: float = 5.0,
+        timeout_s: float = 600.0,
+        poll_interval_s: float = 0.2,
     ):
         """
         Wait for the pipeline to become idle and then returns.
@@ -253,12 +272,18 @@ class Pipeline:
             reached.
         """
         if idle_interval_s > timeout_s:
-            raise ValueError(f"idle interval ({idle_interval_s}s) cannot be larger than timeout ({timeout_s}s)")
+            raise ValueError(
+                f"idle interval ({idle_interval_s}s) cannot be larger than timeout ({timeout_s}s)"
+            )
         if poll_interval_s > timeout_s:
-            raise ValueError(f"poll interval ({poll_interval_s}s) cannot be larger than timeout ({timeout_s}s)")
+            raise ValueError(
+                f"poll interval ({poll_interval_s}s) cannot be larger than timeout ({timeout_s}s)"
+            )
         if poll_interval_s > idle_interval_s:
-            raise ValueError(f"poll interval ({poll_interval_s}s) cannot be larger "
-                             f"than idle interval ({idle_interval_s}s)")
+            raise ValueError(
+                f"poll interval ({poll_interval_s}s) cannot be larger "
+                f"than idle interval ({idle_interval_s}s)"
+            )
         start_time_s = time.monotonic()
         idle_started_s = None
@@ -267,16 +292,24 @@ class Pipeline:
             now_s = time.monotonic()
             # Metrics retrieval
-            metrics: dict = self.client.get_pipeline_stats(self.name).get("global_metrics")
+            metrics: dict = self.client.get_pipeline_stats(self.name).get(
+                "global_metrics"
+            )
             total_input_records: int | None = metrics.get("total_input_records")
             total_processed_records: int | None = metrics.get("total_processed_records")
             if total_input_records is None:
-                raise RuntimeError("total_input_records is missing from the pipeline metrics")
+                raise RuntimeError(
+                    "total_input_records is missing from the pipeline metrics"
+                )
             if total_processed_records is None:
-                raise RuntimeError("total_processed_records is missing from the pipeline metrics")
+                raise RuntimeError(
+                    "total_processed_records is missing from the pipeline metrics"
+                )
             # Idle check
-            unchanged = prev[0] == total_input_records and prev[1] == total_processed_records
+            unchanged = (
+                prev[0] == total_input_records and prev[1] == total_processed_records
+            )
             equal = total_input_records == total_processed_records
             prev = (total_input_records, total_processed_records)
             if unchanged and equal:
@@ -328,7 +361,7 @@ class Pipeline:
         self.client.delete_pipeline(self.name)
     @staticmethod
-    def get(name: str, client: FelderaClient) -> 'Pipeline':
+    def get(name: str, client: FelderaClient) -> "Pipeline":
         """
         Get the pipeline if it exists.
@@ -344,3 +377,57 @@ class Pipeline:
         except FelderaAPIError as err:
             if err.status_code == 404:
                 raise RuntimeError(f"Pipeline with name {name} not found")
+    def query(self, query: str) -> Generator[Mapping[str, Any], None, None]:
+        """
+        Executes an ad-hoc SQL query on this pipeline and returns the result in the specified format.
+        For ``INSERT`` and ``DELETE`` queries, consider using :meth:`.execute` instead.
+        Important:
+            This method is lazy. It returns a generator and is not evaluated until you consume the result.
+        :param query: The SQL query to be executed.
+        :return: A generator that yields the rows of the result as Python dictionaries.
+        """
+        return self.client.query_as_json(self.name, query)
+    def query_parquet(self, query: str, path: str):
+        """
+        Executes an ad-hoc SQL query on this pipeline and saves the result to the specified path as a parquet file.
+        If the extension isn't `parquet`, it will be automatically appended to `path`.
+        :param query: The SQL query to be executed.
+        :param path: The path of the parquet file.
+        """
+        self.client.query_as_parquet(self.name, query, path)
+    def query_tabular(self, query: str) -> Generator[str, None, None]:
+        """
+        Executes a SQL query on this pipeline and returns the result as a formatted string.
+        Important:
+            This method is lazy. It returns a generator and is not evaluated until you consume the result.
+        :param query: The SQL query to be executed.
+        :return: A generator that yields a string representing the query result in a human-readable, tabular format.
+        """
+        return self.client.query_as_text(self.name, query)
+    def execute(self, query: str):
+        """
+        Executes an ad-hoc SQL query on the current pipeline, discarding its result.
+        Unlike the :meth:`.query` method which returns a generator for retrieving query results lazily,
+        this method processes the query eagerly and fully before returning.
+        This method is suitable for SQL operations like ``INSERT`` and ``DELETE``, where the user needs
+        confirmation of successful query execution, but does not require the query result.
+        If the query fails, an exception will be raised.
+        :param query: The SQL query to be executed.
+        """
+        gen = self.query_tabular(query)
+        deque(gen, maxlen=0)

{feldera-0.27.0 → feldera-0.29.0}/feldera/pipeline_builder.py RENAMED Viewed

@@ -14,6 +14,8 @@ class PipelineBuilder:
     :param name: The name of the pipeline
     :param description: The description of the pipeline
     :param sql: The SQL code of the pipeline
+    :param udf_rust: Rust code for UDFs
+    :param udf_toml: Rust dependencies required by UDFs (in the TOML format)
     :param compilation_profile: The compilation profile to use
     :param runtime_config: The runtime config to use
     """
@@ -23,15 +25,18 @@ class PipelineBuilder:
         client: FelderaClient,
         name: str,
         sql: str,
+        udf_rust: str = "",
+        udf_toml: str = "",
         description: str = "",
         compilation_profile: CompilationProfile = CompilationProfile.OPTIMIZED,
         runtime_config: RuntimeConfig = RuntimeConfig(resources=Resources()),
     ):
         self.client: FelderaClient = client
         self.name: str | None = name
         self.description: str = description
         self.sql: str = sql
+        self.udf_rust: str = udf_rust
+        self.udf_toml: str = udf_toml
         self.compilation_profile: CompilationProfile = compilation_profile
         self.runtime_config: RuntimeConfig = runtime_config
@@ -52,8 +57,10 @@ class PipelineBuilder:
             self.name,
             description=self.description,
             sql=self.sql,
+            udf_rust=self.udf_rust,
+            udf_toml=self.udf_toml,
             program_config={
-                'profile': self.compilation_profile.value,
+                "profile": self.compilation_profile.value,
             },
             runtime_config=self.runtime_config.__dict__,
         )
@@ -85,10 +92,14 @@ class PipelineBuilder:
             self.name,
             description=self.description,
             sql=self.sql,
+            udf_rust=self.udf_rust,
+            udf_toml=self.udf_toml,
             program_config={
-                'profile': self.compilation_profile.value,
+                "profile": self.compilation_profile.value,
             },
-            runtime_config=dict((k, v) for k, v in self.runtime_config.__dict__.items() if v is not None),
+            runtime_config=dict(
+                (k, v) for k, v in self.runtime_config.__dict__.items() if v is not None
+            ),
         )
         inner = self.client.create_or_update_pipeline(inner)

{feldera-0.27.0 → feldera-0.29.0}/feldera/rest/__init__.py RENAMED Viewed

@@ -8,4 +8,4 @@ instead of using the REST client directly.
 """
-from feldera.rest.feldera_client import FelderaClient
+from feldera.rest.feldera_client import FelderaClient as FelderaClient

feldera 0.27.0__tar.gz → 0.29.0__tar.gz

Potentially problematic release.

feldera 0.27.0tar.gz → 0.29.0tar.gz