PyPI - feldera - Versions diffs - 0.34.1__py3-none-any.whl - Mend

feldera 0.34.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of feldera might be problematic. Click here for more details.

Files changed (20) hide show

feldera/__init__.py +11 -0
feldera/_callback_runner.py +116 -0
feldera/_helpers.py +104 -0
feldera/enums.py +234 -0
feldera/output_handler.py +67 -0
feldera/pipeline.py +809 -0
feldera/pipeline_builder.py +109 -0
feldera/rest/__init__.py +11 -0
feldera/rest/_httprequests.py +182 -0
feldera/rest/config.py +26 -0
feldera/rest/errors.py +58 -0
feldera/rest/feldera_client.py +605 -0
feldera/rest/pipeline.py +77 -0
feldera/rest/sql_table.py +23 -0
feldera/rest/sql_view.py +23 -0
feldera/runtime_config.py +78 -0
feldera-0.34.1.dist-info/METADATA +105 -0
feldera-0.34.1.dist-info/RECORD +20 -0
feldera-0.34.1.dist-info/WHEEL +5 -0
feldera-0.34.1.dist-info/top_level.txt +1 -0

feldera/__init__.py ADDED Viewed

@@ -0,0 +1,11 @@
+from feldera.rest.feldera_client import FelderaClient as FelderaClient
+from feldera.pipeline import Pipeline as Pipeline
+from feldera.pipeline_builder import PipelineBuilder as PipelineBuilder
+import pretty_errors
+pretty_errors.configure(
+    line_number_first=True,
+)
+pretty_errors.activate()

feldera/_callback_runner.py ADDED Viewed

@@ -0,0 +1,116 @@
+from enum import Enum
+from threading import Thread
+from typing import Callable, Optional
+from queue import Queue, Empty
+import pandas as pd
+from feldera import FelderaClient
+from feldera._helpers import dataframe_from_response
+class _CallbackRunnerInstruction(Enum):
+    PipelineStarted = 1
+    RanToCompletion = 2
+class CallbackRunner(Thread):
+    def __init__(
+        self,
+        client: FelderaClient,
+        pipeline_name: str,
+        view_name: str,
+        callback: Callable[[pd.DataFrame, int], None],
+        queue: Optional[Queue],
+    ):
+        super().__init__()
+        self.daemon = True
+        self.client: FelderaClient = client
+        self.pipeline_name: str = pipeline_name
+        self.view_name: str = view_name
+        self.callback: Callable[[pd.DataFrame, int], None] = callback
+        self.queue: Optional[Queue] = queue
+        self.schema: Optional[dict] = None
+    def run(self):
+        """
+        The main loop of the thread. Listens for data and calls the callback function on each chunk of data received.
+        :meta private:
+        """
+        pipeline = self.client.get_pipeline(self.pipeline_name)
+        schema = pipeline.program_info["schema"]
+        if schema:
+            schemas = [relation for relation in schema["inputs"] + schema["outputs"]]
+            for schema in schemas:
+                if schema["name"] == self.view_name:
+                    self.schema = schema
+                    break
+        if self.schema is None:
+            raise ValueError(
+                f"Table or View {self.view_name} not found in the pipeline schema."
+            )
+        # by default, we assume that the pipeline has been started
+        ack: _CallbackRunnerInstruction = _CallbackRunnerInstruction.PipelineStarted
+        # if there is Queue, we wait for the instruction to start the pipeline
+        # this means that we are listening to the pipeline before running it, therefore, all data should be received
+        if self.queue:
+            ack: _CallbackRunnerInstruction = self.queue.get()
+        match ack:
+            # if the pipeline has actually been started, we start a listener
+            case _CallbackRunnerInstruction.PipelineStarted:
+                # listen to the pipeline
+                gen_obj = self.client.listen_to_pipeline(
+                    self.pipeline_name, self.view_name, format="json"
+                )
+                # if there is a queue set up, inform the main thread that the listener has been started, and it can
+                # proceed with starting the pipeline
+                if self.queue:
+                    # stop blocking the main thread on `join` for the previous message
+                    self.queue.task_done()
+                for chunk in gen_obj:
+                    chunk: dict = chunk
+                    data: list[dict] = chunk.get("json_data")
+                    seq_no: int = chunk.get("sequence_number")
+                    if data is not None:
+                        self.callback(dataframe_from_response([data], schema), seq_no)
+                    if self.queue:
+                        try:
+                            # if a non-blocking way, check if the queue has received further instructions
+                            # this should be a RanToCompletion instruction, which means that the pipeline has been
+                            # completed
+                            again_ack = self.queue.get_nowait()
+                            # if the queue has received a message
+                            if again_ack:
+                                match again_ack:
+                                    case _CallbackRunnerInstruction.RanToCompletion:
+                                        # stop blocking the main thread on `join` and return from this thread
+                                        self.queue.task_done()
+                                        return
+                                    case _CallbackRunnerInstruction.PipelineStarted:
+                                        # if the pipeline has been started again, which shouldn't happen,
+                                        # ignore it and continue listening, call `task_done` to avoid blocking the main
+                                        # thread on `join`
+                                        self.queue.task_done()
+                                        continue
+                        except Empty:
+                            # if the queue is empty, continue listening
+                            continue
+            case _CallbackRunnerInstruction.RanToCompletion:
+                if self.queue:
+                    self.queue.task_done()
+                return

feldera/_helpers.py ADDED Viewed

@@ -0,0 +1,104 @@
+import pandas as pd
+from decimal import Decimal
+def sql_type_to_pandas_type(sql_type: str):
+    """
+    Converts a SQL type to a pandas type.
+    """
+    match sql_type.upper():
+        case "BOOLEAN":
+            return "boolean"
+        case "TINYINT":
+            return "Int8"
+        case "SMALLINT":
+            return "Int16"
+        case "INTEGER":
+            return "Int32"
+        case "BIGINT":
+            return "Int64"
+        case "REAL":
+            return "Float32"
+        case "DOUBLE":
+            return "Float64"
+        case "DECIMAL":
+            return None
+        case "CHAR":
+            return "str"
+        case "VARCHAR":
+            return "str"
+        case "DATE" | "TIMESTAMP":
+            return "datetime64[ns]"
+        case "TIME" | "INTERVAL":
+            return "timedelta64[ns]"
+        case "ARRAY":
+            return None
+        case "NULL":
+            return None
+        case "BINARY" | "VARBINARY":
+            return None
+        case "STRUCT" | "MAP":
+            return None
+def ensure_dataframe_has_columns(df: pd.DataFrame):
+    """
+    Ensures that the DataFrame has column names set.
+    """
+    if [v for v in range(df.shape[1])] == list(df.columns):
+        raise ValueError(
+            """
+            DataFrame has no column names set.
+            Input DataFrame must have column names set and they must be consistent with the columns in the input table.
+            """
+        )
+def dataframe_from_response(buffer: list[list[dict]], schema: dict):
+    """
+    Converts the response from Feldera to a pandas DataFrame.
+    """
+    pd_schema = {}
+    decimal_col = []
+    for column in schema["fields"]:
+        column_name = column["name"]
+        if not column["case_sensitive"]:
+            column_name = column_name.lower()
+        column_type = column["columntype"]["type"]
+        if column_type == "DECIMAL":
+            decimal_col.append(column_name)
+        pd_schema[column_name] = sql_type_to_pandas_type(column_type)
+    data = [
+        {**item["insert"], "insert_delete": 1}
+        if "insert" in item
+        else {**item["delete"], "insert_delete": -1}
+        for sublist in buffer
+        for item in sublist
+    ]
+    if len(decimal_col) != 0:
+        for datum in data:
+            for col in decimal_col:
+                if datum[col] is not None:
+                    datum[col] = Decimal(datum[col])
+    df = pd.DataFrame(data)
+    df = df.astype(pd_schema)
+    return df
+def chunk_dataframe(df, chunk_size=1000):
+    """
+    Yield successive n-sized chunks from the given dataframe.
+    """
+    for i in range(0, len(df), chunk_size):
+        yield df.iloc[i : i + chunk_size]

feldera/enums.py ADDED Viewed

@@ -0,0 +1,234 @@
+from enum import Enum
+from typing import Optional
+class CompilationProfile(Enum):
+    """
+    The compilation profile to use when compiling the program.
+    """
+    SERVER_DEFAULT = None
+    """
+    The compiler server default compilation profile.
+    """
+    DEV = "dev"
+    """
+    The development compilation profile.
+    """
+    UNOPTIMIZED = "unoptimized"
+    """
+    The unoptimized compilation profile.
+    """
+    OPTIMIZED = "optimized"
+    """
+    The optimized compilation profile, the default for this API.
+    """
+class BuildMode(Enum):
+    CREATE = 1
+    GET = 2
+    GET_OR_CREATE = 3
+class PipelineStatus(Enum):
+    """
+    Represents the state that this pipeline is currently in.
+    .. code-block:: text
+        Shutdown     ◄────┐
+        │         │
+        /deploy   │       │
+        │   ⌛ShuttingDown
+        ▼         ▲
+        ⌛Provisioning    │
+        │         │
+        Provisioned        │
+        ▼         │/shutdown
+        ⌛Initializing     │
+        │        │
+        ┌────────┴─────────┴─┐
+        │        ▼           │
+        │      Paused        │
+        │      │    ▲        │
+        │/start│    │/pause  │
+        │      ▼    │        │
+        │     Running        │
+        └──────────┬─────────┘
+                   │
+                   ▼
+                Failed
+    """
+    NOT_FOUND = 1
+    """
+    The pipeline has not been created yet.
+    """
+    SHUTDOWN = 2
+    """
+    Pipeline has not been started or has been shut down.
+    The pipeline remains in this state until the user triggers
+    a deployment by invoking the `/deploy` endpoint.
+    """
+    PROVISIONING = 3
+    """
+    The runner triggered a deployment of the pipeline and is
+    waiting for the pipeline HTTP server to come up.
+    In this state, the runner provisions a runtime for the pipeline,
+    starts the pipeline within this runtime and waits for it to start accepting HTTP requests.
+    The user is unable to communicate with the pipeline during this
+    time.  The pipeline remains in this state until:
+        1. Its HTTP server is up and running; the pipeline transitions to the
+           `PipelineStatus.INITIALIZING` state.
+        2. A pre-defined timeout has passed.  The runner performs forced
+           shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
+        3. The user cancels the pipeline by invoking the `/shutdown` endpoint.
+           The manager performs forced shutdown of the pipeline, returns to the
+           `PipelineStatus.SHUTDOWN` state.
+    """
+    INITIALIZING = 4
+    """
+    The pipeline is initializing its internal state and connectors.
+    This state is part of the pipeline's deployment process.  In this state,
+    the pipeline's HTTP server is up and running, but its query engine
+    and input and output connectors are still initializing.
+    The pipeline remains in this state until:
+        1.  Initialization completes successfully; the pipeline transitions to the
+            `PipelineStatus.PAUSED` state.
+        2.  Initialization fails; transitions to the `PipelineStatus.FAILED` state.
+        3.  A pre-defined timeout has passed.  The runner performs forced
+            shutdown of the pipeline; returns to the `PipelineStatus.SHUTDOWN` state.
+        4.  The user cancels the pipeline by invoking the `/shutdown` endpoint.
+            The manager performs forced shutdown of the pipeline; returns to the
+            `PipelineStatus.SHUTDOWN` state.
+    """
+    PAUSED = 5
+    """
+    The pipeline is fully initialized, but data processing has been paused.
+    The pipeline remains in this state until:
+        1.  The user starts the pipeline by invoking the `/start` endpoint. The
+            manager passes the request to the pipeline; transitions to the
+            `PipelineStatus.RUNNING` state.
+        2.  The user cancels the pipeline by invoking the `/shutdown` endpoint.
+            The manager passes the shutdown request to the pipeline to perform a
+            graceful shutdown; transitions to the `PipelineStatus.SHUTTING_DOWN` state.
+        3.  An unexpected runtime error renders the pipeline `PipelineStatus.FAILED`.
+    """
+    RUNNING = 6
+    """
+    The pipeline is processing data.
+    The pipeline remains in this state until:
+        1. The user pauses the pipeline by invoking the `/pause` endpoint. The
+           manager passes the request to the pipeline; transitions to the
+           `PipelineStatus.PAUSED` state.
+        2. The user cancels the pipeline by invoking the `/shutdown` endpoint.
+           The runner passes the shutdown request to the pipeline to perform a
+           graceful shutdown; transitions to the
+           `PipelineStatus.SHUTTING_DOWN` state.
+        3. An unexpected runtime error renders the pipeline
+           `PipelineStatus.FAILED`.
+    """
+    SHUTTING_DOWN = 7
+    """
+    Graceful shutdown in progress.
+    In this state, the pipeline finishes any ongoing data processing,
+    produces final outputs, shuts down input/output connectors and
+    terminates.
+    The pipeline remains in this state until:
+        1. Shutdown completes successfully; transitions to the `PipelineStatus.SHUTDOWN` state.
+        2. A pre-defined timeout has passed. The manager performs forced shutdown of the pipeline; returns to the
+           `PipelineStatus.SHUTDOWN` state.
+    """
+    FAILED = 8
+    """
+    The pipeline remains in this state until the users acknowledge the failure
+    by issuing a call to shutdown the pipeline; transitions to the
+    `PipelineStatus.SHUTDOWN` state.
+    """
+    UNAVAILABLE = 9
+    """
+    The pipeline was at least once initialized, but in the most recent status check either
+    could not be reached or returned it is not yet ready.
+    """
+    @staticmethod
+    def from_str(value):
+        for member in PipelineStatus:
+            if member.name.lower() == value.lower():
+                return member
+        raise ValueError(f"Unknown value '{value}' for enum {PipelineStatus.__name__}")
+    def __eq__(self, other):
+        return self.value == other.value
+class ProgramStatus(Enum):
+    Pending = 1
+    CompilingSql = 2
+    SqlCompiled = 3
+    CompilingRust = 4
+    Success = 5
+    SqlError = 6
+    RustError = 7
+    SystemError = 8
+    def __init__(self, value):
+        self.error: Optional[dict] = None
+        self._value_ = value
+    @staticmethod
+    def from_value(value):
+        error = None
+        if isinstance(value, dict):
+            error = value
+            value = list(value.keys())[0]
+        for member in ProgramStatus:
+            if member.name.lower() == value.lower():
+                member.error = error
+                return member
+        raise ValueError(f"Unknown value '{value}' for enum {ProgramStatus.__name__}")
+    def __eq__(self, other):
+        return self.value == other.value
+    def __str__(self):
+        return self.name + (f": ({self.error})" if self.error else "")
+    def get_error(self) -> Optional[dict]:
+        """
+        Returns the compilation error, if any.
+        """
+        return self.error

feldera/output_handler.py ADDED Viewed

@@ -0,0 +1,67 @@
+import pandas as pd
+from typing import Optional
+from queue import Queue
+from feldera import FelderaClient
+from feldera._callback_runner import CallbackRunner
+class OutputHandler:
+    def __init__(
+        self,
+        client: FelderaClient,
+        pipeline_name: str,
+        view_name: str,
+        queue: Optional[Queue],
+    ):
+        """
+        Initializes the output handler, but doesn't start it.
+        To start the output handler, call the `.OutputHandler.start` method.
+        """
+        self.client: FelderaClient = client
+        self.pipeline_name: str = pipeline_name
+        self.view_name: str = view_name
+        self.queue: Optional[Queue] = queue
+        self.buffer: list[pd.DataFrame] = []
+        # the callback that is passed to the `CallbackRunner`
+        def callback(df: pd.DataFrame, _: int):
+            if not df.empty:
+                self.buffer.append(df)
+        # sets up the callback runner
+        self.handler = CallbackRunner(
+            self.client, self.pipeline_name, self.view_name, callback, queue
+        )
+    def start(self):
+        """
+        Starts the output handler in a separate thread
+        """
+        self.handler.start()
+    def to_pandas(self, clear_buffer: bool = True):
+        """
+        Returns the output of the pipeline as a pandas DataFrame
+        :param clear_buffer: Whether to clear the buffer after getting the output.
+        """
+        if len(self.buffer) == 0:
+            return pd.DataFrame()
+        res = pd.concat(self.buffer, ignore_index=True)
+        if clear_buffer:
+            self.buffer.clear()
+        return res
+    def to_dict(self, clear_buffer: bool = True):
+        """
+        Returns the output of the pipeline as a list of python dictionaries
+        :param clear_buffer: Whether to clear the buffer after getting the output.
+        """
+        return self.to_pandas(clear_buffer).to_dict(orient="records")