PyPI - feldera - Versions diffs - 0.136.0__py3-none-any.whl → 0.138.0__py3-none-any.whl - Mend

feldera 0.136.0py3-none-any.whl → 0.138.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of feldera might be problematic. Click here for more details.

Files changed (7) hide show

feldera/pipeline.py +49 -4
feldera/runtime_config.py +2 -0
feldera/testutils.py +238 -60
{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/METADATA +1 -1
{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/RECORD +7 -7
{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/WHEEL +0 -0
{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/top_level.txt +0 -0

feldera/pipeline.py CHANGED Viewed

@@ -326,7 +326,7 @@ class Pipeline:
                     f" time {elapsed}s, timeout: {timeout_s}s"
                 )
-            pipeline_complete: bool = self.stats().global_metrics.pipeline_complete
+            pipeline_complete: bool = self.is_complete()
             if pipeline_complete is None:
                 raise RuntimeError(
                     "received unknown metrics from the pipeline, pipeline_complete is None"
@@ -339,6 +339,19 @@ class Pipeline:
         if force_stop:
             self.stop(force=True)
+    def is_complete(self) -> bool:
+        """
+        Check if the pipeline has completed processing all input records.
+        Returns True if (1) all input connectors attached to the
+        pipeline have finished reading their input data sources and issued
+        end-of-input notifications to the pipeline, and (2) all inputs received
+        from these connectors have been fully processed and corresponding
+        outputs have been sent out through the output connectors.
+        """
+        return self.stats().global_metrics.pipeline_complete
     def start(self, wait: bool = True, timeout_s: Optional[float] = None):
         """
         .. _start:
@@ -625,6 +638,8 @@ metrics"""
         :param timeout_s: The maximum time (in seconds) to wait for the
             checkpoint to complete.
+        :return: The checkpoint sequence number.
         :raises FelderaAPIError: If enterprise features are not enabled.
         """
@@ -647,9 +662,7 @@ pipeline '{self.name}' to make checkpoint '{seq}'"""
                 time.sleep(0.1)
                 continue
-            return status
-        return seq
+            return seq
     def checkpoint_status(self, seq: int) -> CheckpointStatus:
         """
@@ -889,6 +902,38 @@ pipeline '{self.name}' to sync checkpoint '{uuid}'"""
         self.refresh()
         return self._inner.program_code
+    def modify(
+        self,
+        sql: Optional[str] = None,
+        udf_rust: Optional[str] = None,
+        udf_toml: Optional[str] = None,
+        program_config: Optional[Mapping[str, Any]] = None,
+        runtime_config: Optional[Mapping[str, Any]] = None,
+        description: Optional[str] = None,
+    ):
+        """
+        Modify the pipeline.
+        Modify the values of pipeline attributes: SQL code, UDF Rust code,
+        UDF Rust dependencies (TOML), program config, runtime config, and
+        description. Only the provided attributes will be modified. Other
+        attributes will remain unchanged.
+        The pipeline must be in the STOPPED state to be modified.
+        :raises FelderaAPIError: If the pipeline is not in a STOPPED state.
+        """
+        self.client.patch_pipeline(
+            name=self._inner.name,
+            sql=sql,
+            udf_rust=udf_rust,
+            udf_toml=udf_toml,
+            program_config=program_config,
+            runtime_config=runtime_config,
+            description=description,
+        )
     def storage_status(self) -> StorageStatus:
         """
         Return the storage status of the pipeline.

feldera/runtime_config.py CHANGED Viewed

@@ -80,6 +80,7 @@ class RuntimeConfig:
         resources: Optional[Resources] = None,
         fault_tolerance_model: Optional[FaultToleranceModel] = None,
         checkpoint_interval_secs: Optional[int] = None,
+        dev_tweaks: Optional[dict] = None,
     ):
         self.workers = workers
         self.tracing = tracing
@@ -103,6 +104,7 @@ class RuntimeConfig:
                 self.storage = storage.__dict__
             else:
                 raise ValueError(f"Unknown value '{storage}' for storage")
+        self.dev_tweaks = dev_tweaks
     @staticmethod
     def default() -> "RuntimeConfig":

feldera/testutils.py CHANGED Viewed

@@ -5,12 +5,13 @@ import re
 import time
 import json
 import unittest
-from typing import cast
+from typing import List, Optional, cast
+from datetime import datetime
 from feldera.enums import CompilationProfile
 from feldera.pipeline import Pipeline
 from feldera.pipeline_builder import PipelineBuilder
-from feldera.runtime_config import RuntimeConfig
+from feldera.runtime_config import Resources, RuntimeConfig
 from feldera.rest import FelderaClient
 API_KEY = os.environ.get("FELDERA_API_KEY")
@@ -56,6 +57,87 @@ class _LazyClient:
 TEST_CLIENT = cast(FelderaClient, _LazyClient())
+# SQL index definition.
+class IndexSpec:
+    def __init__(self, name: str, columns: List[str]):
+        self.name = name
+        self.columns = columns
+    def __repr__(self):
+        return f"IndexSpec(name={self.name!r},columns={self.columns!r})"
+class ViewSpec:
+    """
+    SQL view definition consisting of a query that can run in Feldera or
+    datafusion, optional connector spec and aux SQL statements, e.g., indexes
+    and lateness clauses following view definition.
+    """
+    def __init__(
+        self,
+        name: str,
+        query: str,
+        indexes: List[IndexSpec] = [],
+        connectors: Optional[str] = None,
+        aux: Optional[str] = None,
+        expected_hash: Optional[str] = None,
+    ):
+        if not isinstance(query, str):
+            raise TypeError("query must be a string")
+        self.name = name
+        self.query = query
+        self.connectors = connectors
+        self.indexes = indexes
+        self.aux = aux
+        self.expected_hash = expected_hash
+    def __repr__(self):
+        return f"ViewSpec(name={self.name!r}, query={self.query!r}, indexes={self.indexes!r}, connectors={self.connectors!r}, aux={self.aux!r}, expected_hash={self.expected_hash!r})"
+    def clone(self):
+        return ViewSpec(
+            self.name,
+            self.query,
+            self.indexes,
+            self.connectors,
+            self.aux,
+            self.expected_hash,
+        )
+    def clone_with_name(self, name: str):
+        return ViewSpec(name, self.query, self.indexes, self.connectors, self.aux)
+    def sql(self) -> str:
+        sql = ""
+        if self.connectors:
+            with_clause = f"\nwith('connectors' = '{self.connectors}')\n"
+        else:
+            with_clause = ""
+        sql += (
+            f"create materialized view {self.name}{with_clause} as\n{self.query};\n\n"
+        )
+        for index in self.indexes:
+            columns = ",".join(index.columns)
+            sql += f"create index {index.name} on {self.name}({columns});\n"
+        if self.aux:
+            sql += f"{self.aux}\n"
+        sql += "\n"
+        return sql
+def log(*args, **kwargs):
+    """Print like built-in print(), but prefix each line with current time."""
+    prefix = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
+    print(prefix, *args, **kwargs)
 def unique_pipeline_name(base_name: str) -> str:
     """
     In CI, multiple tests of different runs can run against the same Feldera instance, we
@@ -88,107 +170,203 @@ def datafusionize(query: str) -> str:
     return result
-def validate_view(
-    pipeline: Pipeline, view_name: str, view_query: str | tuple[str, str]
-):
-    print(f"Validating view '{view_name}'")
+def validate_view(pipeline: Pipeline, view: ViewSpec):
+    log(f"Validating view '{view.name}'")
     # We have two modes to verify the view, either we run the same SQL as the view against datafusion
     # by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
     # should look like and check that the hash hasn't changed
-    if isinstance(view_query, tuple):
-        _view_definition, original_hash = view_query
-        view_query = f"select * from {view_name}"
+    if view.expected_hash:
+        view_query = f"select * from {view.name}"
         computed_hash = pipeline.query_hash(view_query)
-        if computed_hash != original_hash:
+        if computed_hash != view.expected_hash:
             raise AssertionError(
-                f"View {view_name} hash {computed_hash} was but expected hash {original_hash}"
+                f"View {view.name} hash {computed_hash} was but expected hash {view.expected_hash}"
             )
     else:
         # TODO: count records
-        view_query = datafusionize(view_query)
+        view_query = datafusionize(view.query)
         try:
             extra_rows = list(
-                pipeline.query(f"(select * from {view_name}) except ({view_query})")
+                pipeline.query(f"(select * from {view.name}) except ({view_query})")
             )
             missing_rows = list(
-                pipeline.query(f"({view_query}) except (select * from {view_name})")
+                pipeline.query(f"({view_query}) except (select * from {view.name})")
             )
             if extra_rows:
-                print(
-                    "Extra rows in Feldera output, but not in the ad hoc query output"
-                )
-                print(json.dumps(extra_rows))
+                log("Extra rows in Feldera output, but not in the ad hoc query output")
+                log(json.dumps(extra_rows))
             if missing_rows:
-                print(
-                    "Extra rows in the ad hoc query output, but not in Feldera output"
-                )
-                print(json.dumps(missing_rows))
+                log("Extra rows in the ad hoc query output, but not in Feldera output")
+                log(json.dumps(missing_rows))
         except Exception as e:
-            print(f"Error querying view '{view_name}': {e}")
-            print(f"Ad-hoc Query: {view_query}")
+            log(f"Error querying view '{view.name}': {e}")
+            log(f"Ad-hoc Query: {view_query}")
             raise
         if extra_rows or missing_rows:
-            raise AssertionError(f"Validation failed for view {view_name}")
-def run_workload(pipeline_name: str, tables: dict, views: dict):
-    """
-    Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
+            raise AssertionError(f"Validation failed for view {view.name}")
-    Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
-    ingest a lot of data and validate the results. For testing more specific functionality, see
-    frameworks in the `tests` directory.
-    """
+def generate_program(tables: dict, views: List[ViewSpec]) -> str:
     sql = ""
     for table_sql in tables.values():
         sql += f"{table_sql}\n"
-    for view_name, view in views.items():
-        if isinstance(view, tuple):
-            view_query, _hash = view
-            sql += f"create materialized view {view_name} as {view_query};\n\n"
-        else:
-            sql += f"create materialized view {view_name} as {view};\n\n"
+    for view in views:
+        sql += view.sql()
+    return sql
+def build_pipeline(
+    pipeline_name: str,
+    tables: dict,
+    views: List[ViewSpec],
+    resources: Optional[Resources] = None,
+) -> Pipeline:
+    sql = generate_program(tables, views)
     pipeline = PipelineBuilder(
         TEST_CLIENT,
-        unique_pipeline_name(pipeline_name),
+        pipeline_name,
         sql=sql,
         compilation_profile=CompilationProfile.OPTIMIZED,
-        runtime_config=RuntimeConfig(provisioning_timeout_secs=60),
+        runtime_config=RuntimeConfig(
+            provisioning_timeout_secs=60,
+            dev_tweaks={"backfill_avoidance": True},
+            resources=resources,
+        ),
     ).create_or_replace()
+    return pipeline
+def validate_outputs(pipeline: Pipeline, tables: dict, views: List[ViewSpec]):
+    for table in tables.keys():
+        row_count = list(pipeline.query(f"select count(*) from {table}"))
+        log(f"Table '{table}' count(*):\n{row_count}")
+    for view in views:
+        validate_view(pipeline, view)
+def check_end_of_input(pipeline: Pipeline) -> bool:
+    return all(
+        input_endpoint.metrics.end_of_input
+        for input_endpoint in pipeline.stats().inputs
+    )
+def wait_end_of_input(pipeline: Pipeline, timeout_s: Optional[int] = None):
+    start_time = time.monotonic()
+    while not check_end_of_input(pipeline):
+        if timeout_s is not None and time.monotonic() - start_time > timeout_s:
+            raise TimeoutError("Timeout waiting for end of input")
+        time.sleep(3)
+def transaction(pipeline: Pipeline, duration_seconds: int):
+    """Run a transaction for a specified duration."""
+    log(f"Running transaction for {duration_seconds} seconds")
+    pipeline.start_transaction()
+    time.sleep(duration_seconds)
+    log("Committing transaction")
+    commit_start = time.monotonic()
+    pipeline.commit_transaction()
+    log(f"Transaction committed in {time.monotonic() - commit_start} seconds")
+def checkpoint_pipeline(pipeline: Pipeline):
+    """Create a checkpoint and wait for it to complete."""
+    log("Creating checkpoint")
+    checkpoint_start = time.monotonic()
+    pipeline.checkpoint(wait=True)
+    log(f"Checkpoint complete in {time.monotonic() - checkpoint_start} seconds")
+def check_for_endpoint_errors(pipeline: Pipeline):
+    """Check for errors on all input and output endpoints."""
+    for input_endpoint_status in pipeline.stats().inputs:
+        input_endpoint_status.metrics
+        if input_endpoint_status.metrics.num_transport_errors > 0:
+            raise RuntimeError(
+                f"Transport errors detected on input endpoint: {input_endpoint_status.endpoint_name}"
+            )
+        if input_endpoint_status.metrics.num_parse_errors > 0:
+            raise RuntimeError(
+                f"Parse errors on input endpoint: {input_endpoint_status.endpoint_name}"
+            )
+        log(f"  Input endpoint {input_endpoint_status.endpoint_name} OK")
+    for output_endpoint_status in pipeline.stats().outputs:
+        output_endpoint_status.metrics
+        if output_endpoint_status.metrics.num_transport_errors > 0:
+            raise RuntimeError(
+                f"Transport errors detected on output endpoint: {output_endpoint_status.endpoint_name}"
+            )
+        if output_endpoint_status.metrics.num_encode_errors > 0:
+            raise RuntimeError(
+                f"Encode errors on output endpoint: {output_endpoint_status.endpoint_name}"
+            )
+        log(f"  Output endpoint {output_endpoint_status.endpoint_name} OK")
+def number_of_processed_records(pipeline: Pipeline) -> int:
+    """Get the total_processed_records metric."""
+    return pipeline.stats().global_metrics.total_processed_records
+def run_workload(
+    pipeline_name: str, tables: dict, views: List[ViewSpec], transaction: bool = True
+):
+    """
+    Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
+    Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
+    ingest a lot of data and validate the results. For testing more specific functionality, see
+    frameworks in the `tests` directory.
+    """
+    pipeline = build_pipeline(pipeline_name, tables, views)
     pipeline.start()
     start_time = time.monotonic()
-    try:
-        pipeline.start_transaction()
-    except Exception as e:
-        print(f"Error starting transaction: {e}")
+    if transaction:
+        try:
+            pipeline.start_transaction()
+        except Exception as e:
+            log(f"Error starting transaction: {e}")
+    if transaction:
+        wait_end_of_input(pipeline, timeout_s=3600)
+    else:
+        pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
-    pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
     elapsed = time.monotonic() - start_time
-    print(f"Data ingested in {elapsed}")
+    log(f"Data ingested in {elapsed}")
-    try:
+    if transaction:
         start_time = time.monotonic()
-        pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
-    except Exception as e:
-        print(f"Error committing transaction: {e}")
-    finally:
-        elapsed = time.monotonic() - start_time
-        print(f"Commit took {elapsed}")
+        try:
+            pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
+            log(f"Commit took {time.monotonic() - start_time}")
+        except Exception as e:
+            log(f"Error committing transaction: {e}")
-    for table in tables.keys():
-        row_count = list(pipeline.query(f"select count(*) from {table}"))
-        print(f"Table '{table}' count(*):\n{row_count}")
+        log("Waiting for outputs to flush")
+        start_time = time.monotonic()
+        pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
+        log(f"Flushing outputs took {time.monotonic() - start_time}")
-    for view_name, view_query in views.items():
-        validate_view(pipeline, view_name, view_query)
+    validate_outputs(pipeline, tables, views)
     pipeline.stop(force=True)

{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: feldera
-Version: 0.136.0
+Version: 0.138.0
 Summary: The feldera python client
 Author-email: Feldera Team <dev@feldera.com>
 License: MIT

{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/RECORD RENAMED Viewed

@@ -3,11 +3,11 @@ feldera/_callback_runner.py,sha256=v3PD2DcT190ObYWoZDtWfS2zF9KU63gVKpguvAAEtJk,4
 feldera/_helpers.py,sha256=TQnDQW19fpljD19ppd5dASy1gUC4y8GNnnJjXxbaUmM,3019
 feldera/enums.py,sha256=MTHBojVANsdRnjbrzCyIOniDIUaH8nTYRfxB7QvajEE,9570
 feldera/output_handler.py,sha256=64J3ljhOaKIhxdjOKYi-BUz_HnMwROfmN8eE-btYygU,1930
-feldera/pipeline.py,sha256=P2yRzAxzxSCjiQpy8aVF9KBLKLNrsXhJQP35OHmZYag,42164
+feldera/pipeline.py,sha256=KIAdKzh0Mol5NTn3KzePoELd9lHsPOU-YJCV0xPSmXo,43788
 feldera/pipeline_builder.py,sha256=a750hp5SgTmlyrobTHFh1fTaK9Ed4A5qnXaYRctRM-8,4250
-feldera/runtime_config.py,sha256=MuYJPd5G_hnu_eDz4ge4BfYvSBSOvOEtv4NYh5sEwqU,4452
+feldera/runtime_config.py,sha256=DcJ44EN6Dt1X1wW-1kUvFbSkIDKDLi4-GqaDdUzPtTQ,4532
 feldera/stats.py,sha256=1qDlWhI-ORx3FktxH3b93mXWwtCOb4XuP0iJePHJTrE,5030
-feldera/testutils.py,sha256=4rDn1DfquV_Q4c0wNgV1RPXL6WGd_NZeHvL2WGs_kK4,6608
+feldera/testutils.py,sha256=yfQYhI1LglmsBsfsghFI36EUa015cuH1izjPsvVuNiQ,12172
 feldera/rest/__init__.py,sha256=Eg-EKUU3RSTDcdxTR_7wNDnCly8VpXEzsZCQUmf-y2M,308
 feldera/rest/_helpers.py,sha256=q7jWInKp9IiIli8N5o31lDG3hNUbcsJqufZXYHG04ps,222
 feldera/rest/_httprequests.py,sha256=-jYIt7fTnZf1CNqAsWvU0XVZt4exsLTOKqf9PXLrAKU,8117
@@ -19,7 +19,7 @@ feldera/rest/pipeline.py,sha256=Rmbflbwjvd86iZ5aSJ5b_bTSs6vgvEKQFwMZDtm0nxE,2835
 feldera/rest/sql_table.py,sha256=qrw-YwMzx5T81zDefNO1KOx7EyypFz1vPwGBzSUB7kc,652
 feldera/rest/sql_view.py,sha256=hN12mPM0mvwLCIPYywpb12s9Hd2Ws31IlTMXPriMisw,644
 feldera/tests/test_datafusionize.py,sha256=NGriTaTWf_WnXFud1wmpFwLFa_-XGjfCh6La3dWc3QA,1337
-feldera-0.136.0.dist-info/METADATA,sha256=Gmkq19v6uTYQlfsQh_wHMn3zFxZSRWt-yjdr01nhSFY,2368
-feldera-0.136.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-feldera-0.136.0.dist-info/top_level.txt,sha256=fB6yTqrQiO6RCbY1xP2T_mpPoTjDFtJvkJJodiee7d0,8
-feldera-0.136.0.dist-info/RECORD,,
+feldera-0.138.0.dist-info/METADATA,sha256=W-i6CyFsXvpXl6PZK9Tx6epbZ4a9XUhFvmDy6tetFmw,2368
+feldera-0.138.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+feldera-0.138.0.dist-info/top_level.txt,sha256=fB6yTqrQiO6RCbY1xP2T_mpPoTjDFtJvkJJodiee7d0,8
+feldera-0.138.0.dist-info/RECORD,,

{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{feldera-0.136.0.dist-info → feldera-0.138.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

feldera 0.136.0__py3-none-any.whl → 0.138.0__py3-none-any.whl

Potentially problematic release.

feldera 0.136.0py3-none-any.whl → 0.138.0py3-none-any.whl