feldera 0.137.0__tar.gz → 0.139.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of feldera might be problematic. Click here for more details.
- {feldera-0.137.0 → feldera-0.139.0}/PKG-INFO +1 -1
- {feldera-0.137.0 → feldera-0.139.0}/feldera/enums.py +27 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/pipeline.py +88 -11
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/feldera_client.py +1 -1
- {feldera-0.137.0 → feldera-0.139.0}/feldera/runtime_config.py +2 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/stats.py +4 -1
- feldera-0.139.0/feldera/testutils.py +372 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera.egg-info/PKG-INFO +1 -1
- {feldera-0.137.0 → feldera-0.139.0}/pyproject.toml +1 -1
- feldera-0.137.0/feldera/testutils.py +0 -194
- {feldera-0.137.0 → feldera-0.139.0}/README.md +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/__init__.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/_callback_runner.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/_helpers.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/output_handler.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/pipeline_builder.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/__init__.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/_helpers.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/_httprequests.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/config.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/errors.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/feldera_config.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/pipeline.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/sql_table.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/rest/sql_view.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera/tests/test_datafusionize.py +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera.egg-info/SOURCES.txt +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera.egg-info/dependency_links.txt +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera.egg-info/requires.txt +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/feldera.egg-info/top_level.txt +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/setup.cfg +0 -0
- {feldera-0.137.0 → feldera-0.139.0}/tests/test_uda.py +0 -0
|
@@ -170,6 +170,33 @@ class PipelineStatus(Enum):
|
|
|
170
170
|
return self.value == other.value
|
|
171
171
|
|
|
172
172
|
|
|
173
|
+
class TransactionStatus(Enum):
|
|
174
|
+
"""
|
|
175
|
+
Represents the transaction handling status of a pipeline.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
NoTransaction = 1
|
|
179
|
+
"""There is currently no active transaction."""
|
|
180
|
+
|
|
181
|
+
TransactionInProgress = 2
|
|
182
|
+
"""There is an active transaction in progress."""
|
|
183
|
+
|
|
184
|
+
CommitInProgress = 3
|
|
185
|
+
"""A commit is currently in progress."""
|
|
186
|
+
|
|
187
|
+
@staticmethod
|
|
188
|
+
def from_str(value):
|
|
189
|
+
for member in TransactionStatus:
|
|
190
|
+
if member.name.lower() == value.lower():
|
|
191
|
+
return member
|
|
192
|
+
raise ValueError(
|
|
193
|
+
f"Unknown value '{value}' for enum {TransactionStatus.__name__}"
|
|
194
|
+
)
|
|
195
|
+
|
|
196
|
+
def __eq__(self, other):
|
|
197
|
+
return self.value == other.value
|
|
198
|
+
|
|
199
|
+
|
|
173
200
|
class ProgramStatus(Enum):
|
|
174
201
|
Pending = 1
|
|
175
202
|
CompilingSql = 2
|
|
@@ -11,7 +11,12 @@ from collections import deque
|
|
|
11
11
|
from queue import Queue
|
|
12
12
|
|
|
13
13
|
from feldera.rest.errors import FelderaAPIError
|
|
14
|
-
from feldera.enums import
|
|
14
|
+
from feldera.enums import (
|
|
15
|
+
PipelineStatus,
|
|
16
|
+
ProgramStatus,
|
|
17
|
+
CheckpointStatus,
|
|
18
|
+
TransactionStatus,
|
|
19
|
+
)
|
|
15
20
|
from feldera.enums import StorageStatus
|
|
16
21
|
from feldera.rest.pipeline import Pipeline as InnerPipeline
|
|
17
22
|
from feldera.rest.feldera_client import FelderaClient
|
|
@@ -326,7 +331,7 @@ class Pipeline:
|
|
|
326
331
|
f" time {elapsed}s, timeout: {timeout_s}s"
|
|
327
332
|
)
|
|
328
333
|
|
|
329
|
-
pipeline_complete: bool = self.
|
|
334
|
+
pipeline_complete: bool = self.is_complete()
|
|
330
335
|
if pipeline_complete is None:
|
|
331
336
|
raise RuntimeError(
|
|
332
337
|
"received unknown metrics from the pipeline, pipeline_complete is None"
|
|
@@ -339,6 +344,19 @@ class Pipeline:
|
|
|
339
344
|
if force_stop:
|
|
340
345
|
self.stop(force=True)
|
|
341
346
|
|
|
347
|
+
def is_complete(self) -> bool:
|
|
348
|
+
"""
|
|
349
|
+
Check if the pipeline has completed processing all input records.
|
|
350
|
+
|
|
351
|
+
Returns True if (1) all input connectors attached to the
|
|
352
|
+
pipeline have finished reading their input data sources and issued
|
|
353
|
+
end-of-input notifications to the pipeline, and (2) all inputs received
|
|
354
|
+
from these connectors have been fully processed and corresponding
|
|
355
|
+
outputs have been sent out through the output connectors.
|
|
356
|
+
"""
|
|
357
|
+
|
|
358
|
+
return self.stats().global_metrics.pipeline_complete
|
|
359
|
+
|
|
342
360
|
def start(self, wait: bool = True, timeout_s: Optional[float] = None):
|
|
343
361
|
"""
|
|
344
362
|
.. _start:
|
|
@@ -546,15 +564,17 @@ metrics"""
|
|
|
546
564
|
|
|
547
565
|
self.client.start_pipeline(self.name, wait=wait, timeout_s=timeout_s)
|
|
548
566
|
|
|
549
|
-
def start_transaction(self):
|
|
567
|
+
def start_transaction(self) -> int:
|
|
550
568
|
"""
|
|
551
569
|
Start a new transaction.
|
|
552
570
|
|
|
553
|
-
|
|
554
|
-
|
|
571
|
+
:return: Transaction ID.
|
|
572
|
+
|
|
573
|
+
:raises FelderaAPIError: If the pipeline fails to start a transaction, e.g., if the pipeline is not running or
|
|
574
|
+
there is already an active transaction.
|
|
555
575
|
"""
|
|
556
576
|
|
|
557
|
-
self.client.start_transaction(self.name)
|
|
577
|
+
return self.client.start_transaction(self.name)
|
|
558
578
|
|
|
559
579
|
def commit_transaction(
|
|
560
580
|
self,
|
|
@@ -563,7 +583,7 @@ metrics"""
|
|
|
563
583
|
timeout_s: Optional[float] = None,
|
|
564
584
|
):
|
|
565
585
|
"""
|
|
566
|
-
|
|
586
|
+
Commit the currently active transaction.
|
|
567
587
|
|
|
568
588
|
:param transaction_id: If provided, the function verifies that the currently active transaction matches this ID.
|
|
569
589
|
If the active transaction ID does not match, the function raises an error.
|
|
@@ -577,11 +597,36 @@ metrics"""
|
|
|
577
597
|
:raises RuntimeError: If there is currently no transaction in progress.
|
|
578
598
|
:raises ValueError: If the provided `transaction_id` does not match the current transaction.
|
|
579
599
|
:raises TimeoutError: If the transaction does not commit within the specified timeout (when `wait` is True).
|
|
580
|
-
:raises FelderaAPIError: If the pipeline fails to
|
|
600
|
+
:raises FelderaAPIError: If the pipeline fails to commit a transaction.
|
|
581
601
|
"""
|
|
582
602
|
|
|
583
603
|
self.client.commit_transaction(self.name, transaction_id, wait, timeout_s)
|
|
584
604
|
|
|
605
|
+
def transaction_status(self) -> TransactionStatus:
|
|
606
|
+
"""
|
|
607
|
+
Get pipeline's transaction handling status.
|
|
608
|
+
|
|
609
|
+
:return: Current transaction handling status of the pipeline.
|
|
610
|
+
|
|
611
|
+
:raises FelderaAPIError: If pipeline's status couldn't be read, e.g., because the pipeline is not currently running.
|
|
612
|
+
"""
|
|
613
|
+
|
|
614
|
+
return self.stats().global_metrics.transaction_status
|
|
615
|
+
|
|
616
|
+
def transaction_id(self) -> Optional[int]:
|
|
617
|
+
"""
|
|
618
|
+
Gets the ID of the currently active transaction or None if there is no active transaction.
|
|
619
|
+
|
|
620
|
+
:return: The ID of the transaction.
|
|
621
|
+
"""
|
|
622
|
+
|
|
623
|
+
transaction_id = self.stats().global_metrics.transaction_id
|
|
624
|
+
|
|
625
|
+
if transaction_id == 0:
|
|
626
|
+
return None
|
|
627
|
+
else:
|
|
628
|
+
return transaction_id
|
|
629
|
+
|
|
585
630
|
def delete(self, clear_storage: bool = False):
|
|
586
631
|
"""
|
|
587
632
|
Deletes the pipeline.
|
|
@@ -625,6 +670,8 @@ metrics"""
|
|
|
625
670
|
:param timeout_s: The maximum time (in seconds) to wait for the
|
|
626
671
|
checkpoint to complete.
|
|
627
672
|
|
|
673
|
+
:return: The checkpoint sequence number.
|
|
674
|
+
|
|
628
675
|
:raises FelderaAPIError: If enterprise features are not enabled.
|
|
629
676
|
"""
|
|
630
677
|
|
|
@@ -647,9 +694,7 @@ pipeline '{self.name}' to make checkpoint '{seq}'"""
|
|
|
647
694
|
time.sleep(0.1)
|
|
648
695
|
continue
|
|
649
696
|
|
|
650
|
-
return
|
|
651
|
-
|
|
652
|
-
return seq
|
|
697
|
+
return seq
|
|
653
698
|
|
|
654
699
|
def checkpoint_status(self, seq: int) -> CheckpointStatus:
|
|
655
700
|
"""
|
|
@@ -889,6 +934,38 @@ pipeline '{self.name}' to sync checkpoint '{uuid}'"""
|
|
|
889
934
|
self.refresh()
|
|
890
935
|
return self._inner.program_code
|
|
891
936
|
|
|
937
|
+
def modify(
|
|
938
|
+
self,
|
|
939
|
+
sql: Optional[str] = None,
|
|
940
|
+
udf_rust: Optional[str] = None,
|
|
941
|
+
udf_toml: Optional[str] = None,
|
|
942
|
+
program_config: Optional[Mapping[str, Any]] = None,
|
|
943
|
+
runtime_config: Optional[Mapping[str, Any]] = None,
|
|
944
|
+
description: Optional[str] = None,
|
|
945
|
+
):
|
|
946
|
+
"""
|
|
947
|
+
Modify the pipeline.
|
|
948
|
+
|
|
949
|
+
Modify the values of pipeline attributes: SQL code, UDF Rust code,
|
|
950
|
+
UDF Rust dependencies (TOML), program config, runtime config, and
|
|
951
|
+
description. Only the provided attributes will be modified. Other
|
|
952
|
+
attributes will remain unchanged.
|
|
953
|
+
|
|
954
|
+
The pipeline must be in the STOPPED state to be modified.
|
|
955
|
+
|
|
956
|
+
:raises FelderaAPIError: If the pipeline is not in a STOPPED state.
|
|
957
|
+
"""
|
|
958
|
+
|
|
959
|
+
self.client.patch_pipeline(
|
|
960
|
+
name=self._inner.name,
|
|
961
|
+
sql=sql,
|
|
962
|
+
udf_rust=udf_rust,
|
|
963
|
+
udf_toml=udf_toml,
|
|
964
|
+
program_config=program_config,
|
|
965
|
+
runtime_config=runtime_config,
|
|
966
|
+
description=description,
|
|
967
|
+
)
|
|
968
|
+
|
|
892
969
|
def storage_status(self) -> StorageStatus:
|
|
893
970
|
"""
|
|
894
971
|
Return the storage status of the pipeline.
|
|
@@ -565,7 +565,7 @@ Reason: The pipeline is in a STOPPED state due to the following error:
|
|
|
565
565
|
:raises RuntimeError: If there is currently no transaction in progress.
|
|
566
566
|
:raises ValueError: If the provided `transaction_id` does not match the current transaction.
|
|
567
567
|
:raises TimeoutError: If the transaction does not commit within the specified timeout (when `wait` is True).
|
|
568
|
-
:raises FelderaAPIError: If the pipeline fails to
|
|
568
|
+
:raises FelderaAPIError: If the pipeline fails to commit a transaction.
|
|
569
569
|
"""
|
|
570
570
|
|
|
571
571
|
# TODO: implement this without using /stats when we have a better pipeline status reporting API.
|
|
@@ -80,6 +80,7 @@ class RuntimeConfig:
|
|
|
80
80
|
resources: Optional[Resources] = None,
|
|
81
81
|
fault_tolerance_model: Optional[FaultToleranceModel] = None,
|
|
82
82
|
checkpoint_interval_secs: Optional[int] = None,
|
|
83
|
+
dev_tweaks: Optional[dict] = None,
|
|
83
84
|
):
|
|
84
85
|
self.workers = workers
|
|
85
86
|
self.tracing = tracing
|
|
@@ -103,6 +104,7 @@ class RuntimeConfig:
|
|
|
103
104
|
self.storage = storage.__dict__
|
|
104
105
|
else:
|
|
105
106
|
raise ValueError(f"Unknown value '{storage}' for storage")
|
|
107
|
+
self.dev_tweaks = dev_tweaks
|
|
106
108
|
|
|
107
109
|
@staticmethod
|
|
108
110
|
def default() -> "RuntimeConfig":
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
from typing import Mapping, Any, Optional, List
|
|
2
|
-
from feldera.enums import PipelineStatus
|
|
2
|
+
from feldera.enums import PipelineStatus, TransactionStatus
|
|
3
3
|
from datetime import datetime
|
|
4
4
|
import uuid
|
|
5
5
|
|
|
@@ -55,6 +55,8 @@ class GlobalPipelineMetrics:
|
|
|
55
55
|
self.total_processed_records: Optional[int] = None
|
|
56
56
|
self.total_completed_records: Optional[int] = None
|
|
57
57
|
self.pipeline_complete: Optional[bool] = None
|
|
58
|
+
self.transaction_status: Optional[TransactionStatus] = None
|
|
59
|
+
self.transaction_id: Optional[int] = None
|
|
58
60
|
|
|
59
61
|
@classmethod
|
|
60
62
|
def from_dict(cls, d: Mapping[str, Any]):
|
|
@@ -63,6 +65,7 @@ class GlobalPipelineMetrics:
|
|
|
63
65
|
metrics.state = PipelineStatus.from_str(d["state"])
|
|
64
66
|
metrics.incarnation_uuid = uuid.UUID(d["incarnation_uuid"])
|
|
65
67
|
metrics.start_time = datetime.fromtimestamp(d["start_time"])
|
|
68
|
+
metrics.transaction_status = TransactionStatus.from_str(d["transaction_status"])
|
|
66
69
|
return metrics
|
|
67
70
|
|
|
68
71
|
|
|
@@ -0,0 +1,372 @@
|
|
|
1
|
+
"Utility functions for writing tests against a Feldera instance."
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
import json
|
|
7
|
+
import unittest
|
|
8
|
+
from typing import List, Optional, cast
|
|
9
|
+
from datetime import datetime
|
|
10
|
+
|
|
11
|
+
from feldera.enums import CompilationProfile
|
|
12
|
+
from feldera.pipeline import Pipeline
|
|
13
|
+
from feldera.pipeline_builder import PipelineBuilder
|
|
14
|
+
from feldera.runtime_config import Resources, RuntimeConfig
|
|
15
|
+
from feldera.rest import FelderaClient
|
|
16
|
+
|
|
17
|
+
API_KEY = os.environ.get("FELDERA_API_KEY")
|
|
18
|
+
BASE_URL = (
|
|
19
|
+
os.environ.get("FELDERA_HOST")
|
|
20
|
+
or os.environ.get("FELDERA_BASE_URL")
|
|
21
|
+
or "http://localhost:8080"
|
|
22
|
+
)
|
|
23
|
+
KAFKA_SERVER = os.environ.get("FELDERA_KAFKA_SERVER", "localhost:19092")
|
|
24
|
+
PIPELINE_TO_KAFKA_SERVER = os.environ.get(
|
|
25
|
+
"FELDERA_PIPELINE_TO_KAFKA_SERVER", "redpanda:9092"
|
|
26
|
+
)
|
|
27
|
+
FELDERA_TLS_INSECURE = True if os.environ.get("FELDERA_TLS_INSECURE") else False
|
|
28
|
+
FELDERA_HTTPS_TLS_CERT = os.environ.get("FELDERA_HTTPS_TLS_CERT")
|
|
29
|
+
if not FELDERA_TLS_INSECURE and FELDERA_HTTPS_TLS_CERT is not None:
|
|
30
|
+
FELDERA_REQUESTS_VERIFY = FELDERA_HTTPS_TLS_CERT
|
|
31
|
+
else:
|
|
32
|
+
FELDERA_REQUESTS_VERIFY = not FELDERA_TLS_INSECURE
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class _LazyClient:
|
|
36
|
+
"Construct the FelderaClient only when accessed as opposed to when imported."
|
|
37
|
+
|
|
38
|
+
__slots__ = ("_client",)
|
|
39
|
+
|
|
40
|
+
def __init__(self):
|
|
41
|
+
self._client = None
|
|
42
|
+
|
|
43
|
+
def _ensure(self):
|
|
44
|
+
if self._client is None:
|
|
45
|
+
self._client = FelderaClient(
|
|
46
|
+
connection_timeout=10,
|
|
47
|
+
)
|
|
48
|
+
return self._client
|
|
49
|
+
|
|
50
|
+
def __getattr__(self, name):
|
|
51
|
+
return getattr(self._ensure(), name)
|
|
52
|
+
|
|
53
|
+
def __call__(self, *a, **kw) -> FelderaClient:
|
|
54
|
+
return self._ensure()
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
TEST_CLIENT = cast(FelderaClient, _LazyClient())
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
# SQL index definition.
|
|
61
|
+
class IndexSpec:
|
|
62
|
+
def __init__(self, name: str, columns: List[str]):
|
|
63
|
+
self.name = name
|
|
64
|
+
self.columns = columns
|
|
65
|
+
|
|
66
|
+
def __repr__(self):
|
|
67
|
+
return f"IndexSpec(name={self.name!r},columns={self.columns!r})"
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class ViewSpec:
|
|
71
|
+
"""
|
|
72
|
+
SQL view definition consisting of a query that can run in Feldera or
|
|
73
|
+
datafusion, optional connector spec and aux SQL statements, e.g., indexes
|
|
74
|
+
and lateness clauses following view definition.
|
|
75
|
+
"""
|
|
76
|
+
|
|
77
|
+
def __init__(
|
|
78
|
+
self,
|
|
79
|
+
name: str,
|
|
80
|
+
query: str,
|
|
81
|
+
indexes: List[IndexSpec] = [],
|
|
82
|
+
connectors: Optional[str] = None,
|
|
83
|
+
aux: Optional[str] = None,
|
|
84
|
+
expected_hash: Optional[str] = None,
|
|
85
|
+
):
|
|
86
|
+
if not isinstance(query, str):
|
|
87
|
+
raise TypeError("query must be a string")
|
|
88
|
+
self.name = name
|
|
89
|
+
self.query = query
|
|
90
|
+
self.connectors = connectors
|
|
91
|
+
self.indexes = indexes
|
|
92
|
+
self.aux = aux
|
|
93
|
+
self.expected_hash = expected_hash
|
|
94
|
+
|
|
95
|
+
def __repr__(self):
|
|
96
|
+
return f"ViewSpec(name={self.name!r}, query={self.query!r}, indexes={self.indexes!r}, connectors={self.connectors!r}, aux={self.aux!r}, expected_hash={self.expected_hash!r})"
|
|
97
|
+
|
|
98
|
+
def clone(self):
|
|
99
|
+
return ViewSpec(
|
|
100
|
+
self.name,
|
|
101
|
+
self.query,
|
|
102
|
+
self.indexes,
|
|
103
|
+
self.connectors,
|
|
104
|
+
self.aux,
|
|
105
|
+
self.expected_hash,
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
def clone_with_name(self, name: str):
|
|
109
|
+
return ViewSpec(name, self.query, self.indexes, self.connectors, self.aux)
|
|
110
|
+
|
|
111
|
+
def sql(self) -> str:
|
|
112
|
+
sql = ""
|
|
113
|
+
|
|
114
|
+
if self.connectors:
|
|
115
|
+
with_clause = f"\nwith('connectors' = '{self.connectors}')\n"
|
|
116
|
+
else:
|
|
117
|
+
with_clause = ""
|
|
118
|
+
|
|
119
|
+
sql += (
|
|
120
|
+
f"create materialized view {self.name}{with_clause} as\n{self.query};\n\n"
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
for index in self.indexes:
|
|
124
|
+
columns = ",".join(index.columns)
|
|
125
|
+
sql += f"create index {index.name} on {self.name}({columns});\n"
|
|
126
|
+
|
|
127
|
+
if self.aux:
|
|
128
|
+
sql += f"{self.aux}\n"
|
|
129
|
+
|
|
130
|
+
sql += "\n"
|
|
131
|
+
|
|
132
|
+
return sql
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def log(*args, **kwargs):
|
|
136
|
+
"""Print like built-in print(), but prefix each line with current time."""
|
|
137
|
+
prefix = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
|
|
138
|
+
print(prefix, *args, **kwargs)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def unique_pipeline_name(base_name: str) -> str:
|
|
142
|
+
"""
|
|
143
|
+
In CI, multiple tests of different runs can run against the same Feldera instance, we
|
|
144
|
+
make sure the pipeline names they use are unique by appending the first 5 characters
|
|
145
|
+
of the commit SHA or 'local' if not in CI.
|
|
146
|
+
"""
|
|
147
|
+
ci_tag = os.getenv("GITHUB_SHA", "local")[:5]
|
|
148
|
+
return f"{ci_tag}_{base_name}"
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def enterprise_only(fn):
|
|
152
|
+
fn._enterprise_only = True
|
|
153
|
+
return unittest.skipUnless(
|
|
154
|
+
TEST_CLIENT.get_config().edition.is_enterprise(),
|
|
155
|
+
f"{fn.__name__} is enterprise only, skipping",
|
|
156
|
+
)(fn)
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def datafusionize(query: str) -> str:
|
|
160
|
+
sort_array_pattern = re.compile(re.escape("SORT_ARRAY"), re.IGNORECASE)
|
|
161
|
+
truncate_pattern = re.compile(re.escape("TRUNCATE"), re.IGNORECASE)
|
|
162
|
+
timestamp_trunc_pattern = re.compile(
|
|
163
|
+
r"TIMESTAMP_TRUNC\s*\(\s*MAKE_TIMESTAMP\s*\(\s*([^)]+)\s*\)\s*,\s*([A-Z]+)\s*\)",
|
|
164
|
+
re.IGNORECASE,
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
result = sort_array_pattern.sub("array_sort", query)
|
|
168
|
+
result = truncate_pattern.sub("trunc", result)
|
|
169
|
+
result = timestamp_trunc_pattern.sub(r"DATE_TRUNC('\2', TO_TIMESTAMP(\1))", result)
|
|
170
|
+
return result
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def validate_view(pipeline: Pipeline, view: ViewSpec):
|
|
174
|
+
log(f"Validating view '{view.name}'")
|
|
175
|
+
|
|
176
|
+
# We have two modes to verify the view, either we run the same SQL as the view against datafusion
|
|
177
|
+
# by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
|
|
178
|
+
# should look like and check that the hash hasn't changed
|
|
179
|
+
if view.expected_hash:
|
|
180
|
+
view_query = f"select * from {view.name}"
|
|
181
|
+
computed_hash = pipeline.query_hash(view_query)
|
|
182
|
+
if computed_hash != view.expected_hash:
|
|
183
|
+
raise AssertionError(
|
|
184
|
+
f"View {view.name} hash {computed_hash} was but expected hash {view.expected_hash}"
|
|
185
|
+
)
|
|
186
|
+
else:
|
|
187
|
+
# TODO: count records
|
|
188
|
+
view_query = datafusionize(view.query)
|
|
189
|
+
try:
|
|
190
|
+
extra_rows = list(
|
|
191
|
+
pipeline.query(f"(select * from {view.name}) except ({view_query})")
|
|
192
|
+
)
|
|
193
|
+
missing_rows = list(
|
|
194
|
+
pipeline.query(f"({view_query}) except (select * from {view.name})")
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
if extra_rows:
|
|
198
|
+
log("Extra rows in Feldera output, but not in the ad hoc query output")
|
|
199
|
+
log(json.dumps(extra_rows))
|
|
200
|
+
|
|
201
|
+
if missing_rows:
|
|
202
|
+
log("Extra rows in the ad hoc query output, but not in Feldera output")
|
|
203
|
+
log(json.dumps(missing_rows))
|
|
204
|
+
except Exception as e:
|
|
205
|
+
log(f"Error querying view '{view.name}': {e}")
|
|
206
|
+
log(f"Ad-hoc Query: {view_query}")
|
|
207
|
+
raise
|
|
208
|
+
|
|
209
|
+
if extra_rows or missing_rows:
|
|
210
|
+
raise AssertionError(f"Validation failed for view {view.name}")
|
|
211
|
+
|
|
212
|
+
|
|
213
|
+
def generate_program(tables: dict, views: List[ViewSpec]) -> str:
|
|
214
|
+
sql = ""
|
|
215
|
+
|
|
216
|
+
for table_sql in tables.values():
|
|
217
|
+
sql += f"{table_sql}\n"
|
|
218
|
+
|
|
219
|
+
for view in views:
|
|
220
|
+
sql += view.sql()
|
|
221
|
+
|
|
222
|
+
return sql
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def build_pipeline(
|
|
226
|
+
pipeline_name: str,
|
|
227
|
+
tables: dict,
|
|
228
|
+
views: List[ViewSpec],
|
|
229
|
+
resources: Optional[Resources] = None,
|
|
230
|
+
) -> Pipeline:
|
|
231
|
+
sql = generate_program(tables, views)
|
|
232
|
+
|
|
233
|
+
pipeline = PipelineBuilder(
|
|
234
|
+
TEST_CLIENT,
|
|
235
|
+
pipeline_name,
|
|
236
|
+
sql=sql,
|
|
237
|
+
compilation_profile=CompilationProfile.OPTIMIZED,
|
|
238
|
+
runtime_config=RuntimeConfig(
|
|
239
|
+
provisioning_timeout_secs=60,
|
|
240
|
+
dev_tweaks={"backfill_avoidance": True},
|
|
241
|
+
resources=resources,
|
|
242
|
+
),
|
|
243
|
+
).create_or_replace()
|
|
244
|
+
|
|
245
|
+
return pipeline
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def validate_outputs(pipeline: Pipeline, tables: dict, views: List[ViewSpec]):
|
|
249
|
+
for table in tables.keys():
|
|
250
|
+
row_count = list(pipeline.query(f"select count(*) from {table}"))
|
|
251
|
+
log(f"Table '{table}' count(*):\n{row_count}")
|
|
252
|
+
|
|
253
|
+
for view in views:
|
|
254
|
+
validate_view(pipeline, view)
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def check_end_of_input(pipeline: Pipeline) -> bool:
|
|
258
|
+
return all(
|
|
259
|
+
input_endpoint.metrics.end_of_input
|
|
260
|
+
for input_endpoint in pipeline.stats().inputs
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
def wait_end_of_input(pipeline: Pipeline, timeout_s: Optional[int] = None):
|
|
265
|
+
start_time = time.monotonic()
|
|
266
|
+
while not check_end_of_input(pipeline):
|
|
267
|
+
if timeout_s is not None and time.monotonic() - start_time > timeout_s:
|
|
268
|
+
raise TimeoutError("Timeout waiting for end of input")
|
|
269
|
+
time.sleep(3)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def transaction(pipeline: Pipeline, duration_seconds: int):
|
|
273
|
+
"""Run a transaction for a specified duration."""
|
|
274
|
+
|
|
275
|
+
log(f"Running transaction for {duration_seconds} seconds")
|
|
276
|
+
pipeline.start_transaction()
|
|
277
|
+
time.sleep(duration_seconds)
|
|
278
|
+
log("Committing transaction")
|
|
279
|
+
commit_start = time.monotonic()
|
|
280
|
+
pipeline.commit_transaction()
|
|
281
|
+
log(f"Transaction committed in {time.monotonic() - commit_start} seconds")
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def checkpoint_pipeline(pipeline: Pipeline):
|
|
285
|
+
"""Create a checkpoint and wait for it to complete."""
|
|
286
|
+
|
|
287
|
+
log("Creating checkpoint")
|
|
288
|
+
checkpoint_start = time.monotonic()
|
|
289
|
+
pipeline.checkpoint(wait=True)
|
|
290
|
+
log(f"Checkpoint complete in {time.monotonic() - checkpoint_start} seconds")
|
|
291
|
+
|
|
292
|
+
|
|
293
|
+
def check_for_endpoint_errors(pipeline: Pipeline):
|
|
294
|
+
"""Check for errors on all input and output endpoints."""
|
|
295
|
+
|
|
296
|
+
for input_endpoint_status in pipeline.stats().inputs:
|
|
297
|
+
input_endpoint_status.metrics
|
|
298
|
+
if input_endpoint_status.metrics.num_transport_errors > 0:
|
|
299
|
+
raise RuntimeError(
|
|
300
|
+
f"Transport errors detected on input endpoint: {input_endpoint_status.endpoint_name}"
|
|
301
|
+
)
|
|
302
|
+
if input_endpoint_status.metrics.num_parse_errors > 0:
|
|
303
|
+
raise RuntimeError(
|
|
304
|
+
f"Parse errors on input endpoint: {input_endpoint_status.endpoint_name}"
|
|
305
|
+
)
|
|
306
|
+
log(f" Input endpoint {input_endpoint_status.endpoint_name} OK")
|
|
307
|
+
|
|
308
|
+
for output_endpoint_status in pipeline.stats().outputs:
|
|
309
|
+
output_endpoint_status.metrics
|
|
310
|
+
if output_endpoint_status.metrics.num_transport_errors > 0:
|
|
311
|
+
raise RuntimeError(
|
|
312
|
+
f"Transport errors detected on output endpoint: {output_endpoint_status.endpoint_name}"
|
|
313
|
+
)
|
|
314
|
+
if output_endpoint_status.metrics.num_encode_errors > 0:
|
|
315
|
+
raise RuntimeError(
|
|
316
|
+
f"Encode errors on output endpoint: {output_endpoint_status.endpoint_name}"
|
|
317
|
+
)
|
|
318
|
+
log(f" Output endpoint {output_endpoint_status.endpoint_name} OK")
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def number_of_processed_records(pipeline: Pipeline) -> int:
|
|
322
|
+
"""Get the total_processed_records metric."""
|
|
323
|
+
|
|
324
|
+
return pipeline.stats().global_metrics.total_processed_records
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
def run_workload(
|
|
328
|
+
pipeline_name: str, tables: dict, views: List[ViewSpec], transaction: bool = True
|
|
329
|
+
):
|
|
330
|
+
"""
|
|
331
|
+
Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
|
|
332
|
+
|
|
333
|
+
Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
|
|
334
|
+
ingest a lot of data and validate the results. For testing more specific functionality, see
|
|
335
|
+
frameworks in the `tests` directory.
|
|
336
|
+
"""
|
|
337
|
+
|
|
338
|
+
pipeline = build_pipeline(pipeline_name, tables, views)
|
|
339
|
+
|
|
340
|
+
pipeline.start()
|
|
341
|
+
start_time = time.monotonic()
|
|
342
|
+
|
|
343
|
+
if transaction:
|
|
344
|
+
try:
|
|
345
|
+
pipeline.start_transaction()
|
|
346
|
+
except Exception as e:
|
|
347
|
+
log(f"Error starting transaction: {e}")
|
|
348
|
+
|
|
349
|
+
if transaction:
|
|
350
|
+
wait_end_of_input(pipeline, timeout_s=3600)
|
|
351
|
+
else:
|
|
352
|
+
pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
|
|
353
|
+
|
|
354
|
+
elapsed = time.monotonic() - start_time
|
|
355
|
+
log(f"Data ingested in {elapsed}")
|
|
356
|
+
|
|
357
|
+
if transaction:
|
|
358
|
+
start_time = time.monotonic()
|
|
359
|
+
try:
|
|
360
|
+
pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
|
|
361
|
+
log(f"Commit took {time.monotonic() - start_time}")
|
|
362
|
+
except Exception as e:
|
|
363
|
+
log(f"Error committing transaction: {e}")
|
|
364
|
+
|
|
365
|
+
log("Waiting for outputs to flush")
|
|
366
|
+
start_time = time.monotonic()
|
|
367
|
+
pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
|
|
368
|
+
log(f"Flushing outputs took {time.monotonic() - start_time}")
|
|
369
|
+
|
|
370
|
+
validate_outputs(pipeline, tables, views)
|
|
371
|
+
|
|
372
|
+
pipeline.stop(force=True)
|
|
@@ -1,194 +0,0 @@
|
|
|
1
|
-
"Utility functions for writing tests against a Feldera instance."
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import re
|
|
5
|
-
import time
|
|
6
|
-
import json
|
|
7
|
-
import unittest
|
|
8
|
-
from typing import cast
|
|
9
|
-
|
|
10
|
-
from feldera.enums import CompilationProfile
|
|
11
|
-
from feldera.pipeline import Pipeline
|
|
12
|
-
from feldera.pipeline_builder import PipelineBuilder
|
|
13
|
-
from feldera.runtime_config import RuntimeConfig
|
|
14
|
-
from feldera.rest import FelderaClient
|
|
15
|
-
|
|
16
|
-
API_KEY = os.environ.get("FELDERA_API_KEY")
|
|
17
|
-
BASE_URL = (
|
|
18
|
-
os.environ.get("FELDERA_HOST")
|
|
19
|
-
or os.environ.get("FELDERA_BASE_URL")
|
|
20
|
-
or "http://localhost:8080"
|
|
21
|
-
)
|
|
22
|
-
KAFKA_SERVER = os.environ.get("FELDERA_KAFKA_SERVER", "localhost:19092")
|
|
23
|
-
PIPELINE_TO_KAFKA_SERVER = os.environ.get(
|
|
24
|
-
"FELDERA_PIPELINE_TO_KAFKA_SERVER", "redpanda:9092"
|
|
25
|
-
)
|
|
26
|
-
FELDERA_TLS_INSECURE = True if os.environ.get("FELDERA_TLS_INSECURE") else False
|
|
27
|
-
FELDERA_HTTPS_TLS_CERT = os.environ.get("FELDERA_HTTPS_TLS_CERT")
|
|
28
|
-
if not FELDERA_TLS_INSECURE and FELDERA_HTTPS_TLS_CERT is not None:
|
|
29
|
-
FELDERA_REQUESTS_VERIFY = FELDERA_HTTPS_TLS_CERT
|
|
30
|
-
else:
|
|
31
|
-
FELDERA_REQUESTS_VERIFY = not FELDERA_TLS_INSECURE
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
class _LazyClient:
|
|
35
|
-
"Construct the FelderaClient only when accessed as opposed to when imported."
|
|
36
|
-
|
|
37
|
-
__slots__ = ("_client",)
|
|
38
|
-
|
|
39
|
-
def __init__(self):
|
|
40
|
-
self._client = None
|
|
41
|
-
|
|
42
|
-
def _ensure(self):
|
|
43
|
-
if self._client is None:
|
|
44
|
-
self._client = FelderaClient(
|
|
45
|
-
connection_timeout=10,
|
|
46
|
-
)
|
|
47
|
-
return self._client
|
|
48
|
-
|
|
49
|
-
def __getattr__(self, name):
|
|
50
|
-
return getattr(self._ensure(), name)
|
|
51
|
-
|
|
52
|
-
def __call__(self, *a, **kw) -> FelderaClient:
|
|
53
|
-
return self._ensure()
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
TEST_CLIENT = cast(FelderaClient, _LazyClient())
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
def unique_pipeline_name(base_name: str) -> str:
|
|
60
|
-
"""
|
|
61
|
-
In CI, multiple tests of different runs can run against the same Feldera instance, we
|
|
62
|
-
make sure the pipeline names they use are unique by appending the first 5 characters
|
|
63
|
-
of the commit SHA or 'local' if not in CI.
|
|
64
|
-
"""
|
|
65
|
-
ci_tag = os.getenv("GITHUB_SHA", "local")[:5]
|
|
66
|
-
return f"{ci_tag}_{base_name}"
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
def enterprise_only(fn):
|
|
70
|
-
fn._enterprise_only = True
|
|
71
|
-
return unittest.skipUnless(
|
|
72
|
-
TEST_CLIENT.get_config().edition.is_enterprise(),
|
|
73
|
-
f"{fn.__name__} is enterprise only, skipping",
|
|
74
|
-
)(fn)
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
def datafusionize(query: str) -> str:
|
|
78
|
-
sort_array_pattern = re.compile(re.escape("SORT_ARRAY"), re.IGNORECASE)
|
|
79
|
-
truncate_pattern = re.compile(re.escape("TRUNCATE"), re.IGNORECASE)
|
|
80
|
-
timestamp_trunc_pattern = re.compile(
|
|
81
|
-
r"TIMESTAMP_TRUNC\s*\(\s*MAKE_TIMESTAMP\s*\(\s*([^)]+)\s*\)\s*,\s*([A-Z]+)\s*\)",
|
|
82
|
-
re.IGNORECASE,
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
result = sort_array_pattern.sub("array_sort", query)
|
|
86
|
-
result = truncate_pattern.sub("trunc", result)
|
|
87
|
-
result = timestamp_trunc_pattern.sub(r"DATE_TRUNC('\2', TO_TIMESTAMP(\1))", result)
|
|
88
|
-
return result
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
def validate_view(
|
|
92
|
-
pipeline: Pipeline, view_name: str, view_query: str | tuple[str, str]
|
|
93
|
-
):
|
|
94
|
-
print(f"Validating view '{view_name}'")
|
|
95
|
-
|
|
96
|
-
# We have two modes to verify the view, either we run the same SQL as the view against datafusion
|
|
97
|
-
# by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
|
|
98
|
-
# should look like and check that the hash hasn't changed
|
|
99
|
-
if isinstance(view_query, tuple):
|
|
100
|
-
_view_definition, original_hash = view_query
|
|
101
|
-
view_query = f"select * from {view_name}"
|
|
102
|
-
computed_hash = pipeline.query_hash(view_query)
|
|
103
|
-
if computed_hash != original_hash:
|
|
104
|
-
raise AssertionError(
|
|
105
|
-
f"View {view_name} hash {computed_hash} was but expected hash {original_hash}"
|
|
106
|
-
)
|
|
107
|
-
else:
|
|
108
|
-
# TODO: count records
|
|
109
|
-
view_query = datafusionize(view_query)
|
|
110
|
-
try:
|
|
111
|
-
extra_rows = list(
|
|
112
|
-
pipeline.query(f"(select * from {view_name}) except ({view_query})")
|
|
113
|
-
)
|
|
114
|
-
missing_rows = list(
|
|
115
|
-
pipeline.query(f"({view_query}) except (select * from {view_name})")
|
|
116
|
-
)
|
|
117
|
-
|
|
118
|
-
if extra_rows:
|
|
119
|
-
print(
|
|
120
|
-
"Extra rows in Feldera output, but not in the ad hoc query output"
|
|
121
|
-
)
|
|
122
|
-
print(json.dumps(extra_rows))
|
|
123
|
-
|
|
124
|
-
if missing_rows:
|
|
125
|
-
print(
|
|
126
|
-
"Extra rows in the ad hoc query output, but not in Feldera output"
|
|
127
|
-
)
|
|
128
|
-
print(json.dumps(missing_rows))
|
|
129
|
-
except Exception as e:
|
|
130
|
-
print(f"Error querying view '{view_name}': {e}")
|
|
131
|
-
print(f"Ad-hoc Query: {view_query}")
|
|
132
|
-
raise
|
|
133
|
-
|
|
134
|
-
if extra_rows or missing_rows:
|
|
135
|
-
raise AssertionError(f"Validation failed for view {view_name}")
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
def run_workload(pipeline_name: str, tables: dict, views: dict):
|
|
139
|
-
"""
|
|
140
|
-
Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
|
|
141
|
-
|
|
142
|
-
Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
|
|
143
|
-
ingest a lot of data and validate the results. For testing more specific functionality, see
|
|
144
|
-
frameworks in the `tests` directory.
|
|
145
|
-
"""
|
|
146
|
-
|
|
147
|
-
sql = ""
|
|
148
|
-
for table_sql in tables.values():
|
|
149
|
-
sql += f"{table_sql}\n"
|
|
150
|
-
|
|
151
|
-
for view_name, view in views.items():
|
|
152
|
-
if isinstance(view, tuple):
|
|
153
|
-
view_query, _hash = view
|
|
154
|
-
sql += f"create materialized view {view_name} as {view_query};\n\n"
|
|
155
|
-
else:
|
|
156
|
-
sql += f"create materialized view {view_name} as {view};\n\n"
|
|
157
|
-
|
|
158
|
-
pipeline = PipelineBuilder(
|
|
159
|
-
TEST_CLIENT,
|
|
160
|
-
unique_pipeline_name(pipeline_name),
|
|
161
|
-
sql=sql,
|
|
162
|
-
compilation_profile=CompilationProfile.OPTIMIZED,
|
|
163
|
-
runtime_config=RuntimeConfig(provisioning_timeout_secs=60),
|
|
164
|
-
).create_or_replace()
|
|
165
|
-
|
|
166
|
-
pipeline.start()
|
|
167
|
-
start_time = time.monotonic()
|
|
168
|
-
|
|
169
|
-
try:
|
|
170
|
-
pipeline.start_transaction()
|
|
171
|
-
except Exception as e:
|
|
172
|
-
print(f"Error starting transaction: {e}")
|
|
173
|
-
|
|
174
|
-
pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
|
|
175
|
-
elapsed = time.monotonic() - start_time
|
|
176
|
-
print(f"Data ingested in {elapsed}")
|
|
177
|
-
|
|
178
|
-
try:
|
|
179
|
-
start_time = time.monotonic()
|
|
180
|
-
pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
|
|
181
|
-
except Exception as e:
|
|
182
|
-
print(f"Error committing transaction: {e}")
|
|
183
|
-
finally:
|
|
184
|
-
elapsed = time.monotonic() - start_time
|
|
185
|
-
print(f"Commit took {elapsed}")
|
|
186
|
-
|
|
187
|
-
for table in tables.keys():
|
|
188
|
-
row_count = list(pipeline.query(f"select count(*) from {table}"))
|
|
189
|
-
print(f"Table '{table}' count(*):\n{row_count}")
|
|
190
|
-
|
|
191
|
-
for view_name, view_query in views.items():
|
|
192
|
-
validate_view(pipeline, view_name, view_query)
|
|
193
|
-
|
|
194
|
-
pipeline.stop(force=True)
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|