feldera 0.69.0__py3-none-any.whl → 0.189.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of feldera might be problematic. Click here for more details.

feldera/rest/pipeline.py CHANGED
@@ -47,15 +47,30 @@ class Pipeline:
47
47
  self.program_version: Optional[int] = None
48
48
  self.deployment_config: Optional[dict] = None
49
49
  self.deployment_desired_status: Optional[str] = None
50
+ self.deployment_desired_status_since: Optional[str] = None
51
+ self.deployment_id: Optional[str] = None
52
+ self.deployment_initial: Optional[str] = None
50
53
  self.deployment_error: Optional[dict] = None
51
54
  self.deployment_location: Optional[str] = None
52
- self.program_binary_url: Optional[str] = None
53
55
  self.program_info: Optional[dict] = (
54
56
  None # info about input & output connectors and the schema
55
57
  )
56
58
  self.program_status: Optional[str] = None
57
59
  self.program_status_since: Optional[str] = None
60
+ self.platform_version: Optional[str] = None
58
61
  self.program_error: Optional[dict] = None
62
+ self.storage_status: Optional[str] = None
63
+
64
+ self.deployment_resources_desired_status: Optional[str] = None
65
+ self.deployment_resources_desired_status_since: Optional[str] = None
66
+ self.deployment_resources_status: Optional[str] = None
67
+ self.deployment_resources_status_since: Optional[str] = None
68
+
69
+ self.deployment_runtime_desired_status: Optional[str] = None
70
+ self.deployment_runtime_desired_status_since: Optional[str] = None
71
+ self.deployment_runtime_status: Optional[str] = None
72
+ self.deployment_runtime_status_details: Optional[dict] = None
73
+ self.deployment_runtime_status_since: Optional[str] = None
59
74
 
60
75
  @classmethod
61
76
  def from_dict(cls, d: Mapping[str, Any]):
feldera/runtime_config.py CHANGED
@@ -1,4 +1,5 @@
1
1
  from typing import Optional, Any, Mapping
2
+ from feldera.enums import FaultToleranceModel
2
3
 
3
4
 
4
5
  class Resources:
@@ -58,6 +59,11 @@ class Storage:
58
59
  class RuntimeConfig:
59
60
  """
60
61
  Runtime configuration class to define the configuration for a pipeline.
62
+ To create runtime config from a dictionary, use
63
+ :meth:`.RuntimeConfig.from_dict`.
64
+
65
+ Documentation:
66
+ https://docs.feldera.com/pipelines/configuration/#runtime-configuration
61
67
  """
62
68
 
63
69
  def __init__(
@@ -72,6 +78,10 @@ class RuntimeConfig:
72
78
  clock_resolution_usecs: Optional[int] = None,
73
79
  provisioning_timeout_secs: Optional[int] = None,
74
80
  resources: Optional[Resources] = None,
81
+ fault_tolerance_model: Optional[FaultToleranceModel] = None,
82
+ checkpoint_interval_secs: Optional[int] = None,
83
+ dev_tweaks: Optional[dict] = None,
84
+ logging: Optional[str] = None,
75
85
  ):
76
86
  self.workers = workers
77
87
  self.tracing = tracing
@@ -81,19 +91,36 @@ class RuntimeConfig:
81
91
  self.min_batch_size_records = min_batch_size_records
82
92
  self.clock_resolution_usecs = clock_resolution_usecs
83
93
  self.provisioning_timeout_secs = provisioning_timeout_secs
94
+ if fault_tolerance_model is not None:
95
+ self.fault_tolerance = {
96
+ "model": str(fault_tolerance_model),
97
+ "checkpoint_interval_secs": checkpoint_interval_secs,
98
+ }
84
99
  if resources is not None:
85
100
  self.resources = resources.__dict__
86
- if isinstance(storage, bool):
87
- self.storage = storage
88
- if isinstance(storage, Storage):
89
- self.storage = storage.__dict__
101
+ if storage is not None:
102
+ if isinstance(storage, bool):
103
+ self.storage = storage
104
+ elif isinstance(storage, Storage):
105
+ self.storage = storage.__dict__
106
+ else:
107
+ raise ValueError(f"Unknown value '{storage}' for storage")
108
+ self.dev_tweaks = dev_tweaks
109
+ self.logging = logging
110
+
111
+ @staticmethod
112
+ def default() -> "RuntimeConfig":
113
+ return RuntimeConfig(resources=Resources())
90
114
 
91
115
  @classmethod
92
116
  def from_dict(cls, d: Mapping[str, Any]):
93
117
  """
94
- Create a `.RuntimeConfig` object from a dictionary.
118
+ Create a :class:`.RuntimeConfig` object from a dictionary.
95
119
  """
96
120
 
97
121
  conf = cls()
98
122
  conf.__dict__ = d
99
123
  return conf
124
+
125
+ def to_dict(self) -> dict:
126
+ return dict((k, v) for k, v in self.__dict__.items() if v is not None)
feldera/stats.py ADDED
@@ -0,0 +1,152 @@
1
+ from typing import Mapping, Any, Optional, List
2
+ from feldera.enums import PipelineStatus, TransactionStatus
3
+ from datetime import datetime
4
+ import uuid
5
+
6
+
7
+ class PipelineStatistics:
8
+ """
9
+ Represents statistics reported by a pipeline's "/stats" endpoint.
10
+ """
11
+
12
+ def __init__(self):
13
+ """
14
+ Initializes as an empty set of statistics.
15
+ """
16
+
17
+ self.global_metrics: GlobalPipelineMetrics = GlobalPipelineMetrics()
18
+ self.suspend_error: Optional[Any] = None
19
+ self.inputs: Mapping[List[InputEndpointStatus]] = {}
20
+ self.outputs: Mapping[List[OutputEndpointStatus]] = {}
21
+
22
+ @classmethod
23
+ def from_dict(cls, d: Mapping[str, Any]):
24
+ pipeline = cls()
25
+ pipeline.global_metrics = GlobalPipelineMetrics.from_dict(d["global_metrics"])
26
+ pipeline.inputs = [
27
+ InputEndpointStatus.from_dict(input) for input in d["inputs"]
28
+ ]
29
+ pipeline.outputs = [
30
+ OutputEndpointStatus.from_dict(output) for output in d["outputs"]
31
+ ]
32
+ return pipeline
33
+
34
+
35
+ class GlobalPipelineMetrics:
36
+ """Represents the "global_metrics" object within the pipeline's
37
+ "/stats" endpoint reply.
38
+ """
39
+
40
+ def __init__(self):
41
+ """
42
+ Initializes as an empty set of metrics.
43
+ """
44
+ self.state: Optional[PipelineStatus] = None
45
+ self.bootstrap_in_progress: Optional[bool] = None
46
+ self.rss_bytes: Optional[int] = None
47
+ self.cpu_msecs: Optional[int] = None
48
+ self.start_time: Optional[datetime] = None
49
+ self.incarnation_uuid: Optional[uuid.UUID] = None
50
+ self.storage_bytes: Optional[int] = None
51
+ self.storage_mb_secs: Optional[int] = None
52
+ self.runtime_elapsed_msecs: Optional[int] = None
53
+ self.buffered_input_records: Optional[int] = None
54
+ self.total_input_records: Optional[int] = None
55
+ self.total_processed_records: Optional[int] = None
56
+ self.total_completed_records: Optional[int] = None
57
+ self.pipeline_complete: Optional[bool] = None
58
+ self.transaction_status: Optional[TransactionStatus] = None
59
+ self.transaction_id: Optional[int] = None
60
+
61
+ @classmethod
62
+ def from_dict(cls, d: Mapping[str, Any]):
63
+ metrics = cls()
64
+ metrics.__dict__.update(d)
65
+ metrics.state = PipelineStatus.from_str(d["state"])
66
+ metrics.incarnation_uuid = uuid.UUID(d["incarnation_uuid"])
67
+ metrics.start_time = datetime.fromtimestamp(d["start_time"])
68
+ metrics.transaction_status = TransactionStatus.from_str(d["transaction_status"])
69
+ return metrics
70
+
71
+
72
+ class InputEndpointStatus:
73
+ """Represents one member of the "inputs" array within the
74
+ pipeline's "/stats" endpoint reply.
75
+ """
76
+
77
+ def __init__(self):
78
+ """Initializes an empty status."""
79
+ self.endpoint_name: Optional[str] = None
80
+ self.config: Optional[Mapping] = None
81
+ self.metrics: Optional[InputEndpointMetrics] = None
82
+ self.fatal_error: Optional[str] = None
83
+ self.paused: Optional[bool] = None
84
+ self.barrier: Optional[bool] = None
85
+
86
+ @classmethod
87
+ def from_dict(cls, d: Mapping[str, Any]):
88
+ status = cls()
89
+ status.__dict__.update(d)
90
+ status.metrics = InputEndpointMetrics.from_dict(d["metrics"])
91
+ return status
92
+
93
+
94
+ class InputEndpointMetrics:
95
+ """Represents the "metrics" member within an input endpoint status
96
+ in the pipeline's "/stats" endpoint reply.
97
+ """
98
+
99
+ def __init__(self):
100
+ self.total_bytes: Optional[int] = None
101
+ self.total_records: Optional[int] = None
102
+ self.buffered_records: Optional[int] = None
103
+ self.num_transport_errors: Optional[int] = None
104
+ self.num_parse_errors: Optional[int] = None
105
+ self.end_of_input: Optional[bool] = None
106
+
107
+ @classmethod
108
+ def from_dict(cls, d: Mapping[str, Any]):
109
+ metrics = cls()
110
+ metrics.__dict__.update(d)
111
+ return metrics
112
+
113
+
114
+ class OutputEndpointStatus:
115
+ """Represents one member of the "outputs" array within the
116
+ pipeline's "/stats" endpoint reply.
117
+ """
118
+
119
+ def __init__(self):
120
+ """Initializes an empty status."""
121
+ self.endpoint_name: Optional[str] = None
122
+ self.config: Optional[Mapping] = None
123
+ self.metrics: Optional[OutputEndpointMetrics] = None
124
+ self.fatal_error: Optional[str] = None
125
+
126
+ @classmethod
127
+ def from_dict(cls, d: Mapping[str, Any]):
128
+ status = cls()
129
+ status.__dict__.update(d)
130
+ status.metrics = OutputEndpointMetrics.from_dict(d["metrics"])
131
+ return status
132
+
133
+
134
+ class OutputEndpointMetrics:
135
+ """Represents the "metrics" member within an output endpoint status
136
+ in the pipeline's "/stats" endpoint reply.
137
+ """
138
+
139
+ def __init__(self):
140
+ self.transmitted_records: Optional[int] = None
141
+ self.transmitted_bytes: Optional[int] = None
142
+ self.queued_records: Optional[int] = None
143
+ self.queued_batches: Optional[int] = None
144
+ self.num_encode_errors: Optional[int] = None
145
+ self.num_transport_errors: Optional[int] = None
146
+ self.total_processed_input_records: Optional[int] = None
147
+
148
+ @classmethod
149
+ def from_dict(cls, d: Mapping[str, Any]):
150
+ metrics = cls()
151
+ metrics.__dict__.update(d)
152
+ return metrics
@@ -0,0 +1,38 @@
1
+ import unittest
2
+
3
+ from feldera.testutils import datafusionize
4
+
5
+
6
+ class TestDatafusionize(unittest.TestCase):
7
+ def test_datafusionize(self):
8
+ # Test SORT_ARRAY replacement
9
+ query = "SELECT SORT_ARRAY(col1) FROM table1"
10
+ result = datafusionize(query)
11
+ assert "array_sort(col1)" in result
12
+
13
+ # Test TRUNCATE replacement
14
+ query = "SELECT TRUNCATE(value, 2) FROM table2"
15
+ result = datafusionize(query)
16
+ assert "trunc(value, 2)" in result
17
+
18
+ # Test TIMESTAMP_TRUNC replacement
19
+ query = "SELECT TIMESTAMP_TRUNC(MAKE_TIMESTAMP(2023, 1, 15, 10, 30, 0), DAY) FROM table3"
20
+ result = datafusionize(query)
21
+ assert "DATE_TRUNC('DAY', TO_TIMESTAMP(2023, 1, 15, 10, 30, 0))" in result
22
+
23
+ query = "TIMESTAMP_TRUNC(MAKE_TIMESTAMP(order_group_last_activity_time), hour) AS window_start_time,"
24
+ result = datafusionize(query)
25
+ assert (
26
+ "DATE_TRUNC('hour', TO_TIMESTAMP(order_group_last_activity_time)) AS window_start_time,"
27
+ in result
28
+ )
29
+
30
+ # Test case insensitive matching
31
+ query = "SELECT sort_array(col) FROM table WHERE truncate(val) > 0"
32
+ result = datafusionize(query)
33
+ assert "array_sort(col)" in result
34
+ assert "trunc(val)" in result
35
+
36
+
37
+ if __name__ == "__main__":
38
+ unittest.main()
feldera/testutils.py ADDED
@@ -0,0 +1,382 @@
1
+ "Utility functions for writing tests against a Feldera instance."
2
+
3
+ import os
4
+ import re
5
+ import time
6
+ import json
7
+ import unittest
8
+ from typing import List, Optional, cast
9
+ from datetime import datetime
10
+
11
+ from feldera.enums import CompilationProfile
12
+ from feldera.pipeline import Pipeline
13
+ from feldera.pipeline_builder import PipelineBuilder
14
+ from feldera.runtime_config import Resources, RuntimeConfig
15
+ from feldera.rest import FelderaClient
16
+ from feldera.rest._helpers import requests_verify_from_env
17
+
18
+ API_KEY = os.environ.get("FELDERA_API_KEY")
19
+
20
+
21
+ # OIDC authentication support
22
+ def _get_oidc_token():
23
+ """Get OIDC token if environment is configured, otherwise return None"""
24
+ try:
25
+ from feldera.testutils_oidc import get_oidc_test_helper
26
+
27
+ oidc_helper = get_oidc_test_helper()
28
+ if oidc_helper is not None:
29
+ return oidc_helper.obtain_access_token()
30
+ except ImportError:
31
+ pass
32
+ return None
33
+
34
+
35
+ def _get_effective_api_key():
36
+ """Get effective API key - OIDC token takes precedence over static API key"""
37
+ oidc_token = _get_oidc_token()
38
+ return oidc_token if oidc_token else API_KEY
39
+
40
+
41
+ BASE_URL = os.environ.get("FELDERA_HOST") or "http://localhost:8080"
42
+ FELDERA_REQUESTS_VERIFY = requests_verify_from_env()
43
+
44
+
45
+ class _LazyClient:
46
+ "Construct the FelderaClient only when accessed as opposed to when imported."
47
+
48
+ __slots__ = ("_client",)
49
+
50
+ def __init__(self):
51
+ self._client = None
52
+
53
+ def _ensure(self):
54
+ if self._client is None:
55
+ self._client = FelderaClient(
56
+ connection_timeout=10,
57
+ api_key=_get_effective_api_key(),
58
+ )
59
+ return self._client
60
+
61
+ def __getattr__(self, name):
62
+ return getattr(self._ensure(), name)
63
+
64
+ def __call__(self, *a, **kw) -> FelderaClient:
65
+ return self._ensure()
66
+
67
+
68
+ TEST_CLIENT = cast(FelderaClient, _LazyClient())
69
+
70
+
71
+ # SQL index definition.
72
+ class IndexSpec:
73
+ def __init__(self, name: str, columns: List[str]):
74
+ self.name = name
75
+ self.columns = columns
76
+
77
+ def __repr__(self):
78
+ return f"IndexSpec(name={self.name!r},columns={self.columns!r})"
79
+
80
+
81
+ class ViewSpec:
82
+ """
83
+ SQL view definition consisting of a query that can run in Feldera or
84
+ datafusion, optional connector spec and aux SQL statements, e.g., indexes
85
+ and lateness clauses following view definition.
86
+ """
87
+
88
+ def __init__(
89
+ self,
90
+ name: str,
91
+ query: str,
92
+ indexes: List[IndexSpec] = [],
93
+ connectors: Optional[str] = None,
94
+ aux: Optional[str] = None,
95
+ expected_hash: Optional[str] = None,
96
+ ):
97
+ if not isinstance(query, str):
98
+ raise TypeError("query must be a string")
99
+ self.name = name
100
+ self.query = query
101
+ self.connectors = connectors
102
+ self.indexes = indexes
103
+ self.aux = aux
104
+ self.expected_hash = expected_hash
105
+
106
+ def __repr__(self):
107
+ return f"ViewSpec(name={self.name!r}, query={self.query!r}, indexes={self.indexes!r}, connectors={self.connectors!r}, aux={self.aux!r}, expected_hash={self.expected_hash!r})"
108
+
109
+ def clone(self):
110
+ return ViewSpec(
111
+ self.name,
112
+ self.query,
113
+ self.indexes,
114
+ self.connectors,
115
+ self.aux,
116
+ self.expected_hash,
117
+ )
118
+
119
+ def clone_with_name(self, name: str):
120
+ return ViewSpec(name, self.query, self.indexes, self.connectors, self.aux)
121
+
122
+ def sql(self) -> str:
123
+ sql = ""
124
+
125
+ if self.connectors:
126
+ with_clause = f"\nwith('connectors' = '{self.connectors}')\n"
127
+ else:
128
+ with_clause = ""
129
+
130
+ sql += (
131
+ f"create materialized view {self.name}{with_clause} as\n{self.query};\n\n"
132
+ )
133
+
134
+ for index in self.indexes:
135
+ columns = ",".join(index.columns)
136
+ sql += f"create index {index.name} on {self.name}({columns});\n"
137
+
138
+ if self.aux:
139
+ sql += f"{self.aux}\n"
140
+
141
+ sql += "\n"
142
+
143
+ return sql
144
+
145
+
146
+ def log(*args, **kwargs):
147
+ """Print like built-in print(), but prefix each line with current time."""
148
+ prefix = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
149
+ print(prefix, *args, **kwargs)
150
+
151
+
152
+ def unique_pipeline_name(base_name: str) -> str:
153
+ """
154
+ In CI, multiple tests of different runs can run against the same Feldera instance, we
155
+ make sure the pipeline names they use are unique by appending the first 5 characters
156
+ of the commit SHA or 'local' if not in CI.
157
+ """
158
+ ci_tag = os.getenv("GITHUB_SHA", "local")[:5]
159
+ return f"{ci_tag}_{base_name}"
160
+
161
+
162
+ def enterprise_only(fn):
163
+ fn._enterprise_only = True
164
+ return unittest.skipUnless(
165
+ TEST_CLIENT.get_config().edition.is_enterprise(),
166
+ f"{fn.__name__} is enterprise only, skipping",
167
+ )(fn)
168
+
169
+
170
+ def datafusionize(query: str) -> str:
171
+ sort_array_pattern = re.compile(re.escape("SORT_ARRAY"), re.IGNORECASE)
172
+ truncate_pattern = re.compile(re.escape("TRUNCATE"), re.IGNORECASE)
173
+ timestamp_trunc_pattern = re.compile(
174
+ r"TIMESTAMP_TRUNC\s*\(\s*MAKE_TIMESTAMP\s*\(\s*([^)]+)\s*\)\s*,\s*([A-Z]+)\s*\)",
175
+ re.IGNORECASE,
176
+ )
177
+
178
+ result = sort_array_pattern.sub("array_sort", query)
179
+ result = truncate_pattern.sub("trunc", result)
180
+ result = timestamp_trunc_pattern.sub(r"DATE_TRUNC('\2', TO_TIMESTAMP(\1))", result)
181
+ return result
182
+
183
+
184
+ def validate_view(pipeline: Pipeline, view: ViewSpec):
185
+ log(f"Validating view '{view.name}'")
186
+
187
+ # We have two modes to verify the view, either we run the same SQL as the view against datafusion
188
+ # by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
189
+ # should look like and check that the hash hasn't changed
190
+ if view.expected_hash:
191
+ view_query = f"select * from {view.name}"
192
+ computed_hash = pipeline.query_hash(view_query)
193
+ if computed_hash != view.expected_hash:
194
+ raise AssertionError(
195
+ f"View {view.name} hash {computed_hash} was but expected hash {view.expected_hash}"
196
+ )
197
+ else:
198
+ # TODO: count records
199
+ view_query = datafusionize(view.query)
200
+ try:
201
+ extra_rows = list(
202
+ pipeline.query(f"(select * from {view.name}) except ({view_query})")
203
+ )
204
+ missing_rows = list(
205
+ pipeline.query(f"({view_query}) except (select * from {view.name})")
206
+ )
207
+
208
+ if extra_rows:
209
+ log("Extra rows in Feldera output, but not in the ad hoc query output")
210
+ log(json.dumps(extra_rows, default=str))
211
+
212
+ if missing_rows:
213
+ log("Extra rows in the ad hoc query output, but not in Feldera output")
214
+ log(json.dumps(missing_rows, default=str))
215
+ except Exception as e:
216
+ log(f"Error querying view '{view.name}': {e}")
217
+ log(f"Ad-hoc Query: {view_query}")
218
+ raise
219
+
220
+ if extra_rows or missing_rows:
221
+ raise AssertionError(f"Validation failed for view {view.name}")
222
+
223
+
224
+ def generate_program(tables: dict, views: List[ViewSpec]) -> str:
225
+ sql = ""
226
+
227
+ for table_sql in tables.values():
228
+ sql += f"{table_sql}\n"
229
+
230
+ for view in views:
231
+ sql += view.sql()
232
+
233
+ return sql
234
+
235
+
236
+ def build_pipeline(
237
+ pipeline_name: str,
238
+ tables: dict,
239
+ views: List[ViewSpec],
240
+ resources: Optional[Resources] = None,
241
+ ) -> Pipeline:
242
+ sql = generate_program(tables, views)
243
+
244
+ pipeline = PipelineBuilder(
245
+ TEST_CLIENT,
246
+ pipeline_name,
247
+ sql=sql,
248
+ compilation_profile=CompilationProfile.OPTIMIZED,
249
+ runtime_config=RuntimeConfig(
250
+ provisioning_timeout_secs=60,
251
+ resources=resources,
252
+ ),
253
+ ).create_or_replace()
254
+
255
+ return pipeline
256
+
257
+
258
+ def validate_outputs(pipeline: Pipeline, tables: dict, views: List[ViewSpec]):
259
+ for table in tables.keys():
260
+ row_count = list(pipeline.query(f"select count(*) from {table}"))
261
+ log(f"Table '{table}' count(*):\n{row_count}")
262
+
263
+ for view in views:
264
+ validate_view(pipeline, view)
265
+
266
+
267
+ def check_end_of_input(pipeline: Pipeline) -> bool:
268
+ return all(
269
+ input_endpoint.metrics.end_of_input
270
+ for input_endpoint in pipeline.stats().inputs
271
+ )
272
+
273
+
274
+ def wait_end_of_input(pipeline: Pipeline, timeout_s: Optional[int] = None):
275
+ start_time = time.monotonic()
276
+ while not check_end_of_input(pipeline):
277
+ if timeout_s is not None and time.monotonic() - start_time > timeout_s:
278
+ raise TimeoutError("Timeout waiting for end of input")
279
+ time.sleep(3)
280
+
281
+
282
+ def transaction(pipeline: Pipeline, duration_seconds: int):
283
+ """Run a transaction for a specified duration."""
284
+
285
+ log(f"Running transaction for {duration_seconds} seconds")
286
+ pipeline.start_transaction()
287
+ time.sleep(duration_seconds)
288
+ log("Committing transaction")
289
+ commit_start = time.monotonic()
290
+ pipeline.commit_transaction()
291
+ log(f"Transaction committed in {time.monotonic() - commit_start} seconds")
292
+
293
+
294
+ def checkpoint_pipeline(pipeline: Pipeline):
295
+ """Create a checkpoint and wait for it to complete."""
296
+
297
+ log("Creating checkpoint")
298
+ checkpoint_start = time.monotonic()
299
+ pipeline.checkpoint(wait=True)
300
+ log(f"Checkpoint complete in {time.monotonic() - checkpoint_start} seconds")
301
+
302
+
303
+ def check_for_endpoint_errors(pipeline: Pipeline):
304
+ """Check for errors on all input and output endpoints."""
305
+
306
+ for input_endpoint_status in pipeline.stats().inputs:
307
+ input_endpoint_status.metrics
308
+ if input_endpoint_status.metrics.num_transport_errors > 0:
309
+ raise RuntimeError(
310
+ f"Transport errors detected on input endpoint: {input_endpoint_status.endpoint_name}"
311
+ )
312
+ if input_endpoint_status.metrics.num_parse_errors > 0:
313
+ raise RuntimeError(
314
+ f"Parse errors on input endpoint: {input_endpoint_status.endpoint_name}"
315
+ )
316
+ log(f" Input endpoint {input_endpoint_status.endpoint_name} OK")
317
+
318
+ for output_endpoint_status in pipeline.stats().outputs:
319
+ output_endpoint_status.metrics
320
+ if output_endpoint_status.metrics.num_transport_errors > 0:
321
+ raise RuntimeError(
322
+ f"Transport errors detected on output endpoint: {output_endpoint_status.endpoint_name}"
323
+ )
324
+ if output_endpoint_status.metrics.num_encode_errors > 0:
325
+ raise RuntimeError(
326
+ f"Encode errors on output endpoint: {output_endpoint_status.endpoint_name}"
327
+ )
328
+ log(f" Output endpoint {output_endpoint_status.endpoint_name} OK")
329
+
330
+
331
+ def number_of_processed_records(pipeline: Pipeline) -> int:
332
+ """Get the total_processed_records metric."""
333
+
334
+ return pipeline.stats().global_metrics.total_processed_records
335
+
336
+
337
+ def run_workload(
338
+ pipeline_name: str, tables: dict, views: List[ViewSpec], transaction: bool = True
339
+ ):
340
+ """
341
+ Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
342
+
343
+ Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
344
+ ingest a lot of data and validate the results. For testing more specific functionality, see
345
+ frameworks in the `tests` directory.
346
+ """
347
+
348
+ pipeline = build_pipeline(pipeline_name, tables, views)
349
+
350
+ pipeline.start()
351
+ start_time = time.monotonic()
352
+
353
+ if transaction:
354
+ try:
355
+ pipeline.start_transaction()
356
+ except Exception as e:
357
+ log(f"Error starting transaction: {e}")
358
+
359
+ if transaction:
360
+ wait_end_of_input(pipeline, timeout_s=3600)
361
+ else:
362
+ pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
363
+
364
+ elapsed = time.monotonic() - start_time
365
+ log(f"Data ingested in {elapsed}")
366
+
367
+ if transaction:
368
+ start_time = time.monotonic()
369
+ try:
370
+ pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
371
+ log(f"Commit took {time.monotonic() - start_time}")
372
+ except Exception as e:
373
+ log(f"Error committing transaction: {e}")
374
+
375
+ log("Waiting for outputs to flush")
376
+ start_time = time.monotonic()
377
+ pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
378
+ log(f"Flushing outputs took {time.monotonic() - start_time}")
379
+
380
+ validate_outputs(pipeline, tables, views)
381
+
382
+ pipeline.stop(force=True)