feldera 0.136.0__tar.gz → 0.138.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of feldera might be problematic. Click here for more details.

Files changed (32) hide show
  1. {feldera-0.136.0 → feldera-0.138.0}/PKG-INFO +1 -1
  2. {feldera-0.136.0 → feldera-0.138.0}/feldera/pipeline.py +49 -4
  3. {feldera-0.136.0 → feldera-0.138.0}/feldera/runtime_config.py +2 -0
  4. feldera-0.138.0/feldera/testutils.py +372 -0
  5. {feldera-0.136.0 → feldera-0.138.0}/feldera.egg-info/PKG-INFO +1 -1
  6. {feldera-0.136.0 → feldera-0.138.0}/pyproject.toml +1 -1
  7. feldera-0.136.0/feldera/testutils.py +0 -194
  8. {feldera-0.136.0 → feldera-0.138.0}/README.md +0 -0
  9. {feldera-0.136.0 → feldera-0.138.0}/feldera/__init__.py +0 -0
  10. {feldera-0.136.0 → feldera-0.138.0}/feldera/_callback_runner.py +0 -0
  11. {feldera-0.136.0 → feldera-0.138.0}/feldera/_helpers.py +0 -0
  12. {feldera-0.136.0 → feldera-0.138.0}/feldera/enums.py +0 -0
  13. {feldera-0.136.0 → feldera-0.138.0}/feldera/output_handler.py +0 -0
  14. {feldera-0.136.0 → feldera-0.138.0}/feldera/pipeline_builder.py +0 -0
  15. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/__init__.py +0 -0
  16. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/_helpers.py +0 -0
  17. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/_httprequests.py +0 -0
  18. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/config.py +0 -0
  19. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/errors.py +0 -0
  20. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/feldera_client.py +0 -0
  21. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/feldera_config.py +0 -0
  22. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/pipeline.py +0 -0
  23. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/sql_table.py +0 -0
  24. {feldera-0.136.0 → feldera-0.138.0}/feldera/rest/sql_view.py +0 -0
  25. {feldera-0.136.0 → feldera-0.138.0}/feldera/stats.py +0 -0
  26. {feldera-0.136.0 → feldera-0.138.0}/feldera/tests/test_datafusionize.py +0 -0
  27. {feldera-0.136.0 → feldera-0.138.0}/feldera.egg-info/SOURCES.txt +0 -0
  28. {feldera-0.136.0 → feldera-0.138.0}/feldera.egg-info/dependency_links.txt +0 -0
  29. {feldera-0.136.0 → feldera-0.138.0}/feldera.egg-info/requires.txt +0 -0
  30. {feldera-0.136.0 → feldera-0.138.0}/feldera.egg-info/top_level.txt +0 -0
  31. {feldera-0.136.0 → feldera-0.138.0}/setup.cfg +0 -0
  32. {feldera-0.136.0 → feldera-0.138.0}/tests/test_uda.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: feldera
3
- Version: 0.136.0
3
+ Version: 0.138.0
4
4
  Summary: The feldera python client
5
5
  Author-email: Feldera Team <dev@feldera.com>
6
6
  License: MIT
@@ -326,7 +326,7 @@ class Pipeline:
326
326
  f" time {elapsed}s, timeout: {timeout_s}s"
327
327
  )
328
328
 
329
- pipeline_complete: bool = self.stats().global_metrics.pipeline_complete
329
+ pipeline_complete: bool = self.is_complete()
330
330
  if pipeline_complete is None:
331
331
  raise RuntimeError(
332
332
  "received unknown metrics from the pipeline, pipeline_complete is None"
@@ -339,6 +339,19 @@ class Pipeline:
339
339
  if force_stop:
340
340
  self.stop(force=True)
341
341
 
342
+ def is_complete(self) -> bool:
343
+ """
344
+ Check if the pipeline has completed processing all input records.
345
+
346
+ Returns True if (1) all input connectors attached to the
347
+ pipeline have finished reading their input data sources and issued
348
+ end-of-input notifications to the pipeline, and (2) all inputs received
349
+ from these connectors have been fully processed and corresponding
350
+ outputs have been sent out through the output connectors.
351
+ """
352
+
353
+ return self.stats().global_metrics.pipeline_complete
354
+
342
355
  def start(self, wait: bool = True, timeout_s: Optional[float] = None):
343
356
  """
344
357
  .. _start:
@@ -625,6 +638,8 @@ metrics"""
625
638
  :param timeout_s: The maximum time (in seconds) to wait for the
626
639
  checkpoint to complete.
627
640
 
641
+ :return: The checkpoint sequence number.
642
+
628
643
  :raises FelderaAPIError: If enterprise features are not enabled.
629
644
  """
630
645
 
@@ -647,9 +662,7 @@ pipeline '{self.name}' to make checkpoint '{seq}'"""
647
662
  time.sleep(0.1)
648
663
  continue
649
664
 
650
- return status
651
-
652
- return seq
665
+ return seq
653
666
 
654
667
  def checkpoint_status(self, seq: int) -> CheckpointStatus:
655
668
  """
@@ -889,6 +902,38 @@ pipeline '{self.name}' to sync checkpoint '{uuid}'"""
889
902
  self.refresh()
890
903
  return self._inner.program_code
891
904
 
905
+ def modify(
906
+ self,
907
+ sql: Optional[str] = None,
908
+ udf_rust: Optional[str] = None,
909
+ udf_toml: Optional[str] = None,
910
+ program_config: Optional[Mapping[str, Any]] = None,
911
+ runtime_config: Optional[Mapping[str, Any]] = None,
912
+ description: Optional[str] = None,
913
+ ):
914
+ """
915
+ Modify the pipeline.
916
+
917
+ Modify the values of pipeline attributes: SQL code, UDF Rust code,
918
+ UDF Rust dependencies (TOML), program config, runtime config, and
919
+ description. Only the provided attributes will be modified. Other
920
+ attributes will remain unchanged.
921
+
922
+ The pipeline must be in the STOPPED state to be modified.
923
+
924
+ :raises FelderaAPIError: If the pipeline is not in a STOPPED state.
925
+ """
926
+
927
+ self.client.patch_pipeline(
928
+ name=self._inner.name,
929
+ sql=sql,
930
+ udf_rust=udf_rust,
931
+ udf_toml=udf_toml,
932
+ program_config=program_config,
933
+ runtime_config=runtime_config,
934
+ description=description,
935
+ )
936
+
892
937
  def storage_status(self) -> StorageStatus:
893
938
  """
894
939
  Return the storage status of the pipeline.
@@ -80,6 +80,7 @@ class RuntimeConfig:
80
80
  resources: Optional[Resources] = None,
81
81
  fault_tolerance_model: Optional[FaultToleranceModel] = None,
82
82
  checkpoint_interval_secs: Optional[int] = None,
83
+ dev_tweaks: Optional[dict] = None,
83
84
  ):
84
85
  self.workers = workers
85
86
  self.tracing = tracing
@@ -103,6 +104,7 @@ class RuntimeConfig:
103
104
  self.storage = storage.__dict__
104
105
  else:
105
106
  raise ValueError(f"Unknown value '{storage}' for storage")
107
+ self.dev_tweaks = dev_tweaks
106
108
 
107
109
  @staticmethod
108
110
  def default() -> "RuntimeConfig":
@@ -0,0 +1,372 @@
1
+ "Utility functions for writing tests against a Feldera instance."
2
+
3
+ import os
4
+ import re
5
+ import time
6
+ import json
7
+ import unittest
8
+ from typing import List, Optional, cast
9
+ from datetime import datetime
10
+
11
+ from feldera.enums import CompilationProfile
12
+ from feldera.pipeline import Pipeline
13
+ from feldera.pipeline_builder import PipelineBuilder
14
+ from feldera.runtime_config import Resources, RuntimeConfig
15
+ from feldera.rest import FelderaClient
16
+
17
+ API_KEY = os.environ.get("FELDERA_API_KEY")
18
+ BASE_URL = (
19
+ os.environ.get("FELDERA_HOST")
20
+ or os.environ.get("FELDERA_BASE_URL")
21
+ or "http://localhost:8080"
22
+ )
23
+ KAFKA_SERVER = os.environ.get("FELDERA_KAFKA_SERVER", "localhost:19092")
24
+ PIPELINE_TO_KAFKA_SERVER = os.environ.get(
25
+ "FELDERA_PIPELINE_TO_KAFKA_SERVER", "redpanda:9092"
26
+ )
27
+ FELDERA_TLS_INSECURE = True if os.environ.get("FELDERA_TLS_INSECURE") else False
28
+ FELDERA_HTTPS_TLS_CERT = os.environ.get("FELDERA_HTTPS_TLS_CERT")
29
+ if not FELDERA_TLS_INSECURE and FELDERA_HTTPS_TLS_CERT is not None:
30
+ FELDERA_REQUESTS_VERIFY = FELDERA_HTTPS_TLS_CERT
31
+ else:
32
+ FELDERA_REQUESTS_VERIFY = not FELDERA_TLS_INSECURE
33
+
34
+
35
+ class _LazyClient:
36
+ "Construct the FelderaClient only when accessed as opposed to when imported."
37
+
38
+ __slots__ = ("_client",)
39
+
40
+ def __init__(self):
41
+ self._client = None
42
+
43
+ def _ensure(self):
44
+ if self._client is None:
45
+ self._client = FelderaClient(
46
+ connection_timeout=10,
47
+ )
48
+ return self._client
49
+
50
+ def __getattr__(self, name):
51
+ return getattr(self._ensure(), name)
52
+
53
+ def __call__(self, *a, **kw) -> FelderaClient:
54
+ return self._ensure()
55
+
56
+
57
+ TEST_CLIENT = cast(FelderaClient, _LazyClient())
58
+
59
+
60
+ # SQL index definition.
61
+ class IndexSpec:
62
+ def __init__(self, name: str, columns: List[str]):
63
+ self.name = name
64
+ self.columns = columns
65
+
66
+ def __repr__(self):
67
+ return f"IndexSpec(name={self.name!r},columns={self.columns!r})"
68
+
69
+
70
+ class ViewSpec:
71
+ """
72
+ SQL view definition consisting of a query that can run in Feldera or
73
+ datafusion, optional connector spec and aux SQL statements, e.g., indexes
74
+ and lateness clauses following view definition.
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ name: str,
80
+ query: str,
81
+ indexes: List[IndexSpec] = [],
82
+ connectors: Optional[str] = None,
83
+ aux: Optional[str] = None,
84
+ expected_hash: Optional[str] = None,
85
+ ):
86
+ if not isinstance(query, str):
87
+ raise TypeError("query must be a string")
88
+ self.name = name
89
+ self.query = query
90
+ self.connectors = connectors
91
+ self.indexes = indexes
92
+ self.aux = aux
93
+ self.expected_hash = expected_hash
94
+
95
+ def __repr__(self):
96
+ return f"ViewSpec(name={self.name!r}, query={self.query!r}, indexes={self.indexes!r}, connectors={self.connectors!r}, aux={self.aux!r}, expected_hash={self.expected_hash!r})"
97
+
98
+ def clone(self):
99
+ return ViewSpec(
100
+ self.name,
101
+ self.query,
102
+ self.indexes,
103
+ self.connectors,
104
+ self.aux,
105
+ self.expected_hash,
106
+ )
107
+
108
+ def clone_with_name(self, name: str):
109
+ return ViewSpec(name, self.query, self.indexes, self.connectors, self.aux)
110
+
111
+ def sql(self) -> str:
112
+ sql = ""
113
+
114
+ if self.connectors:
115
+ with_clause = f"\nwith('connectors' = '{self.connectors}')\n"
116
+ else:
117
+ with_clause = ""
118
+
119
+ sql += (
120
+ f"create materialized view {self.name}{with_clause} as\n{self.query};\n\n"
121
+ )
122
+
123
+ for index in self.indexes:
124
+ columns = ",".join(index.columns)
125
+ sql += f"create index {index.name} on {self.name}({columns});\n"
126
+
127
+ if self.aux:
128
+ sql += f"{self.aux}\n"
129
+
130
+ sql += "\n"
131
+
132
+ return sql
133
+
134
+
135
+ def log(*args, **kwargs):
136
+ """Print like built-in print(), but prefix each line with current time."""
137
+ prefix = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
138
+ print(prefix, *args, **kwargs)
139
+
140
+
141
+ def unique_pipeline_name(base_name: str) -> str:
142
+ """
143
+ In CI, multiple tests of different runs can run against the same Feldera instance, we
144
+ make sure the pipeline names they use are unique by appending the first 5 characters
145
+ of the commit SHA or 'local' if not in CI.
146
+ """
147
+ ci_tag = os.getenv("GITHUB_SHA", "local")[:5]
148
+ return f"{ci_tag}_{base_name}"
149
+
150
+
151
+ def enterprise_only(fn):
152
+ fn._enterprise_only = True
153
+ return unittest.skipUnless(
154
+ TEST_CLIENT.get_config().edition.is_enterprise(),
155
+ f"{fn.__name__} is enterprise only, skipping",
156
+ )(fn)
157
+
158
+
159
+ def datafusionize(query: str) -> str:
160
+ sort_array_pattern = re.compile(re.escape("SORT_ARRAY"), re.IGNORECASE)
161
+ truncate_pattern = re.compile(re.escape("TRUNCATE"), re.IGNORECASE)
162
+ timestamp_trunc_pattern = re.compile(
163
+ r"TIMESTAMP_TRUNC\s*\(\s*MAKE_TIMESTAMP\s*\(\s*([^)]+)\s*\)\s*,\s*([A-Z]+)\s*\)",
164
+ re.IGNORECASE,
165
+ )
166
+
167
+ result = sort_array_pattern.sub("array_sort", query)
168
+ result = truncate_pattern.sub("trunc", result)
169
+ result = timestamp_trunc_pattern.sub(r"DATE_TRUNC('\2', TO_TIMESTAMP(\1))", result)
170
+ return result
171
+
172
+
173
+ def validate_view(pipeline: Pipeline, view: ViewSpec):
174
+ log(f"Validating view '{view.name}'")
175
+
176
+ # We have two modes to verify the view, either we run the same SQL as the view against datafusion
177
+ # by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
178
+ # should look like and check that the hash hasn't changed
179
+ if view.expected_hash:
180
+ view_query = f"select * from {view.name}"
181
+ computed_hash = pipeline.query_hash(view_query)
182
+ if computed_hash != view.expected_hash:
183
+ raise AssertionError(
184
+ f"View {view.name} hash {computed_hash} was but expected hash {view.expected_hash}"
185
+ )
186
+ else:
187
+ # TODO: count records
188
+ view_query = datafusionize(view.query)
189
+ try:
190
+ extra_rows = list(
191
+ pipeline.query(f"(select * from {view.name}) except ({view_query})")
192
+ )
193
+ missing_rows = list(
194
+ pipeline.query(f"({view_query}) except (select * from {view.name})")
195
+ )
196
+
197
+ if extra_rows:
198
+ log("Extra rows in Feldera output, but not in the ad hoc query output")
199
+ log(json.dumps(extra_rows))
200
+
201
+ if missing_rows:
202
+ log("Extra rows in the ad hoc query output, but not in Feldera output")
203
+ log(json.dumps(missing_rows))
204
+ except Exception as e:
205
+ log(f"Error querying view '{view.name}': {e}")
206
+ log(f"Ad-hoc Query: {view_query}")
207
+ raise
208
+
209
+ if extra_rows or missing_rows:
210
+ raise AssertionError(f"Validation failed for view {view.name}")
211
+
212
+
213
+ def generate_program(tables: dict, views: List[ViewSpec]) -> str:
214
+ sql = ""
215
+
216
+ for table_sql in tables.values():
217
+ sql += f"{table_sql}\n"
218
+
219
+ for view in views:
220
+ sql += view.sql()
221
+
222
+ return sql
223
+
224
+
225
+ def build_pipeline(
226
+ pipeline_name: str,
227
+ tables: dict,
228
+ views: List[ViewSpec],
229
+ resources: Optional[Resources] = None,
230
+ ) -> Pipeline:
231
+ sql = generate_program(tables, views)
232
+
233
+ pipeline = PipelineBuilder(
234
+ TEST_CLIENT,
235
+ pipeline_name,
236
+ sql=sql,
237
+ compilation_profile=CompilationProfile.OPTIMIZED,
238
+ runtime_config=RuntimeConfig(
239
+ provisioning_timeout_secs=60,
240
+ dev_tweaks={"backfill_avoidance": True},
241
+ resources=resources,
242
+ ),
243
+ ).create_or_replace()
244
+
245
+ return pipeline
246
+
247
+
248
+ def validate_outputs(pipeline: Pipeline, tables: dict, views: List[ViewSpec]):
249
+ for table in tables.keys():
250
+ row_count = list(pipeline.query(f"select count(*) from {table}"))
251
+ log(f"Table '{table}' count(*):\n{row_count}")
252
+
253
+ for view in views:
254
+ validate_view(pipeline, view)
255
+
256
+
257
+ def check_end_of_input(pipeline: Pipeline) -> bool:
258
+ return all(
259
+ input_endpoint.metrics.end_of_input
260
+ for input_endpoint in pipeline.stats().inputs
261
+ )
262
+
263
+
264
+ def wait_end_of_input(pipeline: Pipeline, timeout_s: Optional[int] = None):
265
+ start_time = time.monotonic()
266
+ while not check_end_of_input(pipeline):
267
+ if timeout_s is not None and time.monotonic() - start_time > timeout_s:
268
+ raise TimeoutError("Timeout waiting for end of input")
269
+ time.sleep(3)
270
+
271
+
272
+ def transaction(pipeline: Pipeline, duration_seconds: int):
273
+ """Run a transaction for a specified duration."""
274
+
275
+ log(f"Running transaction for {duration_seconds} seconds")
276
+ pipeline.start_transaction()
277
+ time.sleep(duration_seconds)
278
+ log("Committing transaction")
279
+ commit_start = time.monotonic()
280
+ pipeline.commit_transaction()
281
+ log(f"Transaction committed in {time.monotonic() - commit_start} seconds")
282
+
283
+
284
+ def checkpoint_pipeline(pipeline: Pipeline):
285
+ """Create a checkpoint and wait for it to complete."""
286
+
287
+ log("Creating checkpoint")
288
+ checkpoint_start = time.monotonic()
289
+ pipeline.checkpoint(wait=True)
290
+ log(f"Checkpoint complete in {time.monotonic() - checkpoint_start} seconds")
291
+
292
+
293
+ def check_for_endpoint_errors(pipeline: Pipeline):
294
+ """Check for errors on all input and output endpoints."""
295
+
296
+ for input_endpoint_status in pipeline.stats().inputs:
297
+ input_endpoint_status.metrics
298
+ if input_endpoint_status.metrics.num_transport_errors > 0:
299
+ raise RuntimeError(
300
+ f"Transport errors detected on input endpoint: {input_endpoint_status.endpoint_name}"
301
+ )
302
+ if input_endpoint_status.metrics.num_parse_errors > 0:
303
+ raise RuntimeError(
304
+ f"Parse errors on input endpoint: {input_endpoint_status.endpoint_name}"
305
+ )
306
+ log(f" Input endpoint {input_endpoint_status.endpoint_name} OK")
307
+
308
+ for output_endpoint_status in pipeline.stats().outputs:
309
+ output_endpoint_status.metrics
310
+ if output_endpoint_status.metrics.num_transport_errors > 0:
311
+ raise RuntimeError(
312
+ f"Transport errors detected on output endpoint: {output_endpoint_status.endpoint_name}"
313
+ )
314
+ if output_endpoint_status.metrics.num_encode_errors > 0:
315
+ raise RuntimeError(
316
+ f"Encode errors on output endpoint: {output_endpoint_status.endpoint_name}"
317
+ )
318
+ log(f" Output endpoint {output_endpoint_status.endpoint_name} OK")
319
+
320
+
321
+ def number_of_processed_records(pipeline: Pipeline) -> int:
322
+ """Get the total_processed_records metric."""
323
+
324
+ return pipeline.stats().global_metrics.total_processed_records
325
+
326
+
327
+ def run_workload(
328
+ pipeline_name: str, tables: dict, views: List[ViewSpec], transaction: bool = True
329
+ ):
330
+ """
331
+ Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
332
+
333
+ Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
334
+ ingest a lot of data and validate the results. For testing more specific functionality, see
335
+ frameworks in the `tests` directory.
336
+ """
337
+
338
+ pipeline = build_pipeline(pipeline_name, tables, views)
339
+
340
+ pipeline.start()
341
+ start_time = time.monotonic()
342
+
343
+ if transaction:
344
+ try:
345
+ pipeline.start_transaction()
346
+ except Exception as e:
347
+ log(f"Error starting transaction: {e}")
348
+
349
+ if transaction:
350
+ wait_end_of_input(pipeline, timeout_s=3600)
351
+ else:
352
+ pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
353
+
354
+ elapsed = time.monotonic() - start_time
355
+ log(f"Data ingested in {elapsed}")
356
+
357
+ if transaction:
358
+ start_time = time.monotonic()
359
+ try:
360
+ pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
361
+ log(f"Commit took {time.monotonic() - start_time}")
362
+ except Exception as e:
363
+ log(f"Error committing transaction: {e}")
364
+
365
+ log("Waiting for outputs to flush")
366
+ start_time = time.monotonic()
367
+ pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
368
+ log(f"Flushing outputs took {time.monotonic() - start_time}")
369
+
370
+ validate_outputs(pipeline, tables, views)
371
+
372
+ pipeline.stop(force=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: feldera
3
- Version: 0.136.0
3
+ Version: 0.138.0
4
4
  Summary: The feldera python client
5
5
  Author-email: Feldera Team <dev@feldera.com>
6
6
  License: MIT
@@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"
6
6
  name = "feldera"
7
7
  readme = "README.md"
8
8
  description = "The feldera python client"
9
- version = "0.136.0"
9
+ version = "0.138.0"
10
10
  license = { text = "MIT" }
11
11
  requires-python = ">=3.10"
12
12
  authors = [
@@ -1,194 +0,0 @@
1
- "Utility functions for writing tests against a Feldera instance."
2
-
3
- import os
4
- import re
5
- import time
6
- import json
7
- import unittest
8
- from typing import cast
9
-
10
- from feldera.enums import CompilationProfile
11
- from feldera.pipeline import Pipeline
12
- from feldera.pipeline_builder import PipelineBuilder
13
- from feldera.runtime_config import RuntimeConfig
14
- from feldera.rest import FelderaClient
15
-
16
- API_KEY = os.environ.get("FELDERA_API_KEY")
17
- BASE_URL = (
18
- os.environ.get("FELDERA_HOST")
19
- or os.environ.get("FELDERA_BASE_URL")
20
- or "http://localhost:8080"
21
- )
22
- KAFKA_SERVER = os.environ.get("FELDERA_KAFKA_SERVER", "localhost:19092")
23
- PIPELINE_TO_KAFKA_SERVER = os.environ.get(
24
- "FELDERA_PIPELINE_TO_KAFKA_SERVER", "redpanda:9092"
25
- )
26
- FELDERA_TLS_INSECURE = True if os.environ.get("FELDERA_TLS_INSECURE") else False
27
- FELDERA_HTTPS_TLS_CERT = os.environ.get("FELDERA_HTTPS_TLS_CERT")
28
- if not FELDERA_TLS_INSECURE and FELDERA_HTTPS_TLS_CERT is not None:
29
- FELDERA_REQUESTS_VERIFY = FELDERA_HTTPS_TLS_CERT
30
- else:
31
- FELDERA_REQUESTS_VERIFY = not FELDERA_TLS_INSECURE
32
-
33
-
34
- class _LazyClient:
35
- "Construct the FelderaClient only when accessed as opposed to when imported."
36
-
37
- __slots__ = ("_client",)
38
-
39
- def __init__(self):
40
- self._client = None
41
-
42
- def _ensure(self):
43
- if self._client is None:
44
- self._client = FelderaClient(
45
- connection_timeout=10,
46
- )
47
- return self._client
48
-
49
- def __getattr__(self, name):
50
- return getattr(self._ensure(), name)
51
-
52
- def __call__(self, *a, **kw) -> FelderaClient:
53
- return self._ensure()
54
-
55
-
56
- TEST_CLIENT = cast(FelderaClient, _LazyClient())
57
-
58
-
59
- def unique_pipeline_name(base_name: str) -> str:
60
- """
61
- In CI, multiple tests of different runs can run against the same Feldera instance, we
62
- make sure the pipeline names they use are unique by appending the first 5 characters
63
- of the commit SHA or 'local' if not in CI.
64
- """
65
- ci_tag = os.getenv("GITHUB_SHA", "local")[:5]
66
- return f"{ci_tag}_{base_name}"
67
-
68
-
69
- def enterprise_only(fn):
70
- fn._enterprise_only = True
71
- return unittest.skipUnless(
72
- TEST_CLIENT.get_config().edition.is_enterprise(),
73
- f"{fn.__name__} is enterprise only, skipping",
74
- )(fn)
75
-
76
-
77
- def datafusionize(query: str) -> str:
78
- sort_array_pattern = re.compile(re.escape("SORT_ARRAY"), re.IGNORECASE)
79
- truncate_pattern = re.compile(re.escape("TRUNCATE"), re.IGNORECASE)
80
- timestamp_trunc_pattern = re.compile(
81
- r"TIMESTAMP_TRUNC\s*\(\s*MAKE_TIMESTAMP\s*\(\s*([^)]+)\s*\)\s*,\s*([A-Z]+)\s*\)",
82
- re.IGNORECASE,
83
- )
84
-
85
- result = sort_array_pattern.sub("array_sort", query)
86
- result = truncate_pattern.sub("trunc", result)
87
- result = timestamp_trunc_pattern.sub(r"DATE_TRUNC('\2', TO_TIMESTAMP(\1))", result)
88
- return result
89
-
90
-
91
- def validate_view(
92
- pipeline: Pipeline, view_name: str, view_query: str | tuple[str, str]
93
- ):
94
- print(f"Validating view '{view_name}'")
95
-
96
- # We have two modes to verify the view, either we run the same SQL as the view against datafusion
97
- # by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
98
- # should look like and check that the hash hasn't changed
99
- if isinstance(view_query, tuple):
100
- _view_definition, original_hash = view_query
101
- view_query = f"select * from {view_name}"
102
- computed_hash = pipeline.query_hash(view_query)
103
- if computed_hash != original_hash:
104
- raise AssertionError(
105
- f"View {view_name} hash {computed_hash} was but expected hash {original_hash}"
106
- )
107
- else:
108
- # TODO: count records
109
- view_query = datafusionize(view_query)
110
- try:
111
- extra_rows = list(
112
- pipeline.query(f"(select * from {view_name}) except ({view_query})")
113
- )
114
- missing_rows = list(
115
- pipeline.query(f"({view_query}) except (select * from {view_name})")
116
- )
117
-
118
- if extra_rows:
119
- print(
120
- "Extra rows in Feldera output, but not in the ad hoc query output"
121
- )
122
- print(json.dumps(extra_rows))
123
-
124
- if missing_rows:
125
- print(
126
- "Extra rows in the ad hoc query output, but not in Feldera output"
127
- )
128
- print(json.dumps(missing_rows))
129
- except Exception as e:
130
- print(f"Error querying view '{view_name}': {e}")
131
- print(f"Ad-hoc Query: {view_query}")
132
- raise
133
-
134
- if extra_rows or missing_rows:
135
- raise AssertionError(f"Validation failed for view {view_name}")
136
-
137
-
138
- def run_workload(pipeline_name: str, tables: dict, views: dict):
139
- """
140
- Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
141
-
142
- Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
143
- ingest a lot of data and validate the results. For testing more specific functionality, see
144
- frameworks in the `tests` directory.
145
- """
146
-
147
- sql = ""
148
- for table_sql in tables.values():
149
- sql += f"{table_sql}\n"
150
-
151
- for view_name, view in views.items():
152
- if isinstance(view, tuple):
153
- view_query, _hash = view
154
- sql += f"create materialized view {view_name} as {view_query};\n\n"
155
- else:
156
- sql += f"create materialized view {view_name} as {view};\n\n"
157
-
158
- pipeline = PipelineBuilder(
159
- TEST_CLIENT,
160
- unique_pipeline_name(pipeline_name),
161
- sql=sql,
162
- compilation_profile=CompilationProfile.OPTIMIZED,
163
- runtime_config=RuntimeConfig(provisioning_timeout_secs=60),
164
- ).create_or_replace()
165
-
166
- pipeline.start()
167
- start_time = time.monotonic()
168
-
169
- try:
170
- pipeline.start_transaction()
171
- except Exception as e:
172
- print(f"Error starting transaction: {e}")
173
-
174
- pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
175
- elapsed = time.monotonic() - start_time
176
- print(f"Data ingested in {elapsed}")
177
-
178
- try:
179
- start_time = time.monotonic()
180
- pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
181
- except Exception as e:
182
- print(f"Error committing transaction: {e}")
183
- finally:
184
- elapsed = time.monotonic() - start_time
185
- print(f"Commit took {elapsed}")
186
-
187
- for table in tables.keys():
188
- row_count = list(pipeline.query(f"select count(*) from {table}"))
189
- print(f"Table '{table}' count(*):\n{row_count}")
190
-
191
- for view_name, view_query in views.items():
192
- validate_view(pipeline, view_name, view_query)
193
-
194
- pipeline.stop(force=True)
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes