feldera 0.136.0__py3-none-any.whl → 0.138.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of feldera might be problematic. Click here for more details.

feldera/pipeline.py CHANGED
@@ -326,7 +326,7 @@ class Pipeline:
326
326
  f" time {elapsed}s, timeout: {timeout_s}s"
327
327
  )
328
328
 
329
- pipeline_complete: bool = self.stats().global_metrics.pipeline_complete
329
+ pipeline_complete: bool = self.is_complete()
330
330
  if pipeline_complete is None:
331
331
  raise RuntimeError(
332
332
  "received unknown metrics from the pipeline, pipeline_complete is None"
@@ -339,6 +339,19 @@ class Pipeline:
339
339
  if force_stop:
340
340
  self.stop(force=True)
341
341
 
342
+ def is_complete(self) -> bool:
343
+ """
344
+ Check if the pipeline has completed processing all input records.
345
+
346
+ Returns True if (1) all input connectors attached to the
347
+ pipeline have finished reading their input data sources and issued
348
+ end-of-input notifications to the pipeline, and (2) all inputs received
349
+ from these connectors have been fully processed and corresponding
350
+ outputs have been sent out through the output connectors.
351
+ """
352
+
353
+ return self.stats().global_metrics.pipeline_complete
354
+
342
355
  def start(self, wait: bool = True, timeout_s: Optional[float] = None):
343
356
  """
344
357
  .. _start:
@@ -625,6 +638,8 @@ metrics"""
625
638
  :param timeout_s: The maximum time (in seconds) to wait for the
626
639
  checkpoint to complete.
627
640
 
641
+ :return: The checkpoint sequence number.
642
+
628
643
  :raises FelderaAPIError: If enterprise features are not enabled.
629
644
  """
630
645
 
@@ -647,9 +662,7 @@ pipeline '{self.name}' to make checkpoint '{seq}'"""
647
662
  time.sleep(0.1)
648
663
  continue
649
664
 
650
- return status
651
-
652
- return seq
665
+ return seq
653
666
 
654
667
  def checkpoint_status(self, seq: int) -> CheckpointStatus:
655
668
  """
@@ -889,6 +902,38 @@ pipeline '{self.name}' to sync checkpoint '{uuid}'"""
889
902
  self.refresh()
890
903
  return self._inner.program_code
891
904
 
905
+ def modify(
906
+ self,
907
+ sql: Optional[str] = None,
908
+ udf_rust: Optional[str] = None,
909
+ udf_toml: Optional[str] = None,
910
+ program_config: Optional[Mapping[str, Any]] = None,
911
+ runtime_config: Optional[Mapping[str, Any]] = None,
912
+ description: Optional[str] = None,
913
+ ):
914
+ """
915
+ Modify the pipeline.
916
+
917
+ Modify the values of pipeline attributes: SQL code, UDF Rust code,
918
+ UDF Rust dependencies (TOML), program config, runtime config, and
919
+ description. Only the provided attributes will be modified. Other
920
+ attributes will remain unchanged.
921
+
922
+ The pipeline must be in the STOPPED state to be modified.
923
+
924
+ :raises FelderaAPIError: If the pipeline is not in a STOPPED state.
925
+ """
926
+
927
+ self.client.patch_pipeline(
928
+ name=self._inner.name,
929
+ sql=sql,
930
+ udf_rust=udf_rust,
931
+ udf_toml=udf_toml,
932
+ program_config=program_config,
933
+ runtime_config=runtime_config,
934
+ description=description,
935
+ )
936
+
892
937
  def storage_status(self) -> StorageStatus:
893
938
  """
894
939
  Return the storage status of the pipeline.
feldera/runtime_config.py CHANGED
@@ -80,6 +80,7 @@ class RuntimeConfig:
80
80
  resources: Optional[Resources] = None,
81
81
  fault_tolerance_model: Optional[FaultToleranceModel] = None,
82
82
  checkpoint_interval_secs: Optional[int] = None,
83
+ dev_tweaks: Optional[dict] = None,
83
84
  ):
84
85
  self.workers = workers
85
86
  self.tracing = tracing
@@ -103,6 +104,7 @@ class RuntimeConfig:
103
104
  self.storage = storage.__dict__
104
105
  else:
105
106
  raise ValueError(f"Unknown value '{storage}' for storage")
107
+ self.dev_tweaks = dev_tweaks
106
108
 
107
109
  @staticmethod
108
110
  def default() -> "RuntimeConfig":
feldera/testutils.py CHANGED
@@ -5,12 +5,13 @@ import re
5
5
  import time
6
6
  import json
7
7
  import unittest
8
- from typing import cast
8
+ from typing import List, Optional, cast
9
+ from datetime import datetime
9
10
 
10
11
  from feldera.enums import CompilationProfile
11
12
  from feldera.pipeline import Pipeline
12
13
  from feldera.pipeline_builder import PipelineBuilder
13
- from feldera.runtime_config import RuntimeConfig
14
+ from feldera.runtime_config import Resources, RuntimeConfig
14
15
  from feldera.rest import FelderaClient
15
16
 
16
17
  API_KEY = os.environ.get("FELDERA_API_KEY")
@@ -56,6 +57,87 @@ class _LazyClient:
56
57
  TEST_CLIENT = cast(FelderaClient, _LazyClient())
57
58
 
58
59
 
60
+ # SQL index definition.
61
+ class IndexSpec:
62
+ def __init__(self, name: str, columns: List[str]):
63
+ self.name = name
64
+ self.columns = columns
65
+
66
+ def __repr__(self):
67
+ return f"IndexSpec(name={self.name!r},columns={self.columns!r})"
68
+
69
+
70
+ class ViewSpec:
71
+ """
72
+ SQL view definition consisting of a query that can run in Feldera or
73
+ datafusion, optional connector spec and aux SQL statements, e.g., indexes
74
+ and lateness clauses following view definition.
75
+ """
76
+
77
+ def __init__(
78
+ self,
79
+ name: str,
80
+ query: str,
81
+ indexes: List[IndexSpec] = [],
82
+ connectors: Optional[str] = None,
83
+ aux: Optional[str] = None,
84
+ expected_hash: Optional[str] = None,
85
+ ):
86
+ if not isinstance(query, str):
87
+ raise TypeError("query must be a string")
88
+ self.name = name
89
+ self.query = query
90
+ self.connectors = connectors
91
+ self.indexes = indexes
92
+ self.aux = aux
93
+ self.expected_hash = expected_hash
94
+
95
+ def __repr__(self):
96
+ return f"ViewSpec(name={self.name!r}, query={self.query!r}, indexes={self.indexes!r}, connectors={self.connectors!r}, aux={self.aux!r}, expected_hash={self.expected_hash!r})"
97
+
98
+ def clone(self):
99
+ return ViewSpec(
100
+ self.name,
101
+ self.query,
102
+ self.indexes,
103
+ self.connectors,
104
+ self.aux,
105
+ self.expected_hash,
106
+ )
107
+
108
+ def clone_with_name(self, name: str):
109
+ return ViewSpec(name, self.query, self.indexes, self.connectors, self.aux)
110
+
111
+ def sql(self) -> str:
112
+ sql = ""
113
+
114
+ if self.connectors:
115
+ with_clause = f"\nwith('connectors' = '{self.connectors}')\n"
116
+ else:
117
+ with_clause = ""
118
+
119
+ sql += (
120
+ f"create materialized view {self.name}{with_clause} as\n{self.query};\n\n"
121
+ )
122
+
123
+ for index in self.indexes:
124
+ columns = ",".join(index.columns)
125
+ sql += f"create index {index.name} on {self.name}({columns});\n"
126
+
127
+ if self.aux:
128
+ sql += f"{self.aux}\n"
129
+
130
+ sql += "\n"
131
+
132
+ return sql
133
+
134
+
135
+ def log(*args, **kwargs):
136
+ """Print like built-in print(), but prefix each line with current time."""
137
+ prefix = datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")
138
+ print(prefix, *args, **kwargs)
139
+
140
+
59
141
  def unique_pipeline_name(base_name: str) -> str:
60
142
  """
61
143
  In CI, multiple tests of different runs can run against the same Feldera instance, we
@@ -88,107 +170,203 @@ def datafusionize(query: str) -> str:
88
170
  return result
89
171
 
90
172
 
91
- def validate_view(
92
- pipeline: Pipeline, view_name: str, view_query: str | tuple[str, str]
93
- ):
94
- print(f"Validating view '{view_name}'")
173
+ def validate_view(pipeline: Pipeline, view: ViewSpec):
174
+ log(f"Validating view '{view.name}'")
95
175
 
96
176
  # We have two modes to verify the view, either we run the same SQL as the view against datafusion
97
177
  # by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
98
178
  # should look like and check that the hash hasn't changed
99
- if isinstance(view_query, tuple):
100
- _view_definition, original_hash = view_query
101
- view_query = f"select * from {view_name}"
179
+ if view.expected_hash:
180
+ view_query = f"select * from {view.name}"
102
181
  computed_hash = pipeline.query_hash(view_query)
103
- if computed_hash != original_hash:
182
+ if computed_hash != view.expected_hash:
104
183
  raise AssertionError(
105
- f"View {view_name} hash {computed_hash} was but expected hash {original_hash}"
184
+ f"View {view.name} hash {computed_hash} was but expected hash {view.expected_hash}"
106
185
  )
107
186
  else:
108
187
  # TODO: count records
109
- view_query = datafusionize(view_query)
188
+ view_query = datafusionize(view.query)
110
189
  try:
111
190
  extra_rows = list(
112
- pipeline.query(f"(select * from {view_name}) except ({view_query})")
191
+ pipeline.query(f"(select * from {view.name}) except ({view_query})")
113
192
  )
114
193
  missing_rows = list(
115
- pipeline.query(f"({view_query}) except (select * from {view_name})")
194
+ pipeline.query(f"({view_query}) except (select * from {view.name})")
116
195
  )
117
196
 
118
197
  if extra_rows:
119
- print(
120
- "Extra rows in Feldera output, but not in the ad hoc query output"
121
- )
122
- print(json.dumps(extra_rows))
198
+ log("Extra rows in Feldera output, but not in the ad hoc query output")
199
+ log(json.dumps(extra_rows))
123
200
 
124
201
  if missing_rows:
125
- print(
126
- "Extra rows in the ad hoc query output, but not in Feldera output"
127
- )
128
- print(json.dumps(missing_rows))
202
+ log("Extra rows in the ad hoc query output, but not in Feldera output")
203
+ log(json.dumps(missing_rows))
129
204
  except Exception as e:
130
- print(f"Error querying view '{view_name}': {e}")
131
- print(f"Ad-hoc Query: {view_query}")
205
+ log(f"Error querying view '{view.name}': {e}")
206
+ log(f"Ad-hoc Query: {view_query}")
132
207
  raise
133
208
 
134
209
  if extra_rows or missing_rows:
135
- raise AssertionError(f"Validation failed for view {view_name}")
136
-
137
-
138
- def run_workload(pipeline_name: str, tables: dict, views: dict):
139
- """
140
- Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
210
+ raise AssertionError(f"Validation failed for view {view.name}")
141
211
 
142
- Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
143
- ingest a lot of data and validate the results. For testing more specific functionality, see
144
- frameworks in the `tests` directory.
145
- """
146
212
 
213
+ def generate_program(tables: dict, views: List[ViewSpec]) -> str:
147
214
  sql = ""
215
+
148
216
  for table_sql in tables.values():
149
217
  sql += f"{table_sql}\n"
150
218
 
151
- for view_name, view in views.items():
152
- if isinstance(view, tuple):
153
- view_query, _hash = view
154
- sql += f"create materialized view {view_name} as {view_query};\n\n"
155
- else:
156
- sql += f"create materialized view {view_name} as {view};\n\n"
219
+ for view in views:
220
+ sql += view.sql()
221
+
222
+ return sql
223
+
224
+
225
+ def build_pipeline(
226
+ pipeline_name: str,
227
+ tables: dict,
228
+ views: List[ViewSpec],
229
+ resources: Optional[Resources] = None,
230
+ ) -> Pipeline:
231
+ sql = generate_program(tables, views)
157
232
 
158
233
  pipeline = PipelineBuilder(
159
234
  TEST_CLIENT,
160
- unique_pipeline_name(pipeline_name),
235
+ pipeline_name,
161
236
  sql=sql,
162
237
  compilation_profile=CompilationProfile.OPTIMIZED,
163
- runtime_config=RuntimeConfig(provisioning_timeout_secs=60),
238
+ runtime_config=RuntimeConfig(
239
+ provisioning_timeout_secs=60,
240
+ dev_tweaks={"backfill_avoidance": True},
241
+ resources=resources,
242
+ ),
164
243
  ).create_or_replace()
165
244
 
245
+ return pipeline
246
+
247
+
248
+ def validate_outputs(pipeline: Pipeline, tables: dict, views: List[ViewSpec]):
249
+ for table in tables.keys():
250
+ row_count = list(pipeline.query(f"select count(*) from {table}"))
251
+ log(f"Table '{table}' count(*):\n{row_count}")
252
+
253
+ for view in views:
254
+ validate_view(pipeline, view)
255
+
256
+
257
+ def check_end_of_input(pipeline: Pipeline) -> bool:
258
+ return all(
259
+ input_endpoint.metrics.end_of_input
260
+ for input_endpoint in pipeline.stats().inputs
261
+ )
262
+
263
+
264
+ def wait_end_of_input(pipeline: Pipeline, timeout_s: Optional[int] = None):
265
+ start_time = time.monotonic()
266
+ while not check_end_of_input(pipeline):
267
+ if timeout_s is not None and time.monotonic() - start_time > timeout_s:
268
+ raise TimeoutError("Timeout waiting for end of input")
269
+ time.sleep(3)
270
+
271
+
272
+ def transaction(pipeline: Pipeline, duration_seconds: int):
273
+ """Run a transaction for a specified duration."""
274
+
275
+ log(f"Running transaction for {duration_seconds} seconds")
276
+ pipeline.start_transaction()
277
+ time.sleep(duration_seconds)
278
+ log("Committing transaction")
279
+ commit_start = time.monotonic()
280
+ pipeline.commit_transaction()
281
+ log(f"Transaction committed in {time.monotonic() - commit_start} seconds")
282
+
283
+
284
+ def checkpoint_pipeline(pipeline: Pipeline):
285
+ """Create a checkpoint and wait for it to complete."""
286
+
287
+ log("Creating checkpoint")
288
+ checkpoint_start = time.monotonic()
289
+ pipeline.checkpoint(wait=True)
290
+ log(f"Checkpoint complete in {time.monotonic() - checkpoint_start} seconds")
291
+
292
+
293
+ def check_for_endpoint_errors(pipeline: Pipeline):
294
+ """Check for errors on all input and output endpoints."""
295
+
296
+ for input_endpoint_status in pipeline.stats().inputs:
297
+ input_endpoint_status.metrics
298
+ if input_endpoint_status.metrics.num_transport_errors > 0:
299
+ raise RuntimeError(
300
+ f"Transport errors detected on input endpoint: {input_endpoint_status.endpoint_name}"
301
+ )
302
+ if input_endpoint_status.metrics.num_parse_errors > 0:
303
+ raise RuntimeError(
304
+ f"Parse errors on input endpoint: {input_endpoint_status.endpoint_name}"
305
+ )
306
+ log(f" Input endpoint {input_endpoint_status.endpoint_name} OK")
307
+
308
+ for output_endpoint_status in pipeline.stats().outputs:
309
+ output_endpoint_status.metrics
310
+ if output_endpoint_status.metrics.num_transport_errors > 0:
311
+ raise RuntimeError(
312
+ f"Transport errors detected on output endpoint: {output_endpoint_status.endpoint_name}"
313
+ )
314
+ if output_endpoint_status.metrics.num_encode_errors > 0:
315
+ raise RuntimeError(
316
+ f"Encode errors on output endpoint: {output_endpoint_status.endpoint_name}"
317
+ )
318
+ log(f" Output endpoint {output_endpoint_status.endpoint_name} OK")
319
+
320
+
321
+ def number_of_processed_records(pipeline: Pipeline) -> int:
322
+ """Get the total_processed_records metric."""
323
+
324
+ return pipeline.stats().global_metrics.total_processed_records
325
+
326
+
327
+ def run_workload(
328
+ pipeline_name: str, tables: dict, views: List[ViewSpec], transaction: bool = True
329
+ ):
330
+ """
331
+ Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
332
+
333
+ Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
334
+ ingest a lot of data and validate the results. For testing more specific functionality, see
335
+ frameworks in the `tests` directory.
336
+ """
337
+
338
+ pipeline = build_pipeline(pipeline_name, tables, views)
339
+
166
340
  pipeline.start()
167
341
  start_time = time.monotonic()
168
342
 
169
- try:
170
- pipeline.start_transaction()
171
- except Exception as e:
172
- print(f"Error starting transaction: {e}")
343
+ if transaction:
344
+ try:
345
+ pipeline.start_transaction()
346
+ except Exception as e:
347
+ log(f"Error starting transaction: {e}")
348
+
349
+ if transaction:
350
+ wait_end_of_input(pipeline, timeout_s=3600)
351
+ else:
352
+ pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
173
353
 
174
- pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
175
354
  elapsed = time.monotonic() - start_time
176
- print(f"Data ingested in {elapsed}")
355
+ log(f"Data ingested in {elapsed}")
177
356
 
178
- try:
357
+ if transaction:
179
358
  start_time = time.monotonic()
180
- pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
181
- except Exception as e:
182
- print(f"Error committing transaction: {e}")
183
- finally:
184
- elapsed = time.monotonic() - start_time
185
- print(f"Commit took {elapsed}")
359
+ try:
360
+ pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
361
+ log(f"Commit took {time.monotonic() - start_time}")
362
+ except Exception as e:
363
+ log(f"Error committing transaction: {e}")
186
364
 
187
- for table in tables.keys():
188
- row_count = list(pipeline.query(f"select count(*) from {table}"))
189
- print(f"Table '{table}' count(*):\n{row_count}")
365
+ log("Waiting for outputs to flush")
366
+ start_time = time.monotonic()
367
+ pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
368
+ log(f"Flushing outputs took {time.monotonic() - start_time}")
190
369
 
191
- for view_name, view_query in views.items():
192
- validate_view(pipeline, view_name, view_query)
370
+ validate_outputs(pipeline, tables, views)
193
371
 
194
372
  pipeline.stop(force=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: feldera
3
- Version: 0.136.0
3
+ Version: 0.138.0
4
4
  Summary: The feldera python client
5
5
  Author-email: Feldera Team <dev@feldera.com>
6
6
  License: MIT
@@ -3,11 +3,11 @@ feldera/_callback_runner.py,sha256=v3PD2DcT190ObYWoZDtWfS2zF9KU63gVKpguvAAEtJk,4
3
3
  feldera/_helpers.py,sha256=TQnDQW19fpljD19ppd5dASy1gUC4y8GNnnJjXxbaUmM,3019
4
4
  feldera/enums.py,sha256=MTHBojVANsdRnjbrzCyIOniDIUaH8nTYRfxB7QvajEE,9570
5
5
  feldera/output_handler.py,sha256=64J3ljhOaKIhxdjOKYi-BUz_HnMwROfmN8eE-btYygU,1930
6
- feldera/pipeline.py,sha256=P2yRzAxzxSCjiQpy8aVF9KBLKLNrsXhJQP35OHmZYag,42164
6
+ feldera/pipeline.py,sha256=KIAdKzh0Mol5NTn3KzePoELd9lHsPOU-YJCV0xPSmXo,43788
7
7
  feldera/pipeline_builder.py,sha256=a750hp5SgTmlyrobTHFh1fTaK9Ed4A5qnXaYRctRM-8,4250
8
- feldera/runtime_config.py,sha256=MuYJPd5G_hnu_eDz4ge4BfYvSBSOvOEtv4NYh5sEwqU,4452
8
+ feldera/runtime_config.py,sha256=DcJ44EN6Dt1X1wW-1kUvFbSkIDKDLi4-GqaDdUzPtTQ,4532
9
9
  feldera/stats.py,sha256=1qDlWhI-ORx3FktxH3b93mXWwtCOb4XuP0iJePHJTrE,5030
10
- feldera/testutils.py,sha256=4rDn1DfquV_Q4c0wNgV1RPXL6WGd_NZeHvL2WGs_kK4,6608
10
+ feldera/testutils.py,sha256=yfQYhI1LglmsBsfsghFI36EUa015cuH1izjPsvVuNiQ,12172
11
11
  feldera/rest/__init__.py,sha256=Eg-EKUU3RSTDcdxTR_7wNDnCly8VpXEzsZCQUmf-y2M,308
12
12
  feldera/rest/_helpers.py,sha256=q7jWInKp9IiIli8N5o31lDG3hNUbcsJqufZXYHG04ps,222
13
13
  feldera/rest/_httprequests.py,sha256=-jYIt7fTnZf1CNqAsWvU0XVZt4exsLTOKqf9PXLrAKU,8117
@@ -19,7 +19,7 @@ feldera/rest/pipeline.py,sha256=Rmbflbwjvd86iZ5aSJ5b_bTSs6vgvEKQFwMZDtm0nxE,2835
19
19
  feldera/rest/sql_table.py,sha256=qrw-YwMzx5T81zDefNO1KOx7EyypFz1vPwGBzSUB7kc,652
20
20
  feldera/rest/sql_view.py,sha256=hN12mPM0mvwLCIPYywpb12s9Hd2Ws31IlTMXPriMisw,644
21
21
  feldera/tests/test_datafusionize.py,sha256=NGriTaTWf_WnXFud1wmpFwLFa_-XGjfCh6La3dWc3QA,1337
22
- feldera-0.136.0.dist-info/METADATA,sha256=Gmkq19v6uTYQlfsQh_wHMn3zFxZSRWt-yjdr01nhSFY,2368
23
- feldera-0.136.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- feldera-0.136.0.dist-info/top_level.txt,sha256=fB6yTqrQiO6RCbY1xP2T_mpPoTjDFtJvkJJodiee7d0,8
25
- feldera-0.136.0.dist-info/RECORD,,
22
+ feldera-0.138.0.dist-info/METADATA,sha256=W-i6CyFsXvpXl6PZK9Tx6epbZ4a9XUhFvmDy6tetFmw,2368
23
+ feldera-0.138.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
+ feldera-0.138.0.dist-info/top_level.txt,sha256=fB6yTqrQiO6RCbY1xP2T_mpPoTjDFtJvkJJodiee7d0,8
25
+ feldera-0.138.0.dist-info/RECORD,,