feldera 0.131.0__tar.gz → 0.133.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of feldera might be problematic. Click here for more details.
- {feldera-0.131.0 → feldera-0.133.0}/PKG-INFO +1 -1
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/errors.py +16 -0
- feldera-0.133.0/feldera/tests/test_datafusionize.py +38 -0
- feldera-0.133.0/feldera/testutils.py +188 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera.egg-info/PKG-INFO +1 -1
- {feldera-0.131.0 → feldera-0.133.0}/feldera.egg-info/SOURCES.txt +2 -0
- {feldera-0.131.0 → feldera-0.133.0}/pyproject.toml +1 -1
- {feldera-0.131.0 → feldera-0.133.0}/README.md +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/__init__.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/_callback_runner.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/_helpers.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/enums.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/output_handler.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/pipeline.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/pipeline_builder.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/__init__.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/_helpers.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/_httprequests.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/config.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/feldera_client.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/feldera_config.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/pipeline.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/sql_table.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/rest/sql_view.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/runtime_config.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera/stats.py +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera.egg-info/dependency_links.txt +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera.egg-info/requires.txt +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/feldera.egg-info/top_level.txt +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/setup.cfg +0 -0
- {feldera-0.131.0 → feldera-0.133.0}/tests/test_uda.py +0 -0
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from requests import Response
|
|
2
2
|
import json
|
|
3
|
+
from urllib.parse import urlparse
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class FelderaError(Exception):
|
|
@@ -40,6 +41,21 @@ class FelderaAPIError(FelderaError):
|
|
|
40
41
|
self.details = json_data.get("details")
|
|
41
42
|
except Exception:
|
|
42
43
|
self.message = request.text
|
|
44
|
+
err_msg += request.text
|
|
45
|
+
|
|
46
|
+
err_msg += f"\nResponse Status: {request.status_code}"
|
|
47
|
+
|
|
48
|
+
if int(request.status_code) == 401:
|
|
49
|
+
parsed = urlparse(request.request.url)
|
|
50
|
+
|
|
51
|
+
auth_err = f"\nAuthorization error: Failed to connect to '{parsed.scheme}://{parsed.hostname}': "
|
|
52
|
+
auth = request.request.headers.get("Authorization")
|
|
53
|
+
if auth is None:
|
|
54
|
+
err_msg += f"{auth_err} API key not set"
|
|
55
|
+
else:
|
|
56
|
+
err_msg += f"{auth_err} invalid API key"
|
|
57
|
+
|
|
58
|
+
err_msg = err_msg.strip()
|
|
43
59
|
|
|
44
60
|
super().__init__(err_msg)
|
|
45
61
|
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
import unittest
|
|
2
|
+
|
|
3
|
+
from feldera.testutils import datafusionize
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class TestDatafusionize(unittest.TestCase):
|
|
7
|
+
def test_datafusionize(self):
|
|
8
|
+
# Test SORT_ARRAY replacement
|
|
9
|
+
query = "SELECT SORT_ARRAY(col1) FROM table1"
|
|
10
|
+
result = datafusionize(query)
|
|
11
|
+
assert "array_sort(col1)" in result
|
|
12
|
+
|
|
13
|
+
# Test TRUNCATE replacement
|
|
14
|
+
query = "SELECT TRUNCATE(value, 2) FROM table2"
|
|
15
|
+
result = datafusionize(query)
|
|
16
|
+
assert "trunc(value, 2)" in result
|
|
17
|
+
|
|
18
|
+
# Test TIMESTAMP_TRUNC replacement
|
|
19
|
+
query = "SELECT TIMESTAMP_TRUNC(MAKE_TIMESTAMP(2023, 1, 15, 10, 30, 0), DAY) FROM table3"
|
|
20
|
+
result = datafusionize(query)
|
|
21
|
+
assert "DATE_TRUNC('DAY', TO_TIMESTAMP(2023, 1, 15, 10, 30, 0))" in result
|
|
22
|
+
|
|
23
|
+
query = "TIMESTAMP_TRUNC(MAKE_TIMESTAMP(order_group_last_activity_time), hour) AS window_start_time,"
|
|
24
|
+
result = datafusionize(query)
|
|
25
|
+
assert (
|
|
26
|
+
"DATE_TRUNC('hour', TO_TIMESTAMP(order_group_last_activity_time)) AS window_start_time,"
|
|
27
|
+
in result
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
# Test case insensitive matching
|
|
31
|
+
query = "SELECT sort_array(col) FROM table WHERE truncate(val) > 0"
|
|
32
|
+
result = datafusionize(query)
|
|
33
|
+
assert "array_sort(col)" in result
|
|
34
|
+
assert "trunc(val)" in result
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
if __name__ == "__main__":
|
|
38
|
+
unittest.main()
|
|
@@ -0,0 +1,188 @@
|
|
|
1
|
+
"Utility functions for writing tests against a Feldera instance."
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
import time
|
|
6
|
+
import json
|
|
7
|
+
import unittest
|
|
8
|
+
from typing import cast
|
|
9
|
+
|
|
10
|
+
from feldera.enums import CompilationProfile
|
|
11
|
+
from feldera.pipeline import Pipeline
|
|
12
|
+
from feldera.pipeline_builder import PipelineBuilder
|
|
13
|
+
from feldera.runtime_config import RuntimeConfig
|
|
14
|
+
from feldera.rest import FelderaClient
|
|
15
|
+
|
|
16
|
+
API_KEY = os.environ.get("FELDERA_API_KEY")
|
|
17
|
+
BASE_URL = (
|
|
18
|
+
os.environ.get("FELDERA_HOST")
|
|
19
|
+
or os.environ.get("FELDERA_BASE_URL")
|
|
20
|
+
or "http://localhost:8080"
|
|
21
|
+
)
|
|
22
|
+
KAFKA_SERVER = os.environ.get("FELDERA_KAFKA_SERVER", "localhost:19092")
|
|
23
|
+
PIPELINE_TO_KAFKA_SERVER = os.environ.get(
|
|
24
|
+
"FELDERA_PIPELINE_TO_KAFKA_SERVER", "redpanda:9092"
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class _LazyClient:
|
|
29
|
+
"Construct the FelderaClient only when accessed as opposed to when imported."
|
|
30
|
+
|
|
31
|
+
__slots__ = ("_client",)
|
|
32
|
+
|
|
33
|
+
def __init__(self):
|
|
34
|
+
self._client = None
|
|
35
|
+
|
|
36
|
+
def _ensure(self):
|
|
37
|
+
if self._client is None:
|
|
38
|
+
self._client = FelderaClient(
|
|
39
|
+
BASE_URL, api_key=API_KEY, connection_timeout=10, requests_verify=False
|
|
40
|
+
)
|
|
41
|
+
return self._client
|
|
42
|
+
|
|
43
|
+
def __getattr__(self, name):
|
|
44
|
+
return getattr(self._ensure(), name)
|
|
45
|
+
|
|
46
|
+
def __call__(self, *a, **kw) -> FelderaClient:
|
|
47
|
+
return self._ensure()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
TEST_CLIENT = cast(FelderaClient, _LazyClient())
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def unique_pipeline_name(base_name: str) -> str:
|
|
54
|
+
"""
|
|
55
|
+
In CI, multiple tests of different runs can run against the same Feldera instance, we
|
|
56
|
+
make sure the pipeline names they use are unique by appending the first 5 characters
|
|
57
|
+
of the commit SHA or 'local' if not in CI.
|
|
58
|
+
"""
|
|
59
|
+
ci_tag = os.getenv("GITHUB_SHA", "local")[:5]
|
|
60
|
+
return f"{ci_tag}_{base_name}"
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def enterprise_only(fn):
|
|
64
|
+
fn._enterprise_only = True
|
|
65
|
+
return unittest.skipUnless(
|
|
66
|
+
TEST_CLIENT.get_config().edition.is_enterprise(),
|
|
67
|
+
f"{fn.__name__} is enterprise only, skipping",
|
|
68
|
+
)(fn)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def datafusionize(query: str) -> str:
|
|
72
|
+
sort_array_pattern = re.compile(re.escape("SORT_ARRAY"), re.IGNORECASE)
|
|
73
|
+
truncate_pattern = re.compile(re.escape("TRUNCATE"), re.IGNORECASE)
|
|
74
|
+
timestamp_trunc_pattern = re.compile(
|
|
75
|
+
r"TIMESTAMP_TRUNC\s*\(\s*MAKE_TIMESTAMP\s*\(\s*([^)]+)\s*\)\s*,\s*([A-Z]+)\s*\)",
|
|
76
|
+
re.IGNORECASE,
|
|
77
|
+
)
|
|
78
|
+
|
|
79
|
+
result = sort_array_pattern.sub("array_sort", query)
|
|
80
|
+
result = truncate_pattern.sub("trunc", result)
|
|
81
|
+
result = timestamp_trunc_pattern.sub(r"DATE_TRUNC('\2', TO_TIMESTAMP(\1))", result)
|
|
82
|
+
return result
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def validate_view(
|
|
86
|
+
pipeline: Pipeline, view_name: str, view_query: str | tuple[str, str]
|
|
87
|
+
):
|
|
88
|
+
print(f"Validating view '{view_name}'")
|
|
89
|
+
|
|
90
|
+
# We have two modes to verify the view, either we run the same SQL as the view against datafusion
|
|
91
|
+
# by `datafusionizing` the query, or a weaker form where we pass a hash of what the result
|
|
92
|
+
# should look like and check that the hash hasn't changed
|
|
93
|
+
if isinstance(view_query, tuple):
|
|
94
|
+
_view_definition, original_hash = view_query
|
|
95
|
+
view_query = f"select * from {view_name}"
|
|
96
|
+
computed_hash = pipeline.query_hash(view_query)
|
|
97
|
+
if computed_hash != original_hash:
|
|
98
|
+
raise AssertionError(
|
|
99
|
+
f"View {view_name} hash {computed_hash} was but expected hash {original_hash}"
|
|
100
|
+
)
|
|
101
|
+
else:
|
|
102
|
+
# TODO: count records
|
|
103
|
+
view_query = datafusionize(view_query)
|
|
104
|
+
try:
|
|
105
|
+
extra_rows = list(
|
|
106
|
+
pipeline.query(f"(select * from {view_name}) except ({view_query})")
|
|
107
|
+
)
|
|
108
|
+
missing_rows = list(
|
|
109
|
+
pipeline.query(f"({view_query}) except (select * from {view_name})")
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
if extra_rows:
|
|
113
|
+
print(
|
|
114
|
+
"Extra rows in Feldera output, but not in the ad hoc query output"
|
|
115
|
+
)
|
|
116
|
+
print(json.dumps(extra_rows))
|
|
117
|
+
|
|
118
|
+
if missing_rows:
|
|
119
|
+
print(
|
|
120
|
+
"Extra rows in the ad hoc query output, but not in Feldera output"
|
|
121
|
+
)
|
|
122
|
+
print(json.dumps(missing_rows))
|
|
123
|
+
except Exception as e:
|
|
124
|
+
print(f"Error querying view '{view_name}': {e}")
|
|
125
|
+
print(f"Ad-hoc Query: {view_query}")
|
|
126
|
+
raise
|
|
127
|
+
|
|
128
|
+
if extra_rows or missing_rows:
|
|
129
|
+
raise AssertionError(f"Validation failed for view {view_name}")
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def run_workload(pipeline_name: str, tables: dict, views: dict):
|
|
133
|
+
"""
|
|
134
|
+
Helper to run a pipeline to completion and validate the views afterwards using ad-hoc queries.
|
|
135
|
+
|
|
136
|
+
Use this for large-scale workload and standard benchmarks (like TPC-H etc.) where you plan to
|
|
137
|
+
ingest a lot of data and validate the results. For testing more specific functionality, see
|
|
138
|
+
frameworks in the `tests` directory.
|
|
139
|
+
"""
|
|
140
|
+
|
|
141
|
+
sql = ""
|
|
142
|
+
for table_sql in tables.values():
|
|
143
|
+
sql += f"{table_sql}\n"
|
|
144
|
+
|
|
145
|
+
for view_name, view in views.items():
|
|
146
|
+
if isinstance(view, tuple):
|
|
147
|
+
view_query, _hash = view
|
|
148
|
+
sql += f"create materialized view {view_name} as {view_query};\n\n"
|
|
149
|
+
else:
|
|
150
|
+
sql += f"create materialized view {view_name} as {view};\n\n"
|
|
151
|
+
|
|
152
|
+
pipeline = PipelineBuilder(
|
|
153
|
+
TEST_CLIENT,
|
|
154
|
+
unique_pipeline_name(pipeline_name),
|
|
155
|
+
sql=sql,
|
|
156
|
+
compilation_profile=CompilationProfile.OPTIMIZED,
|
|
157
|
+
runtime_config=RuntimeConfig(provisioning_timeout_secs=60),
|
|
158
|
+
).create_or_replace()
|
|
159
|
+
|
|
160
|
+
pipeline.start()
|
|
161
|
+
start_time = time.monotonic()
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
pipeline.start_transaction()
|
|
165
|
+
except Exception as e:
|
|
166
|
+
print(f"Error starting transaction: {e}")
|
|
167
|
+
|
|
168
|
+
pipeline.wait_for_completion(force_stop=False, timeout_s=3600)
|
|
169
|
+
elapsed = time.monotonic() - start_time
|
|
170
|
+
print(f"Data ingested in {elapsed}")
|
|
171
|
+
|
|
172
|
+
try:
|
|
173
|
+
start_time = time.monotonic()
|
|
174
|
+
pipeline.commit_transaction(transaction_id=None, wait=True, timeout_s=None)
|
|
175
|
+
except Exception as e:
|
|
176
|
+
print(f"Error committing transaction: {e}")
|
|
177
|
+
finally:
|
|
178
|
+
elapsed = time.monotonic() - start_time
|
|
179
|
+
print(f"Commit took {elapsed}")
|
|
180
|
+
|
|
181
|
+
for table in tables.keys():
|
|
182
|
+
row_count = list(pipeline.query(f"select count(*) from {table}"))
|
|
183
|
+
print(f"Table '{table}' count(*):\n{row_count}")
|
|
184
|
+
|
|
185
|
+
for view_name, view_query in views.items():
|
|
186
|
+
validate_view(pipeline, view_name, view_query)
|
|
187
|
+
|
|
188
|
+
pipeline.stop(force=True)
|
|
@@ -9,6 +9,7 @@ feldera/pipeline.py
|
|
|
9
9
|
feldera/pipeline_builder.py
|
|
10
10
|
feldera/runtime_config.py
|
|
11
11
|
feldera/stats.py
|
|
12
|
+
feldera/testutils.py
|
|
12
13
|
feldera.egg-info/PKG-INFO
|
|
13
14
|
feldera.egg-info/SOURCES.txt
|
|
14
15
|
feldera.egg-info/dependency_links.txt
|
|
@@ -24,4 +25,5 @@ feldera/rest/feldera_config.py
|
|
|
24
25
|
feldera/rest/pipeline.py
|
|
25
26
|
feldera/rest/sql_table.py
|
|
26
27
|
feldera/rest/sql_view.py
|
|
28
|
+
feldera/tests/test_datafusionize.py
|
|
27
29
|
tests/test_uda.py
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|