opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/catalog/compaction.py +15 -8
- opteryx_catalog/catalog/dataset.py +449 -111
- opteryx_catalog/catalog/manifest.py +390 -330
- opteryx_catalog/catalog/metadata.py +3 -0
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +82 -54
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- tests/test_collections.py +37 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import io
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
|
|
6
|
+
from opteryx_catalog.catalog.dataset import SimpleDataset
|
|
7
|
+
from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _MemInput:
|
|
11
|
+
def __init__(self, data: bytes):
|
|
12
|
+
self._data = data
|
|
13
|
+
|
|
14
|
+
def open(self):
|
|
15
|
+
# Provide a file-like BytesIO which .read() returns the bytes
|
|
16
|
+
return io.BytesIO(self._data)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _MemIO:
|
|
20
|
+
def __init__(self, mapping: dict):
|
|
21
|
+
self._mapping = mapping
|
|
22
|
+
|
|
23
|
+
def new_input(self, path: str):
|
|
24
|
+
return _MemInput(self._mapping[path])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_manifest_bytes():
|
|
28
|
+
# Construct a parquet manifest with two entries, two columns per file
|
|
29
|
+
schema = pa.schema(
|
|
30
|
+
[
|
|
31
|
+
("file_path", pa.string()),
|
|
32
|
+
("file_format", pa.string()),
|
|
33
|
+
("record_count", pa.int64()),
|
|
34
|
+
("file_size_in_bytes", pa.int64()),
|
|
35
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
36
|
+
("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
|
|
37
|
+
("null_counts", pa.list_(pa.int64())),
|
|
38
|
+
("min_k_hashes", pa.list_(pa.int64())),
|
|
39
|
+
("histogram_counts", pa.list_(pa.int64())),
|
|
40
|
+
("histogram_bins", pa.int64()),
|
|
41
|
+
("min_values", pa.list_(pa.int64())),
|
|
42
|
+
("max_values", pa.list_(pa.int64())),
|
|
43
|
+
("min_values_display", pa.list_(pa.string())),
|
|
44
|
+
("max_values_display", pa.list_(pa.string())),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
file_path = pa.array(["f1.parquet", "f2.parquet"], type=pa.string())
|
|
49
|
+
file_format = pa.array(["parquet", "parquet"], type=pa.string())
|
|
50
|
+
record_count = pa.array([10, 20], type=pa.int64())
|
|
51
|
+
file_size_in_bytes = pa.array([100, 200], type=pa.int64())
|
|
52
|
+
uncompressed_size_in_bytes = pa.array([1000, 2000], type=pa.int64())
|
|
53
|
+
column_uncompressed_sizes_in_bytes = pa.array(
|
|
54
|
+
[[100, 400], [300, 200]], type=pa.list_(pa.int64())
|
|
55
|
+
)
|
|
56
|
+
null_counts = pa.array([[0, 0], [0, 0]], type=pa.list_(pa.int64()))
|
|
57
|
+
min_k_hashes = pa.array([[1, 2], [1]], type=pa.list_(pa.int64()))
|
|
58
|
+
histogram_counts = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64()))
|
|
59
|
+
histogram_bins = pa.array([32, 32], type=pa.int64())
|
|
60
|
+
min_values = pa.array([[10, 20], [5, 30]], type=pa.list_(pa.int64()))
|
|
61
|
+
max_values = pa.array([[100, 400], [300, 200]], type=pa.list_(pa.int64()))
|
|
62
|
+
min_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
|
|
63
|
+
max_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
|
|
64
|
+
|
|
65
|
+
table = pa.Table.from_arrays(
|
|
66
|
+
[
|
|
67
|
+
file_path,
|
|
68
|
+
file_format,
|
|
69
|
+
record_count,
|
|
70
|
+
file_size_in_bytes,
|
|
71
|
+
uncompressed_size_in_bytes,
|
|
72
|
+
column_uncompressed_sizes_in_bytes,
|
|
73
|
+
null_counts,
|
|
74
|
+
min_k_hashes,
|
|
75
|
+
histogram_counts,
|
|
76
|
+
histogram_bins,
|
|
77
|
+
min_values,
|
|
78
|
+
max_values,
|
|
79
|
+
min_values_display,
|
|
80
|
+
max_values_display,
|
|
81
|
+
],
|
|
82
|
+
schema=schema,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
buf = io.BytesIO()
|
|
86
|
+
pq.write_table(table, buf)
|
|
87
|
+
return buf.getvalue()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_describe_includes_uncompressed_bytes():
|
|
91
|
+
manifest_bytes = _build_manifest_bytes()
|
|
92
|
+
manifest_path = "mem://manifest"
|
|
93
|
+
|
|
94
|
+
meta = DatasetMetadata(
|
|
95
|
+
dataset_identifier="tests_temp.test",
|
|
96
|
+
location="mem://",
|
|
97
|
+
schema=None,
|
|
98
|
+
properties={},
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Add a schema with two columns so describe() can map names -> indices
|
|
102
|
+
meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
|
|
103
|
+
meta.current_schema_id = "s1"
|
|
104
|
+
|
|
105
|
+
# Prepare snapshot referencing our in-memory manifest
|
|
106
|
+
snap = Snapshot(
|
|
107
|
+
snapshot_id=1,
|
|
108
|
+
timestamp_ms=1,
|
|
109
|
+
manifest_list=manifest_path,
|
|
110
|
+
)
|
|
111
|
+
meta.snapshots.append(snap)
|
|
112
|
+
meta.current_snapshot_id = 1
|
|
113
|
+
|
|
114
|
+
ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
|
|
115
|
+
|
|
116
|
+
# Inject our in-memory IO mapping
|
|
117
|
+
ds.io = _MemIO({manifest_path: manifest_bytes})
|
|
118
|
+
|
|
119
|
+
desc = ds.describe()
|
|
120
|
+
|
|
121
|
+
assert "a" in desc
|
|
122
|
+
assert "b" in desc
|
|
123
|
+
|
|
124
|
+
# Column 'a' should have uncompressed bytes = 100 + 300 = 400
|
|
125
|
+
assert desc["a"]["uncompressed_bytes"] == 400
|
|
126
|
+
# Column 'b' should have uncompressed bytes = 400 + 200 = 600
|
|
127
|
+
assert desc["b"]["uncompressed_bytes"] == 600
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import io
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
# Add local paths to sys.path to use local code instead of installed packages
|
|
10
|
+
sys.path.insert(0, os.path.join(sys.path[0], "..")) # Add parent dir for pyiceberg_firestore_gcs
|
|
11
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
|
|
12
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
from opteryx_catalog.catalog.dataset import SimpleDataset
|
|
16
|
+
from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
|
|
17
|
+
from opteryx_catalog.catalog.manifest import (
|
|
18
|
+
build_parquet_manifest_entry_from_bytes,
|
|
19
|
+
get_manifest_metrics,
|
|
20
|
+
reset_manifest_metrics,
|
|
21
|
+
)
|
|
22
|
+
from opteryx_catalog.opteryx_catalog import OpteryxCatalog
|
|
23
|
+
import pytest
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_min_k_hashes_for_string_and_binary():
|
|
27
|
+
try:
|
|
28
|
+
pass # type: ignore
|
|
29
|
+
except Exception:
|
|
30
|
+
pytest.skip("opteryx.draken not available")
|
|
31
|
+
|
|
32
|
+
import pyarrow as pa
|
|
33
|
+
|
|
34
|
+
# short binary and short string columns should get min-k
|
|
35
|
+
t = _make_parquet_table(
|
|
36
|
+
[("bin", pa.binary()), ("s", pa.string())], [(b"a", "x"), (b"b", "y"), (b"c", "z")]
|
|
37
|
+
)
|
|
38
|
+
buf = pa.BufferOutputStream()
|
|
39
|
+
pq.write_table(t, buf, compression="zstd")
|
|
40
|
+
data = buf.getvalue().to_pybytes()
|
|
41
|
+
e = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
|
|
42
|
+
assert len(e.min_k_hashes[0]) > 0
|
|
43
|
+
assert len(e.min_k_hashes[1]) > 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Step 1: Create a local catalog
|
|
47
|
+
catalog = OpteryxCatalog(
|
|
48
|
+
"opteryx",
|
|
49
|
+
firestore_project="mabeldev",
|
|
50
|
+
firestore_database="catalogs",
|
|
51
|
+
gcs_bucket="opteryx_data",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# print(catalog.load_dataset("ops.stdout_log").describe())
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class _MemInput:
|
|
58
|
+
def __init__(self, data: bytes):
|
|
59
|
+
self._data = data
|
|
60
|
+
|
|
61
|
+
def open(self):
|
|
62
|
+
return io.BytesIO(self._data)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _MemIO:
|
|
66
|
+
def __init__(self, mapping: dict):
|
|
67
|
+
self._mapping = mapping
|
|
68
|
+
|
|
69
|
+
def new_input(self, path: str):
|
|
70
|
+
return _MemInput(self._mapping[path])
|
|
71
|
+
|
|
72
|
+
def new_output(self, path: str):
|
|
73
|
+
class Out:
|
|
74
|
+
def __init__(self, mapping, path):
|
|
75
|
+
self._buf = io.BytesIO()
|
|
76
|
+
self._mapping = mapping
|
|
77
|
+
self._path = path
|
|
78
|
+
|
|
79
|
+
def write(self, data: bytes):
|
|
80
|
+
self._buf.write(data)
|
|
81
|
+
|
|
82
|
+
def close(self):
|
|
83
|
+
self._mapping[self._path] = self._buf.getvalue()
|
|
84
|
+
|
|
85
|
+
def create(self):
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
return Out(self._mapping, path)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class _FakeCatalog:
|
|
92
|
+
def __init__(self, io):
|
|
93
|
+
self.io = io
|
|
94
|
+
|
|
95
|
+
def write_parquet_manifest(
|
|
96
|
+
self, snapshot_id: int, entries: list[dict], dataset_location: str
|
|
97
|
+
) -> str:
|
|
98
|
+
# Minimal manifest writer using same schema as production
|
|
99
|
+
schema = pa.schema(
|
|
100
|
+
[
|
|
101
|
+
("file_path", pa.string()),
|
|
102
|
+
("file_format", pa.string()),
|
|
103
|
+
("record_count", pa.int64()),
|
|
104
|
+
("file_size_in_bytes", pa.int64()),
|
|
105
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
106
|
+
("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
|
|
107
|
+
("null_counts", pa.list_(pa.int64())),
|
|
108
|
+
("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
|
|
109
|
+
("histogram_counts", pa.list_(pa.list_(pa.int64()))),
|
|
110
|
+
("histogram_bins", pa.int32()),
|
|
111
|
+
("min_values", pa.list_(pa.int64())),
|
|
112
|
+
("max_values", pa.list_(pa.int64())),
|
|
113
|
+
("min_values_display", pa.list_(pa.string())),
|
|
114
|
+
("max_values_display", pa.list_(pa.string())),
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
normalized = []
|
|
118
|
+
for ent in entries:
|
|
119
|
+
if not isinstance(ent, dict):
|
|
120
|
+
normalized.append(ent)
|
|
121
|
+
continue
|
|
122
|
+
e = dict(ent)
|
|
123
|
+
e.setdefault("min_k_hashes", [])
|
|
124
|
+
e.setdefault("histogram_counts", [])
|
|
125
|
+
e.setdefault("histogram_bins", 0)
|
|
126
|
+
e.setdefault("column_uncompressed_sizes_in_bytes", [])
|
|
127
|
+
e.setdefault("null_counts", [])
|
|
128
|
+
e.setdefault("min_values_display", [])
|
|
129
|
+
e.setdefault("max_values_display", [])
|
|
130
|
+
mv = e.get("min_values") or []
|
|
131
|
+
xv = e.get("max_values") or []
|
|
132
|
+
mv_disp = e.get("min_values_display") or []
|
|
133
|
+
xv_disp = e.get("max_values_display") or []
|
|
134
|
+
e["min_values"] = [int(v) if v is not None else None for v in mv]
|
|
135
|
+
e["max_values"] = [int(v) if v is not None else None for v in xv]
|
|
136
|
+
e["min_values_display"] = [str(v) if v is not None else None for v in mv_disp]
|
|
137
|
+
e["max_values_display"] = [str(v) if v is not None else None for v in xv_disp]
|
|
138
|
+
normalized.append(e)
|
|
139
|
+
|
|
140
|
+
table = pa.Table.from_pylist(normalized, schema=schema)
|
|
141
|
+
buf = pa.BufferOutputStream()
|
|
142
|
+
pq.write_table(table, buf, compression="zstd")
|
|
143
|
+
data = buf.getvalue().to_pybytes()
|
|
144
|
+
path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
145
|
+
out = self.io.new_output(path).create()
|
|
146
|
+
out.write(data)
|
|
147
|
+
out.close()
|
|
148
|
+
return path
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _make_parquet_table(columns: list[tuple[str, pa.DataType]], rows: list[tuple]):
|
|
152
|
+
arrays = []
|
|
153
|
+
for i, (name, dtype) in enumerate(columns):
|
|
154
|
+
col_vals = [r[i] for r in rows]
|
|
155
|
+
arrays.append(pa.array(col_vals, type=dtype))
|
|
156
|
+
return pa.Table.from_arrays(arrays, names=[c[0] for c in columns])
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_build_manifest_from_bytes_matches_table():
|
|
160
|
+
# ensure the bytes-based builder matches the table-based one
|
|
161
|
+
t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
162
|
+
buf = pa.BufferOutputStream()
|
|
163
|
+
pq.write_table(t, buf, compression="zstd")
|
|
164
|
+
data = buf.getvalue().to_pybytes()
|
|
165
|
+
|
|
166
|
+
e_bytes = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
|
|
167
|
+
# basic sanity checks (parity is enforced by using orig_table when available)
|
|
168
|
+
assert e_bytes.record_count == 2
|
|
169
|
+
assert e_bytes.file_size_in_bytes == len(data)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_manifest_metrics_increments():
|
|
173
|
+
reset_manifest_metrics()
|
|
174
|
+
t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
175
|
+
buf = pa.BufferOutputStream()
|
|
176
|
+
pq.write_table(t, buf, compression="zstd")
|
|
177
|
+
data = buf.getvalue().to_pybytes()
|
|
178
|
+
|
|
179
|
+
_ = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
|
|
180
|
+
m = get_manifest_metrics()
|
|
181
|
+
assert m.get("files_read", 0) >= 1
|
|
182
|
+
assert m.get("hash_calls", 0) >= 1
|
|
183
|
+
assert m.get("compress_calls", 0) >= 1
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_table_based_builder_is_removed():
|
|
187
|
+
from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry
|
|
188
|
+
|
|
189
|
+
t = _make_parquet_table([("a", pa.int64())], [(1,)])
|
|
190
|
+
with pytest.raises(RuntimeError):
|
|
191
|
+
_ = build_parquet_manifest_entry(t, "mem://f", 0)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def test_manifest_uses_rugo_for_sizes():
|
|
195
|
+
# Ensure the bytes-based builder uses rugo metadata to compute per-column sizes
|
|
196
|
+
reset_manifest_metrics()
|
|
197
|
+
t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
198
|
+
buf = pa.BufferOutputStream()
|
|
199
|
+
pq.write_table(t, buf, compression="zstd")
|
|
200
|
+
data = buf.getvalue().to_pybytes()
|
|
201
|
+
|
|
202
|
+
entry = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data))
|
|
203
|
+
m = get_manifest_metrics()
|
|
204
|
+
|
|
205
|
+
# rugo should report sizes (non-zero) for these synthetic files
|
|
206
|
+
assert m.get("sizes_from_rugo", 0) >= 1 or m.get("sizes_from_rugo_missing", 0) == 0
|
|
207
|
+
assert entry.uncompressed_size_in_bytes >= 0
|
|
208
|
+
assert isinstance(entry.column_uncompressed_sizes_in_bytes, list)
|
|
209
|
+
assert len(entry.column_uncompressed_sizes_in_bytes) == 2
|
|
210
|
+
# column sizes may be non-zero when metadata is available
|
|
211
|
+
assert all(isinstance(x, int) for x in entry.column_uncompressed_sizes_in_bytes)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def test_refresh_manifest_with_single_file():
|
|
215
|
+
# single file with columns a,b for quick iteration
|
|
216
|
+
t1 = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
217
|
+
|
|
218
|
+
# Write parquet file to mem
|
|
219
|
+
buf = pa.BufferOutputStream()
|
|
220
|
+
pq.write_table(t1, buf, compression="zstd")
|
|
221
|
+
d1 = buf.getvalue().to_pybytes()
|
|
222
|
+
|
|
223
|
+
f1 = "mem://data/f1.parquet"
|
|
224
|
+
manifest_path = "mem://manifest-old"
|
|
225
|
+
|
|
226
|
+
# Build initial manifest entry for single file (bytes-based builder)
|
|
227
|
+
e1 = build_parquet_manifest_entry_from_bytes(d1, f1, len(d1), orig_table=t1).to_dict()
|
|
228
|
+
|
|
229
|
+
# Create in-memory IO mapping including manifest and data file
|
|
230
|
+
mapping = {f1: d1}
|
|
231
|
+
|
|
232
|
+
# Write initial manifest with the single entry using the same writer as the catalog
|
|
233
|
+
fake_writer = _FakeCatalog(_MemIO(mapping))
|
|
234
|
+
manifest_path = fake_writer.write_parquet_manifest(1, [e1], "mem://")
|
|
235
|
+
# Ensure the manifest bytes are present in the mapping
|
|
236
|
+
mapping[manifest_path] = mapping[manifest_path]
|
|
237
|
+
|
|
238
|
+
# Persist the single-file manifest as JSON for quick inspection during
|
|
239
|
+
# iterative debugging (writes to repo `artifacts/` so you can open it).
|
|
240
|
+
import os
|
|
241
|
+
import json
|
|
242
|
+
|
|
243
|
+
artifacts_dir = os.path.join(os.getcwd(), "artifacts")
|
|
244
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
245
|
+
with open(
|
|
246
|
+
os.path.join(artifacts_dir, "single_file_manifest.json"), "w", encoding="utf-8"
|
|
247
|
+
) as fh:
|
|
248
|
+
json.dump(e1, fh, indent=2, default=str)
|
|
249
|
+
|
|
250
|
+
# Create metadata and snapshot
|
|
251
|
+
meta = DatasetMetadata(
|
|
252
|
+
dataset_identifier="tests_temp.test", location="mem://", schema=None, properties={}
|
|
253
|
+
)
|
|
254
|
+
meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
|
|
255
|
+
meta.current_schema_id = "s1"
|
|
256
|
+
snap = Snapshot(snapshot_id=1, timestamp_ms=1, manifest_list=manifest_path)
|
|
257
|
+
meta.snapshots.append(snap)
|
|
258
|
+
meta.current_snapshot_id = 1
|
|
259
|
+
|
|
260
|
+
ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
|
|
261
|
+
ds.io = _MemIO(mapping)
|
|
262
|
+
ds.catalog = _FakeCatalog(ds.io)
|
|
263
|
+
|
|
264
|
+
# Refresh manifest (should re-read f1 and write a new manifest)
|
|
265
|
+
new_snap_id = ds.refresh_manifest(agent="test-agent", author="tester")
|
|
266
|
+
assert new_snap_id is not None
|
|
267
|
+
|
|
268
|
+
# Describe should include both columns and count bytes appropriately
|
|
269
|
+
desc = ds.describe()
|
|
270
|
+
assert "a" in desc
|
|
271
|
+
assert "b" in desc
|
|
272
|
+
|
|
273
|
+
# ensure uncompressed bytes are present and non-zero for both cols
|
|
274
|
+
assert desc["a"]["uncompressed_bytes"] > 0
|
|
275
|
+
assert desc["b"]["uncompressed_bytes"] > 0
|
tests/test_webhooks.py
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"""Tests for the webhook system."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from unittest.mock import MagicMock
|
|
5
|
+
from unittest.mock import patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def test_webhook_manager_disabled_without_domain():
|
|
11
|
+
"""Test that webhook manager is disabled when no domain is configured."""
|
|
12
|
+
from opteryx_catalog.webhooks import WebhookManager
|
|
13
|
+
|
|
14
|
+
# Clear any existing env vars
|
|
15
|
+
os.environ.pop("OPTERYX_WEBHOOK_DOMAIN", None)
|
|
16
|
+
os.environ.pop("OPTERYX_WEBHOOK_QUEUE", None)
|
|
17
|
+
|
|
18
|
+
manager = WebhookManager()
|
|
19
|
+
assert not manager.enabled
|
|
20
|
+
|
|
21
|
+
# Should return False without making any HTTP calls
|
|
22
|
+
result = manager.send(
|
|
23
|
+
action="create",
|
|
24
|
+
workspace="test",
|
|
25
|
+
collection="test",
|
|
26
|
+
resource_type="dataset",
|
|
27
|
+
resource_name="test",
|
|
28
|
+
)
|
|
29
|
+
assert result is False
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def test_webhook_manager_direct_http():
|
|
33
|
+
"""Test that webhooks are sent via direct HTTP when queue is not configured."""
|
|
34
|
+
from opteryx_catalog.webhooks import WebhookManager
|
|
35
|
+
|
|
36
|
+
with patch("opteryx_catalog.webhooks.requests.post") as mock_post:
|
|
37
|
+
mock_response = MagicMock()
|
|
38
|
+
mock_response.status_code = 200
|
|
39
|
+
mock_post.return_value = mock_response
|
|
40
|
+
|
|
41
|
+
manager = WebhookManager(domain="router.example.com", queue_path=None)
|
|
42
|
+
assert manager.enabled
|
|
43
|
+
assert manager._tasks_client is None
|
|
44
|
+
|
|
45
|
+
result = manager.send(
|
|
46
|
+
action="create",
|
|
47
|
+
workspace="test-workspace",
|
|
48
|
+
collection="test-collection",
|
|
49
|
+
resource_type="dataset",
|
|
50
|
+
resource_name="test-dataset",
|
|
51
|
+
payload={"location": "gs://bucket/path"},
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
assert result is True
|
|
55
|
+
mock_post.assert_called_once()
|
|
56
|
+
|
|
57
|
+
# Verify the call arguments
|
|
58
|
+
call_args = mock_post.call_args
|
|
59
|
+
assert call_args.args[0] == "https://router.example.com/event"
|
|
60
|
+
assert call_args.kwargs["json"]["event"]["action"] == "create"
|
|
61
|
+
assert call_args.kwargs["json"]["event"]["resource_type"] == "dataset"
|
|
62
|
+
assert call_args.kwargs["json"]["event"]["resource_name"] == "test-dataset"
|
|
63
|
+
assert call_args.kwargs["json"]["data"]["location"] == "gs://bucket/path"
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_webhook_manager_payload_building():
|
|
67
|
+
"""Test that webhook payloads are built correctly."""
|
|
68
|
+
from opteryx_catalog.webhooks import WebhookManager
|
|
69
|
+
|
|
70
|
+
manager = WebhookManager(domain="hook.example.com")
|
|
71
|
+
|
|
72
|
+
payload = manager._build_payload(
|
|
73
|
+
action="update",
|
|
74
|
+
workspace="ws",
|
|
75
|
+
collection="col",
|
|
76
|
+
resource_type="dataset",
|
|
77
|
+
resource_name="ds",
|
|
78
|
+
additional={"description": "New description"},
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
assert payload["event"]["action"] == "update"
|
|
82
|
+
assert payload["event"]["workspace"] == "ws"
|
|
83
|
+
assert payload["event"]["collection"] == "col"
|
|
84
|
+
assert payload["event"]["resource_type"] == "dataset"
|
|
85
|
+
assert payload["event"]["resource_name"] == "ds"
|
|
86
|
+
assert "timestamp" in payload["event"]
|
|
87
|
+
assert payload["data"]["description"] == "New description"
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_webhook_http_failure_returns_false():
|
|
91
|
+
"""Test that HTTP failures return False without raising exceptions."""
|
|
92
|
+
from opteryx_catalog.webhooks import WebhookManager
|
|
93
|
+
|
|
94
|
+
with patch("opteryx_catalog.webhooks.requests.post") as mock_post:
|
|
95
|
+
# Simulate HTTP error
|
|
96
|
+
mock_post.side_effect = Exception("Connection failed")
|
|
97
|
+
|
|
98
|
+
manager = WebhookManager(domain="router.example.com")
|
|
99
|
+
result = manager.send(
|
|
100
|
+
action="create",
|
|
101
|
+
workspace="test",
|
|
102
|
+
collection="test",
|
|
103
|
+
resource_type="dataset",
|
|
104
|
+
resource_name="test",
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
assert result is False
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def test_send_webhook_convenience_function():
|
|
111
|
+
"""Test the convenience send_webhook function."""
|
|
112
|
+
from opteryx_catalog.webhooks import send_webhook
|
|
113
|
+
|
|
114
|
+
with patch("opteryx_catalog.webhooks.requests.post") as mock_post:
|
|
115
|
+
mock_response = MagicMock()
|
|
116
|
+
mock_response.status_code = 200
|
|
117
|
+
mock_post.return_value = mock_response
|
|
118
|
+
|
|
119
|
+
os.environ["OPTERYX_WEBHOOK_DOMAIN"] = "router.example.com"
|
|
120
|
+
os.environ.pop("OPTERYX_WEBHOOK_QUEUE", None)
|
|
121
|
+
|
|
122
|
+
# Reset the global manager to pick up new env vars
|
|
123
|
+
import opteryx_catalog.webhooks as webhook_module
|
|
124
|
+
|
|
125
|
+
webhook_module._webhook_manager = None
|
|
126
|
+
|
|
127
|
+
result = send_webhook(
|
|
128
|
+
action="create",
|
|
129
|
+
workspace="test",
|
|
130
|
+
collection="test",
|
|
131
|
+
resource_type="dataset",
|
|
132
|
+
resource_name="test",
|
|
133
|
+
payload={"snapshot_id": 123},
|
|
134
|
+
)
|
|
135
|
+
|
|
136
|
+
assert result is True
|
|
137
|
+
mock_post.assert_called_once()
|
|
138
|
+
|
|
139
|
+
# Clean up
|
|
140
|
+
os.environ.pop("OPTERYX_WEBHOOK_DOMAIN", None)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def test_event_payload_builders():
|
|
144
|
+
"""Test the event payload builder functions."""
|
|
145
|
+
from opteryx_catalog.webhooks.events import dataset_commit_payload
|
|
146
|
+
from opteryx_catalog.webhooks.events import dataset_created_payload
|
|
147
|
+
from opteryx_catalog.webhooks.events import view_created_payload
|
|
148
|
+
from opteryx_catalog.webhooks.events import view_executed_payload
|
|
149
|
+
|
|
150
|
+
# Test dataset created
|
|
151
|
+
payload = dataset_created_payload(
|
|
152
|
+
schema=None, location="gs://bucket/path", properties={"key": "value"}
|
|
153
|
+
)
|
|
154
|
+
assert payload["location"] == "gs://bucket/path"
|
|
155
|
+
assert payload["properties"]["key"] == "value"
|
|
156
|
+
|
|
157
|
+
# Test dataset commit
|
|
158
|
+
payload = dataset_commit_payload(
|
|
159
|
+
snapshot_id=123, sequence_number=5, record_count=1000, file_count=2
|
|
160
|
+
)
|
|
161
|
+
assert payload["snapshot_id"] == 123
|
|
162
|
+
assert payload["sequence_number"] == 5
|
|
163
|
+
assert payload["record_count"] == 1000
|
|
164
|
+
assert payload["file_count"] == 2
|
|
165
|
+
|
|
166
|
+
# Test view created
|
|
167
|
+
payload = view_created_payload(definition="SELECT * FROM table", properties={})
|
|
168
|
+
assert payload["definition"] == "SELECT * FROM table"
|
|
169
|
+
|
|
170
|
+
# Test view executed
|
|
171
|
+
payload = view_executed_payload(execution_time_ms=1500, row_count=100)
|
|
172
|
+
assert payload["execution_time_ms"] == 1500
|
|
173
|
+
assert payload["row_count"] == 100
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
if __name__ == "__main__":
|
|
177
|
+
pytest.main([__file__, "-v"])
|
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
opteryx_catalog/__init__.py,sha256=cqGY7bl6iMBIqY_x6VTc5fAFH23M3XQeJYrHPX6FglY,902
|
|
2
|
-
opteryx_catalog/exceptions.py,sha256=ZEaXmrrn030V8pfy8YMaLwzBWFms9OgZG21zVRGKlxM,652
|
|
3
|
-
opteryx_catalog/opteryx_catalog.py,sha256=sgcCiBbIv8mUZQbNb34JwEf6Wq0iByoEvpqkoD_F1bc,39111
|
|
4
|
-
opteryx_catalog/catalog/__init__.py,sha256=yD7egf-dLd1z_CNXunz3ldLyLMMkSNbS3aKjGp3dKQY,119
|
|
5
|
-
opteryx_catalog/catalog/compaction.py,sha256=HGkDnlVBv5GjRiZhdGubxCVxRLScL9N667a19U01g1I,19100
|
|
6
|
-
opteryx_catalog/catalog/dataset.py,sha256=3Q_lLZ1Y0I1E_R47pMFgql81Y1dy955NKlsgk9edfJE,46796
|
|
7
|
-
opteryx_catalog/catalog/manifest.py,sha256=xTV3u_i8s7jxulLvATyBoP9FHTdxOB8b0__SabqhH6g,17045
|
|
8
|
-
opteryx_catalog/catalog/metadata.py,sha256=a4UFj5xUqjqtuLu2_mYQaBHRWtjjX3KU2Ufp63Uo2AM,2870
|
|
9
|
-
opteryx_catalog/catalog/metastore.py,sha256=mS4qaaOMzcIu730Jm0K_Nq-4sNI8kIX3UscevYO5E08,1997
|
|
10
|
-
opteryx_catalog/catalog/view.py,sha256=mUzfRGYqLRx_9BfZdGY5HNz6na9VMEPITrYKiI5m694,219
|
|
11
|
-
opteryx_catalog/iops/__init__.py,sha256=_CxR-hg8XUD2cIFucb3aHyTFqwi41QmEDf9gXzXt3ZU,171
|
|
12
|
-
opteryx_catalog/iops/base.py,sha256=1IW9qjDkQEMXvrA2J73VSBCdzkf2W5xVsWVnpNglL1U,1206
|
|
13
|
-
opteryx_catalog/iops/fileio.py,sha256=cjBl9fN-vutvXskzZkwJjjbBcUlE0O1WrQe5Ryx7pIg,4315
|
|
14
|
-
opteryx_catalog/iops/gcs.py,sha256=aB6hvSAQhbKTSyaLbAPgpXtSnvkI7fndXCRjaAZ1Dxo,8155
|
|
15
|
-
opteryx_catalog-0.4.11.dist-info/licenses/LICENSE,sha256=mc5l20siqdcNQM54xALIWJhyaWsmQJ-NZt81UjgJejo,11351
|
|
16
|
-
scripts/create_dataset.py,sha256=K8zmQo3xbwc_yz2BxNK0IKj-DkDt3pFf13ycI6rgTHo,7798
|
|
17
|
-
scripts/read_dataset.py,sha256=hpBa8Qv1Oj6ffVIUmELGSri2eYHPpdqLnWFKgKpG-FM,9610
|
|
18
|
-
tests/test_compaction.py,sha256=7MLnfbGi3j17ZON8Qi9oq4i1UWkW0JigX46BBFWecMk,7871
|
|
19
|
-
tests/test_dataset_metadata.py,sha256=bMzX2HiUnzFTyU3VkFuW5xjmFEP8cJSYPt1XF6IS0Qk,1019
|
|
20
|
-
tests/test_import.py,sha256=ZvoHW-rmcYqkW6TJKD_brgeePqHHbz2iTyRWKIBHGHk,137
|
|
21
|
-
tests/test_pyproject.py,sha256=o3rS_GOems1oYQDH3UATfqc6XUwDTKZF2Q4cspU-NYc,206
|
|
22
|
-
opteryx_catalog-0.4.11.dist-info/METADATA,sha256=BV0mk_GugipH7BAhKWIJtJq_55ML4kipL8RbF-Cm7t4,22384
|
|
23
|
-
opteryx_catalog-0.4.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
24
|
-
opteryx_catalog-0.4.11.dist-info/top_level.txt,sha256=HWATr4Wgxbg3c1X3EcsJ6cnHoR6ZAdTe1LQ2VssIBUo,30
|
|
25
|
-
opteryx_catalog-0.4.11.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|