opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,127 @@
1
+ import io
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+
6
+ from opteryx_catalog.catalog.dataset import SimpleDataset
7
+ from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
8
+
9
+
10
+ class _MemInput:
11
+ def __init__(self, data: bytes):
12
+ self._data = data
13
+
14
+ def open(self):
15
+ # Provide a file-like BytesIO which .read() returns the bytes
16
+ return io.BytesIO(self._data)
17
+
18
+
19
+ class _MemIO:
20
+ def __init__(self, mapping: dict):
21
+ self._mapping = mapping
22
+
23
+ def new_input(self, path: str):
24
+ return _MemInput(self._mapping[path])
25
+
26
+
27
+ def _build_manifest_bytes():
28
+ # Construct a parquet manifest with two entries, two columns per file
29
+ schema = pa.schema(
30
+ [
31
+ ("file_path", pa.string()),
32
+ ("file_format", pa.string()),
33
+ ("record_count", pa.int64()),
34
+ ("file_size_in_bytes", pa.int64()),
35
+ ("uncompressed_size_in_bytes", pa.int64()),
36
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
37
+ ("null_counts", pa.list_(pa.int64())),
38
+ ("min_k_hashes", pa.list_(pa.int64())),
39
+ ("histogram_counts", pa.list_(pa.int64())),
40
+ ("histogram_bins", pa.int64()),
41
+ ("min_values", pa.list_(pa.int64())),
42
+ ("max_values", pa.list_(pa.int64())),
43
+ ("min_values_display", pa.list_(pa.string())),
44
+ ("max_values_display", pa.list_(pa.string())),
45
+ ]
46
+ )
47
+
48
+ file_path = pa.array(["f1.parquet", "f2.parquet"], type=pa.string())
49
+ file_format = pa.array(["parquet", "parquet"], type=pa.string())
50
+ record_count = pa.array([10, 20], type=pa.int64())
51
+ file_size_in_bytes = pa.array([100, 200], type=pa.int64())
52
+ uncompressed_size_in_bytes = pa.array([1000, 2000], type=pa.int64())
53
+ column_uncompressed_sizes_in_bytes = pa.array(
54
+ [[100, 400], [300, 200]], type=pa.list_(pa.int64())
55
+ )
56
+ null_counts = pa.array([[0, 0], [0, 0]], type=pa.list_(pa.int64()))
57
+ min_k_hashes = pa.array([[1, 2], [1]], type=pa.list_(pa.int64()))
58
+ histogram_counts = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64()))
59
+ histogram_bins = pa.array([32, 32], type=pa.int64())
60
+ min_values = pa.array([[10, 20], [5, 30]], type=pa.list_(pa.int64()))
61
+ max_values = pa.array([[100, 400], [300, 200]], type=pa.list_(pa.int64()))
62
+ min_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
63
+ max_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
64
+
65
+ table = pa.Table.from_arrays(
66
+ [
67
+ file_path,
68
+ file_format,
69
+ record_count,
70
+ file_size_in_bytes,
71
+ uncompressed_size_in_bytes,
72
+ column_uncompressed_sizes_in_bytes,
73
+ null_counts,
74
+ min_k_hashes,
75
+ histogram_counts,
76
+ histogram_bins,
77
+ min_values,
78
+ max_values,
79
+ min_values_display,
80
+ max_values_display,
81
+ ],
82
+ schema=schema,
83
+ )
84
+
85
+ buf = io.BytesIO()
86
+ pq.write_table(table, buf)
87
+ return buf.getvalue()
88
+
89
+
90
+ def test_describe_includes_uncompressed_bytes():
91
+ manifest_bytes = _build_manifest_bytes()
92
+ manifest_path = "mem://manifest"
93
+
94
+ meta = DatasetMetadata(
95
+ dataset_identifier="tests_temp.test",
96
+ location="mem://",
97
+ schema=None,
98
+ properties={},
99
+ )
100
+
101
+ # Add a schema with two columns so describe() can map names -> indices
102
+ meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
103
+ meta.current_schema_id = "s1"
104
+
105
+ # Prepare snapshot referencing our in-memory manifest
106
+ snap = Snapshot(
107
+ snapshot_id=1,
108
+ timestamp_ms=1,
109
+ manifest_list=manifest_path,
110
+ )
111
+ meta.snapshots.append(snap)
112
+ meta.current_snapshot_id = 1
113
+
114
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
115
+
116
+ # Inject our in-memory IO mapping
117
+ ds.io = _MemIO({manifest_path: manifest_bytes})
118
+
119
+ desc = ds.describe()
120
+
121
+ assert "a" in desc
122
+ assert "b" in desc
123
+
124
+ # Column 'a' should have uncompressed bytes = 100 + 300 = 400
125
+ assert desc["a"]["uncompressed_bytes"] == 400
126
+ # Column 'b' should have uncompressed bytes = 400 + 200 = 600
127
+ assert desc["b"]["uncompressed_bytes"] == 600
@@ -0,0 +1,275 @@
1
+ import io
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+
6
+ import os
7
+ import sys
8
+
9
+ # Add local paths to sys.path to use local code instead of installed packages
10
+ sys.path.insert(0, os.path.join(sys.path[0], "..")) # Add parent dir for pyiceberg_firestore_gcs
11
+ sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
12
+ sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
13
+
14
+
15
+ from opteryx_catalog.catalog.dataset import SimpleDataset
16
+ from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
17
+ from opteryx_catalog.catalog.manifest import (
18
+ build_parquet_manifest_entry_from_bytes,
19
+ get_manifest_metrics,
20
+ reset_manifest_metrics,
21
+ )
22
+ from opteryx_catalog.opteryx_catalog import OpteryxCatalog
23
+ import pytest
24
+
25
+
26
+ def test_min_k_hashes_for_string_and_binary():
27
+ try:
28
+ pass # type: ignore
29
+ except Exception:
30
+ pytest.skip("opteryx.draken not available")
31
+
32
+ import pyarrow as pa
33
+
34
+ # short binary and short string columns should get min-k
35
+ t = _make_parquet_table(
36
+ [("bin", pa.binary()), ("s", pa.string())], [(b"a", "x"), (b"b", "y"), (b"c", "z")]
37
+ )
38
+ buf = pa.BufferOutputStream()
39
+ pq.write_table(t, buf, compression="zstd")
40
+ data = buf.getvalue().to_pybytes()
41
+ e = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
42
+ assert len(e.min_k_hashes[0]) > 0
43
+ assert len(e.min_k_hashes[1]) > 0
44
+
45
+
46
+ # Step 1: Create a local catalog
47
+ catalog = OpteryxCatalog(
48
+ "opteryx",
49
+ firestore_project="mabeldev",
50
+ firestore_database="catalogs",
51
+ gcs_bucket="opteryx_data",
52
+ )
53
+
54
+ # print(catalog.load_dataset("ops.stdout_log").describe())
55
+
56
+
57
+ class _MemInput:
58
+ def __init__(self, data: bytes):
59
+ self._data = data
60
+
61
+ def open(self):
62
+ return io.BytesIO(self._data)
63
+
64
+
65
+ class _MemIO:
66
+ def __init__(self, mapping: dict):
67
+ self._mapping = mapping
68
+
69
+ def new_input(self, path: str):
70
+ return _MemInput(self._mapping[path])
71
+
72
+ def new_output(self, path: str):
73
+ class Out:
74
+ def __init__(self, mapping, path):
75
+ self._buf = io.BytesIO()
76
+ self._mapping = mapping
77
+ self._path = path
78
+
79
+ def write(self, data: bytes):
80
+ self._buf.write(data)
81
+
82
+ def close(self):
83
+ self._mapping[self._path] = self._buf.getvalue()
84
+
85
+ def create(self):
86
+ return self
87
+
88
+ return Out(self._mapping, path)
89
+
90
+
91
+ class _FakeCatalog:
92
+ def __init__(self, io):
93
+ self.io = io
94
+
95
+ def write_parquet_manifest(
96
+ self, snapshot_id: int, entries: list[dict], dataset_location: str
97
+ ) -> str:
98
+ # Minimal manifest writer using same schema as production
99
+ schema = pa.schema(
100
+ [
101
+ ("file_path", pa.string()),
102
+ ("file_format", pa.string()),
103
+ ("record_count", pa.int64()),
104
+ ("file_size_in_bytes", pa.int64()),
105
+ ("uncompressed_size_in_bytes", pa.int64()),
106
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
107
+ ("null_counts", pa.list_(pa.int64())),
108
+ ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
109
+ ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
110
+ ("histogram_bins", pa.int32()),
111
+ ("min_values", pa.list_(pa.int64())),
112
+ ("max_values", pa.list_(pa.int64())),
113
+ ("min_values_display", pa.list_(pa.string())),
114
+ ("max_values_display", pa.list_(pa.string())),
115
+ ]
116
+ )
117
+ normalized = []
118
+ for ent in entries:
119
+ if not isinstance(ent, dict):
120
+ normalized.append(ent)
121
+ continue
122
+ e = dict(ent)
123
+ e.setdefault("min_k_hashes", [])
124
+ e.setdefault("histogram_counts", [])
125
+ e.setdefault("histogram_bins", 0)
126
+ e.setdefault("column_uncompressed_sizes_in_bytes", [])
127
+ e.setdefault("null_counts", [])
128
+ e.setdefault("min_values_display", [])
129
+ e.setdefault("max_values_display", [])
130
+ mv = e.get("min_values") or []
131
+ xv = e.get("max_values") or []
132
+ mv_disp = e.get("min_values_display") or []
133
+ xv_disp = e.get("max_values_display") or []
134
+ e["min_values"] = [int(v) if v is not None else None for v in mv]
135
+ e["max_values"] = [int(v) if v is not None else None for v in xv]
136
+ e["min_values_display"] = [str(v) if v is not None else None for v in mv_disp]
137
+ e["max_values_display"] = [str(v) if v is not None else None for v in xv_disp]
138
+ normalized.append(e)
139
+
140
+ table = pa.Table.from_pylist(normalized, schema=schema)
141
+ buf = pa.BufferOutputStream()
142
+ pq.write_table(table, buf, compression="zstd")
143
+ data = buf.getvalue().to_pybytes()
144
+ path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
145
+ out = self.io.new_output(path).create()
146
+ out.write(data)
147
+ out.close()
148
+ return path
149
+
150
+
151
+ def _make_parquet_table(columns: list[tuple[str, pa.DataType]], rows: list[tuple]):
152
+ arrays = []
153
+ for i, (name, dtype) in enumerate(columns):
154
+ col_vals = [r[i] for r in rows]
155
+ arrays.append(pa.array(col_vals, type=dtype))
156
+ return pa.Table.from_arrays(arrays, names=[c[0] for c in columns])
157
+
158
+
159
+ def test_build_manifest_from_bytes_matches_table():
160
+ # ensure the bytes-based builder matches the table-based one
161
+ t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
162
+ buf = pa.BufferOutputStream()
163
+ pq.write_table(t, buf, compression="zstd")
164
+ data = buf.getvalue().to_pybytes()
165
+
166
+ e_bytes = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
167
+ # basic sanity checks (parity is enforced by using orig_table when available)
168
+ assert e_bytes.record_count == 2
169
+ assert e_bytes.file_size_in_bytes == len(data)
170
+
171
+
172
+ def test_manifest_metrics_increments():
173
+ reset_manifest_metrics()
174
+ t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
175
+ buf = pa.BufferOutputStream()
176
+ pq.write_table(t, buf, compression="zstd")
177
+ data = buf.getvalue().to_pybytes()
178
+
179
+ _ = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
180
+ m = get_manifest_metrics()
181
+ assert m.get("files_read", 0) >= 1
182
+ assert m.get("hash_calls", 0) >= 1
183
+ assert m.get("compress_calls", 0) >= 1
184
+
185
+
186
+ def test_table_based_builder_is_removed():
187
+ from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry
188
+
189
+ t = _make_parquet_table([("a", pa.int64())], [(1,)])
190
+ with pytest.raises(RuntimeError):
191
+ _ = build_parquet_manifest_entry(t, "mem://f", 0)
192
+
193
+
194
+ def test_manifest_uses_rugo_for_sizes():
195
+ # Ensure the bytes-based builder uses rugo metadata to compute per-column sizes
196
+ reset_manifest_metrics()
197
+ t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
198
+ buf = pa.BufferOutputStream()
199
+ pq.write_table(t, buf, compression="zstd")
200
+ data = buf.getvalue().to_pybytes()
201
+
202
+ entry = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data))
203
+ m = get_manifest_metrics()
204
+
205
+ # rugo should report sizes (non-zero) for these synthetic files
206
+ assert m.get("sizes_from_rugo", 0) >= 1 or m.get("sizes_from_rugo_missing", 0) == 0
207
+ assert entry.uncompressed_size_in_bytes >= 0
208
+ assert isinstance(entry.column_uncompressed_sizes_in_bytes, list)
209
+ assert len(entry.column_uncompressed_sizes_in_bytes) == 2
210
+ # column sizes may be non-zero when metadata is available
211
+ assert all(isinstance(x, int) for x in entry.column_uncompressed_sizes_in_bytes)
212
+
213
+
214
+ def test_refresh_manifest_with_single_file():
215
+ # single file with columns a,b for quick iteration
216
+ t1 = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
217
+
218
+ # Write parquet file to mem
219
+ buf = pa.BufferOutputStream()
220
+ pq.write_table(t1, buf, compression="zstd")
221
+ d1 = buf.getvalue().to_pybytes()
222
+
223
+ f1 = "mem://data/f1.parquet"
224
+ manifest_path = "mem://manifest-old"
225
+
226
+ # Build initial manifest entry for single file (bytes-based builder)
227
+ e1 = build_parquet_manifest_entry_from_bytes(d1, f1, len(d1), orig_table=t1).to_dict()
228
+
229
+ # Create in-memory IO mapping including manifest and data file
230
+ mapping = {f1: d1}
231
+
232
+ # Write initial manifest with the single entry using the same writer as the catalog
233
+ fake_writer = _FakeCatalog(_MemIO(mapping))
234
+ manifest_path = fake_writer.write_parquet_manifest(1, [e1], "mem://")
235
+ # Ensure the manifest bytes are present in the mapping
236
+ mapping[manifest_path] = mapping[manifest_path]
237
+
238
+ # Persist the single-file manifest as JSON for quick inspection during
239
+ # iterative debugging (writes to repo `artifacts/` so you can open it).
240
+ import os
241
+ import json
242
+
243
+ artifacts_dir = os.path.join(os.getcwd(), "artifacts")
244
+ os.makedirs(artifacts_dir, exist_ok=True)
245
+ with open(
246
+ os.path.join(artifacts_dir, "single_file_manifest.json"), "w", encoding="utf-8"
247
+ ) as fh:
248
+ json.dump(e1, fh, indent=2, default=str)
249
+
250
+ # Create metadata and snapshot
251
+ meta = DatasetMetadata(
252
+ dataset_identifier="tests_temp.test", location="mem://", schema=None, properties={}
253
+ )
254
+ meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
255
+ meta.current_schema_id = "s1"
256
+ snap = Snapshot(snapshot_id=1, timestamp_ms=1, manifest_list=manifest_path)
257
+ meta.snapshots.append(snap)
258
+ meta.current_snapshot_id = 1
259
+
260
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
261
+ ds.io = _MemIO(mapping)
262
+ ds.catalog = _FakeCatalog(ds.io)
263
+
264
+ # Refresh manifest (should re-read f1 and write a new manifest)
265
+ new_snap_id = ds.refresh_manifest(agent="test-agent", author="tester")
266
+ assert new_snap_id is not None
267
+
268
+ # Describe should include both columns and count bytes appropriately
269
+ desc = ds.describe()
270
+ assert "a" in desc
271
+ assert "b" in desc
272
+
273
+ # ensure uncompressed bytes are present and non-zero for both cols
274
+ assert desc["a"]["uncompressed_bytes"] > 0
275
+ assert desc["b"]["uncompressed_bytes"] > 0
tests/test_webhooks.py ADDED
@@ -0,0 +1,177 @@
1
+ """Tests for the webhook system."""
2
+
3
+ import os
4
+ from unittest.mock import MagicMock
5
+ from unittest.mock import patch
6
+
7
+ import pytest
8
+
9
+
10
+ def test_webhook_manager_disabled_without_domain():
11
+ """Test that webhook manager is disabled when no domain is configured."""
12
+ from opteryx_catalog.webhooks import WebhookManager
13
+
14
+ # Clear any existing env vars
15
+ os.environ.pop("OPTERYX_WEBHOOK_DOMAIN", None)
16
+ os.environ.pop("OPTERYX_WEBHOOK_QUEUE", None)
17
+
18
+ manager = WebhookManager()
19
+ assert not manager.enabled
20
+
21
+ # Should return False without making any HTTP calls
22
+ result = manager.send(
23
+ action="create",
24
+ workspace="test",
25
+ collection="test",
26
+ resource_type="dataset",
27
+ resource_name="test",
28
+ )
29
+ assert result is False
30
+
31
+
32
+ def test_webhook_manager_direct_http():
33
+ """Test that webhooks are sent via direct HTTP when queue is not configured."""
34
+ from opteryx_catalog.webhooks import WebhookManager
35
+
36
+ with patch("opteryx_catalog.webhooks.requests.post") as mock_post:
37
+ mock_response = MagicMock()
38
+ mock_response.status_code = 200
39
+ mock_post.return_value = mock_response
40
+
41
+ manager = WebhookManager(domain="router.example.com", queue_path=None)
42
+ assert manager.enabled
43
+ assert manager._tasks_client is None
44
+
45
+ result = manager.send(
46
+ action="create",
47
+ workspace="test-workspace",
48
+ collection="test-collection",
49
+ resource_type="dataset",
50
+ resource_name="test-dataset",
51
+ payload={"location": "gs://bucket/path"},
52
+ )
53
+
54
+ assert result is True
55
+ mock_post.assert_called_once()
56
+
57
+ # Verify the call arguments
58
+ call_args = mock_post.call_args
59
+ assert call_args.args[0] == "https://router.example.com/event"
60
+ assert call_args.kwargs["json"]["event"]["action"] == "create"
61
+ assert call_args.kwargs["json"]["event"]["resource_type"] == "dataset"
62
+ assert call_args.kwargs["json"]["event"]["resource_name"] == "test-dataset"
63
+ assert call_args.kwargs["json"]["data"]["location"] == "gs://bucket/path"
64
+
65
+
66
+ def test_webhook_manager_payload_building():
67
+ """Test that webhook payloads are built correctly."""
68
+ from opteryx_catalog.webhooks import WebhookManager
69
+
70
+ manager = WebhookManager(domain="hook.example.com")
71
+
72
+ payload = manager._build_payload(
73
+ action="update",
74
+ workspace="ws",
75
+ collection="col",
76
+ resource_type="dataset",
77
+ resource_name="ds",
78
+ additional={"description": "New description"},
79
+ )
80
+
81
+ assert payload["event"]["action"] == "update"
82
+ assert payload["event"]["workspace"] == "ws"
83
+ assert payload["event"]["collection"] == "col"
84
+ assert payload["event"]["resource_type"] == "dataset"
85
+ assert payload["event"]["resource_name"] == "ds"
86
+ assert "timestamp" in payload["event"]
87
+ assert payload["data"]["description"] == "New description"
88
+
89
+
90
+ def test_webhook_http_failure_returns_false():
91
+ """Test that HTTP failures return False without raising exceptions."""
92
+ from opteryx_catalog.webhooks import WebhookManager
93
+
94
+ with patch("opteryx_catalog.webhooks.requests.post") as mock_post:
95
+ # Simulate HTTP error
96
+ mock_post.side_effect = Exception("Connection failed")
97
+
98
+ manager = WebhookManager(domain="router.example.com")
99
+ result = manager.send(
100
+ action="create",
101
+ workspace="test",
102
+ collection="test",
103
+ resource_type="dataset",
104
+ resource_name="test",
105
+ )
106
+
107
+ assert result is False
108
+
109
+
110
+ def test_send_webhook_convenience_function():
111
+ """Test the convenience send_webhook function."""
112
+ from opteryx_catalog.webhooks import send_webhook
113
+
114
+ with patch("opteryx_catalog.webhooks.requests.post") as mock_post:
115
+ mock_response = MagicMock()
116
+ mock_response.status_code = 200
117
+ mock_post.return_value = mock_response
118
+
119
+ os.environ["OPTERYX_WEBHOOK_DOMAIN"] = "router.example.com"
120
+ os.environ.pop("OPTERYX_WEBHOOK_QUEUE", None)
121
+
122
+ # Reset the global manager to pick up new env vars
123
+ import opteryx_catalog.webhooks as webhook_module
124
+
125
+ webhook_module._webhook_manager = None
126
+
127
+ result = send_webhook(
128
+ action="create",
129
+ workspace="test",
130
+ collection="test",
131
+ resource_type="dataset",
132
+ resource_name="test",
133
+ payload={"snapshot_id": 123},
134
+ )
135
+
136
+ assert result is True
137
+ mock_post.assert_called_once()
138
+
139
+ # Clean up
140
+ os.environ.pop("OPTERYX_WEBHOOK_DOMAIN", None)
141
+
142
+
143
+ def test_event_payload_builders():
144
+ """Test the event payload builder functions."""
145
+ from opteryx_catalog.webhooks.events import dataset_commit_payload
146
+ from opteryx_catalog.webhooks.events import dataset_created_payload
147
+ from opteryx_catalog.webhooks.events import view_created_payload
148
+ from opteryx_catalog.webhooks.events import view_executed_payload
149
+
150
+ # Test dataset created
151
+ payload = dataset_created_payload(
152
+ schema=None, location="gs://bucket/path", properties={"key": "value"}
153
+ )
154
+ assert payload["location"] == "gs://bucket/path"
155
+ assert payload["properties"]["key"] == "value"
156
+
157
+ # Test dataset commit
158
+ payload = dataset_commit_payload(
159
+ snapshot_id=123, sequence_number=5, record_count=1000, file_count=2
160
+ )
161
+ assert payload["snapshot_id"] == 123
162
+ assert payload["sequence_number"] == 5
163
+ assert payload["record_count"] == 1000
164
+ assert payload["file_count"] == 2
165
+
166
+ # Test view created
167
+ payload = view_created_payload(definition="SELECT * FROM table", properties={})
168
+ assert payload["definition"] == "SELECT * FROM table"
169
+
170
+ # Test view executed
171
+ payload = view_executed_payload(execution_time_ms=1500, row_count=100)
172
+ assert payload["execution_time_ms"] == 1500
173
+ assert payload["row_count"] == 100
174
+
175
+
176
+ if __name__ == "__main__":
177
+ pytest.main([__file__, "-v"])
@@ -1,25 +0,0 @@
1
- opteryx_catalog/__init__.py,sha256=cqGY7bl6iMBIqY_x6VTc5fAFH23M3XQeJYrHPX6FglY,902
2
- opteryx_catalog/exceptions.py,sha256=ZEaXmrrn030V8pfy8YMaLwzBWFms9OgZG21zVRGKlxM,652
3
- opteryx_catalog/opteryx_catalog.py,sha256=sgcCiBbIv8mUZQbNb34JwEf6Wq0iByoEvpqkoD_F1bc,39111
4
- opteryx_catalog/catalog/__init__.py,sha256=yD7egf-dLd1z_CNXunz3ldLyLMMkSNbS3aKjGp3dKQY,119
5
- opteryx_catalog/catalog/compaction.py,sha256=HGkDnlVBv5GjRiZhdGubxCVxRLScL9N667a19U01g1I,19100
6
- opteryx_catalog/catalog/dataset.py,sha256=3Q_lLZ1Y0I1E_R47pMFgql81Y1dy955NKlsgk9edfJE,46796
7
- opteryx_catalog/catalog/manifest.py,sha256=xTV3u_i8s7jxulLvATyBoP9FHTdxOB8b0__SabqhH6g,17045
8
- opteryx_catalog/catalog/metadata.py,sha256=a4UFj5xUqjqtuLu2_mYQaBHRWtjjX3KU2Ufp63Uo2AM,2870
9
- opteryx_catalog/catalog/metastore.py,sha256=mS4qaaOMzcIu730Jm0K_Nq-4sNI8kIX3UscevYO5E08,1997
10
- opteryx_catalog/catalog/view.py,sha256=mUzfRGYqLRx_9BfZdGY5HNz6na9VMEPITrYKiI5m694,219
11
- opteryx_catalog/iops/__init__.py,sha256=_CxR-hg8XUD2cIFucb3aHyTFqwi41QmEDf9gXzXt3ZU,171
12
- opteryx_catalog/iops/base.py,sha256=1IW9qjDkQEMXvrA2J73VSBCdzkf2W5xVsWVnpNglL1U,1206
13
- opteryx_catalog/iops/fileio.py,sha256=cjBl9fN-vutvXskzZkwJjjbBcUlE0O1WrQe5Ryx7pIg,4315
14
- opteryx_catalog/iops/gcs.py,sha256=aB6hvSAQhbKTSyaLbAPgpXtSnvkI7fndXCRjaAZ1Dxo,8155
15
- opteryx_catalog-0.4.11.dist-info/licenses/LICENSE,sha256=mc5l20siqdcNQM54xALIWJhyaWsmQJ-NZt81UjgJejo,11351
16
- scripts/create_dataset.py,sha256=K8zmQo3xbwc_yz2BxNK0IKj-DkDt3pFf13ycI6rgTHo,7798
17
- scripts/read_dataset.py,sha256=hpBa8Qv1Oj6ffVIUmELGSri2eYHPpdqLnWFKgKpG-FM,9610
18
- tests/test_compaction.py,sha256=7MLnfbGi3j17ZON8Qi9oq4i1UWkW0JigX46BBFWecMk,7871
19
- tests/test_dataset_metadata.py,sha256=bMzX2HiUnzFTyU3VkFuW5xjmFEP8cJSYPt1XF6IS0Qk,1019
20
- tests/test_import.py,sha256=ZvoHW-rmcYqkW6TJKD_brgeePqHHbz2iTyRWKIBHGHk,137
21
- tests/test_pyproject.py,sha256=o3rS_GOems1oYQDH3UATfqc6XUwDTKZF2Q4cspU-NYc,206
22
- opteryx_catalog-0.4.11.dist-info/METADATA,sha256=BV0mk_GugipH7BAhKWIJtJq_55ML4kipL8RbF-Cm7t4,22384
23
- opteryx_catalog-0.4.11.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
24
- opteryx_catalog-0.4.11.dist-info/top_level.txt,sha256=HWATr4Wgxbg3c1X3EcsJ6cnHoR6ZAdTe1LQ2VssIBUo,30
25
- opteryx_catalog-0.4.11.dist-info/RECORD,,