opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. opteryx_catalog/__init__.py +1 -1
  2. opteryx_catalog/catalog/__init__.py +2 -1
  3. opteryx_catalog/catalog/compaction.py +536 -0
  4. opteryx_catalog/catalog/dataset.py +840 -520
  5. opteryx_catalog/catalog/manifest.py +475 -0
  6. opteryx_catalog/catalog/metadata.py +5 -2
  7. opteryx_catalog/catalog/metastore.py +2 -2
  8. opteryx_catalog/exceptions.py +1 -1
  9. opteryx_catalog/iops/fileio.py +13 -0
  10. opteryx_catalog/iops/gcs.py +35 -5
  11. opteryx_catalog/maki_nage/__init__.py +8 -0
  12. opteryx_catalog/maki_nage/distogram.py +558 -0
  13. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  14. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  15. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  16. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  17. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  18. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  19. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  20. opteryx_catalog/opteryx_catalog.py +296 -242
  21. opteryx_catalog/webhooks/__init__.py +230 -0
  22. opteryx_catalog/webhooks/events.py +177 -0
  23. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  24. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  25. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  26. scripts/collect_byte_counts.py +42 -0
  27. scripts/create_dataset.py +1 -1
  28. scripts/emit_full_single_file.py +81 -0
  29. scripts/inspect_manifest_dryrun.py +322 -0
  30. scripts/inspect_single_file.py +147 -0
  31. scripts/inspect_single_file_gcs.py +124 -0
  32. scripts/read_dataset.py +1 -1
  33. tests/test_collections.py +37 -0
  34. tests/test_compaction.py +233 -0
  35. tests/test_dataset_metadata.py +14 -0
  36. tests/test_describe_uncompressed.py +127 -0
  37. tests/test_refresh_manifest.py +275 -0
  38. tests/test_webhooks.py +177 -0
  39. opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
  40. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  41. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,233 @@
1
+ """
2
+ Test script for compaction functionality.
3
+
4
+ This tests the DatasetCompactor class with both brute and performance strategies.
5
+ """
6
+
7
+ from unittest.mock import Mock
8
+
9
+ import pyarrow as pa
10
+
11
+ from opteryx_catalog.catalog.compaction import DatasetCompactor
12
+ from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
13
+
14
+
15
+ def create_test_table(num_rows: int, value_range: tuple = (0, 100)) -> pa.Table:
16
+ """Create a simple test table with a timestamp column for sorting."""
17
+ import random
18
+
19
+ timestamps = sorted([random.randint(value_range[0], value_range[1]) for _ in range(num_rows)])
20
+ values = [f"value_{i}" for i in range(num_rows)]
21
+
22
+ return pa.table({"timestamp": timestamps, "value": values})
23
+
24
+
25
+ def test_brute_compaction():
26
+ """Test brute force compaction strategy."""
27
+ print("Testing brute force compaction...")
28
+
29
+ # Create mock dataset
30
+ dataset = Mock()
31
+ dataset.metadata = DatasetMetadata(
32
+ dataset_identifier="test_dataset",
33
+ location="/tmp/test_data",
34
+ )
35
+ dataset.metadata.sort_orders = [] # No sort order for brute
36
+ dataset.metadata.snapshots = []
37
+ dataset.metadata.current_snapshot = None
38
+
39
+ # Create mock entries - small files that should be combined
40
+ mock_entries = [
41
+ {
42
+ "file_path": "/tmp/file1.parquet",
43
+ "file_size_in_bytes": 30 * 1024 * 1024, # 30MB compressed
44
+ "uncompressed_size_in_bytes": 40 * 1024 * 1024, # 40MB uncompressed
45
+ "record_count": 1000,
46
+ },
47
+ {
48
+ "file_path": "/tmp/file2.parquet",
49
+ "file_size_in_bytes": 35 * 1024 * 1024, # 35MB compressed
50
+ "uncompressed_size_in_bytes": 50 * 1024 * 1024, # 50MB uncompressed
51
+ "record_count": 1200,
52
+ },
53
+ {
54
+ "file_path": "/tmp/file3.parquet",
55
+ "file_size_in_bytes": 110 * 1024 * 1024, # 110MB compressed (acceptable)
56
+ "uncompressed_size_in_bytes": 130 * 1024 * 1024, # 130MB uncompressed
57
+ "record_count": 3000,
58
+ },
59
+ ]
60
+
61
+ # Create current snapshot with manifest
62
+ dataset.metadata.current_snapshot = Snapshot(
63
+ snapshot_id=1000,
64
+ timestamp_ms=1000,
65
+ manifest_list="/tmp/manifest.parquet",
66
+ )
67
+
68
+ # Mock IO and catalog
69
+ dataset.io = Mock()
70
+ dataset.catalog = Mock()
71
+
72
+ # Create compactor
73
+ compactor = DatasetCompactor(dataset, strategy="brute", author="test", agent="test-agent")
74
+
75
+ # Verify strategy selection
76
+ assert compactor.strategy == "brute", "Strategy should be brute"
77
+ assert compactor.decision == "user", "Decision should be user"
78
+
79
+ # Test selection logic directly
80
+ plan = compactor._select_brute_compaction(mock_entries)
81
+
82
+ assert plan is not None, "Should find files to compact"
83
+ assert plan["type"] == "combine", "Should plan to combine small files"
84
+ assert len(plan["files"]) == 2, "Should select 2 small files"
85
+
86
+ print("✓ Brute force compaction test passed")
87
+
88
+
89
+ def test_performance_compaction():
90
+ """Test performance compaction strategy."""
91
+ print("Testing performance compaction...")
92
+
93
+ # Create mock dataset with sort order
94
+ dataset = Mock()
95
+ dataset.metadata = DatasetMetadata(
96
+ dataset_identifier="test_dataset",
97
+ location="/tmp/test_data",
98
+ )
99
+ dataset.metadata.sort_orders = [0] # Sort by first column
100
+ dataset.metadata.schema = Mock()
101
+ dataset.metadata.schema.fields = [Mock(name="timestamp")]
102
+ dataset.metadata.snapshots = []
103
+ dataset.metadata.current_snapshot = None
104
+
105
+ # Create mock entries with overlapping ranges
106
+ mock_entries = [
107
+ {
108
+ "file_path": "/tmp/file1.parquet",
109
+ "file_size_in_bytes": 30 * 1024 * 1024,
110
+ "uncompressed_size_in_bytes": 40 * 1024 * 1024,
111
+ "record_count": 1000,
112
+ "lower_bounds": {"timestamp": 1},
113
+ "upper_bounds": {"timestamp": 100},
114
+ },
115
+ {
116
+ "file_path": "/tmp/file2.parquet",
117
+ "file_size_in_bytes": 35 * 1024 * 1024,
118
+ "uncompressed_size_in_bytes": 50 * 1024 * 1024,
119
+ "record_count": 1200,
120
+ "lower_bounds": {"timestamp": 50}, # Overlaps with file1
121
+ "upper_bounds": {"timestamp": 150},
122
+ },
123
+ {
124
+ "file_path": "/tmp/file3.parquet",
125
+ "file_size_in_bytes": 110 * 1024 * 1024,
126
+ "uncompressed_size_in_bytes": 130 * 1024 * 1024,
127
+ "record_count": 3000,
128
+ "lower_bounds": {"timestamp": 200}, # No overlap
129
+ "upper_bounds": {"timestamp": 300},
130
+ },
131
+ ]
132
+
133
+ dataset.metadata.current_snapshot = Snapshot(
134
+ snapshot_id=1000,
135
+ timestamp_ms=1000,
136
+ manifest_list="/tmp/manifest.parquet",
137
+ )
138
+
139
+ # Mock IO and catalog
140
+ dataset.io = Mock()
141
+ dataset.catalog = Mock()
142
+
143
+ # Create compactor (auto-detect should choose performance)
144
+ compactor = DatasetCompactor(dataset, strategy=None, author="test", agent="test-agent")
145
+
146
+ # Verify strategy selection
147
+ assert compactor.strategy == "performance", "Should auto-select performance strategy"
148
+ assert compactor.decision == "auto", "Decision should be auto"
149
+
150
+ # Test selection logic directly
151
+ plan = compactor._select_performance_compaction(mock_entries)
152
+
153
+ assert plan is not None, "Should find overlapping files"
154
+ assert plan["type"] == "combine-split", "Should plan to combine and split"
155
+ assert len(plan["files"]) == 2, "Should select 2 overlapping files"
156
+ assert plan["sort_column"] == "timestamp", "Should identify sort column"
157
+
158
+ print("✓ Performance compaction test passed")
159
+
160
+
161
+ def test_large_file_splitting():
162
+ """Test that large files are identified for splitting."""
163
+ print("Testing large file splitting...")
164
+
165
+ dataset = Mock()
166
+ dataset.metadata = DatasetMetadata(
167
+ dataset_identifier="test_dataset",
168
+ location="/tmp/test_data",
169
+ )
170
+ dataset.metadata.sort_orders = []
171
+
172
+ # Create entry for a large file
173
+ mock_entries = [
174
+ {
175
+ "file_path": "/tmp/large_file.parquet",
176
+ "file_size_in_bytes": 180 * 1024 * 1024,
177
+ "uncompressed_size_in_bytes": 200 * 1024 * 1024, # 200MB > 196MB threshold
178
+ "record_count": 5000,
179
+ }
180
+ ]
181
+
182
+ compactor = DatasetCompactor(dataset, strategy="brute")
183
+ plan = compactor._select_brute_compaction(mock_entries)
184
+
185
+ assert plan is not None, "Should identify large file"
186
+ assert plan["type"] == "split", "Should plan to split"
187
+ assert plan["reason"] == "file-too-large", "Reason should be file too large"
188
+
189
+ print("✓ Large file splitting test passed")
190
+
191
+
192
+ def test_no_compaction_needed():
193
+ """Test when no compaction is needed."""
194
+ print("Testing no compaction scenario...")
195
+
196
+ dataset = Mock()
197
+ dataset.metadata = DatasetMetadata(
198
+ dataset_identifier="test_dataset",
199
+ location="/tmp/test_data",
200
+ )
201
+ dataset.metadata.sort_orders = []
202
+
203
+ # All files are in acceptable range
204
+ mock_entries = [
205
+ {
206
+ "file_path": "/tmp/file1.parquet",
207
+ "file_size_in_bytes": 100 * 1024 * 1024,
208
+ "uncompressed_size_in_bytes": 110 * 1024 * 1024,
209
+ "record_count": 2000,
210
+ },
211
+ {
212
+ "file_path": "/tmp/file2.parquet",
213
+ "file_size_in_bytes": 120 * 1024 * 1024,
214
+ "uncompressed_size_in_bytes": 135 * 1024 * 1024,
215
+ "record_count": 2500,
216
+ },
217
+ ]
218
+
219
+ compactor = DatasetCompactor(dataset, strategy="brute")
220
+ plan = compactor._select_brute_compaction(mock_entries)
221
+
222
+ assert plan is None, "Should not find anything to compact"
223
+
224
+ print("✓ No compaction test passed")
225
+
226
+
227
+ if __name__ == "__main__":
228
+ print("Running compaction tests...\n")
229
+ test_brute_compaction()
230
+ test_performance_compaction()
231
+ test_large_file_splitting()
232
+ test_no_compaction_needed()
233
+ print("\n✅ All tests passed!")
@@ -13,3 +13,17 @@ def test_dataset_metadata_and_simpledataset():
13
13
  assert ds.metadata.dataset_identifier == "tests_temp.test"
14
14
  assert ds.snapshot() is None
15
15
  assert list(ds.snapshots()) == []
16
+
17
+
18
+ def test_sequence_number_requires_history():
19
+ """Test that _next_sequence_number works with empty snapshots."""
20
+ meta = DatasetMetadata(
21
+ dataset_identifier="tests_temp.test",
22
+ location="gs://bucket/ws/tests_temp/test",
23
+ schema=None,
24
+ properties={},
25
+ )
26
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
27
+
28
+ # Should return 1 when no snapshots are loaded (first snapshot)
29
+ assert ds._next_sequence_number() == 1
@@ -0,0 +1,127 @@
1
+ import io
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+
6
+ from opteryx_catalog.catalog.dataset import SimpleDataset
7
+ from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
8
+
9
+
10
+ class _MemInput:
11
+ def __init__(self, data: bytes):
12
+ self._data = data
13
+
14
+ def open(self):
15
+ # Provide a file-like BytesIO which .read() returns the bytes
16
+ return io.BytesIO(self._data)
17
+
18
+
19
+ class _MemIO:
20
+ def __init__(self, mapping: dict):
21
+ self._mapping = mapping
22
+
23
+ def new_input(self, path: str):
24
+ return _MemInput(self._mapping[path])
25
+
26
+
27
+ def _build_manifest_bytes():
28
+ # Construct a parquet manifest with two entries, two columns per file
29
+ schema = pa.schema(
30
+ [
31
+ ("file_path", pa.string()),
32
+ ("file_format", pa.string()),
33
+ ("record_count", pa.int64()),
34
+ ("file_size_in_bytes", pa.int64()),
35
+ ("uncompressed_size_in_bytes", pa.int64()),
36
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
37
+ ("null_counts", pa.list_(pa.int64())),
38
+ ("min_k_hashes", pa.list_(pa.int64())),
39
+ ("histogram_counts", pa.list_(pa.int64())),
40
+ ("histogram_bins", pa.int64()),
41
+ ("min_values", pa.list_(pa.int64())),
42
+ ("max_values", pa.list_(pa.int64())),
43
+ ("min_values_display", pa.list_(pa.string())),
44
+ ("max_values_display", pa.list_(pa.string())),
45
+ ]
46
+ )
47
+
48
+ file_path = pa.array(["f1.parquet", "f2.parquet"], type=pa.string())
49
+ file_format = pa.array(["parquet", "parquet"], type=pa.string())
50
+ record_count = pa.array([10, 20], type=pa.int64())
51
+ file_size_in_bytes = pa.array([100, 200], type=pa.int64())
52
+ uncompressed_size_in_bytes = pa.array([1000, 2000], type=pa.int64())
53
+ column_uncompressed_sizes_in_bytes = pa.array(
54
+ [[100, 400], [300, 200]], type=pa.list_(pa.int64())
55
+ )
56
+ null_counts = pa.array([[0, 0], [0, 0]], type=pa.list_(pa.int64()))
57
+ min_k_hashes = pa.array([[1, 2], [1]], type=pa.list_(pa.int64()))
58
+ histogram_counts = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64()))
59
+ histogram_bins = pa.array([32, 32], type=pa.int64())
60
+ min_values = pa.array([[10, 20], [5, 30]], type=pa.list_(pa.int64()))
61
+ max_values = pa.array([[100, 400], [300, 200]], type=pa.list_(pa.int64()))
62
+ min_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
63
+ max_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
64
+
65
+ table = pa.Table.from_arrays(
66
+ [
67
+ file_path,
68
+ file_format,
69
+ record_count,
70
+ file_size_in_bytes,
71
+ uncompressed_size_in_bytes,
72
+ column_uncompressed_sizes_in_bytes,
73
+ null_counts,
74
+ min_k_hashes,
75
+ histogram_counts,
76
+ histogram_bins,
77
+ min_values,
78
+ max_values,
79
+ min_values_display,
80
+ max_values_display,
81
+ ],
82
+ schema=schema,
83
+ )
84
+
85
+ buf = io.BytesIO()
86
+ pq.write_table(table, buf)
87
+ return buf.getvalue()
88
+
89
+
90
+ def test_describe_includes_uncompressed_bytes():
91
+ manifest_bytes = _build_manifest_bytes()
92
+ manifest_path = "mem://manifest"
93
+
94
+ meta = DatasetMetadata(
95
+ dataset_identifier="tests_temp.test",
96
+ location="mem://",
97
+ schema=None,
98
+ properties={},
99
+ )
100
+
101
+ # Add a schema with two columns so describe() can map names -> indices
102
+ meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
103
+ meta.current_schema_id = "s1"
104
+
105
+ # Prepare snapshot referencing our in-memory manifest
106
+ snap = Snapshot(
107
+ snapshot_id=1,
108
+ timestamp_ms=1,
109
+ manifest_list=manifest_path,
110
+ )
111
+ meta.snapshots.append(snap)
112
+ meta.current_snapshot_id = 1
113
+
114
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
115
+
116
+ # Inject our in-memory IO mapping
117
+ ds.io = _MemIO({manifest_path: manifest_bytes})
118
+
119
+ desc = ds.describe()
120
+
121
+ assert "a" in desc
122
+ assert "b" in desc
123
+
124
+ # Column 'a' should have uncompressed bytes = 100 + 300 = 400
125
+ assert desc["a"]["uncompressed_bytes"] == 400
126
+ # Column 'b' should have uncompressed bytes = 400 + 200 = 600
127
+ assert desc["b"]["uncompressed_bytes"] == 600
@@ -0,0 +1,275 @@
1
+ import io
2
+
3
+ import pyarrow as pa
4
+ import pyarrow.parquet as pq
5
+
6
+ import os
7
+ import sys
8
+
9
+ # Add local paths to sys.path to use local code instead of installed packages
10
+ sys.path.insert(0, os.path.join(sys.path[0], "..")) # Add parent dir for pyiceberg_firestore_gcs
11
+ sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
12
+ sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
13
+
14
+
15
+ from opteryx_catalog.catalog.dataset import SimpleDataset
16
+ from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
17
+ from opteryx_catalog.catalog.manifest import (
18
+ build_parquet_manifest_entry_from_bytes,
19
+ get_manifest_metrics,
20
+ reset_manifest_metrics,
21
+ )
22
+ from opteryx_catalog.opteryx_catalog import OpteryxCatalog
23
+ import pytest
24
+
25
+
26
+ def test_min_k_hashes_for_string_and_binary():
27
+ try:
28
+ pass # type: ignore
29
+ except Exception:
30
+ pytest.skip("opteryx.draken not available")
31
+
32
+ import pyarrow as pa
33
+
34
+ # short binary and short string columns should get min-k
35
+ t = _make_parquet_table(
36
+ [("bin", pa.binary()), ("s", pa.string())], [(b"a", "x"), (b"b", "y"), (b"c", "z")]
37
+ )
38
+ buf = pa.BufferOutputStream()
39
+ pq.write_table(t, buf, compression="zstd")
40
+ data = buf.getvalue().to_pybytes()
41
+ e = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
42
+ assert len(e.min_k_hashes[0]) > 0
43
+ assert len(e.min_k_hashes[1]) > 0
44
+
45
+
46
+ # Step 1: Create a local catalog
47
+ catalog = OpteryxCatalog(
48
+ "opteryx",
49
+ firestore_project="mabeldev",
50
+ firestore_database="catalogs",
51
+ gcs_bucket="opteryx_data",
52
+ )
53
+
54
+ # print(catalog.load_dataset("ops.stdout_log").describe())
55
+
56
+
57
+ class _MemInput:
58
+ def __init__(self, data: bytes):
59
+ self._data = data
60
+
61
+ def open(self):
62
+ return io.BytesIO(self._data)
63
+
64
+
65
+ class _MemIO:
66
+ def __init__(self, mapping: dict):
67
+ self._mapping = mapping
68
+
69
+ def new_input(self, path: str):
70
+ return _MemInput(self._mapping[path])
71
+
72
+ def new_output(self, path: str):
73
+ class Out:
74
+ def __init__(self, mapping, path):
75
+ self._buf = io.BytesIO()
76
+ self._mapping = mapping
77
+ self._path = path
78
+
79
+ def write(self, data: bytes):
80
+ self._buf.write(data)
81
+
82
+ def close(self):
83
+ self._mapping[self._path] = self._buf.getvalue()
84
+
85
+ def create(self):
86
+ return self
87
+
88
+ return Out(self._mapping, path)
89
+
90
+
91
+ class _FakeCatalog:
92
+ def __init__(self, io):
93
+ self.io = io
94
+
95
+ def write_parquet_manifest(
96
+ self, snapshot_id: int, entries: list[dict], dataset_location: str
97
+ ) -> str:
98
+ # Minimal manifest writer using same schema as production
99
+ schema = pa.schema(
100
+ [
101
+ ("file_path", pa.string()),
102
+ ("file_format", pa.string()),
103
+ ("record_count", pa.int64()),
104
+ ("file_size_in_bytes", pa.int64()),
105
+ ("uncompressed_size_in_bytes", pa.int64()),
106
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
107
+ ("null_counts", pa.list_(pa.int64())),
108
+ ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
109
+ ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
110
+ ("histogram_bins", pa.int32()),
111
+ ("min_values", pa.list_(pa.int64())),
112
+ ("max_values", pa.list_(pa.int64())),
113
+ ("min_values_display", pa.list_(pa.string())),
114
+ ("max_values_display", pa.list_(pa.string())),
115
+ ]
116
+ )
117
+ normalized = []
118
+ for ent in entries:
119
+ if not isinstance(ent, dict):
120
+ normalized.append(ent)
121
+ continue
122
+ e = dict(ent)
123
+ e.setdefault("min_k_hashes", [])
124
+ e.setdefault("histogram_counts", [])
125
+ e.setdefault("histogram_bins", 0)
126
+ e.setdefault("column_uncompressed_sizes_in_bytes", [])
127
+ e.setdefault("null_counts", [])
128
+ e.setdefault("min_values_display", [])
129
+ e.setdefault("max_values_display", [])
130
+ mv = e.get("min_values") or []
131
+ xv = e.get("max_values") or []
132
+ mv_disp = e.get("min_values_display") or []
133
+ xv_disp = e.get("max_values_display") or []
134
+ e["min_values"] = [int(v) if v is not None else None for v in mv]
135
+ e["max_values"] = [int(v) if v is not None else None for v in xv]
136
+ e["min_values_display"] = [str(v) if v is not None else None for v in mv_disp]
137
+ e["max_values_display"] = [str(v) if v is not None else None for v in xv_disp]
138
+ normalized.append(e)
139
+
140
+ table = pa.Table.from_pylist(normalized, schema=schema)
141
+ buf = pa.BufferOutputStream()
142
+ pq.write_table(table, buf, compression="zstd")
143
+ data = buf.getvalue().to_pybytes()
144
+ path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
145
+ out = self.io.new_output(path).create()
146
+ out.write(data)
147
+ out.close()
148
+ return path
149
+
150
+
151
+ def _make_parquet_table(columns: list[tuple[str, pa.DataType]], rows: list[tuple]):
152
+ arrays = []
153
+ for i, (name, dtype) in enumerate(columns):
154
+ col_vals = [r[i] for r in rows]
155
+ arrays.append(pa.array(col_vals, type=dtype))
156
+ return pa.Table.from_arrays(arrays, names=[c[0] for c in columns])
157
+
158
+
159
+ def test_build_manifest_from_bytes_matches_table():
160
+ # ensure the bytes-based builder matches the table-based one
161
+ t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
162
+ buf = pa.BufferOutputStream()
163
+ pq.write_table(t, buf, compression="zstd")
164
+ data = buf.getvalue().to_pybytes()
165
+
166
+ e_bytes = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
167
+ # basic sanity checks (parity is enforced by using orig_table when available)
168
+ assert e_bytes.record_count == 2
169
+ assert e_bytes.file_size_in_bytes == len(data)
170
+
171
+
172
+ def test_manifest_metrics_increments():
173
+ reset_manifest_metrics()
174
+ t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
175
+ buf = pa.BufferOutputStream()
176
+ pq.write_table(t, buf, compression="zstd")
177
+ data = buf.getvalue().to_pybytes()
178
+
179
+ _ = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
180
+ m = get_manifest_metrics()
181
+ assert m.get("files_read", 0) >= 1
182
+ assert m.get("hash_calls", 0) >= 1
183
+ assert m.get("compress_calls", 0) >= 1
184
+
185
+
186
+ def test_table_based_builder_is_removed():
187
+ from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry
188
+
189
+ t = _make_parquet_table([("a", pa.int64())], [(1,)])
190
+ with pytest.raises(RuntimeError):
191
+ _ = build_parquet_manifest_entry(t, "mem://f", 0)
192
+
193
+
194
+ def test_manifest_uses_rugo_for_sizes():
195
+ # Ensure the bytes-based builder uses rugo metadata to compute per-column sizes
196
+ reset_manifest_metrics()
197
+ t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
198
+ buf = pa.BufferOutputStream()
199
+ pq.write_table(t, buf, compression="zstd")
200
+ data = buf.getvalue().to_pybytes()
201
+
202
+ entry = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data))
203
+ m = get_manifest_metrics()
204
+
205
+ # rugo should report sizes (non-zero) for these synthetic files
206
+ assert m.get("sizes_from_rugo", 0) >= 1 or m.get("sizes_from_rugo_missing", 0) == 0
207
+ assert entry.uncompressed_size_in_bytes >= 0
208
+ assert isinstance(entry.column_uncompressed_sizes_in_bytes, list)
209
+ assert len(entry.column_uncompressed_sizes_in_bytes) == 2
210
+ # column sizes may be non-zero when metadata is available
211
+ assert all(isinstance(x, int) for x in entry.column_uncompressed_sizes_in_bytes)
212
+
213
+
214
+ def test_refresh_manifest_with_single_file():
215
+ # single file with columns a,b for quick iteration
216
+ t1 = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
217
+
218
+ # Write parquet file to mem
219
+ buf = pa.BufferOutputStream()
220
+ pq.write_table(t1, buf, compression="zstd")
221
+ d1 = buf.getvalue().to_pybytes()
222
+
223
+ f1 = "mem://data/f1.parquet"
224
+ manifest_path = "mem://manifest-old"
225
+
226
+ # Build initial manifest entry for single file (bytes-based builder)
227
+ e1 = build_parquet_manifest_entry_from_bytes(d1, f1, len(d1), orig_table=t1).to_dict()
228
+
229
+ # Create in-memory IO mapping including manifest and data file
230
+ mapping = {f1: d1}
231
+
232
+ # Write initial manifest with the single entry using the same writer as the catalog
233
+ fake_writer = _FakeCatalog(_MemIO(mapping))
234
+ manifest_path = fake_writer.write_parquet_manifest(1, [e1], "mem://")
235
+ # Ensure the manifest bytes are present in the mapping
236
+ mapping[manifest_path] = mapping[manifest_path]
237
+
238
+ # Persist the single-file manifest as JSON for quick inspection during
239
+ # iterative debugging (writes to repo `artifacts/` so you can open it).
240
+ import os
241
+ import json
242
+
243
+ artifacts_dir = os.path.join(os.getcwd(), "artifacts")
244
+ os.makedirs(artifacts_dir, exist_ok=True)
245
+ with open(
246
+ os.path.join(artifacts_dir, "single_file_manifest.json"), "w", encoding="utf-8"
247
+ ) as fh:
248
+ json.dump(e1, fh, indent=2, default=str)
249
+
250
+ # Create metadata and snapshot
251
+ meta = DatasetMetadata(
252
+ dataset_identifier="tests_temp.test", location="mem://", schema=None, properties={}
253
+ )
254
+ meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
255
+ meta.current_schema_id = "s1"
256
+ snap = Snapshot(snapshot_id=1, timestamp_ms=1, manifest_list=manifest_path)
257
+ meta.snapshots.append(snap)
258
+ meta.current_snapshot_id = 1
259
+
260
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
261
+ ds.io = _MemIO(mapping)
262
+ ds.catalog = _FakeCatalog(ds.io)
263
+
264
+ # Refresh manifest (should re-read f1 and write a new manifest)
265
+ new_snap_id = ds.refresh_manifest(agent="test-agent", author="tester")
266
+ assert new_snap_id is not None
267
+
268
+ # Describe should include both columns and count bytes appropriately
269
+ desc = ds.describe()
270
+ assert "a" in desc
271
+ assert "b" in desc
272
+
273
+ # ensure uncompressed bytes are present and non-zero for both cols
274
+ assert desc["a"]["uncompressed_bytes"] > 0
275
+ assert desc["b"]["uncompressed_bytes"] > 0