opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +536 -0
- opteryx_catalog/catalog/dataset.py +840 -520
- opteryx_catalog/catalog/manifest.py +475 -0
- opteryx_catalog/catalog/metadata.py +5 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +296 -242
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/create_dataset.py +1 -1
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- scripts/read_dataset.py +1 -1
- tests/test_collections.py +37 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
tests/test_compaction.py
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Test script for compaction functionality.
|
|
3
|
+
|
|
4
|
+
This tests the DatasetCompactor class with both brute and performance strategies.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from unittest.mock import Mock
|
|
8
|
+
|
|
9
|
+
import pyarrow as pa
|
|
10
|
+
|
|
11
|
+
from opteryx_catalog.catalog.compaction import DatasetCompactor
|
|
12
|
+
from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_test_table(num_rows: int, value_range: tuple = (0, 100)) -> pa.Table:
|
|
16
|
+
"""Create a simple test table with a timestamp column for sorting."""
|
|
17
|
+
import random
|
|
18
|
+
|
|
19
|
+
timestamps = sorted([random.randint(value_range[0], value_range[1]) for _ in range(num_rows)])
|
|
20
|
+
values = [f"value_{i}" for i in range(num_rows)]
|
|
21
|
+
|
|
22
|
+
return pa.table({"timestamp": timestamps, "value": values})
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def test_brute_compaction():
|
|
26
|
+
"""Test brute force compaction strategy."""
|
|
27
|
+
print("Testing brute force compaction...")
|
|
28
|
+
|
|
29
|
+
# Create mock dataset
|
|
30
|
+
dataset = Mock()
|
|
31
|
+
dataset.metadata = DatasetMetadata(
|
|
32
|
+
dataset_identifier="test_dataset",
|
|
33
|
+
location="/tmp/test_data",
|
|
34
|
+
)
|
|
35
|
+
dataset.metadata.sort_orders = [] # No sort order for brute
|
|
36
|
+
dataset.metadata.snapshots = []
|
|
37
|
+
dataset.metadata.current_snapshot = None
|
|
38
|
+
|
|
39
|
+
# Create mock entries - small files that should be combined
|
|
40
|
+
mock_entries = [
|
|
41
|
+
{
|
|
42
|
+
"file_path": "/tmp/file1.parquet",
|
|
43
|
+
"file_size_in_bytes": 30 * 1024 * 1024, # 30MB compressed
|
|
44
|
+
"uncompressed_size_in_bytes": 40 * 1024 * 1024, # 40MB uncompressed
|
|
45
|
+
"record_count": 1000,
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"file_path": "/tmp/file2.parquet",
|
|
49
|
+
"file_size_in_bytes": 35 * 1024 * 1024, # 35MB compressed
|
|
50
|
+
"uncompressed_size_in_bytes": 50 * 1024 * 1024, # 50MB uncompressed
|
|
51
|
+
"record_count": 1200,
|
|
52
|
+
},
|
|
53
|
+
{
|
|
54
|
+
"file_path": "/tmp/file3.parquet",
|
|
55
|
+
"file_size_in_bytes": 110 * 1024 * 1024, # 110MB compressed (acceptable)
|
|
56
|
+
"uncompressed_size_in_bytes": 130 * 1024 * 1024, # 130MB uncompressed
|
|
57
|
+
"record_count": 3000,
|
|
58
|
+
},
|
|
59
|
+
]
|
|
60
|
+
|
|
61
|
+
# Create current snapshot with manifest
|
|
62
|
+
dataset.metadata.current_snapshot = Snapshot(
|
|
63
|
+
snapshot_id=1000,
|
|
64
|
+
timestamp_ms=1000,
|
|
65
|
+
manifest_list="/tmp/manifest.parquet",
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# Mock IO and catalog
|
|
69
|
+
dataset.io = Mock()
|
|
70
|
+
dataset.catalog = Mock()
|
|
71
|
+
|
|
72
|
+
# Create compactor
|
|
73
|
+
compactor = DatasetCompactor(dataset, strategy="brute", author="test", agent="test-agent")
|
|
74
|
+
|
|
75
|
+
# Verify strategy selection
|
|
76
|
+
assert compactor.strategy == "brute", "Strategy should be brute"
|
|
77
|
+
assert compactor.decision == "user", "Decision should be user"
|
|
78
|
+
|
|
79
|
+
# Test selection logic directly
|
|
80
|
+
plan = compactor._select_brute_compaction(mock_entries)
|
|
81
|
+
|
|
82
|
+
assert plan is not None, "Should find files to compact"
|
|
83
|
+
assert plan["type"] == "combine", "Should plan to combine small files"
|
|
84
|
+
assert len(plan["files"]) == 2, "Should select 2 small files"
|
|
85
|
+
|
|
86
|
+
print("✓ Brute force compaction test passed")
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def test_performance_compaction():
|
|
90
|
+
"""Test performance compaction strategy."""
|
|
91
|
+
print("Testing performance compaction...")
|
|
92
|
+
|
|
93
|
+
# Create mock dataset with sort order
|
|
94
|
+
dataset = Mock()
|
|
95
|
+
dataset.metadata = DatasetMetadata(
|
|
96
|
+
dataset_identifier="test_dataset",
|
|
97
|
+
location="/tmp/test_data",
|
|
98
|
+
)
|
|
99
|
+
dataset.metadata.sort_orders = [0] # Sort by first column
|
|
100
|
+
dataset.metadata.schema = Mock()
|
|
101
|
+
dataset.metadata.schema.fields = [Mock(name="timestamp")]
|
|
102
|
+
dataset.metadata.snapshots = []
|
|
103
|
+
dataset.metadata.current_snapshot = None
|
|
104
|
+
|
|
105
|
+
# Create mock entries with overlapping ranges
|
|
106
|
+
mock_entries = [
|
|
107
|
+
{
|
|
108
|
+
"file_path": "/tmp/file1.parquet",
|
|
109
|
+
"file_size_in_bytes": 30 * 1024 * 1024,
|
|
110
|
+
"uncompressed_size_in_bytes": 40 * 1024 * 1024,
|
|
111
|
+
"record_count": 1000,
|
|
112
|
+
"lower_bounds": {"timestamp": 1},
|
|
113
|
+
"upper_bounds": {"timestamp": 100},
|
|
114
|
+
},
|
|
115
|
+
{
|
|
116
|
+
"file_path": "/tmp/file2.parquet",
|
|
117
|
+
"file_size_in_bytes": 35 * 1024 * 1024,
|
|
118
|
+
"uncompressed_size_in_bytes": 50 * 1024 * 1024,
|
|
119
|
+
"record_count": 1200,
|
|
120
|
+
"lower_bounds": {"timestamp": 50}, # Overlaps with file1
|
|
121
|
+
"upper_bounds": {"timestamp": 150},
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"file_path": "/tmp/file3.parquet",
|
|
125
|
+
"file_size_in_bytes": 110 * 1024 * 1024,
|
|
126
|
+
"uncompressed_size_in_bytes": 130 * 1024 * 1024,
|
|
127
|
+
"record_count": 3000,
|
|
128
|
+
"lower_bounds": {"timestamp": 200}, # No overlap
|
|
129
|
+
"upper_bounds": {"timestamp": 300},
|
|
130
|
+
},
|
|
131
|
+
]
|
|
132
|
+
|
|
133
|
+
dataset.metadata.current_snapshot = Snapshot(
|
|
134
|
+
snapshot_id=1000,
|
|
135
|
+
timestamp_ms=1000,
|
|
136
|
+
manifest_list="/tmp/manifest.parquet",
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
# Mock IO and catalog
|
|
140
|
+
dataset.io = Mock()
|
|
141
|
+
dataset.catalog = Mock()
|
|
142
|
+
|
|
143
|
+
# Create compactor (auto-detect should choose performance)
|
|
144
|
+
compactor = DatasetCompactor(dataset, strategy=None, author="test", agent="test-agent")
|
|
145
|
+
|
|
146
|
+
# Verify strategy selection
|
|
147
|
+
assert compactor.strategy == "performance", "Should auto-select performance strategy"
|
|
148
|
+
assert compactor.decision == "auto", "Decision should be auto"
|
|
149
|
+
|
|
150
|
+
# Test selection logic directly
|
|
151
|
+
plan = compactor._select_performance_compaction(mock_entries)
|
|
152
|
+
|
|
153
|
+
assert plan is not None, "Should find overlapping files"
|
|
154
|
+
assert plan["type"] == "combine-split", "Should plan to combine and split"
|
|
155
|
+
assert len(plan["files"]) == 2, "Should select 2 overlapping files"
|
|
156
|
+
assert plan["sort_column"] == "timestamp", "Should identify sort column"
|
|
157
|
+
|
|
158
|
+
print("✓ Performance compaction test passed")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def test_large_file_splitting():
|
|
162
|
+
"""Test that large files are identified for splitting."""
|
|
163
|
+
print("Testing large file splitting...")
|
|
164
|
+
|
|
165
|
+
dataset = Mock()
|
|
166
|
+
dataset.metadata = DatasetMetadata(
|
|
167
|
+
dataset_identifier="test_dataset",
|
|
168
|
+
location="/tmp/test_data",
|
|
169
|
+
)
|
|
170
|
+
dataset.metadata.sort_orders = []
|
|
171
|
+
|
|
172
|
+
# Create entry for a large file
|
|
173
|
+
mock_entries = [
|
|
174
|
+
{
|
|
175
|
+
"file_path": "/tmp/large_file.parquet",
|
|
176
|
+
"file_size_in_bytes": 180 * 1024 * 1024,
|
|
177
|
+
"uncompressed_size_in_bytes": 200 * 1024 * 1024, # 200MB > 196MB threshold
|
|
178
|
+
"record_count": 5000,
|
|
179
|
+
}
|
|
180
|
+
]
|
|
181
|
+
|
|
182
|
+
compactor = DatasetCompactor(dataset, strategy="brute")
|
|
183
|
+
plan = compactor._select_brute_compaction(mock_entries)
|
|
184
|
+
|
|
185
|
+
assert plan is not None, "Should identify large file"
|
|
186
|
+
assert plan["type"] == "split", "Should plan to split"
|
|
187
|
+
assert plan["reason"] == "file-too-large", "Reason should be file too large"
|
|
188
|
+
|
|
189
|
+
print("✓ Large file splitting test passed")
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
def test_no_compaction_needed():
|
|
193
|
+
"""Test when no compaction is needed."""
|
|
194
|
+
print("Testing no compaction scenario...")
|
|
195
|
+
|
|
196
|
+
dataset = Mock()
|
|
197
|
+
dataset.metadata = DatasetMetadata(
|
|
198
|
+
dataset_identifier="test_dataset",
|
|
199
|
+
location="/tmp/test_data",
|
|
200
|
+
)
|
|
201
|
+
dataset.metadata.sort_orders = []
|
|
202
|
+
|
|
203
|
+
# All files are in acceptable range
|
|
204
|
+
mock_entries = [
|
|
205
|
+
{
|
|
206
|
+
"file_path": "/tmp/file1.parquet",
|
|
207
|
+
"file_size_in_bytes": 100 * 1024 * 1024,
|
|
208
|
+
"uncompressed_size_in_bytes": 110 * 1024 * 1024,
|
|
209
|
+
"record_count": 2000,
|
|
210
|
+
},
|
|
211
|
+
{
|
|
212
|
+
"file_path": "/tmp/file2.parquet",
|
|
213
|
+
"file_size_in_bytes": 120 * 1024 * 1024,
|
|
214
|
+
"uncompressed_size_in_bytes": 135 * 1024 * 1024,
|
|
215
|
+
"record_count": 2500,
|
|
216
|
+
},
|
|
217
|
+
]
|
|
218
|
+
|
|
219
|
+
compactor = DatasetCompactor(dataset, strategy="brute")
|
|
220
|
+
plan = compactor._select_brute_compaction(mock_entries)
|
|
221
|
+
|
|
222
|
+
assert plan is None, "Should not find anything to compact"
|
|
223
|
+
|
|
224
|
+
print("✓ No compaction test passed")
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
if __name__ == "__main__":
|
|
228
|
+
print("Running compaction tests...\n")
|
|
229
|
+
test_brute_compaction()
|
|
230
|
+
test_performance_compaction()
|
|
231
|
+
test_large_file_splitting()
|
|
232
|
+
test_no_compaction_needed()
|
|
233
|
+
print("\n✅ All tests passed!")
|
tests/test_dataset_metadata.py
CHANGED
|
@@ -13,3 +13,17 @@ def test_dataset_metadata_and_simpledataset():
|
|
|
13
13
|
assert ds.metadata.dataset_identifier == "tests_temp.test"
|
|
14
14
|
assert ds.snapshot() is None
|
|
15
15
|
assert list(ds.snapshots()) == []
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def test_sequence_number_requires_history():
|
|
19
|
+
"""Test that _next_sequence_number works with empty snapshots."""
|
|
20
|
+
meta = DatasetMetadata(
|
|
21
|
+
dataset_identifier="tests_temp.test",
|
|
22
|
+
location="gs://bucket/ws/tests_temp/test",
|
|
23
|
+
schema=None,
|
|
24
|
+
properties={},
|
|
25
|
+
)
|
|
26
|
+
ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
|
|
27
|
+
|
|
28
|
+
# Should return 1 when no snapshots are loaded (first snapshot)
|
|
29
|
+
assert ds._next_sequence_number() == 1
|
|
@@ -0,0 +1,127 @@
|
|
|
1
|
+
import io
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
|
|
6
|
+
from opteryx_catalog.catalog.dataset import SimpleDataset
|
|
7
|
+
from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class _MemInput:
|
|
11
|
+
def __init__(self, data: bytes):
|
|
12
|
+
self._data = data
|
|
13
|
+
|
|
14
|
+
def open(self):
|
|
15
|
+
# Provide a file-like BytesIO which .read() returns the bytes
|
|
16
|
+
return io.BytesIO(self._data)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class _MemIO:
|
|
20
|
+
def __init__(self, mapping: dict):
|
|
21
|
+
self._mapping = mapping
|
|
22
|
+
|
|
23
|
+
def new_input(self, path: str):
|
|
24
|
+
return _MemInput(self._mapping[path])
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _build_manifest_bytes():
|
|
28
|
+
# Construct a parquet manifest with two entries, two columns per file
|
|
29
|
+
schema = pa.schema(
|
|
30
|
+
[
|
|
31
|
+
("file_path", pa.string()),
|
|
32
|
+
("file_format", pa.string()),
|
|
33
|
+
("record_count", pa.int64()),
|
|
34
|
+
("file_size_in_bytes", pa.int64()),
|
|
35
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
36
|
+
("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
|
|
37
|
+
("null_counts", pa.list_(pa.int64())),
|
|
38
|
+
("min_k_hashes", pa.list_(pa.int64())),
|
|
39
|
+
("histogram_counts", pa.list_(pa.int64())),
|
|
40
|
+
("histogram_bins", pa.int64()),
|
|
41
|
+
("min_values", pa.list_(pa.int64())),
|
|
42
|
+
("max_values", pa.list_(pa.int64())),
|
|
43
|
+
("min_values_display", pa.list_(pa.string())),
|
|
44
|
+
("max_values_display", pa.list_(pa.string())),
|
|
45
|
+
]
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
file_path = pa.array(["f1.parquet", "f2.parquet"], type=pa.string())
|
|
49
|
+
file_format = pa.array(["parquet", "parquet"], type=pa.string())
|
|
50
|
+
record_count = pa.array([10, 20], type=pa.int64())
|
|
51
|
+
file_size_in_bytes = pa.array([100, 200], type=pa.int64())
|
|
52
|
+
uncompressed_size_in_bytes = pa.array([1000, 2000], type=pa.int64())
|
|
53
|
+
column_uncompressed_sizes_in_bytes = pa.array(
|
|
54
|
+
[[100, 400], [300, 200]], type=pa.list_(pa.int64())
|
|
55
|
+
)
|
|
56
|
+
null_counts = pa.array([[0, 0], [0, 0]], type=pa.list_(pa.int64()))
|
|
57
|
+
min_k_hashes = pa.array([[1, 2], [1]], type=pa.list_(pa.int64()))
|
|
58
|
+
histogram_counts = pa.array([[1, 2], [3, 4]], type=pa.list_(pa.int64()))
|
|
59
|
+
histogram_bins = pa.array([32, 32], type=pa.int64())
|
|
60
|
+
min_values = pa.array([[10, 20], [5, 30]], type=pa.list_(pa.int64()))
|
|
61
|
+
max_values = pa.array([[100, 400], [300, 200]], type=pa.list_(pa.int64()))
|
|
62
|
+
min_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
|
|
63
|
+
max_values_display = pa.array([[None, None], [None, None]], type=pa.list_(pa.string()))
|
|
64
|
+
|
|
65
|
+
table = pa.Table.from_arrays(
|
|
66
|
+
[
|
|
67
|
+
file_path,
|
|
68
|
+
file_format,
|
|
69
|
+
record_count,
|
|
70
|
+
file_size_in_bytes,
|
|
71
|
+
uncompressed_size_in_bytes,
|
|
72
|
+
column_uncompressed_sizes_in_bytes,
|
|
73
|
+
null_counts,
|
|
74
|
+
min_k_hashes,
|
|
75
|
+
histogram_counts,
|
|
76
|
+
histogram_bins,
|
|
77
|
+
min_values,
|
|
78
|
+
max_values,
|
|
79
|
+
min_values_display,
|
|
80
|
+
max_values_display,
|
|
81
|
+
],
|
|
82
|
+
schema=schema,
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
buf = io.BytesIO()
|
|
86
|
+
pq.write_table(table, buf)
|
|
87
|
+
return buf.getvalue()
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def test_describe_includes_uncompressed_bytes():
|
|
91
|
+
manifest_bytes = _build_manifest_bytes()
|
|
92
|
+
manifest_path = "mem://manifest"
|
|
93
|
+
|
|
94
|
+
meta = DatasetMetadata(
|
|
95
|
+
dataset_identifier="tests_temp.test",
|
|
96
|
+
location="mem://",
|
|
97
|
+
schema=None,
|
|
98
|
+
properties={},
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Add a schema with two columns so describe() can map names -> indices
|
|
102
|
+
meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
|
|
103
|
+
meta.current_schema_id = "s1"
|
|
104
|
+
|
|
105
|
+
# Prepare snapshot referencing our in-memory manifest
|
|
106
|
+
snap = Snapshot(
|
|
107
|
+
snapshot_id=1,
|
|
108
|
+
timestamp_ms=1,
|
|
109
|
+
manifest_list=manifest_path,
|
|
110
|
+
)
|
|
111
|
+
meta.snapshots.append(snap)
|
|
112
|
+
meta.current_snapshot_id = 1
|
|
113
|
+
|
|
114
|
+
ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
|
|
115
|
+
|
|
116
|
+
# Inject our in-memory IO mapping
|
|
117
|
+
ds.io = _MemIO({manifest_path: manifest_bytes})
|
|
118
|
+
|
|
119
|
+
desc = ds.describe()
|
|
120
|
+
|
|
121
|
+
assert "a" in desc
|
|
122
|
+
assert "b" in desc
|
|
123
|
+
|
|
124
|
+
# Column 'a' should have uncompressed bytes = 100 + 300 = 400
|
|
125
|
+
assert desc["a"]["uncompressed_bytes"] == 400
|
|
126
|
+
# Column 'b' should have uncompressed bytes = 400 + 200 = 600
|
|
127
|
+
assert desc["b"]["uncompressed_bytes"] == 600
|
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
import io
|
|
2
|
+
|
|
3
|
+
import pyarrow as pa
|
|
4
|
+
import pyarrow.parquet as pq
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import sys
|
|
8
|
+
|
|
9
|
+
# Add local paths to sys.path to use local code instead of installed packages
|
|
10
|
+
sys.path.insert(0, os.path.join(sys.path[0], "..")) # Add parent dir for pyiceberg_firestore_gcs
|
|
11
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
|
|
12
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
from opteryx_catalog.catalog.dataset import SimpleDataset
|
|
16
|
+
from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
|
|
17
|
+
from opteryx_catalog.catalog.manifest import (
|
|
18
|
+
build_parquet_manifest_entry_from_bytes,
|
|
19
|
+
get_manifest_metrics,
|
|
20
|
+
reset_manifest_metrics,
|
|
21
|
+
)
|
|
22
|
+
from opteryx_catalog.opteryx_catalog import OpteryxCatalog
|
|
23
|
+
import pytest
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def test_min_k_hashes_for_string_and_binary():
|
|
27
|
+
try:
|
|
28
|
+
pass # type: ignore
|
|
29
|
+
except Exception:
|
|
30
|
+
pytest.skip("opteryx.draken not available")
|
|
31
|
+
|
|
32
|
+
import pyarrow as pa
|
|
33
|
+
|
|
34
|
+
# short binary and short string columns should get min-k
|
|
35
|
+
t = _make_parquet_table(
|
|
36
|
+
[("bin", pa.binary()), ("s", pa.string())], [(b"a", "x"), (b"b", "y"), (b"c", "z")]
|
|
37
|
+
)
|
|
38
|
+
buf = pa.BufferOutputStream()
|
|
39
|
+
pq.write_table(t, buf, compression="zstd")
|
|
40
|
+
data = buf.getvalue().to_pybytes()
|
|
41
|
+
e = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
|
|
42
|
+
assert len(e.min_k_hashes[0]) > 0
|
|
43
|
+
assert len(e.min_k_hashes[1]) > 0
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
# Step 1: Create a local catalog
|
|
47
|
+
catalog = OpteryxCatalog(
|
|
48
|
+
"opteryx",
|
|
49
|
+
firestore_project="mabeldev",
|
|
50
|
+
firestore_database="catalogs",
|
|
51
|
+
gcs_bucket="opteryx_data",
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
# print(catalog.load_dataset("ops.stdout_log").describe())
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class _MemInput:
|
|
58
|
+
def __init__(self, data: bytes):
|
|
59
|
+
self._data = data
|
|
60
|
+
|
|
61
|
+
def open(self):
|
|
62
|
+
return io.BytesIO(self._data)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class _MemIO:
|
|
66
|
+
def __init__(self, mapping: dict):
|
|
67
|
+
self._mapping = mapping
|
|
68
|
+
|
|
69
|
+
def new_input(self, path: str):
|
|
70
|
+
return _MemInput(self._mapping[path])
|
|
71
|
+
|
|
72
|
+
def new_output(self, path: str):
|
|
73
|
+
class Out:
|
|
74
|
+
def __init__(self, mapping, path):
|
|
75
|
+
self._buf = io.BytesIO()
|
|
76
|
+
self._mapping = mapping
|
|
77
|
+
self._path = path
|
|
78
|
+
|
|
79
|
+
def write(self, data: bytes):
|
|
80
|
+
self._buf.write(data)
|
|
81
|
+
|
|
82
|
+
def close(self):
|
|
83
|
+
self._mapping[self._path] = self._buf.getvalue()
|
|
84
|
+
|
|
85
|
+
def create(self):
|
|
86
|
+
return self
|
|
87
|
+
|
|
88
|
+
return Out(self._mapping, path)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
class _FakeCatalog:
|
|
92
|
+
def __init__(self, io):
|
|
93
|
+
self.io = io
|
|
94
|
+
|
|
95
|
+
def write_parquet_manifest(
|
|
96
|
+
self, snapshot_id: int, entries: list[dict], dataset_location: str
|
|
97
|
+
) -> str:
|
|
98
|
+
# Minimal manifest writer using same schema as production
|
|
99
|
+
schema = pa.schema(
|
|
100
|
+
[
|
|
101
|
+
("file_path", pa.string()),
|
|
102
|
+
("file_format", pa.string()),
|
|
103
|
+
("record_count", pa.int64()),
|
|
104
|
+
("file_size_in_bytes", pa.int64()),
|
|
105
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
106
|
+
("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
|
|
107
|
+
("null_counts", pa.list_(pa.int64())),
|
|
108
|
+
("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
|
|
109
|
+
("histogram_counts", pa.list_(pa.list_(pa.int64()))),
|
|
110
|
+
("histogram_bins", pa.int32()),
|
|
111
|
+
("min_values", pa.list_(pa.int64())),
|
|
112
|
+
("max_values", pa.list_(pa.int64())),
|
|
113
|
+
("min_values_display", pa.list_(pa.string())),
|
|
114
|
+
("max_values_display", pa.list_(pa.string())),
|
|
115
|
+
]
|
|
116
|
+
)
|
|
117
|
+
normalized = []
|
|
118
|
+
for ent in entries:
|
|
119
|
+
if not isinstance(ent, dict):
|
|
120
|
+
normalized.append(ent)
|
|
121
|
+
continue
|
|
122
|
+
e = dict(ent)
|
|
123
|
+
e.setdefault("min_k_hashes", [])
|
|
124
|
+
e.setdefault("histogram_counts", [])
|
|
125
|
+
e.setdefault("histogram_bins", 0)
|
|
126
|
+
e.setdefault("column_uncompressed_sizes_in_bytes", [])
|
|
127
|
+
e.setdefault("null_counts", [])
|
|
128
|
+
e.setdefault("min_values_display", [])
|
|
129
|
+
e.setdefault("max_values_display", [])
|
|
130
|
+
mv = e.get("min_values") or []
|
|
131
|
+
xv = e.get("max_values") or []
|
|
132
|
+
mv_disp = e.get("min_values_display") or []
|
|
133
|
+
xv_disp = e.get("max_values_display") or []
|
|
134
|
+
e["min_values"] = [int(v) if v is not None else None for v in mv]
|
|
135
|
+
e["max_values"] = [int(v) if v is not None else None for v in xv]
|
|
136
|
+
e["min_values_display"] = [str(v) if v is not None else None for v in mv_disp]
|
|
137
|
+
e["max_values_display"] = [str(v) if v is not None else None for v in xv_disp]
|
|
138
|
+
normalized.append(e)
|
|
139
|
+
|
|
140
|
+
table = pa.Table.from_pylist(normalized, schema=schema)
|
|
141
|
+
buf = pa.BufferOutputStream()
|
|
142
|
+
pq.write_table(table, buf, compression="zstd")
|
|
143
|
+
data = buf.getvalue().to_pybytes()
|
|
144
|
+
path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
145
|
+
out = self.io.new_output(path).create()
|
|
146
|
+
out.write(data)
|
|
147
|
+
out.close()
|
|
148
|
+
return path
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _make_parquet_table(columns: list[tuple[str, pa.DataType]], rows: list[tuple]):
|
|
152
|
+
arrays = []
|
|
153
|
+
for i, (name, dtype) in enumerate(columns):
|
|
154
|
+
col_vals = [r[i] for r in rows]
|
|
155
|
+
arrays.append(pa.array(col_vals, type=dtype))
|
|
156
|
+
return pa.Table.from_arrays(arrays, names=[c[0] for c in columns])
|
|
157
|
+
|
|
158
|
+
|
|
159
|
+
def test_build_manifest_from_bytes_matches_table():
|
|
160
|
+
# ensure the bytes-based builder matches the table-based one
|
|
161
|
+
t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
162
|
+
buf = pa.BufferOutputStream()
|
|
163
|
+
pq.write_table(t, buf, compression="zstd")
|
|
164
|
+
data = buf.getvalue().to_pybytes()
|
|
165
|
+
|
|
166
|
+
e_bytes = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
|
|
167
|
+
# basic sanity checks (parity is enforced by using orig_table when available)
|
|
168
|
+
assert e_bytes.record_count == 2
|
|
169
|
+
assert e_bytes.file_size_in_bytes == len(data)
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def test_manifest_metrics_increments():
|
|
173
|
+
reset_manifest_metrics()
|
|
174
|
+
t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
175
|
+
buf = pa.BufferOutputStream()
|
|
176
|
+
pq.write_table(t, buf, compression="zstd")
|
|
177
|
+
data = buf.getvalue().to_pybytes()
|
|
178
|
+
|
|
179
|
+
_ = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data), orig_table=t)
|
|
180
|
+
m = get_manifest_metrics()
|
|
181
|
+
assert m.get("files_read", 0) >= 1
|
|
182
|
+
assert m.get("hash_calls", 0) >= 1
|
|
183
|
+
assert m.get("compress_calls", 0) >= 1
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def test_table_based_builder_is_removed():
|
|
187
|
+
from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry
|
|
188
|
+
|
|
189
|
+
t = _make_parquet_table([("a", pa.int64())], [(1,)])
|
|
190
|
+
with pytest.raises(RuntimeError):
|
|
191
|
+
_ = build_parquet_manifest_entry(t, "mem://f", 0)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
def test_manifest_uses_rugo_for_sizes():
|
|
195
|
+
# Ensure the bytes-based builder uses rugo metadata to compute per-column sizes
|
|
196
|
+
reset_manifest_metrics()
|
|
197
|
+
t = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
198
|
+
buf = pa.BufferOutputStream()
|
|
199
|
+
pq.write_table(t, buf, compression="zstd")
|
|
200
|
+
data = buf.getvalue().to_pybytes()
|
|
201
|
+
|
|
202
|
+
entry = build_parquet_manifest_entry_from_bytes(data, "mem://f", len(data))
|
|
203
|
+
m = get_manifest_metrics()
|
|
204
|
+
|
|
205
|
+
# rugo should report sizes (non-zero) for these synthetic files
|
|
206
|
+
assert m.get("sizes_from_rugo", 0) >= 1 or m.get("sizes_from_rugo_missing", 0) == 0
|
|
207
|
+
assert entry.uncompressed_size_in_bytes >= 0
|
|
208
|
+
assert isinstance(entry.column_uncompressed_sizes_in_bytes, list)
|
|
209
|
+
assert len(entry.column_uncompressed_sizes_in_bytes) == 2
|
|
210
|
+
# column sizes may be non-zero when metadata is available
|
|
211
|
+
assert all(isinstance(x, int) for x in entry.column_uncompressed_sizes_in_bytes)
|
|
212
|
+
|
|
213
|
+
|
|
214
|
+
def test_refresh_manifest_with_single_file():
|
|
215
|
+
# single file with columns a,b for quick iteration
|
|
216
|
+
t1 = _make_parquet_table([("a", pa.int64()), ("b", pa.int64())], [(1, 10), (2, 20)])
|
|
217
|
+
|
|
218
|
+
# Write parquet file to mem
|
|
219
|
+
buf = pa.BufferOutputStream()
|
|
220
|
+
pq.write_table(t1, buf, compression="zstd")
|
|
221
|
+
d1 = buf.getvalue().to_pybytes()
|
|
222
|
+
|
|
223
|
+
f1 = "mem://data/f1.parquet"
|
|
224
|
+
manifest_path = "mem://manifest-old"
|
|
225
|
+
|
|
226
|
+
# Build initial manifest entry for single file (bytes-based builder)
|
|
227
|
+
e1 = build_parquet_manifest_entry_from_bytes(d1, f1, len(d1), orig_table=t1).to_dict()
|
|
228
|
+
|
|
229
|
+
# Create in-memory IO mapping including manifest and data file
|
|
230
|
+
mapping = {f1: d1}
|
|
231
|
+
|
|
232
|
+
# Write initial manifest with the single entry using the same writer as the catalog
|
|
233
|
+
fake_writer = _FakeCatalog(_MemIO(mapping))
|
|
234
|
+
manifest_path = fake_writer.write_parquet_manifest(1, [e1], "mem://")
|
|
235
|
+
# Ensure the manifest bytes are present in the mapping
|
|
236
|
+
mapping[manifest_path] = mapping[manifest_path]
|
|
237
|
+
|
|
238
|
+
# Persist the single-file manifest as JSON for quick inspection during
|
|
239
|
+
# iterative debugging (writes to repo `artifacts/` so you can open it).
|
|
240
|
+
import os
|
|
241
|
+
import json
|
|
242
|
+
|
|
243
|
+
artifacts_dir = os.path.join(os.getcwd(), "artifacts")
|
|
244
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
245
|
+
with open(
|
|
246
|
+
os.path.join(artifacts_dir, "single_file_manifest.json"), "w", encoding="utf-8"
|
|
247
|
+
) as fh:
|
|
248
|
+
json.dump(e1, fh, indent=2, default=str)
|
|
249
|
+
|
|
250
|
+
# Create metadata and snapshot
|
|
251
|
+
meta = DatasetMetadata(
|
|
252
|
+
dataset_identifier="tests_temp.test", location="mem://", schema=None, properties={}
|
|
253
|
+
)
|
|
254
|
+
meta.schemas.append({"schema_id": "s1", "columns": [{"name": "a"}, {"name": "b"}]})
|
|
255
|
+
meta.current_schema_id = "s1"
|
|
256
|
+
snap = Snapshot(snapshot_id=1, timestamp_ms=1, manifest_list=manifest_path)
|
|
257
|
+
meta.snapshots.append(snap)
|
|
258
|
+
meta.current_snapshot_id = 1
|
|
259
|
+
|
|
260
|
+
ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
|
|
261
|
+
ds.io = _MemIO(mapping)
|
|
262
|
+
ds.catalog = _FakeCatalog(ds.io)
|
|
263
|
+
|
|
264
|
+
# Refresh manifest (should re-read f1 and write a new manifest)
|
|
265
|
+
new_snap_id = ds.refresh_manifest(agent="test-agent", author="tester")
|
|
266
|
+
assert new_snap_id is not None
|
|
267
|
+
|
|
268
|
+
# Describe should include both columns and count bytes appropriately
|
|
269
|
+
desc = ds.describe()
|
|
270
|
+
assert "a" in desc
|
|
271
|
+
assert "b" in desc
|
|
272
|
+
|
|
273
|
+
# ensure uncompressed bytes are present and non-zero for both cols
|
|
274
|
+
assert desc["a"]["uncompressed_bytes"] > 0
|
|
275
|
+
assert desc["b"]["uncompressed_bytes"] > 0
|