opteryx-catalog 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,233 @@
1
+ """
2
+ Test script for compaction functionality.
3
+
4
+ This tests the DatasetCompactor class with both brute and performance strategies.
5
+ """
6
+
7
+ from unittest.mock import Mock
8
+
9
+ import pyarrow as pa
10
+
11
+ from opteryx_catalog.catalog.compaction import DatasetCompactor
12
+ from opteryx_catalog.catalog.metadata import DatasetMetadata, Snapshot
13
+
14
+
15
+ def create_test_table(num_rows: int, value_range: tuple = (0, 100)) -> pa.Table:
16
+ """Create a simple test table with a timestamp column for sorting."""
17
+ import random
18
+
19
+ timestamps = sorted([random.randint(value_range[0], value_range[1]) for _ in range(num_rows)])
20
+ values = [f"value_{i}" for i in range(num_rows)]
21
+
22
+ return pa.table({"timestamp": timestamps, "value": values})
23
+
24
+
25
+ def test_brute_compaction():
26
+ """Test brute force compaction strategy."""
27
+ print("Testing brute force compaction...")
28
+
29
+ # Create mock dataset
30
+ dataset = Mock()
31
+ dataset.metadata = DatasetMetadata(
32
+ dataset_identifier="test_dataset",
33
+ location="/tmp/test_data",
34
+ )
35
+ dataset.metadata.sort_orders = [] # No sort order for brute
36
+ dataset.metadata.snapshots = []
37
+ dataset.metadata.current_snapshot = None
38
+
39
+ # Create mock entries - small files that should be combined
40
+ mock_entries = [
41
+ {
42
+ "file_path": "/tmp/file1.parquet",
43
+ "file_size_in_bytes": 30 * 1024 * 1024, # 30MB compressed
44
+ "uncompressed_size_in_bytes": 40 * 1024 * 1024, # 40MB uncompressed
45
+ "record_count": 1000,
46
+ },
47
+ {
48
+ "file_path": "/tmp/file2.parquet",
49
+ "file_size_in_bytes": 35 * 1024 * 1024, # 35MB compressed
50
+ "uncompressed_size_in_bytes": 50 * 1024 * 1024, # 50MB uncompressed
51
+ "record_count": 1200,
52
+ },
53
+ {
54
+ "file_path": "/tmp/file3.parquet",
55
+ "file_size_in_bytes": 110 * 1024 * 1024, # 110MB compressed (acceptable)
56
+ "uncompressed_size_in_bytes": 130 * 1024 * 1024, # 130MB uncompressed
57
+ "record_count": 3000,
58
+ },
59
+ ]
60
+
61
+ # Create current snapshot with manifest
62
+ dataset.metadata.current_snapshot = Snapshot(
63
+ snapshot_id=1000,
64
+ timestamp_ms=1000,
65
+ manifest_list="/tmp/manifest.parquet",
66
+ )
67
+
68
+ # Mock IO and catalog
69
+ dataset.io = Mock()
70
+ dataset.catalog = Mock()
71
+
72
+ # Create compactor
73
+ compactor = DatasetCompactor(dataset, strategy="brute", author="test", agent="test-agent")
74
+
75
+ # Verify strategy selection
76
+ assert compactor.strategy == "brute", "Strategy should be brute"
77
+ assert compactor.decision == "user", "Decision should be user"
78
+
79
+ # Test selection logic directly
80
+ plan = compactor._select_brute_compaction(mock_entries)
81
+
82
+ assert plan is not None, "Should find files to compact"
83
+ assert plan["type"] == "combine", "Should plan to combine small files"
84
+ assert len(plan["files"]) == 2, "Should select 2 small files"
85
+
86
+ print("✓ Brute force compaction test passed")
87
+
88
+
89
+ def test_performance_compaction():
90
+ """Test performance compaction strategy."""
91
+ print("Testing performance compaction...")
92
+
93
+ # Create mock dataset with sort order
94
+ dataset = Mock()
95
+ dataset.metadata = DatasetMetadata(
96
+ dataset_identifier="test_dataset",
97
+ location="/tmp/test_data",
98
+ )
99
+ dataset.metadata.sort_orders = [0] # Sort by first column
100
+ dataset.metadata.schema = Mock()
101
+ dataset.metadata.schema.fields = [Mock(name="timestamp")]
102
+ dataset.metadata.snapshots = []
103
+ dataset.metadata.current_snapshot = None
104
+
105
+ # Create mock entries with overlapping ranges
106
+ mock_entries = [
107
+ {
108
+ "file_path": "/tmp/file1.parquet",
109
+ "file_size_in_bytes": 30 * 1024 * 1024,
110
+ "uncompressed_size_in_bytes": 40 * 1024 * 1024,
111
+ "record_count": 1000,
112
+ "lower_bounds": {"timestamp": 1},
113
+ "upper_bounds": {"timestamp": 100},
114
+ },
115
+ {
116
+ "file_path": "/tmp/file2.parquet",
117
+ "file_size_in_bytes": 35 * 1024 * 1024,
118
+ "uncompressed_size_in_bytes": 50 * 1024 * 1024,
119
+ "record_count": 1200,
120
+ "lower_bounds": {"timestamp": 50}, # Overlaps with file1
121
+ "upper_bounds": {"timestamp": 150},
122
+ },
123
+ {
124
+ "file_path": "/tmp/file3.parquet",
125
+ "file_size_in_bytes": 110 * 1024 * 1024,
126
+ "uncompressed_size_in_bytes": 130 * 1024 * 1024,
127
+ "record_count": 3000,
128
+ "lower_bounds": {"timestamp": 200}, # No overlap
129
+ "upper_bounds": {"timestamp": 300},
130
+ },
131
+ ]
132
+
133
+ dataset.metadata.current_snapshot = Snapshot(
134
+ snapshot_id=1000,
135
+ timestamp_ms=1000,
136
+ manifest_list="/tmp/manifest.parquet",
137
+ )
138
+
139
+ # Mock IO and catalog
140
+ dataset.io = Mock()
141
+ dataset.catalog = Mock()
142
+
143
+ # Create compactor (auto-detect should choose performance)
144
+ compactor = DatasetCompactor(dataset, strategy=None, author="test", agent="test-agent")
145
+
146
+ # Verify strategy selection
147
+ assert compactor.strategy == "performance", "Should auto-select performance strategy"
148
+ assert compactor.decision == "auto", "Decision should be auto"
149
+
150
+ # Test selection logic directly
151
+ plan = compactor._select_performance_compaction(mock_entries)
152
+
153
+ assert plan is not None, "Should find overlapping files"
154
+ assert plan["type"] == "combine-split", "Should plan to combine and split"
155
+ assert len(plan["files"]) == 2, "Should select 2 overlapping files"
156
+ assert plan["sort_column"] == "timestamp", "Should identify sort column"
157
+
158
+ print("✓ Performance compaction test passed")
159
+
160
+
161
+ def test_large_file_splitting():
162
+ """Test that large files are identified for splitting."""
163
+ print("Testing large file splitting...")
164
+
165
+ dataset = Mock()
166
+ dataset.metadata = DatasetMetadata(
167
+ dataset_identifier="test_dataset",
168
+ location="/tmp/test_data",
169
+ )
170
+ dataset.metadata.sort_orders = []
171
+
172
+ # Create entry for a large file
173
+ mock_entries = [
174
+ {
175
+ "file_path": "/tmp/large_file.parquet",
176
+ "file_size_in_bytes": 180 * 1024 * 1024,
177
+ "uncompressed_size_in_bytes": 200 * 1024 * 1024, # 200MB > 196MB threshold
178
+ "record_count": 5000,
179
+ }
180
+ ]
181
+
182
+ compactor = DatasetCompactor(dataset, strategy="brute")
183
+ plan = compactor._select_brute_compaction(mock_entries)
184
+
185
+ assert plan is not None, "Should identify large file"
186
+ assert plan["type"] == "split", "Should plan to split"
187
+ assert plan["reason"] == "file-too-large", "Reason should be file too large"
188
+
189
+ print("✓ Large file splitting test passed")
190
+
191
+
192
+ def test_no_compaction_needed():
193
+ """Test when no compaction is needed."""
194
+ print("Testing no compaction scenario...")
195
+
196
+ dataset = Mock()
197
+ dataset.metadata = DatasetMetadata(
198
+ dataset_identifier="test_dataset",
199
+ location="/tmp/test_data",
200
+ )
201
+ dataset.metadata.sort_orders = []
202
+
203
+ # All files are in acceptable range
204
+ mock_entries = [
205
+ {
206
+ "file_path": "/tmp/file1.parquet",
207
+ "file_size_in_bytes": 100 * 1024 * 1024,
208
+ "uncompressed_size_in_bytes": 110 * 1024 * 1024,
209
+ "record_count": 2000,
210
+ },
211
+ {
212
+ "file_path": "/tmp/file2.parquet",
213
+ "file_size_in_bytes": 120 * 1024 * 1024,
214
+ "uncompressed_size_in_bytes": 135 * 1024 * 1024,
215
+ "record_count": 2500,
216
+ },
217
+ ]
218
+
219
+ compactor = DatasetCompactor(dataset, strategy="brute")
220
+ plan = compactor._select_brute_compaction(mock_entries)
221
+
222
+ assert plan is None, "Should not find anything to compact"
223
+
224
+ print("✓ No compaction test passed")
225
+
226
+
227
+ if __name__ == "__main__":
228
+ print("Running compaction tests...\n")
229
+ test_brute_compaction()
230
+ test_performance_compaction()
231
+ test_large_file_splitting()
232
+ test_no_compaction_needed()
233
+ print("\n✅ All tests passed!")
@@ -0,0 +1,29 @@
1
+ from opteryx_catalog.catalog.metadata import DatasetMetadata
2
+ from opteryx_catalog.catalog.dataset import SimpleDataset
3
+
4
+
5
+ def test_dataset_metadata_and_simpledataset():
6
+ meta = DatasetMetadata(
7
+ dataset_identifier="tests_temp.test",
8
+ location="gs://bucket/ws/tests_temp/test",
9
+ schema=None,
10
+ properties={},
11
+ )
12
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
13
+ assert ds.metadata.dataset_identifier == "tests_temp.test"
14
+ assert ds.snapshot() is None
15
+ assert list(ds.snapshots()) == []
16
+
17
+
18
+ def test_sequence_number_requires_history():
19
+ """Test that _next_sequence_number works with empty snapshots."""
20
+ meta = DatasetMetadata(
21
+ dataset_identifier="tests_temp.test",
22
+ location="gs://bucket/ws/tests_temp/test",
23
+ schema=None,
24
+ properties={},
25
+ )
26
+ ds = SimpleDataset(identifier="tests_temp.test", _metadata=meta)
27
+
28
+ # Should return 1 when no snapshots are loaded (first snapshot)
29
+ assert ds._next_sequence_number() == 1
tests/test_import.py ADDED
@@ -0,0 +1,5 @@
1
+ def test_import_opteryx_catalog():
2
+ import importlib
3
+
4
+ mod = importlib.import_module("opteryx_catalog")
5
+ assert mod is not None
@@ -0,0 +1,8 @@
1
+ import pathlib
2
+ import tomllib
3
+
4
+
5
+ def test_pyproject_name():
6
+ p = pathlib.Path("pyproject.toml")
7
+ data = tomllib.loads(p.read_text())
8
+ assert data.get("project", {}).get("name") == "opteryx-catalog"