opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +536 -0
- opteryx_catalog/catalog/dataset.py +840 -520
- opteryx_catalog/catalog/manifest.py +475 -0
- opteryx_catalog/catalog/metadata.py +5 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +296 -242
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/create_dataset.py +1 -1
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- scripts/read_dataset.py +1 -1
- tests/test_collections.py +37 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from collections import Counter
|
|
3
6
|
from dataclasses import dataclass
|
|
4
7
|
from dataclasses import field
|
|
8
|
+
from typing import Any
|
|
5
9
|
from typing import Dict
|
|
6
10
|
|
|
11
|
+
NULL_FLAG = -(1 << 63)
|
|
12
|
+
MIN_K_HASHES = 32
|
|
13
|
+
HISTOGRAM_BINS = 32
|
|
14
|
+
|
|
7
15
|
|
|
8
16
|
@dataclass
|
|
9
17
|
class DataFile:
|
|
@@ -21,3 +29,470 @@ class ManifestEntry:
|
|
|
21
29
|
snapshot_id: int
|
|
22
30
|
data_file: DataFile
|
|
23
31
|
status: str = "added" # 'added' | 'deleted'
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@dataclass
|
|
35
|
+
class ParquetManifestEntry:
|
|
36
|
+
"""Represents a single entry in a Parquet manifest with statistics."""
|
|
37
|
+
|
|
38
|
+
file_path: str
|
|
39
|
+
file_format: str
|
|
40
|
+
record_count: int
|
|
41
|
+
file_size_in_bytes: int
|
|
42
|
+
uncompressed_size_in_bytes: int
|
|
43
|
+
column_uncompressed_sizes_in_bytes: list[int]
|
|
44
|
+
null_counts: list[int]
|
|
45
|
+
min_k_hashes: list[list[int]]
|
|
46
|
+
histogram_counts: list[list[int]]
|
|
47
|
+
histogram_bins: int
|
|
48
|
+
min_values: list
|
|
49
|
+
max_values: list
|
|
50
|
+
min_values_display: list
|
|
51
|
+
max_values_display: list
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> dict:
|
|
54
|
+
return {
|
|
55
|
+
"file_path": self.file_path,
|
|
56
|
+
"file_format": self.file_format,
|
|
57
|
+
"record_count": self.record_count,
|
|
58
|
+
"file_size_in_bytes": self.file_size_in_bytes,
|
|
59
|
+
"uncompressed_size_in_bytes": self.uncompressed_size_in_bytes,
|
|
60
|
+
"column_uncompressed_sizes_in_bytes": self.column_uncompressed_sizes_in_bytes,
|
|
61
|
+
"null_counts": self.null_counts,
|
|
62
|
+
"min_k_hashes": self.min_k_hashes,
|
|
63
|
+
"histogram_counts": self.histogram_counts,
|
|
64
|
+
"histogram_bins": self.histogram_bins,
|
|
65
|
+
"min_values": self.min_values,
|
|
66
|
+
"max_values": self.max_values,
|
|
67
|
+
"min_values_display": self.min_values_display,
|
|
68
|
+
"max_values_display": self.max_values_display,
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
logger = logging.getLogger(__name__)
|
|
73
|
+
_manifest_metrics = Counter()
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _compute_stats_for_arrow_column(col, field_type, file_path: str):
|
|
77
|
+
"""Compute statistics for a single PyArrow column (Array or ChunkedArray).
|
|
78
|
+
|
|
79
|
+
Returns a tuple: (col_min_k, col_hist, col_min, col_max, min_display, max_display, null_count)
|
|
80
|
+
"""
|
|
81
|
+
import heapq
|
|
82
|
+
|
|
83
|
+
import opteryx.draken as draken # type: ignore
|
|
84
|
+
import pyarrow as pa
|
|
85
|
+
|
|
86
|
+
# Ensure single contiguous array when possible
|
|
87
|
+
if hasattr(col, "combine_chunks"):
|
|
88
|
+
try:
|
|
89
|
+
col = col.combine_chunks()
|
|
90
|
+
except Exception:
|
|
91
|
+
# leave as-is
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
# Record compress/hash usage
|
|
95
|
+
_manifest_metrics["hash_calls"] += 1
|
|
96
|
+
_manifest_metrics["compress_calls"] += 1
|
|
97
|
+
|
|
98
|
+
col_py = None
|
|
99
|
+
try:
|
|
100
|
+
vec = draken.Vector.from_arrow(col)
|
|
101
|
+
except Exception: # pragma: no cover - be robust
|
|
102
|
+
raise
|
|
103
|
+
|
|
104
|
+
hashes = set(vec.hash())
|
|
105
|
+
|
|
106
|
+
# Decide whether to compute min-k/histogram for this column
|
|
107
|
+
compute_min_k = False
|
|
108
|
+
if (
|
|
109
|
+
pa.types.is_integer(field_type)
|
|
110
|
+
or pa.types.is_floating(field_type)
|
|
111
|
+
or pa.types.is_decimal(field_type)
|
|
112
|
+
):
|
|
113
|
+
compute_min_k = True
|
|
114
|
+
elif (
|
|
115
|
+
pa.types.is_timestamp(field_type)
|
|
116
|
+
or pa.types.is_date(field_type)
|
|
117
|
+
or pa.types.is_time(field_type)
|
|
118
|
+
):
|
|
119
|
+
compute_min_k = True
|
|
120
|
+
elif (
|
|
121
|
+
pa.types.is_string(field_type)
|
|
122
|
+
or pa.types.is_large_string(field_type)
|
|
123
|
+
or pa.types.is_binary(field_type)
|
|
124
|
+
or pa.types.is_large_binary(field_type)
|
|
125
|
+
):
|
|
126
|
+
# For strings/binary we may need pylist for display
|
|
127
|
+
try:
|
|
128
|
+
col_py = col.to_pylist()
|
|
129
|
+
except Exception:
|
|
130
|
+
col_py = None
|
|
131
|
+
compute_min_k = True
|
|
132
|
+
|
|
133
|
+
if compute_min_k:
|
|
134
|
+
smallest = heapq.nsmallest(MIN_K_HASHES, hashes)
|
|
135
|
+
col_min_k = sorted(smallest)
|
|
136
|
+
else:
|
|
137
|
+
col_min_k = []
|
|
138
|
+
|
|
139
|
+
import pyarrow as pa # local import for types
|
|
140
|
+
|
|
141
|
+
compute_hist = compute_min_k
|
|
142
|
+
if pa.types.is_boolean(field_type):
|
|
143
|
+
compute_hist = True
|
|
144
|
+
|
|
145
|
+
# Use draken.compress() to get canonical int64 per value
|
|
146
|
+
compressed = list(vec.compress())
|
|
147
|
+
null_count = sum(1 for m in compressed if m == NULL_FLAG)
|
|
148
|
+
|
|
149
|
+
non_nulls_compressed = [m for m in compressed if m != NULL_FLAG]
|
|
150
|
+
if non_nulls_compressed:
|
|
151
|
+
vmin = min(non_nulls_compressed)
|
|
152
|
+
vmax = max(non_nulls_compressed)
|
|
153
|
+
col_min = int(vmin)
|
|
154
|
+
col_max = int(vmax)
|
|
155
|
+
if compute_hist:
|
|
156
|
+
# Special-case boolean histograms
|
|
157
|
+
if pa.types.is_boolean(field_type):
|
|
158
|
+
try:
|
|
159
|
+
if col_py is None:
|
|
160
|
+
try:
|
|
161
|
+
col_py = col.to_pylist()
|
|
162
|
+
except Exception:
|
|
163
|
+
col_py = None
|
|
164
|
+
if col_py is not None:
|
|
165
|
+
non_nulls_bool = [v for v in col_py if v is not None]
|
|
166
|
+
false_count = sum(1 for v in non_nulls_bool if v is False)
|
|
167
|
+
true_count = sum(1 for v in non_nulls_bool if v is True)
|
|
168
|
+
else:
|
|
169
|
+
# Fallback: infer from compressed mapping (assume 0/1)
|
|
170
|
+
false_count = sum(1 for m in non_nulls_compressed if m == 0)
|
|
171
|
+
true_count = sum(1 for m in non_nulls_compressed if m != 0)
|
|
172
|
+
except Exception:
|
|
173
|
+
false_count = 0
|
|
174
|
+
true_count = 0
|
|
175
|
+
|
|
176
|
+
col_hist = [int(true_count), int(false_count)]
|
|
177
|
+
else:
|
|
178
|
+
if vmin == vmax:
|
|
179
|
+
col_hist = []
|
|
180
|
+
else:
|
|
181
|
+
col_hist = [0] * HISTOGRAM_BINS
|
|
182
|
+
span = float(vmax - vmin)
|
|
183
|
+
for m in non_nulls_compressed:
|
|
184
|
+
b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
|
|
185
|
+
if b < 0:
|
|
186
|
+
b = 0
|
|
187
|
+
if b >= HISTOGRAM_BINS:
|
|
188
|
+
b = HISTOGRAM_BINS - 1
|
|
189
|
+
col_hist[b] += 1
|
|
190
|
+
else:
|
|
191
|
+
col_hist = []
|
|
192
|
+
else:
|
|
193
|
+
# no non-null values
|
|
194
|
+
col_min = NULL_FLAG
|
|
195
|
+
col_max = NULL_FLAG
|
|
196
|
+
col_hist = []
|
|
197
|
+
|
|
198
|
+
# display values
|
|
199
|
+
try:
|
|
200
|
+
if pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
|
|
201
|
+
if col_py is None:
|
|
202
|
+
try:
|
|
203
|
+
col_py = col.to_pylist()
|
|
204
|
+
except Exception:
|
|
205
|
+
col_py = None
|
|
206
|
+
if col_py is not None:
|
|
207
|
+
non_nulls_str = [x for x in col_py if x is not None]
|
|
208
|
+
if non_nulls_str:
|
|
209
|
+
min_value = min(non_nulls_str)
|
|
210
|
+
max_value = max(non_nulls_str)
|
|
211
|
+
if len(min_value) > 16:
|
|
212
|
+
min_value = min_value[:16] + "..."
|
|
213
|
+
if len(max_value) > 16:
|
|
214
|
+
max_value = max_value[:16] + "..."
|
|
215
|
+
min_display = min_value
|
|
216
|
+
max_display = max_value
|
|
217
|
+
else:
|
|
218
|
+
min_display = None
|
|
219
|
+
max_display = None
|
|
220
|
+
else:
|
|
221
|
+
min_display = None
|
|
222
|
+
max_display = None
|
|
223
|
+
elif pa.types.is_binary(field_type) or pa.types.is_large_binary(field_type):
|
|
224
|
+
if col_py is None:
|
|
225
|
+
try:
|
|
226
|
+
col_py = col.to_pylist()
|
|
227
|
+
except Exception:
|
|
228
|
+
col_py = None
|
|
229
|
+
if col_py is not None:
|
|
230
|
+
non_nulls = [x for x in col_py if x is not None]
|
|
231
|
+
if non_nulls:
|
|
232
|
+
min_value = min(non_nulls)
|
|
233
|
+
max_value = max(non_nulls)
|
|
234
|
+
if len(min_value) > 16:
|
|
235
|
+
min_value = min_value[:16] + "..."
|
|
236
|
+
if len(max_value) > 16:
|
|
237
|
+
max_value = max_value[:16] + "..."
|
|
238
|
+
if any(ord(b) < 32 or ord(b) > 126 for b in min_value):
|
|
239
|
+
min_value = min_value.hex()
|
|
240
|
+
min_value = min_value[:16] + "..."
|
|
241
|
+
if any(ord(b) < 32 or ord(b) > 126 for b in max_value):
|
|
242
|
+
max_value = max_value.hex()
|
|
243
|
+
max_value = max_value[:16] + "..."
|
|
244
|
+
min_display = min_value
|
|
245
|
+
max_display = max_value
|
|
246
|
+
else:
|
|
247
|
+
min_display = None
|
|
248
|
+
max_display = None
|
|
249
|
+
else:
|
|
250
|
+
min_display = None
|
|
251
|
+
max_display = None
|
|
252
|
+
else:
|
|
253
|
+
if col_py is None:
|
|
254
|
+
try:
|
|
255
|
+
col_py = col.to_pylist()
|
|
256
|
+
except Exception:
|
|
257
|
+
col_py = None
|
|
258
|
+
if col_py is not None:
|
|
259
|
+
non_nulls = [x for x in col_py if x is not None]
|
|
260
|
+
if non_nulls:
|
|
261
|
+
min_display = min(non_nulls)
|
|
262
|
+
max_display = max(non_nulls)
|
|
263
|
+
else:
|
|
264
|
+
min_display = None
|
|
265
|
+
max_display = None
|
|
266
|
+
else:
|
|
267
|
+
min_display = None
|
|
268
|
+
max_display = None
|
|
269
|
+
except Exception:
|
|
270
|
+
min_display = None
|
|
271
|
+
max_display = None
|
|
272
|
+
|
|
273
|
+
return (
|
|
274
|
+
col_min_k,
|
|
275
|
+
col_hist,
|
|
276
|
+
int(col_min),
|
|
277
|
+
int(col_max),
|
|
278
|
+
min_display,
|
|
279
|
+
max_display,
|
|
280
|
+
int(null_count),
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def build_parquet_manifest_entry_from_bytes(
|
|
285
|
+
data_bytes: bytes,
|
|
286
|
+
file_path: str,
|
|
287
|
+
file_size_in_bytes: int | None = None,
|
|
288
|
+
orig_table: Any | None = None,
|
|
289
|
+
) -> ParquetManifestEntry:
|
|
290
|
+
"""Build a manifest entry by reading a parquet file as bytes and scanning column-by-column.
|
|
291
|
+
|
|
292
|
+
This reads the compressed file once and materializes one full column at a time
|
|
293
|
+
(combine_chunks) which keeps peak memory low while letting per-column
|
|
294
|
+
stat calculation (draken) operate on contiguous arrays.
|
|
295
|
+
"""
|
|
296
|
+
import pyarrow as pa
|
|
297
|
+
import pyarrow.parquet as pq
|
|
298
|
+
|
|
299
|
+
t_start = time.perf_counter()
|
|
300
|
+
_manifest_metrics["files_read"] += 1
|
|
301
|
+
_manifest_metrics["bytes_read"] += len(data_bytes)
|
|
302
|
+
|
|
303
|
+
buf = pa.BufferReader(data_bytes)
|
|
304
|
+
pf = pq.ParquetFile(buf)
|
|
305
|
+
meta = pf.metadata
|
|
306
|
+
|
|
307
|
+
# Try to read rugo metadata early so we can compute sizes without
|
|
308
|
+
# materializing the table later. This is zero-copy and fast.
|
|
309
|
+
try:
|
|
310
|
+
from opteryx.rugo.parquet import read_metadata_from_memoryview
|
|
311
|
+
|
|
312
|
+
rmeta = read_metadata_from_memoryview(memoryview(data_bytes))
|
|
313
|
+
except Exception:
|
|
314
|
+
rmeta = None
|
|
315
|
+
|
|
316
|
+
# Prepare result containers
|
|
317
|
+
min_k_hashes: list[list[int]] = []
|
|
318
|
+
histograms: list[list[int]] = []
|
|
319
|
+
min_values: list[int] = []
|
|
320
|
+
null_counts: list[int] = []
|
|
321
|
+
max_values: list[int] = []
|
|
322
|
+
min_values_display: list = []
|
|
323
|
+
max_values_display: list = []
|
|
324
|
+
|
|
325
|
+
# iterate schema fields and process each column independently
|
|
326
|
+
schema = pf.schema_arrow
|
|
327
|
+
for col_idx, field in enumerate(schema):
|
|
328
|
+
col_name = field.name
|
|
329
|
+
try:
|
|
330
|
+
col_table = pf.read(columns=[col_name])
|
|
331
|
+
col = col_table.column(0).combine_chunks()
|
|
332
|
+
except Exception:
|
|
333
|
+
# fallback: try reading the row group column (more granular)
|
|
334
|
+
try:
|
|
335
|
+
tbl = pf.read_row_group(0, columns=[col_name])
|
|
336
|
+
col = tbl.column(0).combine_chunks()
|
|
337
|
+
except Exception:
|
|
338
|
+
# Last resort: read entire file and then take the column
|
|
339
|
+
tbl = pf.read()
|
|
340
|
+
col = tbl.column(col_idx).combine_chunks()
|
|
341
|
+
|
|
342
|
+
# compute stats using existing logic encapsulated in helper
|
|
343
|
+
(
|
|
344
|
+
col_min_k,
|
|
345
|
+
col_hist,
|
|
346
|
+
col_min,
|
|
347
|
+
col_max,
|
|
348
|
+
col_min_display,
|
|
349
|
+
col_max_display,
|
|
350
|
+
null_count,
|
|
351
|
+
) = _compute_stats_for_arrow_column(col, field.type, file_path)
|
|
352
|
+
|
|
353
|
+
# free the table-level reference if present so memory can be reclaimed
|
|
354
|
+
try:
|
|
355
|
+
del col_table
|
|
356
|
+
except Exception:
|
|
357
|
+
pass
|
|
358
|
+
try:
|
|
359
|
+
del tbl
|
|
360
|
+
except Exception:
|
|
361
|
+
pass
|
|
362
|
+
|
|
363
|
+
min_k_hashes.append(col_min_k)
|
|
364
|
+
histograms.append(col_hist)
|
|
365
|
+
min_values.append(col_min)
|
|
366
|
+
max_values.append(col_max)
|
|
367
|
+
min_values_display.append(col_min_display)
|
|
368
|
+
max_values_display.append(col_max_display)
|
|
369
|
+
null_counts.append(null_count)
|
|
370
|
+
|
|
371
|
+
# Calculate uncompressed sizes. When the original in-memory table is
|
|
372
|
+
# available (we just wrote it), prefer using it so sizes match the
|
|
373
|
+
# table-based builder exactly. Otherwise materialize the table from
|
|
374
|
+
# bytes and compute sizes the same way.
|
|
375
|
+
import pyarrow as pa
|
|
376
|
+
import pyarrow.parquet as pq
|
|
377
|
+
|
|
378
|
+
column_uncompressed: list[int] = []
|
|
379
|
+
uncompressed_size = 0
|
|
380
|
+
|
|
381
|
+
# Free references to large objects we no longer need so memory can be reclaimed
|
|
382
|
+
try:
|
|
383
|
+
del buf
|
|
384
|
+
except Exception:
|
|
385
|
+
pass
|
|
386
|
+
try:
|
|
387
|
+
del pf
|
|
388
|
+
except Exception:
|
|
389
|
+
pass
|
|
390
|
+
try:
|
|
391
|
+
del data_bytes
|
|
392
|
+
except Exception:
|
|
393
|
+
pass
|
|
394
|
+
|
|
395
|
+
if orig_table is not None:
|
|
396
|
+
# Use the original table buffers so results match the table-based route
|
|
397
|
+
for col in orig_table.columns:
|
|
398
|
+
col_total = 0
|
|
399
|
+
for chunk in col.chunks:
|
|
400
|
+
try:
|
|
401
|
+
buffs = chunk.buffers()
|
|
402
|
+
except Exception as exc:
|
|
403
|
+
raise RuntimeError(
|
|
404
|
+
f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
|
|
405
|
+
) from exc
|
|
406
|
+
for buffer in buffs:
|
|
407
|
+
if buffer is not None:
|
|
408
|
+
col_total += buffer.size
|
|
409
|
+
column_uncompressed.append(int(col_total))
|
|
410
|
+
uncompressed_size += col_total
|
|
411
|
+
else:
|
|
412
|
+
# Use rugo metadata (if available) to compute per-column uncompressed sizes
|
|
413
|
+
if rmeta:
|
|
414
|
+
rgs = rmeta.get("row_groups", [])
|
|
415
|
+
if rgs:
|
|
416
|
+
ncols = len(rgs[0].get("columns", []))
|
|
417
|
+
for cidx in range(ncols):
|
|
418
|
+
col_total = 0
|
|
419
|
+
for rg in rgs:
|
|
420
|
+
cols = rg.get("columns", [])
|
|
421
|
+
if cidx < len(cols):
|
|
422
|
+
col_total += int(cols[cidx].get("total_byte_size", 0) or 0)
|
|
423
|
+
column_uncompressed.append(int(col_total))
|
|
424
|
+
uncompressed_size += col_total
|
|
425
|
+
_manifest_metrics["sizes_from_rugo"] += 1
|
|
426
|
+
else:
|
|
427
|
+
column_uncompressed = [0] * len(schema)
|
|
428
|
+
uncompressed_size = 0
|
|
429
|
+
_manifest_metrics["sizes_from_rugo_missing"] += 1
|
|
430
|
+
else:
|
|
431
|
+
# If rugo metadata isn't available, avoid materializing the table;
|
|
432
|
+
# emit zero sizes (safe and memory-light) and track that we lacked
|
|
433
|
+
# metadata for sizes.
|
|
434
|
+
column_uncompressed = [0] * len(schema)
|
|
435
|
+
uncompressed_size = 0
|
|
436
|
+
_manifest_metrics["sizes_from_rugo_unavailable"] += 1
|
|
437
|
+
logger.debug(
|
|
438
|
+
"rugo metadata unavailable for %s; emitting zero column sizes to avoid materializing table",
|
|
439
|
+
file_path,
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
entry = ParquetManifestEntry(
|
|
443
|
+
file_path=file_path,
|
|
444
|
+
file_format="parquet",
|
|
445
|
+
record_count=int(meta.num_rows),
|
|
446
|
+
file_size_in_bytes=int(file_size_in_bytes or len(data_bytes)),
|
|
447
|
+
uncompressed_size_in_bytes=uncompressed_size,
|
|
448
|
+
column_uncompressed_sizes_in_bytes=column_uncompressed,
|
|
449
|
+
null_counts=null_counts,
|
|
450
|
+
min_k_hashes=min_k_hashes,
|
|
451
|
+
histogram_counts=histograms,
|
|
452
|
+
histogram_bins=HISTOGRAM_BINS,
|
|
453
|
+
min_values=min_values,
|
|
454
|
+
max_values=max_values,
|
|
455
|
+
min_values_display=min_values_display,
|
|
456
|
+
max_values_display=max_values_display,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
logger.debug(
|
|
460
|
+
"build_parquet_manifest_entry_from_bytes %s files=%d dur=%.3fs",
|
|
461
|
+
file_path,
|
|
462
|
+
_manifest_metrics["files_read"],
|
|
463
|
+
time.perf_counter() - t_start,
|
|
464
|
+
)
|
|
465
|
+
return entry
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
# Backwards-compatible wrapper that keeps the original calling convention
|
|
469
|
+
# when a pyarrow Table is already provided (tests and some scripts rely on it).
|
|
470
|
+
def build_parquet_manifest_entry(
|
|
471
|
+
table: Any, file_path: str, file_size_in_bytes: int | None = None
|
|
472
|
+
) -> ParquetManifestEntry:
|
|
473
|
+
"""DEPRECATED: explicit table-based manifest building is removed.
|
|
474
|
+
|
|
475
|
+
The implementation previously accepted a PyArrow ``table`` and performed
|
|
476
|
+
the same per-column statistics calculation. That behavior hid a different
|
|
477
|
+
IO/scan path and led to inconsistent performance characteristics.
|
|
478
|
+
|
|
479
|
+
Use ``build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=None)``
|
|
480
|
+
instead. If you have an in-memory table you can serialize it and call the
|
|
481
|
+
bytes-based builder, or pass ``orig_table`` to preserve exact uncompressed
|
|
482
|
+
size calculations.
|
|
483
|
+
|
|
484
|
+
This function now fails fast to avoid silently using the removed path.
|
|
485
|
+
"""
|
|
486
|
+
raise RuntimeError(
|
|
487
|
+
"table-based manifest builder removed: use build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=table) instead"
|
|
488
|
+
)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def get_manifest_metrics() -> dict:
|
|
492
|
+
"""Return a snapshot of manifest instrumentation counters (for tests/benchmarks)."""
|
|
493
|
+
return dict(_manifest_metrics)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def reset_manifest_metrics() -> None:
|
|
497
|
+
"""Reset the manifest metrics counters to zero."""
|
|
498
|
+
_manifest_metrics.clear()
|
|
@@ -46,12 +46,12 @@ class DatasetMetadata:
|
|
|
46
46
|
location: str = ""
|
|
47
47
|
schema: Any = None
|
|
48
48
|
properties: dict = field(default_factory=dict)
|
|
49
|
-
#
|
|
49
|
+
# Dataset-level created/updated metadata
|
|
50
50
|
timestamp_ms: Optional[int] = None
|
|
51
51
|
author: Optional[str] = None
|
|
52
52
|
description: Optional[str] = None
|
|
53
53
|
describer: Optional[str] = None
|
|
54
|
-
sort_orders: List[
|
|
54
|
+
sort_orders: List[int] = field(default_factory=list)
|
|
55
55
|
# Maintenance policy: retention settings grouped under a single block
|
|
56
56
|
maintenance_policy: dict = field(
|
|
57
57
|
default_factory=lambda: {
|
|
@@ -68,6 +68,9 @@ class DatasetMetadata:
|
|
|
68
68
|
# Each schema dict may also include `timestamp-ms` and `author`.
|
|
69
69
|
schemas: List[dict] = field(default_factory=list)
|
|
70
70
|
current_schema_id: Optional[str] = None
|
|
71
|
+
# Annotations: list of annotation objects attached to this dataset
|
|
72
|
+
# Each annotation is a dict with keys like 'key' and 'value'.
|
|
73
|
+
annotations: List[dict] = field(default_factory=list)
|
|
71
74
|
|
|
72
75
|
def current_snapshot(self) -> Optional[Snapshot]:
|
|
73
76
|
if self.current_snapshot_id is None:
|
|
@@ -15,12 +15,12 @@ class Metastore:
|
|
|
15
15
|
implementations to ease future compatibility.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
def load_dataset(self, identifier: str) -> "
|
|
18
|
+
def load_dataset(self, identifier: str) -> "Dataset":
|
|
19
19
|
raise NotImplementedError()
|
|
20
20
|
|
|
21
21
|
def create_dataset(
|
|
22
22
|
self, identifier: str, schema: Any, properties: dict | None = None
|
|
23
|
-
) -> "
|
|
23
|
+
) -> "Dataset":
|
|
24
24
|
raise NotImplementedError()
|
|
25
25
|
|
|
26
26
|
def drop_dataset(self, identifier: str) -> None:
|
opteryx_catalog/exceptions.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Catalog-specific exceptions for opteryx_catalog.
|
|
2
2
|
|
|
3
3
|
Exceptions mirror previous behavior (they subclass KeyError where callers
|
|
4
|
-
may expect KeyError) but provide explicit types for
|
|
4
|
+
may expect KeyError) but provide explicit types for datasets, views and
|
|
5
5
|
namespaces.
|
|
6
6
|
"""
|
|
7
7
|
|
opteryx_catalog/iops/fileio.py
CHANGED
|
@@ -123,3 +123,16 @@ class GcsFileIO(FileIO):
|
|
|
123
123
|
return True
|
|
124
124
|
except Exception:
|
|
125
125
|
return False
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# Centralized Parquet write options used across the codebase when writing
|
|
129
|
+
# parquet files. Exported here so all writers share the same configuration.
|
|
130
|
+
WRITE_PARQUET_OPTIONS = {
|
|
131
|
+
"compression": "ZSTD",
|
|
132
|
+
"compression_level": 3,
|
|
133
|
+
"use_dictionary": True,
|
|
134
|
+
"dictionary_pagesize_limit": 1024 * 1024,
|
|
135
|
+
"data_page_size": 1024 * 1024,
|
|
136
|
+
"version": "2.6",
|
|
137
|
+
"write_statistics": True,
|
|
138
|
+
}
|
opteryx_catalog/iops/gcs.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Optimized GCS FileIO for opteryx_catalog.iops
|
|
3
|
-
|
|
4
|
-
Adapted from pyiceberg_firestore_gcs.fileio.gcs_fileio to provide a fast
|
|
5
|
-
HTTP-backed GCS implementation without depending on pyiceberg types.
|
|
6
3
|
"""
|
|
7
4
|
|
|
8
5
|
import io
|
|
9
6
|
import logging
|
|
10
7
|
import os
|
|
11
8
|
import urllib.parse
|
|
9
|
+
from collections import OrderedDict
|
|
12
10
|
from typing import Callable
|
|
13
11
|
from typing import Union
|
|
14
12
|
|
|
@@ -20,6 +18,9 @@ from .base import FileIO
|
|
|
20
18
|
from .base import InputFile
|
|
21
19
|
from .base import OutputFile
|
|
22
20
|
|
|
21
|
+
# we keep a local cache of recently read files
|
|
22
|
+
MAX_CACHE_SIZE: int = 32
|
|
23
|
+
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
@@ -116,12 +117,32 @@ class _GcsOutputStream(io.BytesIO):
|
|
|
116
117
|
|
|
117
118
|
class _GcsInputFile(InputFile):
|
|
118
119
|
def __init__(
|
|
119
|
-
self,
|
|
120
|
+
self,
|
|
121
|
+
location: str,
|
|
122
|
+
session: requests.Session,
|
|
123
|
+
access_token_getter: Callable[[], str],
|
|
124
|
+
cache: OrderedDict = None,
|
|
120
125
|
):
|
|
126
|
+
# Check cache first
|
|
127
|
+
if cache is not None and location in cache:
|
|
128
|
+
# Move to end (most recently used)
|
|
129
|
+
cache.move_to_end(location)
|
|
130
|
+
data = cache[location]
|
|
131
|
+
super().__init__(location, data)
|
|
132
|
+
return
|
|
133
|
+
|
|
121
134
|
# read entire bytes via optimized session
|
|
122
135
|
try:
|
|
123
136
|
stream = _GcsInputStream(location, session, access_token_getter)
|
|
124
137
|
data = stream.read()
|
|
138
|
+
|
|
139
|
+
# Add to cache
|
|
140
|
+
if cache is not None:
|
|
141
|
+
cache[location] = data
|
|
142
|
+
# Evict oldest if cache exceeds MAX_CACHE_SIZE entries
|
|
143
|
+
if len(cache) > MAX_CACHE_SIZE:
|
|
144
|
+
cache.popitem(last=False)
|
|
145
|
+
|
|
125
146
|
super().__init__(location, data)
|
|
126
147
|
except FileNotFoundError:
|
|
127
148
|
super().__init__(location, None)
|
|
@@ -152,6 +173,9 @@ class GcsFileIO(FileIO):
|
|
|
152
173
|
self.manifest_paths: list[str] = []
|
|
153
174
|
self.captured_manifests: list[tuple[str, bytes]] = []
|
|
154
175
|
|
|
176
|
+
# LRU cache for read operations (MAX_CACHE_SIZE files max)
|
|
177
|
+
self._read_cache: OrderedDict = OrderedDict()
|
|
178
|
+
|
|
155
179
|
# Prepare requests session and set up credential refresh helper (token may expire)
|
|
156
180
|
self._credentials = _get_storage_credentials()
|
|
157
181
|
self._access_token = None
|
|
@@ -180,17 +204,23 @@ class GcsFileIO(FileIO):
|
|
|
180
204
|
self._session.mount("https://", adapter)
|
|
181
205
|
|
|
182
206
|
def new_input(self, location: str) -> InputFile:
|
|
183
|
-
return _GcsInputFile(location, self._session, self.get_access_token)
|
|
207
|
+
return _GcsInputFile(location, self._session, self.get_access_token, self._read_cache)
|
|
184
208
|
|
|
185
209
|
def new_output(self, location: str) -> OutputFile:
|
|
186
210
|
logger.info(f"new_output -> {location}")
|
|
187
211
|
|
|
212
|
+
# Invalidate cache entry if present
|
|
213
|
+
self._read_cache.pop(location, None)
|
|
214
|
+
|
|
188
215
|
return _GcsOutputFile(location, self._session, self.get_access_token)
|
|
189
216
|
|
|
190
217
|
def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
|
|
191
218
|
if isinstance(location, (InputFile, OutputFile)):
|
|
192
219
|
location = location.location
|
|
193
220
|
|
|
221
|
+
# Invalidate cache entry if present
|
|
222
|
+
self._read_cache.pop(location, None)
|
|
223
|
+
|
|
194
224
|
path = location
|
|
195
225
|
if path.startswith("gs://"):
|
|
196
226
|
path = path[5:]
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
# Lightweight package shim so `opteryx.third_party.maki_nage` is importable
|
|
2
|
+
from .distogram import Distogram
|
|
3
|
+
from .distogram import histogram
|
|
4
|
+
from .distogram import load
|
|
5
|
+
from .distogram import merge
|
|
6
|
+
from .distogram import quantile
|
|
7
|
+
|
|
8
|
+
__all__ = ["Distogram", "load", "merge", "histogram", "quantile"]
|