opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +433 -451
- opteryx_catalog/catalog/manifest.py +415 -0
- opteryx_catalog/catalog/metadata.py +2 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/opteryx_catalog.py +257 -231
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/METADATA +1 -1
- opteryx_catalog-0.4.11.dist-info/RECORD +25 -0
- scripts/create_dataset.py +1 -1
- scripts/read_dataset.py +1 -1
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/WHEEL +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/top_level.txt +0 -0
|
@@ -2,8 +2,13 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from dataclasses import dataclass
|
|
4
4
|
from dataclasses import field
|
|
5
|
+
from typing import Any
|
|
5
6
|
from typing import Dict
|
|
6
7
|
|
|
8
|
+
NULL_FLAG = -(1 << 63)
|
|
9
|
+
MIN_K_HASHES = 32
|
|
10
|
+
HISTOGRAM_BINS = 32
|
|
11
|
+
|
|
7
12
|
|
|
8
13
|
@dataclass
|
|
9
14
|
class DataFile:
|
|
@@ -21,3 +26,413 @@ class ManifestEntry:
|
|
|
21
26
|
snapshot_id: int
|
|
22
27
|
data_file: DataFile
|
|
23
28
|
status: str = "added" # 'added' | 'deleted'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@dataclass
|
|
32
|
+
class ParquetManifestEntry:
|
|
33
|
+
"""Represents a single entry in a Parquet manifest with statistics."""
|
|
34
|
+
|
|
35
|
+
file_path: str
|
|
36
|
+
file_format: str
|
|
37
|
+
record_count: int
|
|
38
|
+
file_size_in_bytes: int
|
|
39
|
+
uncompressed_size_in_bytes: int
|
|
40
|
+
column_uncompressed_sizes_in_bytes: list[int]
|
|
41
|
+
null_counts: list[int]
|
|
42
|
+
min_k_hashes: list[list[int]]
|
|
43
|
+
histogram_counts: list[list[int]]
|
|
44
|
+
histogram_bins: int
|
|
45
|
+
min_values: list
|
|
46
|
+
max_values: list
|
|
47
|
+
|
|
48
|
+
def to_dict(self) -> dict:
|
|
49
|
+
return {
|
|
50
|
+
"file_path": self.file_path,
|
|
51
|
+
"file_format": self.file_format,
|
|
52
|
+
"record_count": self.record_count,
|
|
53
|
+
"file_size_in_bytes": self.file_size_in_bytes,
|
|
54
|
+
"uncompressed_size_in_bytes": self.uncompressed_size_in_bytes,
|
|
55
|
+
"column_uncompressed_sizes_in_bytes": self.column_uncompressed_sizes_in_bytes,
|
|
56
|
+
"null_counts": self.null_counts,
|
|
57
|
+
"min_k_hashes": self.min_k_hashes,
|
|
58
|
+
"histogram_counts": self.histogram_counts,
|
|
59
|
+
"histogram_bins": self.histogram_bins,
|
|
60
|
+
"min_values": self.min_values,
|
|
61
|
+
"max_values": self.max_values,
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def build_parquet_manifest_entry(
|
|
66
|
+
table: Any, file_path: str, file_size_in_bytes: int
|
|
67
|
+
) -> ParquetManifestEntry:
|
|
68
|
+
"""Build a Parquet manifest entry with statistics for a PyArrow table.
|
|
69
|
+
|
|
70
|
+
Args:
|
|
71
|
+
table: PyArrow table to analyze
|
|
72
|
+
file_path: Path where the file is stored
|
|
73
|
+
file_size_in_bytes: Size of the parquet file in bytes
|
|
74
|
+
|
|
75
|
+
Returns:
|
|
76
|
+
ParquetManifestEntry with computed statistics
|
|
77
|
+
"""
|
|
78
|
+
import pyarrow as pa
|
|
79
|
+
|
|
80
|
+
min_k_hashes: list[list[int]] = []
|
|
81
|
+
histograms: list[list[int]] = []
|
|
82
|
+
min_values: list[int] = []
|
|
83
|
+
null_counts: list[int] = []
|
|
84
|
+
max_values: list[int] = []
|
|
85
|
+
|
|
86
|
+
# Use draken for efficient hashing and compression when available.
|
|
87
|
+
import heapq
|
|
88
|
+
|
|
89
|
+
# Try to compute additional per-column statistics when draken is available.
|
|
90
|
+
try:
|
|
91
|
+
import opteryx.draken as draken # type: ignore
|
|
92
|
+
|
|
93
|
+
for col_idx, col in enumerate(table.columns):
|
|
94
|
+
# hash column values to 64-bit via draken (new cpdef API)
|
|
95
|
+
vec = draken.Vector.from_arrow(col)
|
|
96
|
+
hashes = list(vec.hash())
|
|
97
|
+
|
|
98
|
+
# Decide whether to compute min-k/histogram for this column based
|
|
99
|
+
# on field type and, for strings, average length of values.
|
|
100
|
+
field_type = table.schema.field(col_idx).type
|
|
101
|
+
compute_min_k = False
|
|
102
|
+
if (
|
|
103
|
+
pa.types.is_integer(field_type)
|
|
104
|
+
or pa.types.is_floating(field_type)
|
|
105
|
+
or pa.types.is_decimal(field_type)
|
|
106
|
+
):
|
|
107
|
+
compute_min_k = True
|
|
108
|
+
elif (
|
|
109
|
+
pa.types.is_timestamp(field_type)
|
|
110
|
+
or pa.types.is_date(field_type)
|
|
111
|
+
or pa.types.is_time(field_type)
|
|
112
|
+
):
|
|
113
|
+
compute_min_k = True
|
|
114
|
+
elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
|
|
115
|
+
# compute average length from non-null values; only allow
|
|
116
|
+
# min-k/histogram for short strings (avg <= 16)
|
|
117
|
+
col_py = None
|
|
118
|
+
try:
|
|
119
|
+
col_py = col.to_pylist()
|
|
120
|
+
except Exception:
|
|
121
|
+
col_py = None
|
|
122
|
+
|
|
123
|
+
if col_py is not None:
|
|
124
|
+
lens = [len(x) for x in col_py if x is not None]
|
|
125
|
+
if lens:
|
|
126
|
+
avg_len = sum(lens) / len(lens)
|
|
127
|
+
if avg_len <= 16:
|
|
128
|
+
compute_min_k = True
|
|
129
|
+
|
|
130
|
+
# KMV: take K smallest unique hashes when allowed; otherwise
|
|
131
|
+
# store an empty list for this column. Deduplicate hashes so
|
|
132
|
+
# the KMV sketch contains unique hashes (avoids duplicates
|
|
133
|
+
# skewing cardinality estimates).
|
|
134
|
+
if compute_min_k:
|
|
135
|
+
unique_hashes = set(hashes)
|
|
136
|
+
smallest = heapq.nsmallest(MIN_K_HASHES, unique_hashes)
|
|
137
|
+
col_min_k = sorted(smallest)
|
|
138
|
+
else:
|
|
139
|
+
col_min_k = []
|
|
140
|
+
|
|
141
|
+
# For histogram decisions follow the same rule as min-k
|
|
142
|
+
compute_hist = compute_min_k
|
|
143
|
+
|
|
144
|
+
# Use draken.compress() to get canonical int64 per value
|
|
145
|
+
mapped = list(vec.compress())
|
|
146
|
+
# Compute null count from compressed representation
|
|
147
|
+
null_count = sum(1 for m in mapped if m == NULL_FLAG)
|
|
148
|
+
null_counts.append(int(null_count))
|
|
149
|
+
non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
|
|
150
|
+
if non_nulls_mapped:
|
|
151
|
+
vmin = min(non_nulls_mapped)
|
|
152
|
+
vmax = max(non_nulls_mapped)
|
|
153
|
+
col_min = int(vmin)
|
|
154
|
+
col_max = int(vmax)
|
|
155
|
+
if compute_hist:
|
|
156
|
+
if vmin == vmax:
|
|
157
|
+
col_hist = [0] * HISTOGRAM_BINS
|
|
158
|
+
col_hist[-1] = len(non_nulls_mapped)
|
|
159
|
+
else:
|
|
160
|
+
col_hist = [0] * HISTOGRAM_BINS
|
|
161
|
+
span = float(vmax - vmin)
|
|
162
|
+
for m in non_nulls_mapped:
|
|
163
|
+
b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
|
|
164
|
+
if b < 0:
|
|
165
|
+
b = 0
|
|
166
|
+
if b >= HISTOGRAM_BINS:
|
|
167
|
+
b = HISTOGRAM_BINS - 1
|
|
168
|
+
col_hist[b] += 1
|
|
169
|
+
else:
|
|
170
|
+
col_hist = [0] * HISTOGRAM_BINS
|
|
171
|
+
else:
|
|
172
|
+
# no non-null values; histogram via hash buckets
|
|
173
|
+
col_min = NULL_FLAG
|
|
174
|
+
col_max = NULL_FLAG
|
|
175
|
+
if compute_hist:
|
|
176
|
+
col_hist = [0] * HISTOGRAM_BINS
|
|
177
|
+
for h in hashes:
|
|
178
|
+
b = (h >> (64 - 5)) & 0x1F
|
|
179
|
+
col_hist[b] += 1
|
|
180
|
+
else:
|
|
181
|
+
col_hist = [0] * HISTOGRAM_BINS
|
|
182
|
+
|
|
183
|
+
min_k_hashes.append(col_min_k)
|
|
184
|
+
histograms.append(col_hist)
|
|
185
|
+
min_values.append(col_min)
|
|
186
|
+
max_values.append(col_max)
|
|
187
|
+
# end for
|
|
188
|
+
except Exception:
|
|
189
|
+
# Draken not available or failed; leave min_k_hashes/histograms empty
|
|
190
|
+
min_k_hashes = [[] for _ in table.columns]
|
|
191
|
+
histograms = [[] for _ in table.columns]
|
|
192
|
+
# Attempt to compute per-column min/max from the table directly
|
|
193
|
+
try:
|
|
194
|
+
for col in table.columns:
|
|
195
|
+
try:
|
|
196
|
+
col_py = col.to_pylist()
|
|
197
|
+
non_nulls = [v for v in col_py if v is not None]
|
|
198
|
+
null_count = len(col_py) - len(non_nulls)
|
|
199
|
+
null_counts.append(int(null_count))
|
|
200
|
+
if non_nulls:
|
|
201
|
+
try:
|
|
202
|
+
min_values.append(min(non_nulls))
|
|
203
|
+
max_values.append(max(non_nulls))
|
|
204
|
+
except Exception:
|
|
205
|
+
min_values.append(None)
|
|
206
|
+
max_values.append(None)
|
|
207
|
+
else:
|
|
208
|
+
min_values.append(None)
|
|
209
|
+
max_values.append(None)
|
|
210
|
+
except Exception:
|
|
211
|
+
min_values.append(None)
|
|
212
|
+
max_values.append(None)
|
|
213
|
+
# If we couldn't introspect column values, assume 0 nulls
|
|
214
|
+
null_counts.append(0)
|
|
215
|
+
except Exception:
|
|
216
|
+
# If even direct inspection fails, ensure lists lengths match
|
|
217
|
+
min_values = [None] * len(table.columns)
|
|
218
|
+
max_values = [None] * len(table.columns)
|
|
219
|
+
null_counts = [0] * len(table.columns)
|
|
220
|
+
|
|
221
|
+
# Calculate uncompressed size from table buffers — must be accurate.
|
|
222
|
+
column_uncompressed: list[int] = []
|
|
223
|
+
uncompressed_size = 0
|
|
224
|
+
for col in table.columns:
|
|
225
|
+
col_total = 0
|
|
226
|
+
for chunk in col.chunks:
|
|
227
|
+
try:
|
|
228
|
+
buffs = chunk.buffers()
|
|
229
|
+
except Exception as exc:
|
|
230
|
+
raise RuntimeError(
|
|
231
|
+
f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
|
|
232
|
+
) from exc
|
|
233
|
+
for buffer in buffs:
|
|
234
|
+
if buffer is not None:
|
|
235
|
+
col_total += buffer.size
|
|
236
|
+
column_uncompressed.append(int(col_total))
|
|
237
|
+
uncompressed_size += col_total
|
|
238
|
+
|
|
239
|
+
return ParquetManifestEntry(
|
|
240
|
+
file_path=file_path,
|
|
241
|
+
file_format="parquet",
|
|
242
|
+
record_count=int(table.num_rows),
|
|
243
|
+
file_size_in_bytes=file_size_in_bytes,
|
|
244
|
+
uncompressed_size_in_bytes=uncompressed_size,
|
|
245
|
+
column_uncompressed_sizes_in_bytes=column_uncompressed,
|
|
246
|
+
null_counts=null_counts,
|
|
247
|
+
min_k_hashes=min_k_hashes,
|
|
248
|
+
histogram_counts=histograms,
|
|
249
|
+
histogram_bins=HISTOGRAM_BINS,
|
|
250
|
+
min_values=min_values,
|
|
251
|
+
max_values=max_values,
|
|
252
|
+
)
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def build_parquet_manifest_minmax_entry(data: bytes, file_path: str) -> ParquetManifestEntry:
|
|
256
|
+
"""Build a Parquet manifest entry with min/max statistics using fast rugo reader.
|
|
257
|
+
|
|
258
|
+
This is much faster than build_parquet_manifest_entry (microseconds per file)
|
|
259
|
+
and is suitable for bulk file operations where full statistics are not needed.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
data: Raw parquet file bytes
|
|
263
|
+
file_path: Path where the file is stored
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
ParquetManifestEntry with min/max statistics only (no histograms or k-hashes)
|
|
267
|
+
"""
|
|
268
|
+
file_size = len(data)
|
|
269
|
+
|
|
270
|
+
# Prefer rugo fast metadata reader when available, otherwise fall back
|
|
271
|
+
# to pyarrow ParquetFile to extract row-group statistics.
|
|
272
|
+
try:
|
|
273
|
+
import opteryx.rugo.parquet as parquet_meta
|
|
274
|
+
from opteryx.compiled.structures.relation_statistics import to_int
|
|
275
|
+
|
|
276
|
+
if isinstance(data, memoryview):
|
|
277
|
+
metadata = parquet_meta.read_metadata_from_memoryview(data, include_statistics=True)
|
|
278
|
+
else:
|
|
279
|
+
metadata = parquet_meta.read_metadata_from_memoryview(
|
|
280
|
+
memoryview(data), include_statistics=True
|
|
281
|
+
)
|
|
282
|
+
|
|
283
|
+
record_count = metadata["num_rows"]
|
|
284
|
+
except ImportError:
|
|
285
|
+
# Fallback: use pyarrow to read Parquet metadata
|
|
286
|
+
import pyarrow as pa
|
|
287
|
+
import pyarrow.parquet as pq
|
|
288
|
+
|
|
289
|
+
pf = pq.ParquetFile(pa.BufferReader(data))
|
|
290
|
+
record_count = int(pf.metadata.num_rows or 0)
|
|
291
|
+
|
|
292
|
+
# Construct minimal metadata structure compatible with expected shape
|
|
293
|
+
metadata = {"num_rows": record_count, "row_groups": []}
|
|
294
|
+
for rg in range(pf.num_row_groups):
|
|
295
|
+
rg_entry = {"columns": []}
|
|
296
|
+
for ci in range(pf.metadata.num_columns):
|
|
297
|
+
col_meta = pf.metadata.row_group(rg).column(ci)
|
|
298
|
+
col_entry = {"name": pf.schema.names[ci]}
|
|
299
|
+
stats = getattr(col_meta, "statistics", None)
|
|
300
|
+
if stats:
|
|
301
|
+
col_entry["min"] = getattr(stats, "min", None)
|
|
302
|
+
col_entry["max"] = getattr(stats, "max", None)
|
|
303
|
+
rg_entry["columns"].append(col_entry)
|
|
304
|
+
# total_byte_size may not be available; leave out to trigger full-table calculation later
|
|
305
|
+
metadata["row_groups"].append(rg_entry)
|
|
306
|
+
|
|
307
|
+
# Define a simple to_int fallback for the pyarrow path
|
|
308
|
+
def to_int(v: object) -> int:
|
|
309
|
+
try:
|
|
310
|
+
return int(v)
|
|
311
|
+
except Exception:
|
|
312
|
+
try:
|
|
313
|
+
if isinstance(v, (bytes, bytearray)):
|
|
314
|
+
s = v.decode("utf-8", errors="ignore")
|
|
315
|
+
return int(float(s)) if s else 0
|
|
316
|
+
return int(float(v))
|
|
317
|
+
except Exception:
|
|
318
|
+
return 0
|
|
319
|
+
|
|
320
|
+
# Gather min/max per column across all row groups
|
|
321
|
+
column_stats = {}
|
|
322
|
+
for row_group in metadata["row_groups"]:
|
|
323
|
+
for column in row_group["columns"]:
|
|
324
|
+
column_name = column["name"]
|
|
325
|
+
|
|
326
|
+
if column_name not in column_stats:
|
|
327
|
+
column_stats[column_name] = {"min": None, "max": None}
|
|
328
|
+
|
|
329
|
+
min_value = column.get("min")
|
|
330
|
+
if min_value is not None:
|
|
331
|
+
# Compress value to int using to_int
|
|
332
|
+
min_compressed = to_int(min_value)
|
|
333
|
+
if column_stats[column_name]["min"] is None:
|
|
334
|
+
column_stats[column_name]["min"] = min_compressed
|
|
335
|
+
else:
|
|
336
|
+
column_stats[column_name]["min"] = min(
|
|
337
|
+
column_stats[column_name]["min"], min_compressed
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
max_value = column.get("max")
|
|
341
|
+
if max_value is not None:
|
|
342
|
+
# Compress value to int using to_int
|
|
343
|
+
max_compressed = to_int(max_value)
|
|
344
|
+
if column_stats[column_name]["max"] is None:
|
|
345
|
+
column_stats[column_name]["max"] = max_compressed
|
|
346
|
+
else:
|
|
347
|
+
column_stats[column_name]["max"] = max(
|
|
348
|
+
column_stats[column_name]["max"], max_compressed
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Extract min/max values (filter out None)
|
|
352
|
+
min_values = [stats["min"] for stats in column_stats.values() if stats["min"] is not None]
|
|
353
|
+
max_values = [stats["max"] for stats in column_stats.values() if stats["max"] is not None]
|
|
354
|
+
|
|
355
|
+
# Attempt to gather null counts from metadata row groups if available
|
|
356
|
+
column_nulls: dict = {}
|
|
357
|
+
for row_group in metadata["row_groups"]:
|
|
358
|
+
for column in row_group["columns"]:
|
|
359
|
+
cname = column["name"]
|
|
360
|
+
if cname not in column_nulls:
|
|
361
|
+
column_nulls[cname] = 0
|
|
362
|
+
nc = column.get("null_count")
|
|
363
|
+
if nc is not None:
|
|
364
|
+
try:
|
|
365
|
+
column_nulls[cname] += int(nc)
|
|
366
|
+
except Exception:
|
|
367
|
+
pass
|
|
368
|
+
|
|
369
|
+
if column_nulls:
|
|
370
|
+
null_counts = [column_nulls.get(n, 0) for n in column_stats.keys()]
|
|
371
|
+
else:
|
|
372
|
+
null_counts = []
|
|
373
|
+
|
|
374
|
+
# Get uncompressed size from metadata; if missing, read full table and
|
|
375
|
+
# compute accurate uncompressed size from buffers. Also attempt to
|
|
376
|
+
# compute per-column uncompressed byte counts when reading the table.
|
|
377
|
+
uncompressed_size = 0
|
|
378
|
+
column_uncompressed: list[int] = []
|
|
379
|
+
missing = False
|
|
380
|
+
for row_group in metadata["row_groups"]:
|
|
381
|
+
v = row_group.get("total_byte_size", None)
|
|
382
|
+
if v is None:
|
|
383
|
+
missing = True
|
|
384
|
+
break
|
|
385
|
+
uncompressed_size += v
|
|
386
|
+
|
|
387
|
+
if missing or uncompressed_size == 0:
|
|
388
|
+
try:
|
|
389
|
+
import pyarrow as pa
|
|
390
|
+
import pyarrow.parquet as pq
|
|
391
|
+
|
|
392
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
393
|
+
uncompressed_size = 0
|
|
394
|
+
# Compute per-column uncompressed sizes and null counts from the table
|
|
395
|
+
for col in table.columns:
|
|
396
|
+
col_total = 0
|
|
397
|
+
null_total = 0
|
|
398
|
+
for chunk in col.chunks:
|
|
399
|
+
for buffer in chunk.buffers():
|
|
400
|
+
if buffer is not None:
|
|
401
|
+
col_total += buffer.size
|
|
402
|
+
try:
|
|
403
|
+
null_total += int(chunk.null_count)
|
|
404
|
+
except Exception:
|
|
405
|
+
# Fallback to slow python inspection
|
|
406
|
+
try:
|
|
407
|
+
col_py = col.to_pylist()
|
|
408
|
+
null_total = len(col_py) - len([v for v in col_py if v is not None])
|
|
409
|
+
except Exception:
|
|
410
|
+
null_total = 0
|
|
411
|
+
|
|
412
|
+
column_uncompressed.append(int(col_total))
|
|
413
|
+
uncompressed_size += col_total
|
|
414
|
+
null_counts = null_counts or []
|
|
415
|
+
null_counts.append(int(null_total))
|
|
416
|
+
except Exception as exc:
|
|
417
|
+
raise RuntimeError(
|
|
418
|
+
f"Unable to determine uncompressed size for {file_path}: {exc}"
|
|
419
|
+
) from exc
|
|
420
|
+
else:
|
|
421
|
+
# If we didn't read the table and null_counts is still empty, default to zeros
|
|
422
|
+
if not null_counts:
|
|
423
|
+
null_counts = [0] * len(column_stats)
|
|
424
|
+
|
|
425
|
+
return ParquetManifestEntry(
|
|
426
|
+
file_path=file_path,
|
|
427
|
+
file_format="parquet",
|
|
428
|
+
record_count=int(record_count),
|
|
429
|
+
file_size_in_bytes=file_size,
|
|
430
|
+
uncompressed_size_in_bytes=uncompressed_size,
|
|
431
|
+
column_uncompressed_sizes_in_bytes=column_uncompressed,
|
|
432
|
+
null_counts=null_counts,
|
|
433
|
+
min_k_hashes=[],
|
|
434
|
+
histogram_counts=[],
|
|
435
|
+
histogram_bins=0,
|
|
436
|
+
min_values=min_values,
|
|
437
|
+
max_values=max_values,
|
|
438
|
+
)
|
|
@@ -46,12 +46,12 @@ class DatasetMetadata:
|
|
|
46
46
|
location: str = ""
|
|
47
47
|
schema: Any = None
|
|
48
48
|
properties: dict = field(default_factory=dict)
|
|
49
|
-
#
|
|
49
|
+
# Dataset-level created/updated metadata
|
|
50
50
|
timestamp_ms: Optional[int] = None
|
|
51
51
|
author: Optional[str] = None
|
|
52
52
|
description: Optional[str] = None
|
|
53
53
|
describer: Optional[str] = None
|
|
54
|
-
sort_orders: List[
|
|
54
|
+
sort_orders: List[int] = field(default_factory=list)
|
|
55
55
|
# Maintenance policy: retention settings grouped under a single block
|
|
56
56
|
maintenance_policy: dict = field(
|
|
57
57
|
default_factory=lambda: {
|
|
@@ -15,12 +15,12 @@ class Metastore:
|
|
|
15
15
|
implementations to ease future compatibility.
|
|
16
16
|
"""
|
|
17
17
|
|
|
18
|
-
def load_dataset(self, identifier: str) -> "
|
|
18
|
+
def load_dataset(self, identifier: str) -> "Dataset":
|
|
19
19
|
raise NotImplementedError()
|
|
20
20
|
|
|
21
21
|
def create_dataset(
|
|
22
22
|
self, identifier: str, schema: Any, properties: dict | None = None
|
|
23
|
-
) -> "
|
|
23
|
+
) -> "Dataset":
|
|
24
24
|
raise NotImplementedError()
|
|
25
25
|
|
|
26
26
|
def drop_dataset(self, identifier: str) -> None:
|
opteryx_catalog/exceptions.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Catalog-specific exceptions for opteryx_catalog.
|
|
2
2
|
|
|
3
3
|
Exceptions mirror previous behavior (they subclass KeyError where callers
|
|
4
|
-
may expect KeyError) but provide explicit types for
|
|
4
|
+
may expect KeyError) but provide explicit types for datasets, views and
|
|
5
5
|
namespaces.
|
|
6
6
|
"""
|
|
7
7
|
|
opteryx_catalog/iops/gcs.py
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Optimized GCS FileIO for opteryx_catalog.iops
|
|
3
|
-
|
|
4
|
-
Adapted from pyiceberg_firestore_gcs.fileio.gcs_fileio to provide a fast
|
|
5
|
-
HTTP-backed GCS implementation without depending on pyiceberg types.
|
|
6
3
|
"""
|
|
7
4
|
|
|
8
5
|
import io
|
|
9
6
|
import logging
|
|
10
7
|
import os
|
|
11
8
|
import urllib.parse
|
|
9
|
+
from collections import OrderedDict
|
|
12
10
|
from typing import Callable
|
|
13
11
|
from typing import Union
|
|
14
12
|
|
|
@@ -20,6 +18,9 @@ from .base import FileIO
|
|
|
20
18
|
from .base import InputFile
|
|
21
19
|
from .base import OutputFile
|
|
22
20
|
|
|
21
|
+
# we keep a local cache of recently read files
|
|
22
|
+
MAX_CACHE_SIZE: int = 32
|
|
23
|
+
|
|
23
24
|
logger = logging.getLogger(__name__)
|
|
24
25
|
|
|
25
26
|
|
|
@@ -116,12 +117,32 @@ class _GcsOutputStream(io.BytesIO):
|
|
|
116
117
|
|
|
117
118
|
class _GcsInputFile(InputFile):
|
|
118
119
|
def __init__(
|
|
119
|
-
self,
|
|
120
|
+
self,
|
|
121
|
+
location: str,
|
|
122
|
+
session: requests.Session,
|
|
123
|
+
access_token_getter: Callable[[], str],
|
|
124
|
+
cache: OrderedDict = None,
|
|
120
125
|
):
|
|
126
|
+
# Check cache first
|
|
127
|
+
if cache is not None and location in cache:
|
|
128
|
+
# Move to end (most recently used)
|
|
129
|
+
cache.move_to_end(location)
|
|
130
|
+
data = cache[location]
|
|
131
|
+
super().__init__(location, data)
|
|
132
|
+
return
|
|
133
|
+
|
|
121
134
|
# read entire bytes via optimized session
|
|
122
135
|
try:
|
|
123
136
|
stream = _GcsInputStream(location, session, access_token_getter)
|
|
124
137
|
data = stream.read()
|
|
138
|
+
|
|
139
|
+
# Add to cache
|
|
140
|
+
if cache is not None:
|
|
141
|
+
cache[location] = data
|
|
142
|
+
# Evict oldest if cache exceeds MAX_CACHE_SIZE entries
|
|
143
|
+
if len(cache) > MAX_CACHE_SIZE:
|
|
144
|
+
cache.popitem(last=False)
|
|
145
|
+
|
|
125
146
|
super().__init__(location, data)
|
|
126
147
|
except FileNotFoundError:
|
|
127
148
|
super().__init__(location, None)
|
|
@@ -152,6 +173,9 @@ class GcsFileIO(FileIO):
|
|
|
152
173
|
self.manifest_paths: list[str] = []
|
|
153
174
|
self.captured_manifests: list[tuple[str, bytes]] = []
|
|
154
175
|
|
|
176
|
+
# LRU cache for read operations (MAX_CACHE_SIZE files max)
|
|
177
|
+
self._read_cache: OrderedDict = OrderedDict()
|
|
178
|
+
|
|
155
179
|
# Prepare requests session and set up credential refresh helper (token may expire)
|
|
156
180
|
self._credentials = _get_storage_credentials()
|
|
157
181
|
self._access_token = None
|
|
@@ -180,17 +204,23 @@ class GcsFileIO(FileIO):
|
|
|
180
204
|
self._session.mount("https://", adapter)
|
|
181
205
|
|
|
182
206
|
def new_input(self, location: str) -> InputFile:
|
|
183
|
-
return _GcsInputFile(location, self._session, self.get_access_token)
|
|
207
|
+
return _GcsInputFile(location, self._session, self.get_access_token, self._read_cache)
|
|
184
208
|
|
|
185
209
|
def new_output(self, location: str) -> OutputFile:
|
|
186
210
|
logger.info(f"new_output -> {location}")
|
|
187
211
|
|
|
212
|
+
# Invalidate cache entry if present
|
|
213
|
+
self._read_cache.pop(location, None)
|
|
214
|
+
|
|
188
215
|
return _GcsOutputFile(location, self._session, self.get_access_token)
|
|
189
216
|
|
|
190
217
|
def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
|
|
191
218
|
if isinstance(location, (InputFile, OutputFile)):
|
|
192
219
|
location = location.location
|
|
193
220
|
|
|
221
|
+
# Invalidate cache entry if present
|
|
222
|
+
self._read_cache.pop(location, None)
|
|
223
|
+
|
|
194
224
|
path = location
|
|
195
225
|
if path.startswith("gs://"):
|
|
196
226
|
path = path[5:]
|