opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -2,8 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  from dataclasses import dataclass
4
4
  from dataclasses import field
5
+ from typing import Any
5
6
  from typing import Dict
6
7
 
8
+ NULL_FLAG = -(1 << 63)
9
+ MIN_K_HASHES = 32
10
+ HISTOGRAM_BINS = 32
11
+
7
12
 
8
13
  @dataclass
9
14
  class DataFile:
@@ -21,3 +26,413 @@ class ManifestEntry:
21
26
  snapshot_id: int
22
27
  data_file: DataFile
23
28
  status: str = "added" # 'added' | 'deleted'
29
+
30
+
31
+ @dataclass
32
+ class ParquetManifestEntry:
33
+ """Represents a single entry in a Parquet manifest with statistics."""
34
+
35
+ file_path: str
36
+ file_format: str
37
+ record_count: int
38
+ file_size_in_bytes: int
39
+ uncompressed_size_in_bytes: int
40
+ column_uncompressed_sizes_in_bytes: list[int]
41
+ null_counts: list[int]
42
+ min_k_hashes: list[list[int]]
43
+ histogram_counts: list[list[int]]
44
+ histogram_bins: int
45
+ min_values: list
46
+ max_values: list
47
+
48
+ def to_dict(self) -> dict:
49
+ return {
50
+ "file_path": self.file_path,
51
+ "file_format": self.file_format,
52
+ "record_count": self.record_count,
53
+ "file_size_in_bytes": self.file_size_in_bytes,
54
+ "uncompressed_size_in_bytes": self.uncompressed_size_in_bytes,
55
+ "column_uncompressed_sizes_in_bytes": self.column_uncompressed_sizes_in_bytes,
56
+ "null_counts": self.null_counts,
57
+ "min_k_hashes": self.min_k_hashes,
58
+ "histogram_counts": self.histogram_counts,
59
+ "histogram_bins": self.histogram_bins,
60
+ "min_values": self.min_values,
61
+ "max_values": self.max_values,
62
+ }
63
+
64
+
65
+ def build_parquet_manifest_entry(
66
+ table: Any, file_path: str, file_size_in_bytes: int
67
+ ) -> ParquetManifestEntry:
68
+ """Build a Parquet manifest entry with statistics for a PyArrow table.
69
+
70
+ Args:
71
+ table: PyArrow table to analyze
72
+ file_path: Path where the file is stored
73
+ file_size_in_bytes: Size of the parquet file in bytes
74
+
75
+ Returns:
76
+ ParquetManifestEntry with computed statistics
77
+ """
78
+ import pyarrow as pa
79
+
80
+ min_k_hashes: list[list[int]] = []
81
+ histograms: list[list[int]] = []
82
+ min_values: list[int] = []
83
+ null_counts: list[int] = []
84
+ max_values: list[int] = []
85
+
86
+ # Use draken for efficient hashing and compression when available.
87
+ import heapq
88
+
89
+ # Try to compute additional per-column statistics when draken is available.
90
+ try:
91
+ import opteryx.draken as draken # type: ignore
92
+
93
+ for col_idx, col in enumerate(table.columns):
94
+ # hash column values to 64-bit via draken (new cpdef API)
95
+ vec = draken.Vector.from_arrow(col)
96
+ hashes = list(vec.hash())
97
+
98
+ # Decide whether to compute min-k/histogram for this column based
99
+ # on field type and, for strings, average length of values.
100
+ field_type = table.schema.field(col_idx).type
101
+ compute_min_k = False
102
+ if (
103
+ pa.types.is_integer(field_type)
104
+ or pa.types.is_floating(field_type)
105
+ or pa.types.is_decimal(field_type)
106
+ ):
107
+ compute_min_k = True
108
+ elif (
109
+ pa.types.is_timestamp(field_type)
110
+ or pa.types.is_date(field_type)
111
+ or pa.types.is_time(field_type)
112
+ ):
113
+ compute_min_k = True
114
+ elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
115
+ # compute average length from non-null values; only allow
116
+ # min-k/histogram for short strings (avg <= 16)
117
+ col_py = None
118
+ try:
119
+ col_py = col.to_pylist()
120
+ except Exception:
121
+ col_py = None
122
+
123
+ if col_py is not None:
124
+ lens = [len(x) for x in col_py if x is not None]
125
+ if lens:
126
+ avg_len = sum(lens) / len(lens)
127
+ if avg_len <= 16:
128
+ compute_min_k = True
129
+
130
+ # KMV: take K smallest unique hashes when allowed; otherwise
131
+ # store an empty list for this column. Deduplicate hashes so
132
+ # the KMV sketch contains unique hashes (avoids duplicates
133
+ # skewing cardinality estimates).
134
+ if compute_min_k:
135
+ unique_hashes = set(hashes)
136
+ smallest = heapq.nsmallest(MIN_K_HASHES, unique_hashes)
137
+ col_min_k = sorted(smallest)
138
+ else:
139
+ col_min_k = []
140
+
141
+ # For histogram decisions follow the same rule as min-k
142
+ compute_hist = compute_min_k
143
+
144
+ # Use draken.compress() to get canonical int64 per value
145
+ mapped = list(vec.compress())
146
+ # Compute null count from compressed representation
147
+ null_count = sum(1 for m in mapped if m == NULL_FLAG)
148
+ null_counts.append(int(null_count))
149
+ non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
150
+ if non_nulls_mapped:
151
+ vmin = min(non_nulls_mapped)
152
+ vmax = max(non_nulls_mapped)
153
+ col_min = int(vmin)
154
+ col_max = int(vmax)
155
+ if compute_hist:
156
+ if vmin == vmax:
157
+ col_hist = [0] * HISTOGRAM_BINS
158
+ col_hist[-1] = len(non_nulls_mapped)
159
+ else:
160
+ col_hist = [0] * HISTOGRAM_BINS
161
+ span = float(vmax - vmin)
162
+ for m in non_nulls_mapped:
163
+ b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
164
+ if b < 0:
165
+ b = 0
166
+ if b >= HISTOGRAM_BINS:
167
+ b = HISTOGRAM_BINS - 1
168
+ col_hist[b] += 1
169
+ else:
170
+ col_hist = [0] * HISTOGRAM_BINS
171
+ else:
172
+ # no non-null values; histogram via hash buckets
173
+ col_min = NULL_FLAG
174
+ col_max = NULL_FLAG
175
+ if compute_hist:
176
+ col_hist = [0] * HISTOGRAM_BINS
177
+ for h in hashes:
178
+ b = (h >> (64 - 5)) & 0x1F
179
+ col_hist[b] += 1
180
+ else:
181
+ col_hist = [0] * HISTOGRAM_BINS
182
+
183
+ min_k_hashes.append(col_min_k)
184
+ histograms.append(col_hist)
185
+ min_values.append(col_min)
186
+ max_values.append(col_max)
187
+ # end for
188
+ except Exception:
189
+ # Draken not available or failed; leave min_k_hashes/histograms empty
190
+ min_k_hashes = [[] for _ in table.columns]
191
+ histograms = [[] for _ in table.columns]
192
+ # Attempt to compute per-column min/max from the table directly
193
+ try:
194
+ for col in table.columns:
195
+ try:
196
+ col_py = col.to_pylist()
197
+ non_nulls = [v for v in col_py if v is not None]
198
+ null_count = len(col_py) - len(non_nulls)
199
+ null_counts.append(int(null_count))
200
+ if non_nulls:
201
+ try:
202
+ min_values.append(min(non_nulls))
203
+ max_values.append(max(non_nulls))
204
+ except Exception:
205
+ min_values.append(None)
206
+ max_values.append(None)
207
+ else:
208
+ min_values.append(None)
209
+ max_values.append(None)
210
+ except Exception:
211
+ min_values.append(None)
212
+ max_values.append(None)
213
+ # If we couldn't introspect column values, assume 0 nulls
214
+ null_counts.append(0)
215
+ except Exception:
216
+ # If even direct inspection fails, ensure lists lengths match
217
+ min_values = [None] * len(table.columns)
218
+ max_values = [None] * len(table.columns)
219
+ null_counts = [0] * len(table.columns)
220
+
221
+ # Calculate uncompressed size from table buffers — must be accurate.
222
+ column_uncompressed: list[int] = []
223
+ uncompressed_size = 0
224
+ for col in table.columns:
225
+ col_total = 0
226
+ for chunk in col.chunks:
227
+ try:
228
+ buffs = chunk.buffers()
229
+ except Exception as exc:
230
+ raise RuntimeError(
231
+ f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
232
+ ) from exc
233
+ for buffer in buffs:
234
+ if buffer is not None:
235
+ col_total += buffer.size
236
+ column_uncompressed.append(int(col_total))
237
+ uncompressed_size += col_total
238
+
239
+ return ParquetManifestEntry(
240
+ file_path=file_path,
241
+ file_format="parquet",
242
+ record_count=int(table.num_rows),
243
+ file_size_in_bytes=file_size_in_bytes,
244
+ uncompressed_size_in_bytes=uncompressed_size,
245
+ column_uncompressed_sizes_in_bytes=column_uncompressed,
246
+ null_counts=null_counts,
247
+ min_k_hashes=min_k_hashes,
248
+ histogram_counts=histograms,
249
+ histogram_bins=HISTOGRAM_BINS,
250
+ min_values=min_values,
251
+ max_values=max_values,
252
+ )
253
+
254
+
255
+ def build_parquet_manifest_minmax_entry(data: bytes, file_path: str) -> ParquetManifestEntry:
256
+ """Build a Parquet manifest entry with min/max statistics using fast rugo reader.
257
+
258
+ This is much faster than build_parquet_manifest_entry (microseconds per file)
259
+ and is suitable for bulk file operations where full statistics are not needed.
260
+
261
+ Args:
262
+ data: Raw parquet file bytes
263
+ file_path: Path where the file is stored
264
+
265
+ Returns:
266
+ ParquetManifestEntry with min/max statistics only (no histograms or k-hashes)
267
+ """
268
+ file_size = len(data)
269
+
270
+ # Prefer rugo fast metadata reader when available, otherwise fall back
271
+ # to pyarrow ParquetFile to extract row-group statistics.
272
+ try:
273
+ import opteryx.rugo.parquet as parquet_meta
274
+ from opteryx.compiled.structures.relation_statistics import to_int
275
+
276
+ if isinstance(data, memoryview):
277
+ metadata = parquet_meta.read_metadata_from_memoryview(data, include_statistics=True)
278
+ else:
279
+ metadata = parquet_meta.read_metadata_from_memoryview(
280
+ memoryview(data), include_statistics=True
281
+ )
282
+
283
+ record_count = metadata["num_rows"]
284
+ except ImportError:
285
+ # Fallback: use pyarrow to read Parquet metadata
286
+ import pyarrow as pa
287
+ import pyarrow.parquet as pq
288
+
289
+ pf = pq.ParquetFile(pa.BufferReader(data))
290
+ record_count = int(pf.metadata.num_rows or 0)
291
+
292
+ # Construct minimal metadata structure compatible with expected shape
293
+ metadata = {"num_rows": record_count, "row_groups": []}
294
+ for rg in range(pf.num_row_groups):
295
+ rg_entry = {"columns": []}
296
+ for ci in range(pf.metadata.num_columns):
297
+ col_meta = pf.metadata.row_group(rg).column(ci)
298
+ col_entry = {"name": pf.schema.names[ci]}
299
+ stats = getattr(col_meta, "statistics", None)
300
+ if stats:
301
+ col_entry["min"] = getattr(stats, "min", None)
302
+ col_entry["max"] = getattr(stats, "max", None)
303
+ rg_entry["columns"].append(col_entry)
304
+ # total_byte_size may not be available; leave out to trigger full-table calculation later
305
+ metadata["row_groups"].append(rg_entry)
306
+
307
+ # Define a simple to_int fallback for the pyarrow path
308
+ def to_int(v: object) -> int:
309
+ try:
310
+ return int(v)
311
+ except Exception:
312
+ try:
313
+ if isinstance(v, (bytes, bytearray)):
314
+ s = v.decode("utf-8", errors="ignore")
315
+ return int(float(s)) if s else 0
316
+ return int(float(v))
317
+ except Exception:
318
+ return 0
319
+
320
+ # Gather min/max per column across all row groups
321
+ column_stats = {}
322
+ for row_group in metadata["row_groups"]:
323
+ for column in row_group["columns"]:
324
+ column_name = column["name"]
325
+
326
+ if column_name not in column_stats:
327
+ column_stats[column_name] = {"min": None, "max": None}
328
+
329
+ min_value = column.get("min")
330
+ if min_value is not None:
331
+ # Compress value to int using to_int
332
+ min_compressed = to_int(min_value)
333
+ if column_stats[column_name]["min"] is None:
334
+ column_stats[column_name]["min"] = min_compressed
335
+ else:
336
+ column_stats[column_name]["min"] = min(
337
+ column_stats[column_name]["min"], min_compressed
338
+ )
339
+
340
+ max_value = column.get("max")
341
+ if max_value is not None:
342
+ # Compress value to int using to_int
343
+ max_compressed = to_int(max_value)
344
+ if column_stats[column_name]["max"] is None:
345
+ column_stats[column_name]["max"] = max_compressed
346
+ else:
347
+ column_stats[column_name]["max"] = max(
348
+ column_stats[column_name]["max"], max_compressed
349
+ )
350
+
351
+ # Extract min/max values (filter out None)
352
+ min_values = [stats["min"] for stats in column_stats.values() if stats["min"] is not None]
353
+ max_values = [stats["max"] for stats in column_stats.values() if stats["max"] is not None]
354
+
355
+ # Attempt to gather null counts from metadata row groups if available
356
+ column_nulls: dict = {}
357
+ for row_group in metadata["row_groups"]:
358
+ for column in row_group["columns"]:
359
+ cname = column["name"]
360
+ if cname not in column_nulls:
361
+ column_nulls[cname] = 0
362
+ nc = column.get("null_count")
363
+ if nc is not None:
364
+ try:
365
+ column_nulls[cname] += int(nc)
366
+ except Exception:
367
+ pass
368
+
369
+ if column_nulls:
370
+ null_counts = [column_nulls.get(n, 0) for n in column_stats.keys()]
371
+ else:
372
+ null_counts = []
373
+
374
+ # Get uncompressed size from metadata; if missing, read full table and
375
+ # compute accurate uncompressed size from buffers. Also attempt to
376
+ # compute per-column uncompressed byte counts when reading the table.
377
+ uncompressed_size = 0
378
+ column_uncompressed: list[int] = []
379
+ missing = False
380
+ for row_group in metadata["row_groups"]:
381
+ v = row_group.get("total_byte_size", None)
382
+ if v is None:
383
+ missing = True
384
+ break
385
+ uncompressed_size += v
386
+
387
+ if missing or uncompressed_size == 0:
388
+ try:
389
+ import pyarrow as pa
390
+ import pyarrow.parquet as pq
391
+
392
+ table = pq.read_table(pa.BufferReader(data))
393
+ uncompressed_size = 0
394
+ # Compute per-column uncompressed sizes and null counts from the table
395
+ for col in table.columns:
396
+ col_total = 0
397
+ null_total = 0
398
+ for chunk in col.chunks:
399
+ for buffer in chunk.buffers():
400
+ if buffer is not None:
401
+ col_total += buffer.size
402
+ try:
403
+ null_total += int(chunk.null_count)
404
+ except Exception:
405
+ # Fallback to slow python inspection
406
+ try:
407
+ col_py = col.to_pylist()
408
+ null_total = len(col_py) - len([v for v in col_py if v is not None])
409
+ except Exception:
410
+ null_total = 0
411
+
412
+ column_uncompressed.append(int(col_total))
413
+ uncompressed_size += col_total
414
+ null_counts = null_counts or []
415
+ null_counts.append(int(null_total))
416
+ except Exception as exc:
417
+ raise RuntimeError(
418
+ f"Unable to determine uncompressed size for {file_path}: {exc}"
419
+ ) from exc
420
+ else:
421
+ # If we didn't read the table and null_counts is still empty, default to zeros
422
+ if not null_counts:
423
+ null_counts = [0] * len(column_stats)
424
+
425
+ return ParquetManifestEntry(
426
+ file_path=file_path,
427
+ file_format="parquet",
428
+ record_count=int(record_count),
429
+ file_size_in_bytes=file_size,
430
+ uncompressed_size_in_bytes=uncompressed_size,
431
+ column_uncompressed_sizes_in_bytes=column_uncompressed,
432
+ null_counts=null_counts,
433
+ min_k_hashes=[],
434
+ histogram_counts=[],
435
+ histogram_bins=0,
436
+ min_values=min_values,
437
+ max_values=max_values,
438
+ )
@@ -46,12 +46,12 @@ class DatasetMetadata:
46
46
  location: str = ""
47
47
  schema: Any = None
48
48
  properties: dict = field(default_factory=dict)
49
- # Table-level created/updated metadata
49
+ # Dataset-level created/updated metadata
50
50
  timestamp_ms: Optional[int] = None
51
51
  author: Optional[str] = None
52
52
  description: Optional[str] = None
53
53
  describer: Optional[str] = None
54
- sort_orders: List[dict] = field(default_factory=list)
54
+ sort_orders: List[int] = field(default_factory=list)
55
55
  # Maintenance policy: retention settings grouped under a single block
56
56
  maintenance_policy: dict = field(
57
57
  default_factory=lambda: {
@@ -15,12 +15,12 @@ class Metastore:
15
15
  implementations to ease future compatibility.
16
16
  """
17
17
 
18
- def load_dataset(self, identifier: str) -> "Table":
18
+ def load_dataset(self, identifier: str) -> "Dataset":
19
19
  raise NotImplementedError()
20
20
 
21
21
  def create_dataset(
22
22
  self, identifier: str, schema: Any, properties: dict | None = None
23
- ) -> "Table":
23
+ ) -> "Dataset":
24
24
  raise NotImplementedError()
25
25
 
26
26
  def drop_dataset(self, identifier: str) -> None:
@@ -1,7 +1,7 @@
1
1
  """Catalog-specific exceptions for opteryx_catalog.
2
2
 
3
3
  Exceptions mirror previous behavior (they subclass KeyError where callers
4
- may expect KeyError) but provide explicit types for tables, views and
4
+ may expect KeyError) but provide explicit types for datasets, views and
5
5
  namespaces.
6
6
  """
7
7
 
@@ -1,14 +1,12 @@
1
1
  """
2
2
  Optimized GCS FileIO for opteryx_catalog.iops
3
-
4
- Adapted from pyiceberg_firestore_gcs.fileio.gcs_fileio to provide a fast
5
- HTTP-backed GCS implementation without depending on pyiceberg types.
6
3
  """
7
4
 
8
5
  import io
9
6
  import logging
10
7
  import os
11
8
  import urllib.parse
9
+ from collections import OrderedDict
12
10
  from typing import Callable
13
11
  from typing import Union
14
12
 
@@ -20,6 +18,9 @@ from .base import FileIO
20
18
  from .base import InputFile
21
19
  from .base import OutputFile
22
20
 
21
+ # we keep a local cache of recently read files
22
+ MAX_CACHE_SIZE: int = 32
23
+
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
@@ -116,12 +117,32 @@ class _GcsOutputStream(io.BytesIO):
116
117
 
117
118
  class _GcsInputFile(InputFile):
118
119
  def __init__(
119
- self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
120
+ self,
121
+ location: str,
122
+ session: requests.Session,
123
+ access_token_getter: Callable[[], str],
124
+ cache: OrderedDict = None,
120
125
  ):
126
+ # Check cache first
127
+ if cache is not None and location in cache:
128
+ # Move to end (most recently used)
129
+ cache.move_to_end(location)
130
+ data = cache[location]
131
+ super().__init__(location, data)
132
+ return
133
+
121
134
  # read entire bytes via optimized session
122
135
  try:
123
136
  stream = _GcsInputStream(location, session, access_token_getter)
124
137
  data = stream.read()
138
+
139
+ # Add to cache
140
+ if cache is not None:
141
+ cache[location] = data
142
+ # Evict oldest if cache exceeds MAX_CACHE_SIZE entries
143
+ if len(cache) > MAX_CACHE_SIZE:
144
+ cache.popitem(last=False)
145
+
125
146
  super().__init__(location, data)
126
147
  except FileNotFoundError:
127
148
  super().__init__(location, None)
@@ -152,6 +173,9 @@ class GcsFileIO(FileIO):
152
173
  self.manifest_paths: list[str] = []
153
174
  self.captured_manifests: list[tuple[str, bytes]] = []
154
175
 
176
+ # LRU cache for read operations (MAX_CACHE_SIZE files max)
177
+ self._read_cache: OrderedDict = OrderedDict()
178
+
155
179
  # Prepare requests session and set up credential refresh helper (token may expire)
156
180
  self._credentials = _get_storage_credentials()
157
181
  self._access_token = None
@@ -180,17 +204,23 @@ class GcsFileIO(FileIO):
180
204
  self._session.mount("https://", adapter)
181
205
 
182
206
  def new_input(self, location: str) -> InputFile:
183
- return _GcsInputFile(location, self._session, self.get_access_token)
207
+ return _GcsInputFile(location, self._session, self.get_access_token, self._read_cache)
184
208
 
185
209
  def new_output(self, location: str) -> OutputFile:
186
210
  logger.info(f"new_output -> {location}")
187
211
 
212
+ # Invalidate cache entry if present
213
+ self._read_cache.pop(location, None)
214
+
188
215
  return _GcsOutputFile(location, self._session, self.get_access_token)
189
216
 
190
217
  def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
191
218
  if isinstance(location, (InputFile, OutputFile)):
192
219
  location = location.location
193
220
 
221
+ # Invalidate cache entry if present
222
+ self._read_cache.pop(location, None)
223
+
194
224
  path = location
195
225
  if path.startswith("gs://"):
196
226
  path = path[5:]