opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. opteryx_catalog/__init__.py +1 -1
  2. opteryx_catalog/catalog/__init__.py +2 -1
  3. opteryx_catalog/catalog/compaction.py +536 -0
  4. opteryx_catalog/catalog/dataset.py +840 -520
  5. opteryx_catalog/catalog/manifest.py +475 -0
  6. opteryx_catalog/catalog/metadata.py +5 -2
  7. opteryx_catalog/catalog/metastore.py +2 -2
  8. opteryx_catalog/exceptions.py +1 -1
  9. opteryx_catalog/iops/fileio.py +13 -0
  10. opteryx_catalog/iops/gcs.py +35 -5
  11. opteryx_catalog/maki_nage/__init__.py +8 -0
  12. opteryx_catalog/maki_nage/distogram.py +558 -0
  13. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  14. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  15. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  16. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  17. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  18. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  19. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  20. opteryx_catalog/opteryx_catalog.py +296 -242
  21. opteryx_catalog/webhooks/__init__.py +230 -0
  22. opteryx_catalog/webhooks/events.py +177 -0
  23. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  24. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  25. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  26. scripts/collect_byte_counts.py +42 -0
  27. scripts/create_dataset.py +1 -1
  28. scripts/emit_full_single_file.py +81 -0
  29. scripts/inspect_manifest_dryrun.py +322 -0
  30. scripts/inspect_single_file.py +147 -0
  31. scripts/inspect_single_file_gcs.py +124 -0
  32. scripts/read_dataset.py +1 -1
  33. tests/test_collections.py +37 -0
  34. tests/test_compaction.py +233 -0
  35. tests/test_dataset_metadata.py +14 -0
  36. tests/test_describe_uncompressed.py +127 -0
  37. tests/test_refresh_manifest.py +275 -0
  38. tests/test_webhooks.py +177 -0
  39. opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
  40. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  41. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -1,9 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
4
+ import time
5
+ from collections import Counter
3
6
  from dataclasses import dataclass
4
7
  from dataclasses import field
8
+ from typing import Any
5
9
  from typing import Dict
6
10
 
11
+ NULL_FLAG = -(1 << 63)
12
+ MIN_K_HASHES = 32
13
+ HISTOGRAM_BINS = 32
14
+
7
15
 
8
16
  @dataclass
9
17
  class DataFile:
@@ -21,3 +29,470 @@ class ManifestEntry:
21
29
  snapshot_id: int
22
30
  data_file: DataFile
23
31
  status: str = "added" # 'added' | 'deleted'
32
+
33
+
34
+ @dataclass
35
+ class ParquetManifestEntry:
36
+ """Represents a single entry in a Parquet manifest with statistics."""
37
+
38
+ file_path: str
39
+ file_format: str
40
+ record_count: int
41
+ file_size_in_bytes: int
42
+ uncompressed_size_in_bytes: int
43
+ column_uncompressed_sizes_in_bytes: list[int]
44
+ null_counts: list[int]
45
+ min_k_hashes: list[list[int]]
46
+ histogram_counts: list[list[int]]
47
+ histogram_bins: int
48
+ min_values: list
49
+ max_values: list
50
+ min_values_display: list
51
+ max_values_display: list
52
+
53
+ def to_dict(self) -> dict:
54
+ return {
55
+ "file_path": self.file_path,
56
+ "file_format": self.file_format,
57
+ "record_count": self.record_count,
58
+ "file_size_in_bytes": self.file_size_in_bytes,
59
+ "uncompressed_size_in_bytes": self.uncompressed_size_in_bytes,
60
+ "column_uncompressed_sizes_in_bytes": self.column_uncompressed_sizes_in_bytes,
61
+ "null_counts": self.null_counts,
62
+ "min_k_hashes": self.min_k_hashes,
63
+ "histogram_counts": self.histogram_counts,
64
+ "histogram_bins": self.histogram_bins,
65
+ "min_values": self.min_values,
66
+ "max_values": self.max_values,
67
+ "min_values_display": self.min_values_display,
68
+ "max_values_display": self.max_values_display,
69
+ }
70
+
71
+
72
+ logger = logging.getLogger(__name__)
73
+ _manifest_metrics = Counter()
74
+
75
+
76
+ def _compute_stats_for_arrow_column(col, field_type, file_path: str):
77
+ """Compute statistics for a single PyArrow column (Array or ChunkedArray).
78
+
79
+ Returns a tuple: (col_min_k, col_hist, col_min, col_max, min_display, max_display, null_count)
80
+ """
81
+ import heapq
82
+
83
+ import opteryx.draken as draken # type: ignore
84
+ import pyarrow as pa
85
+
86
+ # Ensure single contiguous array when possible
87
+ if hasattr(col, "combine_chunks"):
88
+ try:
89
+ col = col.combine_chunks()
90
+ except Exception:
91
+ # leave as-is
92
+ pass
93
+
94
+ # Record compress/hash usage
95
+ _manifest_metrics["hash_calls"] += 1
96
+ _manifest_metrics["compress_calls"] += 1
97
+
98
+ col_py = None
99
+ try:
100
+ vec = draken.Vector.from_arrow(col)
101
+ except Exception: # pragma: no cover - be robust
102
+ raise
103
+
104
+ hashes = set(vec.hash())
105
+
106
+ # Decide whether to compute min-k/histogram for this column
107
+ compute_min_k = False
108
+ if (
109
+ pa.types.is_integer(field_type)
110
+ or pa.types.is_floating(field_type)
111
+ or pa.types.is_decimal(field_type)
112
+ ):
113
+ compute_min_k = True
114
+ elif (
115
+ pa.types.is_timestamp(field_type)
116
+ or pa.types.is_date(field_type)
117
+ or pa.types.is_time(field_type)
118
+ ):
119
+ compute_min_k = True
120
+ elif (
121
+ pa.types.is_string(field_type)
122
+ or pa.types.is_large_string(field_type)
123
+ or pa.types.is_binary(field_type)
124
+ or pa.types.is_large_binary(field_type)
125
+ ):
126
+ # For strings/binary we may need pylist for display
127
+ try:
128
+ col_py = col.to_pylist()
129
+ except Exception:
130
+ col_py = None
131
+ compute_min_k = True
132
+
133
+ if compute_min_k:
134
+ smallest = heapq.nsmallest(MIN_K_HASHES, hashes)
135
+ col_min_k = sorted(smallest)
136
+ else:
137
+ col_min_k = []
138
+
139
+ import pyarrow as pa # local import for types
140
+
141
+ compute_hist = compute_min_k
142
+ if pa.types.is_boolean(field_type):
143
+ compute_hist = True
144
+
145
+ # Use draken.compress() to get canonical int64 per value
146
+ compressed = list(vec.compress())
147
+ null_count = sum(1 for m in compressed if m == NULL_FLAG)
148
+
149
+ non_nulls_compressed = [m for m in compressed if m != NULL_FLAG]
150
+ if non_nulls_compressed:
151
+ vmin = min(non_nulls_compressed)
152
+ vmax = max(non_nulls_compressed)
153
+ col_min = int(vmin)
154
+ col_max = int(vmax)
155
+ if compute_hist:
156
+ # Special-case boolean histograms
157
+ if pa.types.is_boolean(field_type):
158
+ try:
159
+ if col_py is None:
160
+ try:
161
+ col_py = col.to_pylist()
162
+ except Exception:
163
+ col_py = None
164
+ if col_py is not None:
165
+ non_nulls_bool = [v for v in col_py if v is not None]
166
+ false_count = sum(1 for v in non_nulls_bool if v is False)
167
+ true_count = sum(1 for v in non_nulls_bool if v is True)
168
+ else:
169
+ # Fallback: infer from compressed mapping (assume 0/1)
170
+ false_count = sum(1 for m in non_nulls_compressed if m == 0)
171
+ true_count = sum(1 for m in non_nulls_compressed if m != 0)
172
+ except Exception:
173
+ false_count = 0
174
+ true_count = 0
175
+
176
+ col_hist = [int(true_count), int(false_count)]
177
+ else:
178
+ if vmin == vmax:
179
+ col_hist = []
180
+ else:
181
+ col_hist = [0] * HISTOGRAM_BINS
182
+ span = float(vmax - vmin)
183
+ for m in non_nulls_compressed:
184
+ b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
185
+ if b < 0:
186
+ b = 0
187
+ if b >= HISTOGRAM_BINS:
188
+ b = HISTOGRAM_BINS - 1
189
+ col_hist[b] += 1
190
+ else:
191
+ col_hist = []
192
+ else:
193
+ # no non-null values
194
+ col_min = NULL_FLAG
195
+ col_max = NULL_FLAG
196
+ col_hist = []
197
+
198
+ # display values
199
+ try:
200
+ if pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
201
+ if col_py is None:
202
+ try:
203
+ col_py = col.to_pylist()
204
+ except Exception:
205
+ col_py = None
206
+ if col_py is not None:
207
+ non_nulls_str = [x for x in col_py if x is not None]
208
+ if non_nulls_str:
209
+ min_value = min(non_nulls_str)
210
+ max_value = max(non_nulls_str)
211
+ if len(min_value) > 16:
212
+ min_value = min_value[:16] + "..."
213
+ if len(max_value) > 16:
214
+ max_value = max_value[:16] + "..."
215
+ min_display = min_value
216
+ max_display = max_value
217
+ else:
218
+ min_display = None
219
+ max_display = None
220
+ else:
221
+ min_display = None
222
+ max_display = None
223
+ elif pa.types.is_binary(field_type) or pa.types.is_large_binary(field_type):
224
+ if col_py is None:
225
+ try:
226
+ col_py = col.to_pylist()
227
+ except Exception:
228
+ col_py = None
229
+ if col_py is not None:
230
+ non_nulls = [x for x in col_py if x is not None]
231
+ if non_nulls:
232
+ min_value = min(non_nulls)
233
+ max_value = max(non_nulls)
234
+ if len(min_value) > 16:
235
+ min_value = min_value[:16] + "..."
236
+ if len(max_value) > 16:
237
+ max_value = max_value[:16] + "..."
238
+ if any(ord(b) < 32 or ord(b) > 126 for b in min_value):
239
+ min_value = min_value.hex()
240
+ min_value = min_value[:16] + "..."
241
+ if any(ord(b) < 32 or ord(b) > 126 for b in max_value):
242
+ max_value = max_value.hex()
243
+ max_value = max_value[:16] + "..."
244
+ min_display = min_value
245
+ max_display = max_value
246
+ else:
247
+ min_display = None
248
+ max_display = None
249
+ else:
250
+ min_display = None
251
+ max_display = None
252
+ else:
253
+ if col_py is None:
254
+ try:
255
+ col_py = col.to_pylist()
256
+ except Exception:
257
+ col_py = None
258
+ if col_py is not None:
259
+ non_nulls = [x for x in col_py if x is not None]
260
+ if non_nulls:
261
+ min_display = min(non_nulls)
262
+ max_display = max(non_nulls)
263
+ else:
264
+ min_display = None
265
+ max_display = None
266
+ else:
267
+ min_display = None
268
+ max_display = None
269
+ except Exception:
270
+ min_display = None
271
+ max_display = None
272
+
273
+ return (
274
+ col_min_k,
275
+ col_hist,
276
+ int(col_min),
277
+ int(col_max),
278
+ min_display,
279
+ max_display,
280
+ int(null_count),
281
+ )
282
+
283
+
284
+ def build_parquet_manifest_entry_from_bytes(
285
+ data_bytes: bytes,
286
+ file_path: str,
287
+ file_size_in_bytes: int | None = None,
288
+ orig_table: Any | None = None,
289
+ ) -> ParquetManifestEntry:
290
+ """Build a manifest entry by reading a parquet file as bytes and scanning column-by-column.
291
+
292
+ This reads the compressed file once and materializes one full column at a time
293
+ (combine_chunks) which keeps peak memory low while letting per-column
294
+ stat calculation (draken) operate on contiguous arrays.
295
+ """
296
+ import pyarrow as pa
297
+ import pyarrow.parquet as pq
298
+
299
+ t_start = time.perf_counter()
300
+ _manifest_metrics["files_read"] += 1
301
+ _manifest_metrics["bytes_read"] += len(data_bytes)
302
+
303
+ buf = pa.BufferReader(data_bytes)
304
+ pf = pq.ParquetFile(buf)
305
+ meta = pf.metadata
306
+
307
+ # Try to read rugo metadata early so we can compute sizes without
308
+ # materializing the table later. This is zero-copy and fast.
309
+ try:
310
+ from opteryx.rugo.parquet import read_metadata_from_memoryview
311
+
312
+ rmeta = read_metadata_from_memoryview(memoryview(data_bytes))
313
+ except Exception:
314
+ rmeta = None
315
+
316
+ # Prepare result containers
317
+ min_k_hashes: list[list[int]] = []
318
+ histograms: list[list[int]] = []
319
+ min_values: list[int] = []
320
+ null_counts: list[int] = []
321
+ max_values: list[int] = []
322
+ min_values_display: list = []
323
+ max_values_display: list = []
324
+
325
+ # iterate schema fields and process each column independently
326
+ schema = pf.schema_arrow
327
+ for col_idx, field in enumerate(schema):
328
+ col_name = field.name
329
+ try:
330
+ col_table = pf.read(columns=[col_name])
331
+ col = col_table.column(0).combine_chunks()
332
+ except Exception:
333
+ # fallback: try reading the row group column (more granular)
334
+ try:
335
+ tbl = pf.read_row_group(0, columns=[col_name])
336
+ col = tbl.column(0).combine_chunks()
337
+ except Exception:
338
+ # Last resort: read entire file and then take the column
339
+ tbl = pf.read()
340
+ col = tbl.column(col_idx).combine_chunks()
341
+
342
+ # compute stats using existing logic encapsulated in helper
343
+ (
344
+ col_min_k,
345
+ col_hist,
346
+ col_min,
347
+ col_max,
348
+ col_min_display,
349
+ col_max_display,
350
+ null_count,
351
+ ) = _compute_stats_for_arrow_column(col, field.type, file_path)
352
+
353
+ # free the table-level reference if present so memory can be reclaimed
354
+ try:
355
+ del col_table
356
+ except Exception:
357
+ pass
358
+ try:
359
+ del tbl
360
+ except Exception:
361
+ pass
362
+
363
+ min_k_hashes.append(col_min_k)
364
+ histograms.append(col_hist)
365
+ min_values.append(col_min)
366
+ max_values.append(col_max)
367
+ min_values_display.append(col_min_display)
368
+ max_values_display.append(col_max_display)
369
+ null_counts.append(null_count)
370
+
371
+ # Calculate uncompressed sizes. When the original in-memory table is
372
+ # available (we just wrote it), prefer using it so sizes match the
373
+ # table-based builder exactly. Otherwise materialize the table from
374
+ # bytes and compute sizes the same way.
375
+ import pyarrow as pa
376
+ import pyarrow.parquet as pq
377
+
378
+ column_uncompressed: list[int] = []
379
+ uncompressed_size = 0
380
+
381
+ # Free references to large objects we no longer need so memory can be reclaimed
382
+ try:
383
+ del buf
384
+ except Exception:
385
+ pass
386
+ try:
387
+ del pf
388
+ except Exception:
389
+ pass
390
+ try:
391
+ del data_bytes
392
+ except Exception:
393
+ pass
394
+
395
+ if orig_table is not None:
396
+ # Use the original table buffers so results match the table-based route
397
+ for col in orig_table.columns:
398
+ col_total = 0
399
+ for chunk in col.chunks:
400
+ try:
401
+ buffs = chunk.buffers()
402
+ except Exception as exc:
403
+ raise RuntimeError(
404
+ f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
405
+ ) from exc
406
+ for buffer in buffs:
407
+ if buffer is not None:
408
+ col_total += buffer.size
409
+ column_uncompressed.append(int(col_total))
410
+ uncompressed_size += col_total
411
+ else:
412
+ # Use rugo metadata (if available) to compute per-column uncompressed sizes
413
+ if rmeta:
414
+ rgs = rmeta.get("row_groups", [])
415
+ if rgs:
416
+ ncols = len(rgs[0].get("columns", []))
417
+ for cidx in range(ncols):
418
+ col_total = 0
419
+ for rg in rgs:
420
+ cols = rg.get("columns", [])
421
+ if cidx < len(cols):
422
+ col_total += int(cols[cidx].get("total_byte_size", 0) or 0)
423
+ column_uncompressed.append(int(col_total))
424
+ uncompressed_size += col_total
425
+ _manifest_metrics["sizes_from_rugo"] += 1
426
+ else:
427
+ column_uncompressed = [0] * len(schema)
428
+ uncompressed_size = 0
429
+ _manifest_metrics["sizes_from_rugo_missing"] += 1
430
+ else:
431
+ # If rugo metadata isn't available, avoid materializing the table;
432
+ # emit zero sizes (safe and memory-light) and track that we lacked
433
+ # metadata for sizes.
434
+ column_uncompressed = [0] * len(schema)
435
+ uncompressed_size = 0
436
+ _manifest_metrics["sizes_from_rugo_unavailable"] += 1
437
+ logger.debug(
438
+ "rugo metadata unavailable for %s; emitting zero column sizes to avoid materializing table",
439
+ file_path,
440
+ )
441
+
442
+ entry = ParquetManifestEntry(
443
+ file_path=file_path,
444
+ file_format="parquet",
445
+ record_count=int(meta.num_rows),
446
+ file_size_in_bytes=int(file_size_in_bytes or len(data_bytes)),
447
+ uncompressed_size_in_bytes=uncompressed_size,
448
+ column_uncompressed_sizes_in_bytes=column_uncompressed,
449
+ null_counts=null_counts,
450
+ min_k_hashes=min_k_hashes,
451
+ histogram_counts=histograms,
452
+ histogram_bins=HISTOGRAM_BINS,
453
+ min_values=min_values,
454
+ max_values=max_values,
455
+ min_values_display=min_values_display,
456
+ max_values_display=max_values_display,
457
+ )
458
+
459
+ logger.debug(
460
+ "build_parquet_manifest_entry_from_bytes %s files=%d dur=%.3fs",
461
+ file_path,
462
+ _manifest_metrics["files_read"],
463
+ time.perf_counter() - t_start,
464
+ )
465
+ return entry
466
+
467
+
468
+ # Backwards-compatible wrapper that keeps the original calling convention
469
+ # when a pyarrow Table is already provided (tests and some scripts rely on it).
470
+ def build_parquet_manifest_entry(
471
+ table: Any, file_path: str, file_size_in_bytes: int | None = None
472
+ ) -> ParquetManifestEntry:
473
+ """DEPRECATED: explicit table-based manifest building is removed.
474
+
475
+ The implementation previously accepted a PyArrow ``table`` and performed
476
+ the same per-column statistics calculation. That behavior hid a different
477
+ IO/scan path and led to inconsistent performance characteristics.
478
+
479
+ Use ``build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=None)``
480
+ instead. If you have an in-memory table you can serialize it and call the
481
+ bytes-based builder, or pass ``orig_table`` to preserve exact uncompressed
482
+ size calculations.
483
+
484
+ This function now fails fast to avoid silently using the removed path.
485
+ """
486
+ raise RuntimeError(
487
+ "table-based manifest builder removed: use build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=table) instead"
488
+ )
489
+
490
+
491
+ def get_manifest_metrics() -> dict:
492
+ """Return a snapshot of manifest instrumentation counters (for tests/benchmarks)."""
493
+ return dict(_manifest_metrics)
494
+
495
+
496
+ def reset_manifest_metrics() -> None:
497
+ """Reset the manifest metrics counters to zero."""
498
+ _manifest_metrics.clear()
@@ -46,12 +46,12 @@ class DatasetMetadata:
46
46
  location: str = ""
47
47
  schema: Any = None
48
48
  properties: dict = field(default_factory=dict)
49
- # Table-level created/updated metadata
49
+ # Dataset-level created/updated metadata
50
50
  timestamp_ms: Optional[int] = None
51
51
  author: Optional[str] = None
52
52
  description: Optional[str] = None
53
53
  describer: Optional[str] = None
54
- sort_orders: List[dict] = field(default_factory=list)
54
+ sort_orders: List[int] = field(default_factory=list)
55
55
  # Maintenance policy: retention settings grouped under a single block
56
56
  maintenance_policy: dict = field(
57
57
  default_factory=lambda: {
@@ -68,6 +68,9 @@ class DatasetMetadata:
68
68
  # Each schema dict may also include `timestamp-ms` and `author`.
69
69
  schemas: List[dict] = field(default_factory=list)
70
70
  current_schema_id: Optional[str] = None
71
+ # Annotations: list of annotation objects attached to this dataset
72
+ # Each annotation is a dict with keys like 'key' and 'value'.
73
+ annotations: List[dict] = field(default_factory=list)
71
74
 
72
75
  def current_snapshot(self) -> Optional[Snapshot]:
73
76
  if self.current_snapshot_id is None:
@@ -15,12 +15,12 @@ class Metastore:
15
15
  implementations to ease future compatibility.
16
16
  """
17
17
 
18
- def load_dataset(self, identifier: str) -> "Table":
18
+ def load_dataset(self, identifier: str) -> "Dataset":
19
19
  raise NotImplementedError()
20
20
 
21
21
  def create_dataset(
22
22
  self, identifier: str, schema: Any, properties: dict | None = None
23
- ) -> "Table":
23
+ ) -> "Dataset":
24
24
  raise NotImplementedError()
25
25
 
26
26
  def drop_dataset(self, identifier: str) -> None:
@@ -1,7 +1,7 @@
1
1
  """Catalog-specific exceptions for opteryx_catalog.
2
2
 
3
3
  Exceptions mirror previous behavior (they subclass KeyError where callers
4
- may expect KeyError) but provide explicit types for tables, views and
4
+ may expect KeyError) but provide explicit types for datasets, views and
5
5
  namespaces.
6
6
  """
7
7
 
@@ -123,3 +123,16 @@ class GcsFileIO(FileIO):
123
123
  return True
124
124
  except Exception:
125
125
  return False
126
+
127
+
128
+ # Centralized Parquet write options used across the codebase when writing
129
+ # parquet files. Exported here so all writers share the same configuration.
130
+ WRITE_PARQUET_OPTIONS = {
131
+ "compression": "ZSTD",
132
+ "compression_level": 3,
133
+ "use_dictionary": True,
134
+ "dictionary_pagesize_limit": 1024 * 1024,
135
+ "data_page_size": 1024 * 1024,
136
+ "version": "2.6",
137
+ "write_statistics": True,
138
+ }
@@ -1,14 +1,12 @@
1
1
  """
2
2
  Optimized GCS FileIO for opteryx_catalog.iops
3
-
4
- Adapted from pyiceberg_firestore_gcs.fileio.gcs_fileio to provide a fast
5
- HTTP-backed GCS implementation without depending on pyiceberg types.
6
3
  """
7
4
 
8
5
  import io
9
6
  import logging
10
7
  import os
11
8
  import urllib.parse
9
+ from collections import OrderedDict
12
10
  from typing import Callable
13
11
  from typing import Union
14
12
 
@@ -20,6 +18,9 @@ from .base import FileIO
20
18
  from .base import InputFile
21
19
  from .base import OutputFile
22
20
 
21
+ # we keep a local cache of recently read files
22
+ MAX_CACHE_SIZE: int = 32
23
+
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
@@ -116,12 +117,32 @@ class _GcsOutputStream(io.BytesIO):
116
117
 
117
118
  class _GcsInputFile(InputFile):
118
119
  def __init__(
119
- self, location: str, session: requests.Session, access_token_getter: Callable[[], str]
120
+ self,
121
+ location: str,
122
+ session: requests.Session,
123
+ access_token_getter: Callable[[], str],
124
+ cache: OrderedDict = None,
120
125
  ):
126
+ # Check cache first
127
+ if cache is not None and location in cache:
128
+ # Move to end (most recently used)
129
+ cache.move_to_end(location)
130
+ data = cache[location]
131
+ super().__init__(location, data)
132
+ return
133
+
121
134
  # read entire bytes via optimized session
122
135
  try:
123
136
  stream = _GcsInputStream(location, session, access_token_getter)
124
137
  data = stream.read()
138
+
139
+ # Add to cache
140
+ if cache is not None:
141
+ cache[location] = data
142
+ # Evict oldest if cache exceeds MAX_CACHE_SIZE entries
143
+ if len(cache) > MAX_CACHE_SIZE:
144
+ cache.popitem(last=False)
145
+
125
146
  super().__init__(location, data)
126
147
  except FileNotFoundError:
127
148
  super().__init__(location, None)
@@ -152,6 +173,9 @@ class GcsFileIO(FileIO):
152
173
  self.manifest_paths: list[str] = []
153
174
  self.captured_manifests: list[tuple[str, bytes]] = []
154
175
 
176
+ # LRU cache for read operations (MAX_CACHE_SIZE files max)
177
+ self._read_cache: OrderedDict = OrderedDict()
178
+
155
179
  # Prepare requests session and set up credential refresh helper (token may expire)
156
180
  self._credentials = _get_storage_credentials()
157
181
  self._access_token = None
@@ -180,17 +204,23 @@ class GcsFileIO(FileIO):
180
204
  self._session.mount("https://", adapter)
181
205
 
182
206
  def new_input(self, location: str) -> InputFile:
183
- return _GcsInputFile(location, self._session, self.get_access_token)
207
+ return _GcsInputFile(location, self._session, self.get_access_token, self._read_cache)
184
208
 
185
209
  def new_output(self, location: str) -> OutputFile:
186
210
  logger.info(f"new_output -> {location}")
187
211
 
212
+ # Invalidate cache entry if present
213
+ self._read_cache.pop(location, None)
214
+
188
215
  return _GcsOutputFile(location, self._session, self.get_access_token)
189
216
 
190
217
  def delete(self, location: Union[str, InputFile, OutputFile]) -> None:
191
218
  if isinstance(location, (InputFile, OutputFile)):
192
219
  location = location.location
193
220
 
221
+ # Invalidate cache entry if present
222
+ self._read_cache.pop(location, None)
223
+
194
224
  path = location
195
225
  if path.startswith("gs://"):
196
226
  path = path[5:]
@@ -0,0 +1,8 @@
1
+ # Lightweight package shim so `opteryx.third_party.maki_nage` is importable
2
+ from .distogram import Distogram
3
+ from .distogram import histogram
4
+ from .distogram import load
5
+ from .distogram import merge
6
+ from .distogram import quantile
7
+
8
+ __all__ = ["Distogram", "load", "merge", "histogram", "quantile"]