opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. opteryx_catalog/__init__.py +1 -1
  2. opteryx_catalog/catalog/__init__.py +2 -1
  3. opteryx_catalog/catalog/compaction.py +536 -0
  4. opteryx_catalog/catalog/dataset.py +840 -520
  5. opteryx_catalog/catalog/manifest.py +475 -0
  6. opteryx_catalog/catalog/metadata.py +5 -2
  7. opteryx_catalog/catalog/metastore.py +2 -2
  8. opteryx_catalog/exceptions.py +1 -1
  9. opteryx_catalog/iops/fileio.py +13 -0
  10. opteryx_catalog/iops/gcs.py +35 -5
  11. opteryx_catalog/maki_nage/__init__.py +8 -0
  12. opteryx_catalog/maki_nage/distogram.py +558 -0
  13. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  14. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  15. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  16. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  17. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  18. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  19. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  20. opteryx_catalog/opteryx_catalog.py +296 -242
  21. opteryx_catalog/webhooks/__init__.py +230 -0
  22. opteryx_catalog/webhooks/events.py +177 -0
  23. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  24. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  25. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  26. scripts/collect_byte_counts.py +42 -0
  27. scripts/create_dataset.py +1 -1
  28. scripts/emit_full_single_file.py +81 -0
  29. scripts/inspect_manifest_dryrun.py +322 -0
  30. scripts/inspect_single_file.py +147 -0
  31. scripts/inspect_single_file_gcs.py +124 -0
  32. scripts/read_dataset.py +1 -1
  33. tests/test_collections.py +37 -0
  34. tests/test_compaction.py +233 -0
  35. tests/test_dataset_metadata.py +14 -0
  36. tests/test_describe_uncompressed.py +127 -0
  37. tests/test_refresh_manifest.py +275 -0
  38. tests/test_webhooks.py +177 -0
  39. opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
  40. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  41. {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import os
4
3
  import time
5
4
  from typing import Any
6
5
  from typing import Iterable
@@ -21,6 +20,9 @@ from .exceptions import DatasetNotFound
21
20
  from .exceptions import ViewAlreadyExists
22
21
  from .exceptions import ViewNotFound
23
22
  from .iops.base import FileIO
23
+ from .webhooks import send_webhook
24
+ from .webhooks.events import dataset_created_payload
25
+ from .webhooks.events import view_created_payload
24
26
 
25
27
 
26
28
  class OpteryxCatalog(Metastore):
@@ -28,7 +30,7 @@ class OpteryxCatalog(Metastore):
28
30
 
29
31
  Terminology: catalog -> workspace -> collection -> dataset|view
30
32
 
31
- Stores table documents under the configured workspace in Firestore.
33
+ Stores dataset documents under the configured workspace in Firestore.
32
34
  Snapshots are stored in a `snapshots` subcollection under each
33
35
  dataset's document. Parquet manifests are written to GCS under the
34
36
  dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
@@ -57,12 +59,8 @@ class OpteryxCatalog(Metastore):
57
59
  props_ref = self._catalog_ref.document("$properties")
58
60
  if not props_ref.get().exists:
59
61
  now_ms = int(time.time() * 1000)
60
- billing = (
61
- os.environ.get("BILLING_ACCOUNT_ID")
62
- or os.environ.get("BILLING_ACCOUNT")
63
- or None
64
- )
65
- owner = os.environ.get("WORKSPACE_OWNER") or None
62
+ billing = None
63
+ owner = None
66
64
  props_ref.set(
67
65
  {
68
66
  "timestamp-ms": now_ms,
@@ -81,12 +79,9 @@ class OpteryxCatalog(Metastore):
81
79
  self.io = io
82
80
  else:
83
81
  if gcs_bucket:
84
- try:
85
- from .iops.gcs import GcsFileIO
82
+ from .iops.gcs import GcsFileIO
86
83
 
87
- self.io = GcsFileIO()
88
- except Exception:
89
- self.io = FileIO()
84
+ self.io = GcsFileIO()
90
85
  else:
91
86
  self.io = FileIO()
92
87
 
@@ -109,7 +104,7 @@ class OpteryxCatalog(Metastore):
109
104
  return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
110
105
 
111
106
  def _views_collection(self, collection: str):
112
- return self._namespace_ref(collection).collection("views")
107
+ return self._collection_ref(collection).collection("views")
113
108
 
114
109
  def _view_doc_ref(self, collection: str, view_name: str):
115
110
  return self._views_collection(collection).document(view_name)
@@ -125,7 +120,7 @@ class OpteryxCatalog(Metastore):
125
120
  if doc_ref.get().exists:
126
121
  raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
127
122
 
128
- # Build default table metadata
123
+ # Build default dataset metadata
129
124
  location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
130
125
  metadata = DatasetMetadata(
131
126
  dataset_identifier=identifier,
@@ -149,11 +144,10 @@ class OpteryxCatalog(Metastore):
149
144
  "timestamp-ms": now_ms,
150
145
  "author": author,
151
146
  "maintenance-policy": metadata.maintenance_policy,
147
+ "annotations": metadata.annotations,
152
148
  }
153
149
  )
154
150
 
155
- # Persisted in primary `datasets` collection only.
156
-
157
151
  # Persist initial schema into `schemas` subcollection if provided
158
152
  if schema is not None:
159
153
  schema_id = self._write_schema(collection, dataset_name, schema, author=author)
@@ -175,13 +169,41 @@ class OpteryxCatalog(Metastore):
175
169
  metadata.schemas = [
176
170
  {"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
177
171
  ]
178
- # update table doc to reference current schema
172
+ # update dataset doc to reference current schema
179
173
  doc_ref.update({"current-schema-id": metadata.current_schema_id})
180
174
 
175
+ # Send webhook notification
176
+ send_webhook(
177
+ action="create",
178
+ workspace=self.workspace,
179
+ collection=collection,
180
+ resource_type="dataset",
181
+ resource_name=dataset_name,
182
+ payload=dataset_created_payload(
183
+ schema=schema,
184
+ location=location,
185
+ properties=properties,
186
+ ),
187
+ )
188
+
181
189
  # Return SimpleDataset (attach this catalog so append() can persist)
182
190
  return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
183
191
 
184
- def load_dataset(self, identifier: str) -> SimpleDataset:
192
+ def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
193
+ """Load a dataset from Firestore.
194
+
195
+ Args:
196
+ identifier: Dataset identifier in format 'collection.dataset_name'
197
+ load_history: If True, load all snapshots from Firestore (expensive for
198
+ large histories). If False (default), only load the current snapshot,
199
+ which is sufficient for most write operations.
200
+
201
+ Returns:
202
+ SimpleDataset instance with metadata loaded from Firestore.
203
+
204
+ Raises:
205
+ DatasetNotFound: If the dataset does not exist in Firestore.
206
+ """
185
207
  collection, dataset_name = identifier.split(".")
186
208
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
187
209
  doc = doc_ref.get()
@@ -197,37 +219,69 @@ class OpteryxCatalog(Metastore):
197
219
  properties=data.get("properties") or {},
198
220
  )
199
221
 
200
- # Load table-level timestamp/author and collection/workspace
222
+ # Load dataset-level timestamp/author and collection/workspace
201
223
  metadata.timestamp_ms = data.get("timestamp-ms")
202
224
  metadata.author = data.get("author")
203
- # note: Firestore table doc stores the original collection and workspace
204
- # under keys `collection` and `workspace`.
225
+ metadata.description = data.get("description")
226
+ metadata.describer = data.get("describer")
227
+ metadata.annotations = data.get("annotations") or []
205
228
 
206
- # Load snapshots
229
+ # Load snapshots based on load_history flag
207
230
  snaps = []
208
- for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
209
- sd = snap_doc.to_dict() or {}
210
- snap = Snapshot(
211
- snapshot_id=sd.get("snapshot-id"),
212
- timestamp_ms=sd.get("timestamp-ms"),
213
- author=sd.get("author"),
214
- sequence_number=sd.get("sequence-number"),
215
- user_created=sd.get("user-created"),
216
- manifest_list=sd.get("manifest"),
217
- schema_id=sd.get("schema-id"),
218
- summary=sd.get("summary", {}),
219
- operation_type=sd.get("operation-type"),
220
- parent_snapshot_id=sd.get("parent-snapshot-id"),
221
- )
222
- snaps.append(snap)
231
+ if load_history:
232
+ # Load all snapshots from Firestore (expensive for large histories)
233
+ for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
234
+ sd = snap_doc.to_dict() or {}
235
+ snap = Snapshot(
236
+ snapshot_id=sd.get("snapshot-id"),
237
+ timestamp_ms=sd.get("timestamp-ms"),
238
+ author=sd.get("author"),
239
+ sequence_number=sd.get("sequence-number"),
240
+ user_created=sd.get("user-created"),
241
+ manifest_list=sd.get("manifest"),
242
+ schema_id=sd.get("schema-id"),
243
+ summary=sd.get("summary", {}),
244
+ operation_type=sd.get("operation-type"),
245
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
246
+ )
247
+ snaps.append(snap)
248
+ if snaps:
249
+ metadata.current_snapshot_id = snaps[-1].snapshot_id
250
+ else:
251
+ # Load only the current snapshot (efficient single read)
252
+ current_snap_id = data.get("current-snapshot-id")
253
+ if current_snap_id:
254
+ try:
255
+ snap_doc = (
256
+ self._snapshots_collection(collection, dataset_name)
257
+ .document(str(current_snap_id))
258
+ .get()
259
+ )
260
+ if snap_doc.exists:
261
+ sd = snap_doc.to_dict() or {}
262
+ snap = Snapshot(
263
+ snapshot_id=sd.get("snapshot-id"),
264
+ timestamp_ms=sd.get("timestamp-ms"),
265
+ author=sd.get("author"),
266
+ sequence_number=sd.get("sequence-number"),
267
+ user_created=sd.get("user-created"),
268
+ manifest_list=sd.get("manifest"),
269
+ schema_id=sd.get("schema-id"),
270
+ summary=sd.get("summary", {}),
271
+ operation_type=sd.get("operation-type"),
272
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
273
+ )
274
+ snaps.append(snap)
275
+ metadata.current_snapshot_id = current_snap_id
276
+ except Exception:
277
+ pass
223
278
  metadata.snapshots = snaps
224
- if snaps:
225
- metadata.current_snapshot_id = snaps[-1].snapshot_id
226
279
 
227
280
  # Load schemas subcollection
228
- try:
281
+ schemas_coll = doc_ref.collection("schemas")
282
+ # Load all schemas if requested; otherwise load only current schema
283
+ if load_history:
229
284
  schemas = []
230
- schemas_coll = doc_ref.collection("schemas")
231
285
  for sdoc in schemas_coll.stream():
232
286
  sd = sdoc.to_dict() or {}
233
287
  schemas.append(
@@ -241,9 +295,23 @@ class OpteryxCatalog(Metastore):
241
295
  )
242
296
  metadata.schemas = schemas
243
297
  metadata.current_schema_id = doc.to_dict().get("current-schema-id")
244
- except Exception:
245
- pass
246
-
298
+ else:
299
+ # Only load the current schema document for efficiency
300
+ current_schema_id = doc.to_dict().get("current-schema-id")
301
+ if current_schema_id:
302
+ sdoc = schemas_coll.document(str(current_schema_id)).get()
303
+ if sdoc.exists:
304
+ sd = sdoc.to_dict() or {}
305
+ metadata.schemas = [
306
+ {
307
+ "schema_id": sdoc.id,
308
+ "columns": sd.get("columns", []),
309
+ "timestamp-ms": sd.get("timestamp-ms"),
310
+ "author": sd.get("author"),
311
+ "sequence-number": sd.get("sequence-number"),
312
+ }
313
+ ]
314
+ metadata.current_schema_id = current_schema_id
247
315
  return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
248
316
 
249
317
  def drop_dataset(self, identifier: str) -> None:
@@ -259,6 +327,13 @@ class OpteryxCatalog(Metastore):
259
327
  coll = self._datasets_collection(collection)
260
328
  return [doc.id for doc in coll.stream()]
261
329
 
330
+ def list_collections(self) -> Iterable[str]:
331
+ """List top-level collections (documents) in this workspace."""
332
+ try:
333
+ return [col.id for col in self._catalog_ref.list_documents() if col.id[0] != "$"]
334
+ except:
335
+ return []
336
+
262
337
  def create_collection(
263
338
  self,
264
339
  collection: str,
@@ -270,7 +345,7 @@ class OpteryxCatalog(Metastore):
270
345
 
271
346
  If `exists_ok` is False and the collection already exists, a KeyError is raised.
272
347
  """
273
- doc_ref = self._namespace_ref(collection)
348
+ doc_ref = self._collection_ref(collection)
274
349
  if doc_ref.get().exists:
275
350
  if exists_ok:
276
351
  return
@@ -285,6 +360,7 @@ class OpteryxCatalog(Metastore):
285
360
  "properties": properties or {},
286
361
  "timestamp-ms": now_ms,
287
362
  "author": author,
363
+ "annotations": [],
288
364
  }
289
365
  )
290
366
 
@@ -292,11 +368,7 @@ class OpteryxCatalog(Metastore):
292
368
  self, collection: str, properties: dict | None = None, author: Optional[str] = None
293
369
  ) -> None:
294
370
  """Convenience wrapper that creates the collection only if missing."""
295
- try:
296
- self.create_collection(collection, properties=properties, exists_ok=True, author=author)
297
- except Exception:
298
- # Be conservative: surface caller-level warnings rather than failing
299
- return
371
+ self.create_collection(collection, properties=properties, exists_ok=True, author=author)
300
372
 
301
373
  def dataset_exists(
302
374
  self, identifier_or_collection: str, dataset_name: Optional[str] = None
@@ -309,12 +381,14 @@ class OpteryxCatalog(Metastore):
309
381
  """
310
382
  # Normalize inputs
311
383
  if dataset_name is None:
312
- # Expect a single collection like 'collection.table'
384
+ # Expect a single collection like 'collection.dataset'
313
385
  if "." not in identifier_or_collection:
314
386
  raise ValueError(
315
- "collection must be 'collection.table' or pass dataset_name separately"
387
+ "collection must be 'collection.dataset' or pass dataset_name separately"
316
388
  )
317
389
  collection, dataset_name = identifier_or_collection.rsplit(".", 1)
390
+ else:
391
+ collection = identifier_or_collection
318
392
 
319
393
  try:
320
394
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
@@ -334,6 +408,7 @@ class OpteryxCatalog(Metastore):
334
408
  author: str = None,
335
409
  description: Optional[str] = None,
336
410
  properties: dict | None = None,
411
+ update_if_exists: bool = False,
337
412
  ) -> CatalogView:
338
413
  """Create a view document and a statement version in the `statement` subcollection.
339
414
 
@@ -347,7 +422,22 @@ class OpteryxCatalog(Metastore):
347
422
 
348
423
  doc_ref = self._view_doc_ref(collection, view_name)
349
424
  if doc_ref.get().exists:
350
- raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
425
+ if not update_if_exists:
426
+ raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
427
+ # Update existing view - get current sequence number
428
+ existing_doc = doc_ref.get().to_dict()
429
+ current_statement_id = existing_doc.get("statement-id")
430
+ if current_statement_id:
431
+ stmt_ref = doc_ref.collection("statement").document(current_statement_id)
432
+ stmt_doc = stmt_ref.get()
433
+ if stmt_doc.exists:
434
+ sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
435
+ else:
436
+ sequence_number = 1
437
+ else:
438
+ sequence_number = 1
439
+ else:
440
+ sequence_number = 1
351
441
 
352
442
  now_ms = int(time.time() * 1000)
353
443
  if author is None:
@@ -361,7 +451,7 @@ class OpteryxCatalog(Metastore):
361
451
  "sql": sql,
362
452
  "timestamp-ms": now_ms,
363
453
  "author": author,
364
- "sequence-number": 1,
454
+ "sequence-number": sequence_number,
365
455
  }
366
456
  )
367
457
 
@@ -383,12 +473,28 @@ class OpteryxCatalog(Metastore):
383
473
  }
384
474
  )
385
475
 
476
+ # Send webhook notification
477
+ send_webhook(
478
+ action="create" if not update_if_exists else "update",
479
+ workspace=self.workspace,
480
+ collection=collection,
481
+ resource_type="view",
482
+ resource_name=view_name,
483
+ payload=view_created_payload(
484
+ definition=sql,
485
+ properties=properties,
486
+ ),
487
+ )
488
+
386
489
  # Return a simple CatalogView wrapper
387
490
  v = CatalogView(name=view_name, definition=sql, properties=properties or {})
388
491
  # provide convenient attributes used by docs/examples
389
492
  setattr(v, "sql", sql)
390
493
  setattr(v, "metadata", type("M", (), {})())
391
494
  v.metadata.schema = schema
495
+ # Attach catalog and identifier for describe() method
496
+ setattr(v, "_catalog", self)
497
+ setattr(v, "_identifier", f"{collection}.{view_name}")
392
498
  return v
393
499
 
394
500
  def load_view(self, identifier: str | tuple) -> CatalogView:
@@ -410,27 +516,28 @@ class OpteryxCatalog(Metastore):
410
516
  stmt_id = data.get("statement-id")
411
517
  sql = None
412
518
  schema = data.get("schema")
413
- try:
414
- if stmt_id:
415
- sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
416
- if sdoc.exists:
417
- sql = (sdoc.to_dict() or {}).get("sql")
418
- # fallback: pick the most recent statement
419
- if not sql:
420
- for s in doc_ref.collection("statement").stream():
421
- sd = s.to_dict() or {}
422
- if sd.get("sql"):
423
- sql = sd.get("sql")
424
- break
425
- except Exception:
426
- pass
519
+
520
+ sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
521
+ sql = (sdoc.to_dict() or {}).get("sql")
427
522
 
428
523
  v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
429
524
  setattr(v, "sql", sql or "")
430
525
  setattr(v, "metadata", type("M", (), {})())
431
526
  v.metadata.schema = schema
527
+ # Populate metadata fields from the stored view document so callers
528
+ # expecting attributes like `timestamp_ms` won't fail.
432
529
  v.metadata.author = data.get("author")
433
530
  v.metadata.description = data.get("description")
531
+ v.metadata.timestamp_ms = data.get("timestamp-ms")
532
+ # Execution/operational fields (may be None)
533
+ v.metadata.last_execution_ms = data.get("last-execution-ms")
534
+ v.metadata.last_execution_data_size = data.get("last-execution-data-size")
535
+ v.metadata.last_execution_records = data.get("last-execution-records")
536
+ # Optional describer (used to flag LLM-generated descriptions)
537
+ v.metadata.describer = data.get("describer")
538
+ # Attach catalog and identifier for describe() method
539
+ setattr(v, "_catalog", self)
540
+ setattr(v, "_identifier", f"{collection}.{view_name}")
434
541
  return v
435
542
 
436
543
  def drop_view(self, identifier: str | tuple) -> None:
@@ -441,11 +548,9 @@ class OpteryxCatalog(Metastore):
441
548
 
442
549
  doc_ref = self._view_doc_ref(collection, view_name)
443
550
  # delete statement subcollection
444
- try:
445
- for d in doc_ref.collection("statement").stream():
446
- doc_ref.collection("statement").document(d.id).delete()
447
- except Exception:
448
- pass
551
+ for d in doc_ref.collection("statement").stream():
552
+ doc_ref.collection("statement").document(d.id).delete()
553
+
449
554
  doc_ref.delete()
450
555
 
451
556
  def list_views(self, collection: str) -> Iterable[str]:
@@ -474,6 +579,8 @@ class OpteryxCatalog(Metastore):
474
579
  "identifier must be 'collection.view' or pass view_name separately"
475
580
  )
476
581
  collection, view_name = identifier_or_collection.rsplit(".", 1)
582
+ else:
583
+ collection = identifier_or_collection
477
584
 
478
585
  try:
479
586
  doc_ref = self._view_doc_ref(collection, view_name)
@@ -501,40 +608,82 @@ class OpteryxCatalog(Metastore):
501
608
  updates["last-execution-time-ms"] = int(execution_time * 1000)
502
609
  updates["last-execution-ms"] = now_ms
503
610
  if updates:
504
- try:
505
- doc_ref.update(updates)
506
- except Exception:
507
- pass
611
+ doc_ref.update(updates)
612
+
613
+ def update_view_description(
614
+ self,
615
+ identifier: str | tuple,
616
+ description: str,
617
+ describer: Optional[str] = None,
618
+ ) -> None:
619
+ """Update the description for a view.
620
+
621
+ Args:
622
+ identifier: View identifier ('collection.view' or tuple)
623
+ description: The new description text
624
+ describer: Optional identifier for who/what created the description
625
+ """
626
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
627
+ collection, view_name = identifier[0], identifier[1]
628
+ else:
629
+ collection, view_name = identifier.split(".")
630
+
631
+ doc_ref = self._view_doc_ref(collection, view_name)
632
+ updates = {
633
+ "description": description,
634
+ }
635
+ if describer is not None:
636
+ updates["describer"] = describer
637
+ doc_ref.update(updates)
638
+
639
+ def update_dataset_description(
640
+ self,
641
+ identifier: str | tuple,
642
+ description: str,
643
+ describer: Optional[str] = None,
644
+ ) -> None:
645
+ """Update the description for a dataset.
646
+
647
+ Args:
648
+ identifier: Dataset identifier in format 'collection.dataset_name'
649
+ description: The new description text
650
+ describer: Optional identifier for who/what created the description
651
+ """
652
+
653
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
654
+ collection, dataset_name = identifier[0], identifier[1]
655
+ else:
656
+ collection, dataset_name = identifier.split(".")
657
+
658
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
659
+ updates = {
660
+ "description": description,
661
+ }
662
+ if describer is not None:
663
+ updates["describer"] = describer
664
+ doc_ref.update(updates)
508
665
 
509
666
  def write_parquet_manifest(
510
- self, snapshot_id: int, entries: List[dict], table_location: str
667
+ self, snapshot_id: int, entries: List[dict], dataset_location: str
511
668
  ) -> Optional[str]:
512
669
  """Write a Parquet manifest for the given snapshot id and entries.
513
670
 
514
671
  Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
515
- The manifest will be written to <table_location>/metadata/manifest-<snapshot_id>.parquet
672
+ The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
516
673
  """
517
674
  import pyarrow as pa
518
675
  import pyarrow.parquet as pq
519
676
 
677
+ from .iops.fileio import WRITE_PARQUET_OPTIONS
678
+
520
679
  # If entries is None we skip writing; if entries is empty list, write
521
- # an empty Parquet manifest (represents an empty table for this
680
+ # an empty Parquet manifest (represents an empty dataset for this
522
681
  # snapshot). This preserves previous manifests so older snapshots
523
682
  # remain readable.
524
683
  if entries is None:
525
684
  return None
526
685
 
527
- # Print manifest entries so users can inspect the manifest when created
528
- try:
529
- pass
530
-
531
- # print("[MANIFEST] Parquet manifest entries to write:")
532
- # print(json.dumps(entries, indent=2, default=str))
533
- except Exception:
534
- # print("[MANIFEST] Parquet manifest entries:", entries)
535
- pass
536
-
537
- parquet_path = f"{table_location}/metadata/manifest-{snapshot_id}.parquet"
686
+ parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
538
687
 
539
688
  # Use provided FileIO if it supports writing; otherwise write to GCS
540
689
  try:
@@ -546,146 +695,63 @@ class OpteryxCatalog(Metastore):
546
695
  ("file_format", pa.string()),
547
696
  ("record_count", pa.int64()),
548
697
  ("file_size_in_bytes", pa.int64()),
698
+ ("uncompressed_size_in_bytes", pa.int64()),
699
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
700
+ ("null_counts", pa.list_(pa.int64())),
549
701
  ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
550
702
  ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
551
703
  ("histogram_bins", pa.int32()),
552
704
  ("min_values", pa.list_(pa.int64())),
553
705
  ("max_values", pa.list_(pa.int64())),
706
+ ("min_values_display", pa.list_(pa.string())),
707
+ ("max_values_display", pa.list_(pa.string())),
554
708
  ]
555
709
  )
556
710
 
557
- try:
558
- table = pa.Table.from_pylist(entries, schema=schema)
559
- except Exception:
560
- # Diagnostic output to help find malformed manifest entries
561
- try:
562
- print(
563
- "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
564
- )
565
- for i, ent in enumerate(entries):
566
- print(f" Entry {i}:")
567
- if isinstance(ent, dict):
568
- for k, v in ent.items():
569
- tname = type(v).__name__
570
- try:
571
- s = repr(v)
572
- except Exception:
573
- s = "<unreprable>"
574
- print(f" - {k}: type={tname} repr={s[:200]}")
575
- else:
576
- print(
577
- f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
578
- )
579
- except Exception:
580
- pass
711
+ # Normalize entries to match schema expectations:
712
+ normalized = []
713
+ for ent in entries:
714
+ if not isinstance(ent, dict):
715
+ normalized.append(ent)
716
+ continue
717
+ e = dict(ent)
718
+ # Ensure list fields exist
719
+ e.setdefault("min_k_hashes", [])
720
+ e.setdefault("histogram_counts", [])
721
+ e.setdefault("histogram_bins", 0)
722
+ e.setdefault("column_uncompressed_sizes_in_bytes", [])
723
+ e.setdefault("null_counts", [])
724
+ e.setdefault("min_values_display", [])
725
+ e.setdefault("max_values_display", [])
726
+
727
+ # min/max values are stored as compressed int64 values
728
+ # display values are string representations for human readability
729
+ mv = e.get("min_values") or []
730
+ xv = e.get("max_values") or []
731
+ mv_disp = e.get("min_values_display") or []
732
+ xv_disp = e.get("max_values_display") or []
733
+
734
+ def truncate_display(v, max_len=32):
735
+ """Truncate display value to max_len characters, adding '...' if longer."""
736
+ if v is None:
737
+ return None
738
+ s = str(v)
739
+ if len(s) > max_len:
740
+ return s[:max_len] + "..."
741
+ return s
742
+
743
+ # Ensure int64 values are properly typed for min/max
744
+ e["min_values"] = [int(v) if v is not None else None for v in mv]
745
+ e["max_values"] = [int(v) if v is not None else None for v in xv]
746
+ # Display values truncated to 32 chars with '...' suffix if longer
747
+ e["min_values_display"] = [truncate_display(v) for v in mv_disp]
748
+ e["max_values_display"] = [truncate_display(v) for v in xv_disp]
749
+ normalized.append(e)
750
+
751
+ table = pa.Table.from_pylist(normalized, schema=schema)
581
752
 
582
- # Attempt to sanitize entries and retry conversion.
583
- try:
584
- print("[MANIFEST DEBUG] Attempting to sanitize entries and retry")
585
- sanitized = []
586
- for ent in entries:
587
- if not isinstance(ent, dict):
588
- sanitized.append(ent)
589
- continue
590
- e2 = dict(ent) # copy
591
- # Ensure numeric fields
592
- for k in ("record_count", "file_size_in_bytes", "histogram_bins"):
593
- v = e2.get(k)
594
- try:
595
- e2[k] = int(v) if v is not None else 0
596
- except Exception:
597
- e2[k] = 0
598
- # Ensure min_k_hashes is list[list[int]]
599
- mk = e2.get("min_k_hashes")
600
- if not isinstance(mk, list):
601
- e2["min_k_hashes"] = []
602
- else:
603
- new_mk = []
604
- for sub in mk:
605
- if isinstance(sub, list):
606
- try:
607
- new_mk.append([int(x) for x in sub])
608
- except Exception:
609
- new_mk.append([])
610
- else:
611
- new_mk.append([])
612
- e2["min_k_hashes"] = new_mk
613
- # Ensure histogram_counts is list[list[int]]
614
- hc = e2.get("histogram_counts")
615
- if not isinstance(hc, list):
616
- e2["histogram_counts"] = []
617
- else:
618
- new_hc = []
619
- for sub in hc:
620
- if isinstance(sub, list):
621
- try:
622
- new_hc.append([int(x) for x in sub])
623
- except Exception:
624
- new_hc.append([])
625
- else:
626
- new_hc.append([])
627
- e2["histogram_counts"] = new_hc
628
- # Sanitize min_values / max_values: must be list[int] or None
629
- # Sanitize min_values / max_values: coerce to int64 using to_int() if available
630
- try:
631
- from opteryx.compiled.structures.relation_statistics import to_int
632
- except Exception:
633
-
634
- def to_int(val):
635
- # Best-effort fallback: handle numpy types, strings and numbers
636
- try:
637
- if val is None:
638
- return None
639
- if hasattr(val, "item"):
640
- val = val.item()
641
- if isinstance(val, (bytes, bytearray)):
642
- val = val.decode(errors="ignore")
643
- if isinstance(val, str):
644
- # empty strings are invalid
645
- if val == "":
646
- return None
647
- try:
648
- return int(val)
649
- except Exception:
650
- return None
651
- if isinstance(val, float):
652
- return int(val)
653
- return int(val)
654
- except Exception:
655
- return None
656
-
657
- for key in ("min_values", "max_values"):
658
- mv = e2.get(key)
659
- if not isinstance(mv, list):
660
- e2[key] = [None]
661
- else:
662
- new_mv = []
663
- for x in mv:
664
- try:
665
- if x is None:
666
- new_mv.append(None)
667
- continue
668
- # Use to_int to coerce into int64 semantics
669
- v = x
670
- if hasattr(v, "item"):
671
- v = v.item()
672
- coerced = to_int(v)
673
- # to_int may return None-like sentinel; accept ints only
674
- if coerced is None:
675
- new_mv.append(None)
676
- else:
677
- new_mv.append(int(coerced))
678
- except Exception:
679
- new_mv.append(None)
680
- e2[key] = new_mv
681
- sanitized.append(e2)
682
- table = pa.Table.from_pylist(sanitized, schema=schema)
683
- print("[MANIFEST DEBUG] Sanitized entries converted successfully")
684
- except Exception:
685
- print("[MANIFEST DEBUG] Sanitization failed; re-raising original exception")
686
- raise
687
753
  buf = pa.BufferOutputStream()
688
- pq.write_table(table, buf, compression="zstd")
754
+ pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
689
755
  data = buf.getvalue().to_pybytes()
690
756
 
691
757
  if self.io:
@@ -696,15 +762,6 @@ class OpteryxCatalog(Metastore):
696
762
  out.close()
697
763
  except Exception:
698
764
  pass
699
- elif self._storage_client and self.gcs_bucket:
700
- # Write to GCS bucket
701
- bucket = self._storage_client.bucket(self.gcs_bucket)
702
- # object path: remove gs://bucket/ prefix
703
- parsed = parquet_path
704
- if parsed.startswith("gs://"):
705
- parsed = parsed[5 + len(self.gcs_bucket) + 1 :]
706
- blob = bucket.blob(parsed)
707
- blob.upload_from_string(data)
708
765
 
709
766
  return parquet_path
710
767
  except Exception as e:
@@ -713,7 +770,7 @@ class OpteryxCatalog(Metastore):
713
770
  raise e
714
771
 
715
772
  def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
716
- """Persist a single snapshot document for a table."""
773
+ """Persist a single snapshot document for a dataset."""
717
774
  namespace, dataset_name = identifier.split(".")
718
775
  snaps = self._snapshots_collection(namespace, dataset_name)
719
776
  doc_id = str(snapshot.snapshot_id)
@@ -749,9 +806,9 @@ class OpteryxCatalog(Metastore):
749
806
  snaps.document(doc_id).set(data)
750
807
 
751
808
  def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
752
- """Persist table-level metadata and snapshots to Firestore.
809
+ """Persist dataset-level metadata and snapshots to Firestore.
753
810
 
754
- This writes the table document and upserts snapshot documents.
811
+ This writes the dataset document and upserts snapshot documents.
755
812
  """
756
813
  collection, dataset_name = identifier.split(".")
757
814
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
@@ -763,6 +820,7 @@ class OpteryxCatalog(Metastore):
763
820
  "location": metadata.location,
764
821
  "properties": metadata.properties,
765
822
  "format-version": metadata.format_version,
823
+ "annotations": metadata.annotations,
766
824
  "current-snapshot-id": metadata.current_snapshot_id,
767
825
  "current-schema-id": metadata.current_schema_id,
768
826
  "timestamp-ms": metadata.timestamp_ms,
@@ -777,10 +835,9 @@ class OpteryxCatalog(Metastore):
777
835
  # Metadata persisted in primary `datasets` collection only.
778
836
 
779
837
  snaps_coll = self._snapshots_collection(collection, dataset_name)
780
- existing = {d.id for d in snaps_coll.stream()}
781
- new_ids = set()
838
+ # Upsert snapshot documents. Do NOT delete existing snapshot documents
839
+ # here to avoid accidental removal of historical snapshots on save.
782
840
  for snap in metadata.snapshots:
783
- new_ids.add(str(snap.snapshot_id))
784
841
  snaps_coll.document(str(snap.snapshot_id)).set(
785
842
  {
786
843
  "snapshot-id": snap.snapshot_id,
@@ -795,10 +852,6 @@ class OpteryxCatalog(Metastore):
795
852
  }
796
853
  )
797
854
 
798
- # Delete stale snapshots
799
- for stale in existing - new_ids:
800
- snaps_coll.document(stale).delete()
801
-
802
855
  # Persist schemas subcollection
803
856
  schemas_coll = doc_ref.collection("schemas")
804
857
  existing_schema_ids = {d.id for d in schemas_coll.stream()}
@@ -866,6 +919,7 @@ class OpteryxCatalog(Metastore):
866
919
  "scale": scale,
867
920
  "precision": precision,
868
921
  "expectation-policies": [],
922
+ "annotations": [],
869
923
  }
870
924
 
871
925
  cols.append(typed)
@@ -873,7 +927,7 @@ class OpteryxCatalog(Metastore):
873
927
  return cols
874
928
 
875
929
  def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
876
- """Persist a schema document in the table's `schemas` subcollection and
930
+ """Persist a schema document in the dataset's `schemas` subcollection and
877
931
  return the new schema id.
878
932
  """
879
933
  import uuid