opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- import os
4
3
  import time
5
4
  from typing import Any
6
5
  from typing import Iterable
@@ -28,7 +27,7 @@ class OpteryxCatalog(Metastore):
28
27
 
29
28
  Terminology: catalog -> workspace -> collection -> dataset|view
30
29
 
31
- Stores table documents under the configured workspace in Firestore.
30
+ Stores dataset documents under the configured workspace in Firestore.
32
31
  Snapshots are stored in a `snapshots` subcollection under each
33
32
  dataset's document. Parquet manifests are written to GCS under the
34
33
  dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
@@ -57,12 +56,8 @@ class OpteryxCatalog(Metastore):
57
56
  props_ref = self._catalog_ref.document("$properties")
58
57
  if not props_ref.get().exists:
59
58
  now_ms = int(time.time() * 1000)
60
- billing = (
61
- os.environ.get("BILLING_ACCOUNT_ID")
62
- or os.environ.get("BILLING_ACCOUNT")
63
- or None
64
- )
65
- owner = os.environ.get("WORKSPACE_OWNER") or None
59
+ billing = None
60
+ owner = None
66
61
  props_ref.set(
67
62
  {
68
63
  "timestamp-ms": now_ms,
@@ -81,12 +76,9 @@ class OpteryxCatalog(Metastore):
81
76
  self.io = io
82
77
  else:
83
78
  if gcs_bucket:
84
- try:
85
- from .iops.gcs import GcsFileIO
79
+ from .iops.gcs import GcsFileIO
86
80
 
87
- self.io = GcsFileIO()
88
- except Exception:
89
- self.io = FileIO()
81
+ self.io = GcsFileIO()
90
82
  else:
91
83
  self.io = FileIO()
92
84
 
@@ -109,7 +101,7 @@ class OpteryxCatalog(Metastore):
109
101
  return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
110
102
 
111
103
  def _views_collection(self, collection: str):
112
- return self._namespace_ref(collection).collection("views")
104
+ return self._collection_ref(collection).collection("views")
113
105
 
114
106
  def _view_doc_ref(self, collection: str, view_name: str):
115
107
  return self._views_collection(collection).document(view_name)
@@ -125,7 +117,7 @@ class OpteryxCatalog(Metastore):
125
117
  if doc_ref.get().exists:
126
118
  raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
127
119
 
128
- # Build default table metadata
120
+ # Build default dataset metadata
129
121
  location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
130
122
  metadata = DatasetMetadata(
131
123
  dataset_identifier=identifier,
@@ -152,8 +144,6 @@ class OpteryxCatalog(Metastore):
152
144
  }
153
145
  )
154
146
 
155
- # Persisted in primary `datasets` collection only.
156
-
157
147
  # Persist initial schema into `schemas` subcollection if provided
158
148
  if schema is not None:
159
149
  schema_id = self._write_schema(collection, dataset_name, schema, author=author)
@@ -175,13 +165,27 @@ class OpteryxCatalog(Metastore):
175
165
  metadata.schemas = [
176
166
  {"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
177
167
  ]
178
- # update table doc to reference current schema
168
+ # update dataset doc to reference current schema
179
169
  doc_ref.update({"current-schema-id": metadata.current_schema_id})
180
170
 
181
171
  # Return SimpleDataset (attach this catalog so append() can persist)
182
172
  return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
183
173
 
184
- def load_dataset(self, identifier: str) -> SimpleDataset:
174
+ def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
175
+ """Load a dataset from Firestore.
176
+
177
+ Args:
178
+ identifier: Dataset identifier in format 'collection.dataset_name'
179
+ load_history: If True, load all snapshots from Firestore (expensive for
180
+ large histories). If False (default), only load the current snapshot,
181
+ which is sufficient for most write operations.
182
+
183
+ Returns:
184
+ SimpleDataset instance with metadata loaded from Firestore.
185
+
186
+ Raises:
187
+ DatasetNotFound: If the dataset does not exist in Firestore.
188
+ """
185
189
  collection, dataset_name = identifier.split(".")
186
190
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
187
191
  doc = doc_ref.get()
@@ -197,37 +201,68 @@ class OpteryxCatalog(Metastore):
197
201
  properties=data.get("properties") or {},
198
202
  )
199
203
 
200
- # Load table-level timestamp/author and collection/workspace
204
+ # Load dataset-level timestamp/author and collection/workspace
201
205
  metadata.timestamp_ms = data.get("timestamp-ms")
202
206
  metadata.author = data.get("author")
203
- # note: Firestore table doc stores the original collection and workspace
207
+ # note: Firestore dataset doc stores the original collection and workspace
204
208
  # under keys `collection` and `workspace`.
205
209
 
206
- # Load snapshots
210
+ # Load snapshots based on load_history flag
207
211
  snaps = []
208
- for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
209
- sd = snap_doc.to_dict() or {}
210
- snap = Snapshot(
211
- snapshot_id=sd.get("snapshot-id"),
212
- timestamp_ms=sd.get("timestamp-ms"),
213
- author=sd.get("author"),
214
- sequence_number=sd.get("sequence-number"),
215
- user_created=sd.get("user-created"),
216
- manifest_list=sd.get("manifest"),
217
- schema_id=sd.get("schema-id"),
218
- summary=sd.get("summary", {}),
219
- operation_type=sd.get("operation-type"),
220
- parent_snapshot_id=sd.get("parent-snapshot-id"),
221
- )
222
- snaps.append(snap)
212
+ if load_history:
213
+ # Load all snapshots from Firestore (expensive for large histories)
214
+ for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
215
+ sd = snap_doc.to_dict() or {}
216
+ snap = Snapshot(
217
+ snapshot_id=sd.get("snapshot-id"),
218
+ timestamp_ms=sd.get("timestamp-ms"),
219
+ author=sd.get("author"),
220
+ sequence_number=sd.get("sequence-number"),
221
+ user_created=sd.get("user-created"),
222
+ manifest_list=sd.get("manifest"),
223
+ schema_id=sd.get("schema-id"),
224
+ summary=sd.get("summary", {}),
225
+ operation_type=sd.get("operation-type"),
226
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
227
+ )
228
+ snaps.append(snap)
229
+ if snaps:
230
+ metadata.current_snapshot_id = snaps[-1].snapshot_id
231
+ else:
232
+ # Load only the current snapshot (efficient single read)
233
+ current_snap_id = data.get("current-snapshot-id")
234
+ if current_snap_id:
235
+ try:
236
+ snap_doc = (
237
+ self._snapshots_collection(collection, dataset_name)
238
+ .document(str(current_snap_id))
239
+ .get()
240
+ )
241
+ if snap_doc.exists:
242
+ sd = snap_doc.to_dict() or {}
243
+ snap = Snapshot(
244
+ snapshot_id=sd.get("snapshot-id"),
245
+ timestamp_ms=sd.get("timestamp-ms"),
246
+ author=sd.get("author"),
247
+ sequence_number=sd.get("sequence-number"),
248
+ user_created=sd.get("user-created"),
249
+ manifest_list=sd.get("manifest"),
250
+ schema_id=sd.get("schema-id"),
251
+ summary=sd.get("summary", {}),
252
+ operation_type=sd.get("operation-type"),
253
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
254
+ )
255
+ snaps.append(snap)
256
+ metadata.current_snapshot_id = current_snap_id
257
+ except Exception:
258
+ pass
223
259
  metadata.snapshots = snaps
224
- if snaps:
225
- metadata.current_snapshot_id = snaps[-1].snapshot_id
226
260
 
227
261
  # Load schemas subcollection
228
- try:
262
+ schemas_coll = doc_ref.collection("schemas")
263
+ # Load all schemas if requested; otherwise load only current schema
264
+ if load_history:
229
265
  schemas = []
230
- schemas_coll = doc_ref.collection("schemas")
231
266
  for sdoc in schemas_coll.stream():
232
267
  sd = sdoc.to_dict() or {}
233
268
  schemas.append(
@@ -241,9 +276,23 @@ class OpteryxCatalog(Metastore):
241
276
  )
242
277
  metadata.schemas = schemas
243
278
  metadata.current_schema_id = doc.to_dict().get("current-schema-id")
244
- except Exception:
245
- pass
246
-
279
+ else:
280
+ # Only load the current schema document for efficiency
281
+ current_schema_id = doc.to_dict().get("current-schema-id")
282
+ if current_schema_id:
283
+ sdoc = schemas_coll.document(str(current_schema_id)).get()
284
+ if sdoc.exists:
285
+ sd = sdoc.to_dict() or {}
286
+ metadata.schemas = [
287
+ {
288
+ "schema_id": sdoc.id,
289
+ "columns": sd.get("columns", []),
290
+ "timestamp-ms": sd.get("timestamp-ms"),
291
+ "author": sd.get("author"),
292
+ "sequence-number": sd.get("sequence-number"),
293
+ }
294
+ ]
295
+ metadata.current_schema_id = current_schema_id
247
296
  return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
248
297
 
249
298
  def drop_dataset(self, identifier: str) -> None:
@@ -270,7 +319,7 @@ class OpteryxCatalog(Metastore):
270
319
 
271
320
  If `exists_ok` is False and the collection already exists, a KeyError is raised.
272
321
  """
273
- doc_ref = self._namespace_ref(collection)
322
+ doc_ref = self._collection_ref(collection)
274
323
  if doc_ref.get().exists:
275
324
  if exists_ok:
276
325
  return
@@ -292,11 +341,7 @@ class OpteryxCatalog(Metastore):
292
341
  self, collection: str, properties: dict | None = None, author: Optional[str] = None
293
342
  ) -> None:
294
343
  """Convenience wrapper that creates the collection only if missing."""
295
- try:
296
- self.create_collection(collection, properties=properties, exists_ok=True, author=author)
297
- except Exception:
298
- # Be conservative: surface caller-level warnings rather than failing
299
- return
344
+ self.create_collection(collection, properties=properties, exists_ok=True, author=author)
300
345
 
301
346
  def dataset_exists(
302
347
  self, identifier_or_collection: str, dataset_name: Optional[str] = None
@@ -309,12 +354,14 @@ class OpteryxCatalog(Metastore):
309
354
  """
310
355
  # Normalize inputs
311
356
  if dataset_name is None:
312
- # Expect a single collection like 'collection.table'
357
+ # Expect a single collection like 'collection.dataset'
313
358
  if "." not in identifier_or_collection:
314
359
  raise ValueError(
315
- "collection must be 'collection.table' or pass dataset_name separately"
360
+ "collection must be 'collection.dataset' or pass dataset_name separately"
316
361
  )
317
362
  collection, dataset_name = identifier_or_collection.rsplit(".", 1)
363
+ else:
364
+ collection = identifier_or_collection
318
365
 
319
366
  try:
320
367
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
@@ -334,6 +381,7 @@ class OpteryxCatalog(Metastore):
334
381
  author: str = None,
335
382
  description: Optional[str] = None,
336
383
  properties: dict | None = None,
384
+ update_if_exists: bool = False,
337
385
  ) -> CatalogView:
338
386
  """Create a view document and a statement version in the `statement` subcollection.
339
387
 
@@ -347,7 +395,22 @@ class OpteryxCatalog(Metastore):
347
395
 
348
396
  doc_ref = self._view_doc_ref(collection, view_name)
349
397
  if doc_ref.get().exists:
350
- raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
398
+ if not update_if_exists:
399
+ raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
400
+ # Update existing view - get current sequence number
401
+ existing_doc = doc_ref.get().to_dict()
402
+ current_statement_id = existing_doc.get("statement-id")
403
+ if current_statement_id:
404
+ stmt_ref = doc_ref.collection("statement").document(current_statement_id)
405
+ stmt_doc = stmt_ref.get()
406
+ if stmt_doc.exists:
407
+ sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
408
+ else:
409
+ sequence_number = 1
410
+ else:
411
+ sequence_number = 1
412
+ else:
413
+ sequence_number = 1
351
414
 
352
415
  now_ms = int(time.time() * 1000)
353
416
  if author is None:
@@ -361,7 +424,7 @@ class OpteryxCatalog(Metastore):
361
424
  "sql": sql,
362
425
  "timestamp-ms": now_ms,
363
426
  "author": author,
364
- "sequence-number": 1,
427
+ "sequence-number": sequence_number,
365
428
  }
366
429
  )
367
430
 
@@ -389,6 +452,9 @@ class OpteryxCatalog(Metastore):
389
452
  setattr(v, "sql", sql)
390
453
  setattr(v, "metadata", type("M", (), {})())
391
454
  v.metadata.schema = schema
455
+ # Attach catalog and identifier for describe() method
456
+ setattr(v, "_catalog", self)
457
+ setattr(v, "_identifier", f"{collection}.{view_name}")
392
458
  return v
393
459
 
394
460
  def load_view(self, identifier: str | tuple) -> CatalogView:
@@ -410,27 +476,28 @@ class OpteryxCatalog(Metastore):
410
476
  stmt_id = data.get("statement-id")
411
477
  sql = None
412
478
  schema = data.get("schema")
413
- try:
414
- if stmt_id:
415
- sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
416
- if sdoc.exists:
417
- sql = (sdoc.to_dict() or {}).get("sql")
418
- # fallback: pick the most recent statement
419
- if not sql:
420
- for s in doc_ref.collection("statement").stream():
421
- sd = s.to_dict() or {}
422
- if sd.get("sql"):
423
- sql = sd.get("sql")
424
- break
425
- except Exception:
426
- pass
479
+
480
+ sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
481
+ sql = (sdoc.to_dict() or {}).get("sql")
427
482
 
428
483
  v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
429
484
  setattr(v, "sql", sql or "")
430
485
  setattr(v, "metadata", type("M", (), {})())
431
486
  v.metadata.schema = schema
487
+ # Populate metadata fields from the stored view document so callers
488
+ # expecting attributes like `timestamp_ms` won't fail.
432
489
  v.metadata.author = data.get("author")
433
490
  v.metadata.description = data.get("description")
491
+ v.metadata.timestamp_ms = data.get("timestamp-ms")
492
+ # Execution/operational fields (may be None)
493
+ v.metadata.last_execution_ms = data.get("last-execution-ms")
494
+ v.metadata.last_execution_data_size = data.get("last-execution-data-size")
495
+ v.metadata.last_execution_records = data.get("last-execution-records")
496
+ # Optional describer (used to flag LLM-generated descriptions)
497
+ v.metadata.describer = data.get("describer")
498
+ # Attach catalog and identifier for describe() method
499
+ setattr(v, "_catalog", self)
500
+ setattr(v, "_identifier", f"{collection}.{view_name}")
434
501
  return v
435
502
 
436
503
  def drop_view(self, identifier: str | tuple) -> None:
@@ -441,11 +508,9 @@ class OpteryxCatalog(Metastore):
441
508
 
442
509
  doc_ref = self._view_doc_ref(collection, view_name)
443
510
  # delete statement subcollection
444
- try:
445
- for d in doc_ref.collection("statement").stream():
446
- doc_ref.collection("statement").document(d.id).delete()
447
- except Exception:
448
- pass
511
+ for d in doc_ref.collection("statement").stream():
512
+ doc_ref.collection("statement").document(d.id).delete()
513
+
449
514
  doc_ref.delete()
450
515
 
451
516
  def list_views(self, collection: str) -> Iterable[str]:
@@ -474,6 +539,8 @@ class OpteryxCatalog(Metastore):
474
539
  "identifier must be 'collection.view' or pass view_name separately"
475
540
  )
476
541
  collection, view_name = identifier_or_collection.rsplit(".", 1)
542
+ else:
543
+ collection = identifier_or_collection
477
544
 
478
545
  try:
479
546
  doc_ref = self._view_doc_ref(collection, view_name)
@@ -501,40 +568,75 @@ class OpteryxCatalog(Metastore):
501
568
  updates["last-execution-time-ms"] = int(execution_time * 1000)
502
569
  updates["last-execution-ms"] = now_ms
503
570
  if updates:
504
- try:
505
- doc_ref.update(updates)
506
- except Exception:
507
- pass
571
+ doc_ref.update(updates)
572
+
573
+ def update_view_description(
574
+ self,
575
+ identifier: str | tuple,
576
+ description: str,
577
+ describer: Optional[str] = None,
578
+ ) -> None:
579
+ """Update the description for a view.
580
+
581
+ Args:
582
+ identifier: View identifier ('collection.view' or tuple)
583
+ description: The new description text
584
+ describer: Optional identifier for who/what created the description
585
+ """
586
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
587
+ collection, view_name = identifier[0], identifier[1]
588
+ else:
589
+ collection, view_name = identifier.split(".")
590
+
591
+ doc_ref = self._view_doc_ref(collection, view_name)
592
+ updates = {
593
+ "description": description,
594
+ }
595
+ if describer is not None:
596
+ updates["describer"] = describer
597
+ doc_ref.update(updates)
598
+
599
+ def update_dataset_description(
600
+ self,
601
+ identifier: str,
602
+ description: str,
603
+ describer: Optional[str] = None,
604
+ ) -> None:
605
+ """Update the description for a dataset.
606
+
607
+ Args:
608
+ identifier: Dataset identifier in format 'collection.dataset_name'
609
+ description: The new description text
610
+ describer: Optional identifier for who/what created the description
611
+ """
612
+ collection, dataset_name = identifier.split(".")
613
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
614
+ updates = {
615
+ "description": description,
616
+ }
617
+ if describer is not None:
618
+ updates["describer"] = describer
619
+ doc_ref.update(updates)
508
620
 
509
621
  def write_parquet_manifest(
510
- self, snapshot_id: int, entries: List[dict], table_location: str
622
+ self, snapshot_id: int, entries: List[dict], dataset_location: str
511
623
  ) -> Optional[str]:
512
624
  """Write a Parquet manifest for the given snapshot id and entries.
513
625
 
514
626
  Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
515
- The manifest will be written to <table_location>/metadata/manifest-<snapshot_id>.parquet
627
+ The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
516
628
  """
517
629
  import pyarrow as pa
518
630
  import pyarrow.parquet as pq
519
631
 
520
632
  # If entries is None we skip writing; if entries is empty list, write
521
- # an empty Parquet manifest (represents an empty table for this
633
+ # an empty Parquet manifest (represents an empty dataset for this
522
634
  # snapshot). This preserves previous manifests so older snapshots
523
635
  # remain readable.
524
636
  if entries is None:
525
637
  return None
526
638
 
527
- # Print manifest entries so users can inspect the manifest when created
528
- try:
529
- pass
530
-
531
- # print("[MANIFEST] Parquet manifest entries to write:")
532
- # print(json.dumps(entries, indent=2, default=str))
533
- except Exception:
534
- # print("[MANIFEST] Parquet manifest entries:", entries)
535
- pass
536
-
537
- parquet_path = f"{table_location}/metadata/manifest-{snapshot_id}.parquet"
639
+ parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
538
640
 
539
641
  # Use provided FileIO if it supports writing; otherwise write to GCS
540
642
  try:
@@ -546,144 +648,77 @@ class OpteryxCatalog(Metastore):
546
648
  ("file_format", pa.string()),
547
649
  ("record_count", pa.int64()),
548
650
  ("file_size_in_bytes", pa.int64()),
651
+ ("uncompressed_size_in_bytes", pa.int64()),
652
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
653
+ ("null_counts", pa.list_(pa.int64())),
549
654
  ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
550
655
  ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
551
656
  ("histogram_bins", pa.int32()),
552
- ("min_values", pa.list_(pa.int64())),
553
- ("max_values", pa.list_(pa.int64())),
657
+ ("min_values", pa.list_(pa.binary())),
658
+ ("max_values", pa.list_(pa.binary())),
554
659
  ]
555
660
  )
556
661
 
662
+ # Normalize entries to match schema expectations:
663
+ normalized = []
664
+ for ent in entries:
665
+ if not isinstance(ent, dict):
666
+ normalized.append(ent)
667
+ continue
668
+ e = dict(ent)
669
+ # Ensure list fields exist
670
+ e.setdefault("min_k_hashes", [])
671
+ e.setdefault("histogram_counts", [])
672
+ e.setdefault("histogram_bins", 0)
673
+ e.setdefault("column_uncompressed_sizes_in_bytes", [])
674
+ e.setdefault("null_counts", [])
675
+
676
+ # Process min/max values: truncate to 16 bytes with ellipsis marker if longer
677
+ mv = e.get("min_values") or []
678
+ xv = e.get("max_values") or []
679
+
680
+ def truncate_value(v):
681
+ """Convert value to binary and truncate to 16 bytes with marker if needed."""
682
+ if v is None:
683
+ return None
684
+ # Convert to bytes
685
+ if isinstance(v, bytes):
686
+ b = v
687
+ else:
688
+ b = str(v).encode('utf-8')
689
+ # Truncate if longer than 16 bytes, add 0xFF as 17th byte to indicate truncation
690
+ if len(b) > 16:
691
+ return b[:16] + b'\xff'
692
+ return b
693
+
694
+ e["min_values"] = [truncate_value(v) for v in mv]
695
+ e["max_values"] = [truncate_value(v) for v in xv]
696
+ normalized.append(e)
697
+
557
698
  try:
558
- table = pa.Table.from_pylist(entries, schema=schema)
559
- except Exception:
699
+ table = pa.Table.from_pylist(normalized, schema=schema)
700
+ except Exception as exc:
560
701
  # Diagnostic output to help find malformed manifest entries
561
- try:
562
- print(
563
- "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
564
- )
565
- for i, ent in enumerate(entries):
566
- print(f" Entry {i}:")
567
- if isinstance(ent, dict):
568
- for k, v in ent.items():
569
- tname = type(v).__name__
570
- try:
571
- s = repr(v)
572
- except Exception:
573
- s = "<unreprable>"
574
- print(f" - {k}: type={tname} repr={s[:200]}")
575
- else:
576
- print(
577
- f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
578
- )
579
- except Exception:
580
- pass
581
702
 
582
- # Attempt to sanitize entries and retry conversion.
583
- try:
584
- print("[MANIFEST DEBUG] Attempting to sanitize entries and retry")
585
- sanitized = []
586
- for ent in entries:
587
- if not isinstance(ent, dict):
588
- sanitized.append(ent)
589
- continue
590
- e2 = dict(ent) # copy
591
- # Ensure numeric fields
592
- for k in ("record_count", "file_size_in_bytes", "histogram_bins"):
593
- v = e2.get(k)
703
+ print(
704
+ "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
705
+ )
706
+ for i, ent in enumerate(entries):
707
+ print(f" Entry {i}:")
708
+ if isinstance(ent, dict):
709
+ for k, v in ent.items():
710
+ tname = type(v).__name__
594
711
  try:
595
- e2[k] = int(v) if v is not None else 0
712
+ s = repr(v)
596
713
  except Exception:
597
- e2[k] = 0
598
- # Ensure min_k_hashes is list[list[int]]
599
- mk = e2.get("min_k_hashes")
600
- if not isinstance(mk, list):
601
- e2["min_k_hashes"] = []
602
- else:
603
- new_mk = []
604
- for sub in mk:
605
- if isinstance(sub, list):
606
- try:
607
- new_mk.append([int(x) for x in sub])
608
- except Exception:
609
- new_mk.append([])
610
- else:
611
- new_mk.append([])
612
- e2["min_k_hashes"] = new_mk
613
- # Ensure histogram_counts is list[list[int]]
614
- hc = e2.get("histogram_counts")
615
- if not isinstance(hc, list):
616
- e2["histogram_counts"] = []
617
- else:
618
- new_hc = []
619
- for sub in hc:
620
- if isinstance(sub, list):
621
- try:
622
- new_hc.append([int(x) for x in sub])
623
- except Exception:
624
- new_hc.append([])
625
- else:
626
- new_hc.append([])
627
- e2["histogram_counts"] = new_hc
628
- # Sanitize min_values / max_values: must be list[int] or None
629
- # Sanitize min_values / max_values: coerce to int64 using to_int() if available
630
- try:
631
- from opteryx.compiled.structures.relation_statistics import to_int
632
- except Exception:
633
-
634
- def to_int(val):
635
- # Best-effort fallback: handle numpy types, strings and numbers
636
- try:
637
- if val is None:
638
- return None
639
- if hasattr(val, "item"):
640
- val = val.item()
641
- if isinstance(val, (bytes, bytearray)):
642
- val = val.decode(errors="ignore")
643
- if isinstance(val, str):
644
- # empty strings are invalid
645
- if val == "":
646
- return None
647
- try:
648
- return int(val)
649
- except Exception:
650
- return None
651
- if isinstance(val, float):
652
- return int(val)
653
- return int(val)
654
- except Exception:
655
- return None
656
-
657
- for key in ("min_values", "max_values"):
658
- mv = e2.get(key)
659
- if not isinstance(mv, list):
660
- e2[key] = [None]
661
- else:
662
- new_mv = []
663
- for x in mv:
664
- try:
665
- if x is None:
666
- new_mv.append(None)
667
- continue
668
- # Use to_int to coerce into int64 semantics
669
- v = x
670
- if hasattr(v, "item"):
671
- v = v.item()
672
- coerced = to_int(v)
673
- # to_int may return None-like sentinel; accept ints only
674
- if coerced is None:
675
- new_mv.append(None)
676
- else:
677
- new_mv.append(int(coerced))
678
- except Exception:
679
- new_mv.append(None)
680
- e2[key] = new_mv
681
- sanitized.append(e2)
682
- table = pa.Table.from_pylist(sanitized, schema=schema)
683
- print("[MANIFEST DEBUG] Sanitized entries converted successfully")
684
- except Exception:
685
- print("[MANIFEST DEBUG] Sanitization failed; re-raising original exception")
686
- raise
714
+ s = "<unreprable>"
715
+ print(f" - {k}: type={tname} repr={s[:200]}")
716
+ else:
717
+ print(
718
+ f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
719
+ )
720
+ raise exc
721
+
687
722
  buf = pa.BufferOutputStream()
688
723
  pq.write_table(table, buf, compression="zstd")
689
724
  data = buf.getvalue().to_pybytes()
@@ -696,15 +731,6 @@ class OpteryxCatalog(Metastore):
696
731
  out.close()
697
732
  except Exception:
698
733
  pass
699
- elif self._storage_client and self.gcs_bucket:
700
- # Write to GCS bucket
701
- bucket = self._storage_client.bucket(self.gcs_bucket)
702
- # object path: remove gs://bucket/ prefix
703
- parsed = parquet_path
704
- if parsed.startswith("gs://"):
705
- parsed = parsed[5 + len(self.gcs_bucket) + 1 :]
706
- blob = bucket.blob(parsed)
707
- blob.upload_from_string(data)
708
734
 
709
735
  return parquet_path
710
736
  except Exception as e:
@@ -713,7 +739,7 @@ class OpteryxCatalog(Metastore):
713
739
  raise e
714
740
 
715
741
  def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
716
- """Persist a single snapshot document for a table."""
742
+ """Persist a single snapshot document for a dataset."""
717
743
  namespace, dataset_name = identifier.split(".")
718
744
  snaps = self._snapshots_collection(namespace, dataset_name)
719
745
  doc_id = str(snapshot.snapshot_id)
@@ -749,9 +775,9 @@ class OpteryxCatalog(Metastore):
749
775
  snaps.document(doc_id).set(data)
750
776
 
751
777
  def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
752
- """Persist table-level metadata and snapshots to Firestore.
778
+ """Persist dataset-level metadata and snapshots to Firestore.
753
779
 
754
- This writes the table document and upserts snapshot documents.
780
+ This writes the dataset document and upserts snapshot documents.
755
781
  """
756
782
  collection, dataset_name = identifier.split(".")
757
783
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
@@ -873,7 +899,7 @@ class OpteryxCatalog(Metastore):
873
899
  return cols
874
900
 
875
901
  def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
876
- """Persist a schema document in the table's `schemas` subcollection and
902
+ """Persist a schema document in the dataset's `schemas` subcollection and
877
903
  return the new schema id.
878
904
  """
879
905
  import uuid