opteryx-catalog 0.4.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,923 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import time
5
+ from typing import Any
6
+ from typing import Iterable
7
+ from typing import List
8
+ from typing import Optional
9
+
10
+ from google.cloud import firestore
11
+ from google.cloud import storage
12
+
13
+ from .catalog.dataset import SimpleDataset
14
+ from .catalog.metadata import DatasetMetadata
15
+ from .catalog.metadata import Snapshot
16
+ from .catalog.metastore import Metastore
17
+ from .catalog.view import View as CatalogView
18
+ from .exceptions import CollectionAlreadyExists
19
+ from .exceptions import DatasetAlreadyExists
20
+ from .exceptions import DatasetNotFound
21
+ from .exceptions import ViewAlreadyExists
22
+ from .exceptions import ViewNotFound
23
+ from .iops.base import FileIO
24
+
25
+
26
+ class OpteryxCatalog(Metastore):
27
+ """Firestore-backed Metastore implementation.
28
+
29
+ Terminology: catalog -> workspace -> collection -> dataset|view
30
+
31
+ Stores table documents under the configured workspace in Firestore.
32
+ Snapshots are stored in a `snapshots` subcollection under each
33
+ dataset's document. Parquet manifests are written to GCS under the
34
+ dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
35
+ """
36
+
37
+ def __init__(
38
+ self,
39
+ workspace: str,
40
+ firestore_project: Optional[str] = None,
41
+ firestore_database: Optional[str] = None,
42
+ gcs_bucket: Optional[str] = None,
43
+ io: Optional[FileIO] = None,
44
+ ):
45
+ # `workspace` is the configured catalog/workspace name
46
+ self.workspace = workspace
47
+ # Backwards-compatible alias: keep `catalog_name` for older code paths
48
+ self.catalog_name = workspace
49
+ self.firestore_client = firestore.Client(
50
+ project=firestore_project, database=firestore_database
51
+ )
52
+ self._catalog_ref = self.firestore_client.collection(workspace)
53
+ # Ensure workspace-level properties document exists in Firestore.
54
+ # The $properties doc records metadata for the workspace such as
55
+ # 'timestamp-ms', 'author', 'billing-account-id' and 'owner'.
56
+ try:
57
+ props_ref = self._catalog_ref.document("$properties")
58
+ if not props_ref.get().exists:
59
+ now_ms = int(time.time() * 1000)
60
+ billing = (
61
+ os.environ.get("BILLING_ACCOUNT_ID")
62
+ or os.environ.get("BILLING_ACCOUNT")
63
+ or None
64
+ )
65
+ owner = os.environ.get("WORKSPACE_OWNER") or None
66
+ props_ref.set(
67
+ {
68
+ "timestamp-ms": now_ms,
69
+ "billing-account-id": billing,
70
+ "owner": owner,
71
+ }
72
+ )
73
+ except Exception:
74
+ # Be conservative: don't fail catalog initialization on Firestore errors
75
+ pass
76
+ self.gcs_bucket = gcs_bucket
77
+ self._storage_client = storage.Client() if gcs_bucket else None
78
+ # Default to a GCS-backed FileIO when a GCS bucket is configured and
79
+ # no explicit `io` was provided.
80
+ if io is not None:
81
+ self.io = io
82
+ else:
83
+ if gcs_bucket:
84
+ try:
85
+ from .iops.gcs import GcsFileIO
86
+
87
+ self.io = GcsFileIO()
88
+ except Exception:
89
+ self.io = FileIO()
90
+ else:
91
+ self.io = FileIO()
92
+
93
+ def _collection_ref(self, collection: str):
94
+ """Alias for `_namespace_ref` using the preferred term `collection`.
95
+
96
+ Do NOT change call signatures; this helper provides a clearer name
97
+ for new code paths while remaining backwards-compatible.
98
+ """
99
+ return self._catalog_ref.document(collection)
100
+
101
+ def _datasets_collection(self, collection: str):
102
+ # Primary subcollection for datasets.
103
+ return self._collection_ref(collection).collection("datasets")
104
+
105
+ def _dataset_doc_ref(self, collection: str, dataset_name: str):
106
+ return self._datasets_collection(collection).document(dataset_name)
107
+
108
+ def _snapshots_collection(self, collection: str, dataset_name: str):
109
+ return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
110
+
111
+ def _views_collection(self, collection: str):
112
+ return self._namespace_ref(collection).collection("views")
113
+
114
+ def _view_doc_ref(self, collection: str, view_name: str):
115
+ return self._views_collection(collection).document(view_name)
116
+
117
+ def create_dataset(
118
+ self, identifier: str, schema: Any, properties: dict | None = None, author: str = None
119
+ ) -> SimpleDataset:
120
+ if author is None:
121
+ raise ValueError("author must be provided when creating a dataset")
122
+ collection, dataset_name = identifier.split(".")
123
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
124
+ # Check primary `datasets` location
125
+ if doc_ref.get().exists:
126
+ raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
127
+
128
+ # Build default table metadata
129
+ location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
130
+ metadata = DatasetMetadata(
131
+ dataset_identifier=identifier,
132
+ schema=schema,
133
+ location=location,
134
+ properties=properties or {},
135
+ )
136
+
137
+ # Persist document with timestamp and author
138
+ now_ms = int(time.time() * 1000)
139
+ metadata.timestamp_ms = now_ms
140
+ metadata.author = author
141
+ doc_ref.set(
142
+ {
143
+ "name": dataset_name,
144
+ "collection": collection,
145
+ "workspace": self.workspace,
146
+ "location": location,
147
+ "properties": metadata.properties,
148
+ "format-version": metadata.format_version,
149
+ "timestamp-ms": now_ms,
150
+ "author": author,
151
+ "maintenance-policy": metadata.maintenance_policy,
152
+ }
153
+ )
154
+
155
+ # Persisted in primary `datasets` collection only.
156
+
157
+ # Persist initial schema into `schemas` subcollection if provided
158
+ if schema is not None:
159
+ schema_id = self._write_schema(collection, dataset_name, schema, author=author)
160
+ metadata.current_schema_id = schema_id
161
+ # Read back the schema doc to capture timestamp-ms, author, sequence-number
162
+ try:
163
+ sdoc = doc_ref.collection("schemas").document(schema_id).get()
164
+ sdata = sdoc.to_dict() or {}
165
+ metadata.schemas = [
166
+ {
167
+ "schema_id": schema_id,
168
+ "columns": sdata.get("columns", self._schema_to_columns(schema)),
169
+ "timestamp-ms": sdata.get("timestamp-ms"),
170
+ "author": sdata.get("author"),
171
+ "sequence-number": sdata.get("sequence-number"),
172
+ }
173
+ ]
174
+ except Exception:
175
+ metadata.schemas = [
176
+ {"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
177
+ ]
178
+ # update table doc to reference current schema
179
+ doc_ref.update({"current-schema-id": metadata.current_schema_id})
180
+
181
+ # Return SimpleDataset (attach this catalog so append() can persist)
182
+ return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
183
+
184
+ def load_dataset(self, identifier: str) -> SimpleDataset:
185
+ collection, dataset_name = identifier.split(".")
186
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
187
+ doc = doc_ref.get()
188
+ if not doc.exists:
189
+ raise DatasetNotFound(f"Dataset not found: {identifier}")
190
+
191
+ data = doc.to_dict() or {}
192
+ metadata = DatasetMetadata(
193
+ dataset_identifier=identifier,
194
+ location=data.get("location")
195
+ or f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}",
196
+ schema=data.get("schema"),
197
+ properties=data.get("properties") or {},
198
+ )
199
+
200
+ # Load table-level timestamp/author and collection/workspace
201
+ metadata.timestamp_ms = data.get("timestamp-ms")
202
+ metadata.author = data.get("author")
203
+ # note: Firestore table doc stores the original collection and workspace
204
+ # under keys `collection` and `workspace`.
205
+
206
+ # Load snapshots
207
+ snaps = []
208
+ for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
209
+ sd = snap_doc.to_dict() or {}
210
+ snap = Snapshot(
211
+ snapshot_id=sd.get("snapshot-id"),
212
+ timestamp_ms=sd.get("timestamp-ms"),
213
+ author=sd.get("author"),
214
+ sequence_number=sd.get("sequence-number"),
215
+ user_created=sd.get("user-created"),
216
+ manifest_list=sd.get("manifest"),
217
+ schema_id=sd.get("schema-id"),
218
+ summary=sd.get("summary", {}),
219
+ operation_type=sd.get("operation-type"),
220
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
221
+ )
222
+ snaps.append(snap)
223
+ metadata.snapshots = snaps
224
+ if snaps:
225
+ metadata.current_snapshot_id = snaps[-1].snapshot_id
226
+
227
+ # Load schemas subcollection
228
+ try:
229
+ schemas = []
230
+ schemas_coll = doc_ref.collection("schemas")
231
+ for sdoc in schemas_coll.stream():
232
+ sd = sdoc.to_dict() or {}
233
+ schemas.append(
234
+ {
235
+ "schema_id": sdoc.id,
236
+ "columns": sd.get("columns", []),
237
+ "timestamp-ms": sd.get("timestamp-ms"),
238
+ "author": sd.get("author"),
239
+ "sequence-number": sd.get("sequence-number"),
240
+ }
241
+ )
242
+ metadata.schemas = schemas
243
+ metadata.current_schema_id = doc.to_dict().get("current-schema-id")
244
+ except Exception:
245
+ pass
246
+
247
+ return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
248
+
249
+ def drop_dataset(self, identifier: str) -> None:
250
+ collection, dataset_name = identifier.split(".")
251
+ # Delete snapshots
252
+ snaps_coll = self._snapshots_collection(collection, dataset_name)
253
+ for doc in snaps_coll.stream():
254
+ snaps_coll.document(doc.id).delete()
255
+ # Delete dataset doc
256
+ self._dataset_doc_ref(collection, dataset_name).delete()
257
+
258
+ def list_datasets(self, collection: str) -> Iterable[str]:
259
+ coll = self._datasets_collection(collection)
260
+ return [doc.id for doc in coll.stream()]
261
+
262
+ def create_collection(
263
+ self,
264
+ collection: str,
265
+ properties: dict | None = None,
266
+ exists_ok: bool = False,
267
+ author: str = None,
268
+ ) -> None:
269
+ """Create a collection document under the catalog.
270
+
271
+ If `exists_ok` is False and the collection already exists, a KeyError is raised.
272
+ """
273
+ doc_ref = self._namespace_ref(collection)
274
+ if doc_ref.get().exists:
275
+ if exists_ok:
276
+ return
277
+ raise CollectionAlreadyExists(f"Collection already exists: {collection}")
278
+
279
+ now_ms = int(time.time() * 1000)
280
+ if author is None:
281
+ raise ValueError("author must be provided when creating a collection")
282
+ doc_ref.set(
283
+ {
284
+ "name": collection,
285
+ "properties": properties or {},
286
+ "timestamp-ms": now_ms,
287
+ "author": author,
288
+ }
289
+ )
290
+
291
+ def create_collection_if_not_exists(
292
+ self, collection: str, properties: dict | None = None, author: Optional[str] = None
293
+ ) -> None:
294
+ """Convenience wrapper that creates the collection only if missing."""
295
+ try:
296
+ self.create_collection(collection, properties=properties, exists_ok=True, author=author)
297
+ except Exception:
298
+ # Be conservative: surface caller-level warnings rather than failing
299
+ return
300
+
301
+ def dataset_exists(
302
+ self, identifier_or_collection: str, dataset_name: Optional[str] = None
303
+ ) -> bool:
304
+ """Return True if the dataset exists.
305
+
306
+ Supports two call forms:
307
+ - dataset_exists("collection.dataset")
308
+ - dataset_exists("collection", "dataset")
309
+ """
310
+ # Normalize inputs
311
+ if dataset_name is None:
312
+ # Expect a single collection like 'collection.table'
313
+ if "." not in identifier_or_collection:
314
+ raise ValueError(
315
+ "collection must be 'collection.table' or pass dataset_name separately"
316
+ )
317
+ collection, dataset_name = identifier_or_collection.rsplit(".", 1)
318
+
319
+ try:
320
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
321
+ return doc_ref.get().exists
322
+ except Exception:
323
+ # On any error, be conservative and return False
324
+ return False
325
+
326
+ # Dataset API methods have been renamed to the preferred `dataset` terminology.
327
+
328
+ # --- View support -------------------------------------------------
329
+ def create_view(
330
+ self,
331
+ identifier: str | tuple,
332
+ sql: str,
333
+ schema: Any | None = None,
334
+ author: str = None,
335
+ description: Optional[str] = None,
336
+ properties: dict | None = None,
337
+ ) -> CatalogView:
338
+ """Create a view document and a statement version in the `statement` subcollection.
339
+
340
+ `identifier` may be a string like 'namespace.view' or a tuple ('namespace','view').
341
+ """
342
+ # Normalize identifier
343
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
344
+ collection, view_name = identifier[0], identifier[1]
345
+ else:
346
+ collection, view_name = identifier.split(".")
347
+
348
+ doc_ref = self._view_doc_ref(collection, view_name)
349
+ if doc_ref.get().exists:
350
+ raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
351
+
352
+ now_ms = int(time.time() * 1000)
353
+ if author is None:
354
+ raise ValueError("author must be provided when creating a view")
355
+
356
+ # Write statement version
357
+ statement_id = str(now_ms)
358
+ stmt_coll = doc_ref.collection("statement")
359
+ stmt_coll.document(statement_id).set(
360
+ {
361
+ "sql": sql,
362
+ "timestamp-ms": now_ms,
363
+ "author": author,
364
+ "sequence-number": 1,
365
+ }
366
+ )
367
+
368
+ # Persist root view doc referencing the statement id
369
+ doc_ref.set(
370
+ {
371
+ "name": view_name,
372
+ "collection": collection,
373
+ "workspace": self.workspace,
374
+ "timestamp-ms": now_ms,
375
+ "author": author,
376
+ "description": description,
377
+ "describer": author,
378
+ "last-execution-ms": None,
379
+ "last-execution-data-size": None,
380
+ "last-execution-records": None,
381
+ "statement-id": statement_id,
382
+ "properties": properties or {},
383
+ }
384
+ )
385
+
386
+ # Return a simple CatalogView wrapper
387
+ v = CatalogView(name=view_name, definition=sql, properties=properties or {})
388
+ # provide convenient attributes used by docs/examples
389
+ setattr(v, "sql", sql)
390
+ setattr(v, "metadata", type("M", (), {})())
391
+ v.metadata.schema = schema
392
+ return v
393
+
394
+ def load_view(self, identifier: str | tuple) -> CatalogView:
395
+ """Load a view by identifier. Returns a `CatalogView` with `.definition` and `.sql`.
396
+
397
+ Raises `ViewNotFound` if the view doc is missing.
398
+ """
399
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
400
+ collection, view_name = identifier[0], identifier[1]
401
+ else:
402
+ collection, view_name = identifier.split(".")
403
+
404
+ doc_ref = self._view_doc_ref(collection, view_name)
405
+ doc = doc_ref.get()
406
+ if not doc.exists:
407
+ raise ViewNotFound(f"View not found: {collection}.{view_name}")
408
+
409
+ data = doc.to_dict() or {}
410
+ stmt_id = data.get("statement-id")
411
+ sql = None
412
+ schema = data.get("schema")
413
+ try:
414
+ if stmt_id:
415
+ sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
416
+ if sdoc.exists:
417
+ sql = (sdoc.to_dict() or {}).get("sql")
418
+ # fallback: pick the most recent statement
419
+ if not sql:
420
+ for s in doc_ref.collection("statement").stream():
421
+ sd = s.to_dict() or {}
422
+ if sd.get("sql"):
423
+ sql = sd.get("sql")
424
+ break
425
+ except Exception:
426
+ pass
427
+
428
+ v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
429
+ setattr(v, "sql", sql or "")
430
+ setattr(v, "metadata", type("M", (), {})())
431
+ v.metadata.schema = schema
432
+ v.metadata.author = data.get("author")
433
+ v.metadata.description = data.get("description")
434
+ return v
435
+
436
+ def drop_view(self, identifier: str | tuple) -> None:
437
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
438
+ collection, view_name = identifier[0], identifier[1]
439
+ else:
440
+ collection, view_name = identifier.split(".")
441
+
442
+ doc_ref = self._view_doc_ref(collection, view_name)
443
+ # delete statement subcollection
444
+ try:
445
+ for d in doc_ref.collection("statement").stream():
446
+ doc_ref.collection("statement").document(d.id).delete()
447
+ except Exception:
448
+ pass
449
+ doc_ref.delete()
450
+
451
+ def list_views(self, collection: str) -> Iterable[str]:
452
+ coll = self._views_collection(collection)
453
+ return [doc.id for doc in coll.stream()]
454
+
455
+ def view_exists(
456
+ self, identifier_or_collection: str | tuple, view_name: Optional[str] = None
457
+ ) -> bool:
458
+ """Return True if the view exists.
459
+
460
+ Supports two call forms:
461
+ - view_exists("collection.view")
462
+ - view_exists(("collection", "view"))
463
+ - view_exists("collection", "view")
464
+ """
465
+ # Normalize inputs
466
+ if view_name is None:
467
+ if isinstance(identifier_or_collection, tuple) or isinstance(
468
+ identifier_or_collection, list
469
+ ):
470
+ collection, view_name = identifier_or_collection[0], identifier_or_collection[1]
471
+ else:
472
+ if "." not in identifier_or_collection:
473
+ raise ValueError(
474
+ "identifier must be 'collection.view' or pass view_name separately"
475
+ )
476
+ collection, view_name = identifier_or_collection.rsplit(".", 1)
477
+
478
+ try:
479
+ doc_ref = self._view_doc_ref(collection, view_name)
480
+ return doc_ref.get().exists
481
+ except Exception:
482
+ return False
483
+
484
+ def update_view_execution_metadata(
485
+ self,
486
+ identifier: str | tuple,
487
+ row_count: Optional[int] = None,
488
+ execution_time: Optional[float] = None,
489
+ ) -> None:
490
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
491
+ collection, view_name = identifier[0], identifier[1]
492
+ else:
493
+ collection, view_name = identifier.split(".")
494
+
495
+ doc_ref = self._view_doc_ref(collection, view_name)
496
+ updates = {}
497
+ now_ms = int(time.time() * 1000)
498
+ if row_count is not None:
499
+ updates["last-execution-records"] = row_count
500
+ if execution_time is not None:
501
+ updates["last-execution-time-ms"] = int(execution_time * 1000)
502
+ updates["last-execution-ms"] = now_ms
503
+ if updates:
504
+ try:
505
+ doc_ref.update(updates)
506
+ except Exception:
507
+ pass
508
+
509
+ def write_parquet_manifest(
510
+ self, snapshot_id: int, entries: List[dict], table_location: str
511
+ ) -> Optional[str]:
512
+ """Write a Parquet manifest for the given snapshot id and entries.
513
+
514
+ Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
515
+ The manifest will be written to <table_location>/metadata/manifest-<snapshot_id>.parquet
516
+ """
517
+ import pyarrow as pa
518
+ import pyarrow.parquet as pq
519
+
520
+ # If entries is None we skip writing; if entries is empty list, write
521
+ # an empty Parquet manifest (represents an empty table for this
522
+ # snapshot). This preserves previous manifests so older snapshots
523
+ # remain readable.
524
+ if entries is None:
525
+ return None
526
+
527
+ # Print manifest entries so users can inspect the manifest when created
528
+ try:
529
+ pass
530
+
531
+ # print("[MANIFEST] Parquet manifest entries to write:")
532
+ # print(json.dumps(entries, indent=2, default=str))
533
+ except Exception:
534
+ # print("[MANIFEST] Parquet manifest entries:", entries)
535
+ pass
536
+
537
+ parquet_path = f"{table_location}/metadata/manifest-{snapshot_id}.parquet"
538
+
539
+ # Use provided FileIO if it supports writing; otherwise write to GCS
540
+ try:
541
+ # Use an explicit schema so PyArrow types (especially nested lists)
542
+ # are correct and we avoid integer overflow / inference issues.
543
+ schema = pa.schema(
544
+ [
545
+ ("file_path", pa.string()),
546
+ ("file_format", pa.string()),
547
+ ("record_count", pa.int64()),
548
+ ("file_size_in_bytes", pa.int64()),
549
+ ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
550
+ ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
551
+ ("histogram_bins", pa.int32()),
552
+ ("min_values", pa.list_(pa.int64())),
553
+ ("max_values", pa.list_(pa.int64())),
554
+ ]
555
+ )
556
+
557
+ try:
558
+ table = pa.Table.from_pylist(entries, schema=schema)
559
+ except Exception:
560
+ # Diagnostic output to help find malformed manifest entries
561
+ try:
562
+ print(
563
+ "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
564
+ )
565
+ for i, ent in enumerate(entries):
566
+ print(f" Entry {i}:")
567
+ if isinstance(ent, dict):
568
+ for k, v in ent.items():
569
+ tname = type(v).__name__
570
+ try:
571
+ s = repr(v)
572
+ except Exception:
573
+ s = "<unreprable>"
574
+ print(f" - {k}: type={tname} repr={s[:200]}")
575
+ else:
576
+ print(
577
+ f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
578
+ )
579
+ except Exception:
580
+ pass
581
+
582
+ # Attempt to sanitize entries and retry conversion.
583
+ try:
584
+ print("[MANIFEST DEBUG] Attempting to sanitize entries and retry")
585
+ sanitized = []
586
+ for ent in entries:
587
+ if not isinstance(ent, dict):
588
+ sanitized.append(ent)
589
+ continue
590
+ e2 = dict(ent) # copy
591
+ # Ensure numeric fields
592
+ for k in ("record_count", "file_size_in_bytes", "histogram_bins"):
593
+ v = e2.get(k)
594
+ try:
595
+ e2[k] = int(v) if v is not None else 0
596
+ except Exception:
597
+ e2[k] = 0
598
+ # Ensure min_k_hashes is list[list[int]]
599
+ mk = e2.get("min_k_hashes")
600
+ if not isinstance(mk, list):
601
+ e2["min_k_hashes"] = []
602
+ else:
603
+ new_mk = []
604
+ for sub in mk:
605
+ if isinstance(sub, list):
606
+ try:
607
+ new_mk.append([int(x) for x in sub])
608
+ except Exception:
609
+ new_mk.append([])
610
+ else:
611
+ new_mk.append([])
612
+ e2["min_k_hashes"] = new_mk
613
+ # Ensure histogram_counts is list[list[int]]
614
+ hc = e2.get("histogram_counts")
615
+ if not isinstance(hc, list):
616
+ e2["histogram_counts"] = []
617
+ else:
618
+ new_hc = []
619
+ for sub in hc:
620
+ if isinstance(sub, list):
621
+ try:
622
+ new_hc.append([int(x) for x in sub])
623
+ except Exception:
624
+ new_hc.append([])
625
+ else:
626
+ new_hc.append([])
627
+ e2["histogram_counts"] = new_hc
628
+ # Sanitize min_values / max_values: must be list[int] or None
629
+ # Sanitize min_values / max_values: coerce to int64 using to_int() if available
630
+ try:
631
+ from opteryx.compiled.structures.relation_statistics import to_int
632
+ except Exception:
633
+
634
+ def to_int(val):
635
+ # Best-effort fallback: handle numpy types, strings and numbers
636
+ try:
637
+ if val is None:
638
+ return None
639
+ if hasattr(val, "item"):
640
+ val = val.item()
641
+ if isinstance(val, (bytes, bytearray)):
642
+ val = val.decode(errors="ignore")
643
+ if isinstance(val, str):
644
+ # empty strings are invalid
645
+ if val == "":
646
+ return None
647
+ try:
648
+ return int(val)
649
+ except Exception:
650
+ return None
651
+ if isinstance(val, float):
652
+ return int(val)
653
+ return int(val)
654
+ except Exception:
655
+ return None
656
+
657
+ for key in ("min_values", "max_values"):
658
+ mv = e2.get(key)
659
+ if not isinstance(mv, list):
660
+ e2[key] = [None]
661
+ else:
662
+ new_mv = []
663
+ for x in mv:
664
+ try:
665
+ if x is None:
666
+ new_mv.append(None)
667
+ continue
668
+ # Use to_int to coerce into int64 semantics
669
+ v = x
670
+ if hasattr(v, "item"):
671
+ v = v.item()
672
+ coerced = to_int(v)
673
+ # to_int may return None-like sentinel; accept ints only
674
+ if coerced is None:
675
+ new_mv.append(None)
676
+ else:
677
+ new_mv.append(int(coerced))
678
+ except Exception:
679
+ new_mv.append(None)
680
+ e2[key] = new_mv
681
+ sanitized.append(e2)
682
+ table = pa.Table.from_pylist(sanitized, schema=schema)
683
+ print("[MANIFEST DEBUG] Sanitized entries converted successfully")
684
+ except Exception:
685
+ print("[MANIFEST DEBUG] Sanitization failed; re-raising original exception")
686
+ raise
687
+ buf = pa.BufferOutputStream()
688
+ pq.write_table(table, buf, compression="zstd")
689
+ data = buf.getvalue().to_pybytes()
690
+
691
+ if self.io:
692
+ out = self.io.new_output(parquet_path).create()
693
+ out.write(data)
694
+ try:
695
+ # Some OutputFile implementations buffer and require close()
696
+ out.close()
697
+ except Exception:
698
+ pass
699
+ elif self._storage_client and self.gcs_bucket:
700
+ # Write to GCS bucket
701
+ bucket = self._storage_client.bucket(self.gcs_bucket)
702
+ # object path: remove gs://bucket/ prefix
703
+ parsed = parquet_path
704
+ if parsed.startswith("gs://"):
705
+ parsed = parsed[5 + len(self.gcs_bucket) + 1 :]
706
+ blob = bucket.blob(parsed)
707
+ blob.upload_from_string(data)
708
+
709
+ return parquet_path
710
+ except Exception as e:
711
+ # Log and return None on failure
712
+ # print(f"Failed to write Parquet manifest: {e}")
713
+ raise e
714
+
715
+ def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
716
+ """Persist a single snapshot document for a table."""
717
+ namespace, dataset_name = identifier.split(".")
718
+ snaps = self._snapshots_collection(namespace, dataset_name)
719
+ doc_id = str(snapshot.snapshot_id)
720
+ # Ensure summary contains all expected keys (zero defaults applied in dataclass)
721
+ summary = snapshot.summary or {}
722
+ # Provide explicit keys if missing
723
+ for k in [
724
+ "added-data-files",
725
+ "added-files-size",
726
+ "added-records",
727
+ "deleted-data-files",
728
+ "deleted-files-size",
729
+ "deleted-records",
730
+ "total-data-files",
731
+ "total-files-size",
732
+ "total-records",
733
+ ]:
734
+ summary.setdefault(k, 0)
735
+
736
+ data = {
737
+ "snapshot-id": snapshot.snapshot_id,
738
+ "timestamp-ms": snapshot.timestamp_ms,
739
+ "manifest": snapshot.manifest_list,
740
+ "commit-message": getattr(snapshot, "commit_message", ""),
741
+ "summary": summary,
742
+ "author": getattr(snapshot, "author", None),
743
+ "sequence-number": getattr(snapshot, "sequence_number", None),
744
+ "operation-type": getattr(snapshot, "operation_type", None),
745
+ "parent-snapshot-id": getattr(snapshot, "parent_snapshot_id", None),
746
+ }
747
+ if getattr(snapshot, "schema_id", None) is not None:
748
+ data["schema-id"] = snapshot.schema_id
749
+ snaps.document(doc_id).set(data)
750
+
751
+ def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
752
+ """Persist table-level metadata and snapshots to Firestore.
753
+
754
+ This writes the table document and upserts snapshot documents.
755
+ """
756
+ collection, dataset_name = identifier.split(".")
757
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
758
+ doc_ref.set(
759
+ {
760
+ "name": dataset_name,
761
+ "collection": collection,
762
+ "workspace": self.workspace,
763
+ "location": metadata.location,
764
+ "properties": metadata.properties,
765
+ "format-version": metadata.format_version,
766
+ "current-snapshot-id": metadata.current_snapshot_id,
767
+ "current-schema-id": metadata.current_schema_id,
768
+ "timestamp-ms": metadata.timestamp_ms,
769
+ "author": metadata.author,
770
+ "description": metadata.description,
771
+ "describer": metadata.describer,
772
+ "maintenance-policy": metadata.maintenance_policy,
773
+ "sort-orders": metadata.sort_orders,
774
+ }
775
+ )
776
+
777
+ # Metadata persisted in primary `datasets` collection only.
778
+
779
+ snaps_coll = self._snapshots_collection(collection, dataset_name)
780
+ existing = {d.id for d in snaps_coll.stream()}
781
+ new_ids = set()
782
+ for snap in metadata.snapshots:
783
+ new_ids.add(str(snap.snapshot_id))
784
+ snaps_coll.document(str(snap.snapshot_id)).set(
785
+ {
786
+ "snapshot-id": snap.snapshot_id,
787
+ "timestamp-ms": snap.timestamp_ms,
788
+ "manifest": snap.manifest_list,
789
+ "commit-message": getattr(snap, "commit_message", ""),
790
+ "schema-id": snap.schema_id,
791
+ "summary": snap.summary or {},
792
+ "author": getattr(snap, "author", None),
793
+ "sequence-number": getattr(snap, "sequence_number", None),
794
+ "user-created": getattr(snap, "user_created", None),
795
+ }
796
+ )
797
+
798
+ # Delete stale snapshots
799
+ for stale in existing - new_ids:
800
+ snaps_coll.document(stale).delete()
801
+
802
+ # Persist schemas subcollection
803
+ schemas_coll = doc_ref.collection("schemas")
804
+ existing_schema_ids = {d.id for d in schemas_coll.stream()}
805
+ new_schema_ids = set()
806
+ for s in metadata.schemas:
807
+ sid = s.get("schema_id")
808
+ if not sid:
809
+ continue
810
+ new_schema_ids.add(sid)
811
+ schemas_coll.document(sid).set(
812
+ {
813
+ "columns": s.get("columns", []),
814
+ "timestamp-ms": s.get("timestamp-ms"),
815
+ "author": s.get("author"),
816
+ "sequence-number": s.get("sequence-number"),
817
+ }
818
+ )
819
+ # Delete stale schema docs
820
+ for stale in existing_schema_ids - new_schema_ids:
821
+ schemas_coll.document(stale).delete()
822
+
823
+ def _schema_to_columns(self, schema: Any) -> list:
824
+ """Convert a pyarrow.Schema into a simple columns list for storage.
825
+
826
+ Each column is a dict: {"id": index (1-based), "name": column_name, "type": str(type)}
827
+ """
828
+ # Support pyarrow.Schema and Orso RelationSchema. When Orso's
829
+ # FlatColumn.from_arrow is available, use it to derive Orso types
830
+ # (type, element-type, scale, precision). Fall back to simple
831
+ # stringified types if Orso isn't installed.
832
+ cols = []
833
+ # Try Orso FlatColumn importer
834
+ import orso
835
+ import pyarrow as pa
836
+
837
+ # If schema is an Orso RelationSchema, try to obtain a list of columns
838
+ columns = None
839
+ if isinstance(schema, orso.schema.RelationSchema):
840
+ columns = schema.columns
841
+ elif isinstance(schema, pa.Schema):
842
+ orso_schema = orso.schema.convert_arrow_schema_to_orso_schema(schema)
843
+ columns = orso_schema.columns
844
+ else:
845
+ # print(f"[DEBUG] _schema_to_columns: unsupported schema type: {type(schema)}")
846
+ raise ValueError(
847
+ "Unsupported schema type, expected pyarrow.Schema or orso.RelationSchema"
848
+ )
849
+
850
+ # print(f"[DEBUG] _schema_to_columns: processing {len(columns)} columns")
851
+
852
+ for idx, column in enumerate(columns, start=1):
853
+ # If f looks like a pyarrow.Field, use its name/type
854
+ name = column.name
855
+
856
+ # Extract expected attributes safely
857
+ ctype = column.type
858
+ element_type = column.element_type if column.element_type else None
859
+ scale = column.scale
860
+ precision = column.precision
861
+ typed = {
862
+ "id": idx,
863
+ "name": name,
864
+ "type": ctype,
865
+ "element-type": element_type,
866
+ "scale": scale,
867
+ "precision": precision,
868
+ "expectation-policies": [],
869
+ }
870
+
871
+ cols.append(typed)
872
+
873
+ return cols
874
+
875
+ def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
876
+ """Persist a schema document in the table's `schemas` subcollection and
877
+ return the new schema id.
878
+ """
879
+ import uuid
880
+
881
+ doc_ref = self._dataset_doc_ref(namespace, dataset_name)
882
+ schemas_coll = doc_ref.collection("schemas")
883
+ sid = str(uuid.uuid4())
884
+ # print(f"[DEBUG] _write_schema called for {namespace}/{dataset_name} sid={sid}")
885
+ try:
886
+ cols = self._schema_to_columns(schema)
887
+ except Exception:
888
+ # print(
889
+ # f"[DEBUG] _write_schema: _schema_to_columns raised: {e}; falling back to empty columns list"
890
+ # )
891
+ cols = []
892
+ now_ms = int(time.time() * 1000)
893
+ if author is None:
894
+ raise ValueError("author must be provided when writing a schema")
895
+ # Determine next sequence number by scanning existing schema docs
896
+ try:
897
+ max_seq = 0
898
+ for d in schemas_coll.stream():
899
+ sd = d.to_dict() or {}
900
+ seq = sd.get("sequence-number") or 0
901
+ if isinstance(seq, int) and seq > max_seq:
902
+ max_seq = seq
903
+ new_seq = max_seq + 1
904
+ except Exception:
905
+ new_seq = 1
906
+
907
+ try:
908
+ # print(
909
+ # f"[DEBUG] Writing schema doc {sid} for {namespace}/{dataset_name} (cols={len(cols)})"
910
+ # )
911
+ schemas_coll.document(sid).set(
912
+ {
913
+ "columns": cols,
914
+ "timestamp-ms": now_ms,
915
+ "author": author,
916
+ "sequence-number": new_seq,
917
+ }
918
+ )
919
+ # print(f"[DEBUG] Wrote schema doc {sid}")
920
+ except Exception:
921
+ # print(f"[DEBUG] Failed to write schema doc {sid}: {e}")
922
+ pass
923
+ return sid