opteryx-catalog 0.4.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,979 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import Any
5
+ from typing import Iterable
6
+ from typing import List
7
+ from typing import Optional
8
+
9
+ from google.cloud import firestore
10
+ from google.cloud import storage
11
+
12
+ from .catalog.dataset import SimpleDataset
13
+ from .catalog.metadata import DatasetMetadata
14
+ from .catalog.metadata import Snapshot
15
+ from .catalog.metastore import Metastore
16
+ from .catalog.view import View as CatalogView
17
+ from .exceptions import CollectionAlreadyExists
18
+ from .exceptions import DatasetAlreadyExists
19
+ from .exceptions import DatasetNotFound
20
+ from .exceptions import ViewAlreadyExists
21
+ from .exceptions import ViewNotFound
22
+ from .iops.base import FileIO
23
+ from .webhooks import send_webhook
24
+ from .webhooks.events import dataset_created_payload
25
+ from .webhooks.events import view_created_payload
26
+
27
+
28
+ class OpteryxCatalog(Metastore):
29
+ """Firestore-backed Metastore implementation.
30
+
31
+ Terminology: catalog -> workspace -> collection -> dataset|view
32
+
33
+ Stores dataset documents under the configured workspace in Firestore.
34
+ Snapshots are stored in a `snapshots` subcollection under each
35
+ dataset's document. Parquet manifests are written to GCS under the
36
+ dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ workspace: str,
42
+ firestore_project: Optional[str] = None,
43
+ firestore_database: Optional[str] = None,
44
+ gcs_bucket: Optional[str] = None,
45
+ io: Optional[FileIO] = None,
46
+ ):
47
+ # `workspace` is the configured catalog/workspace name
48
+ self.workspace = workspace
49
+ # Backwards-compatible alias: keep `catalog_name` for older code paths
50
+ self.catalog_name = workspace
51
+ self.firestore_client = firestore.Client(
52
+ project=firestore_project, database=firestore_database
53
+ )
54
+ self._catalog_ref = self.firestore_client.collection(workspace)
55
+ # Ensure workspace-level properties document exists in Firestore.
56
+ # The $properties doc records metadata for the workspace such as
57
+ # 'timestamp-ms', 'author', 'billing-account-id' and 'owner'.
58
+ try:
59
+ props_ref = self._catalog_ref.document("$properties")
60
+ if not props_ref.get().exists:
61
+ now_ms = int(time.time() * 1000)
62
+ billing = None
63
+ owner = None
64
+ props_ref.set(
65
+ {
66
+ "timestamp-ms": now_ms,
67
+ "billing-account-id": billing,
68
+ "owner": owner,
69
+ }
70
+ )
71
+ except Exception:
72
+ # Be conservative: don't fail catalog initialization on Firestore errors
73
+ pass
74
+ self.gcs_bucket = gcs_bucket
75
+ self._storage_client = storage.Client() if gcs_bucket else None
76
+ # Default to a GCS-backed FileIO when a GCS bucket is configured and
77
+ # no explicit `io` was provided.
78
+ if io is not None:
79
+ self.io = io
80
+ else:
81
+ if gcs_bucket:
82
+ from .iops.gcs import GcsFileIO
83
+
84
+ self.io = GcsFileIO()
85
+ else:
86
+ self.io = FileIO()
87
+
88
+ def _collection_ref(self, collection: str):
89
+ """Alias for `_namespace_ref` using the preferred term `collection`.
90
+
91
+ Do NOT change call signatures; this helper provides a clearer name
92
+ for new code paths while remaining backwards-compatible.
93
+ """
94
+ return self._catalog_ref.document(collection)
95
+
96
+ def _datasets_collection(self, collection: str):
97
+ # Primary subcollection for datasets.
98
+ return self._collection_ref(collection).collection("datasets")
99
+
100
+ def _dataset_doc_ref(self, collection: str, dataset_name: str):
101
+ return self._datasets_collection(collection).document(dataset_name)
102
+
103
+ def _snapshots_collection(self, collection: str, dataset_name: str):
104
+ return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
105
+
106
+ def _views_collection(self, collection: str):
107
+ return self._collection_ref(collection).collection("views")
108
+
109
+ def _view_doc_ref(self, collection: str, view_name: str):
110
+ return self._views_collection(collection).document(view_name)
111
+
112
+ def create_dataset(
113
+ self, identifier: str, schema: Any, properties: dict | None = None, author: str = None
114
+ ) -> SimpleDataset:
115
+ if author is None:
116
+ raise ValueError("author must be provided when creating a dataset")
117
+ collection, dataset_name = identifier.split(".")
118
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
119
+ # Check primary `datasets` location
120
+ if doc_ref.get().exists:
121
+ raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
122
+
123
+ # Build default dataset metadata
124
+ location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
125
+ metadata = DatasetMetadata(
126
+ dataset_identifier=identifier,
127
+ schema=schema,
128
+ location=location,
129
+ properties=properties or {},
130
+ )
131
+
132
+ # Persist document with timestamp and author
133
+ now_ms = int(time.time() * 1000)
134
+ metadata.timestamp_ms = now_ms
135
+ metadata.author = author
136
+ doc_ref.set(
137
+ {
138
+ "name": dataset_name,
139
+ "collection": collection,
140
+ "workspace": self.workspace,
141
+ "location": location,
142
+ "properties": metadata.properties,
143
+ "format-version": metadata.format_version,
144
+ "timestamp-ms": now_ms,
145
+ "author": author,
146
+ "maintenance-policy": metadata.maintenance_policy,
147
+ }
148
+ )
149
+
150
+ # Persist initial schema into `schemas` subcollection if provided
151
+ if schema is not None:
152
+ schema_id = self._write_schema(collection, dataset_name, schema, author=author)
153
+ metadata.current_schema_id = schema_id
154
+ # Read back the schema doc to capture timestamp-ms, author, sequence-number
155
+ try:
156
+ sdoc = doc_ref.collection("schemas").document(schema_id).get()
157
+ sdata = sdoc.to_dict() or {}
158
+ metadata.schemas = [
159
+ {
160
+ "schema_id": schema_id,
161
+ "columns": sdata.get("columns", self._schema_to_columns(schema)),
162
+ "timestamp-ms": sdata.get("timestamp-ms"),
163
+ "author": sdata.get("author"),
164
+ "sequence-number": sdata.get("sequence-number"),
165
+ }
166
+ ]
167
+ except Exception:
168
+ metadata.schemas = [
169
+ {"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
170
+ ]
171
+ # update dataset doc to reference current schema
172
+ doc_ref.update({"current-schema-id": metadata.current_schema_id})
173
+
174
+ # Send webhook notification
175
+ send_webhook(
176
+ action="create",
177
+ workspace=self.workspace,
178
+ collection=collection,
179
+ resource_type="dataset",
180
+ resource_name=dataset_name,
181
+ payload=dataset_created_payload(
182
+ schema=schema,
183
+ location=location,
184
+ properties=properties,
185
+ ),
186
+ )
187
+
188
+ # Return SimpleDataset (attach this catalog so append() can persist)
189
+ return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
190
+
191
+ def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
192
+ """Load a dataset from Firestore.
193
+
194
+ Args:
195
+ identifier: Dataset identifier in format 'collection.dataset_name'
196
+ load_history: If True, load all snapshots from Firestore (expensive for
197
+ large histories). If False (default), only load the current snapshot,
198
+ which is sufficient for most write operations.
199
+
200
+ Returns:
201
+ SimpleDataset instance with metadata loaded from Firestore.
202
+
203
+ Raises:
204
+ DatasetNotFound: If the dataset does not exist in Firestore.
205
+ """
206
+ collection, dataset_name = identifier.split(".")
207
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
208
+ doc = doc_ref.get()
209
+ if not doc.exists:
210
+ raise DatasetNotFound(f"Dataset not found: {identifier}")
211
+
212
+ data = doc.to_dict() or {}
213
+ metadata = DatasetMetadata(
214
+ dataset_identifier=identifier,
215
+ location=data.get("location")
216
+ or f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}",
217
+ schema=data.get("schema"),
218
+ properties=data.get("properties") or {},
219
+ )
220
+
221
+ # Load dataset-level timestamp/author and collection/workspace
222
+ metadata.timestamp_ms = data.get("timestamp-ms")
223
+ metadata.author = data.get("author")
224
+ metadata.description = data.get("description")
225
+ metadata.describer = data.get("describer")
226
+
227
+ # Load snapshots based on load_history flag
228
+ snaps = []
229
+ if load_history:
230
+ # Load all snapshots from Firestore (expensive for large histories)
231
+ for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
232
+ sd = snap_doc.to_dict() or {}
233
+ snap = Snapshot(
234
+ snapshot_id=sd.get("snapshot-id"),
235
+ timestamp_ms=sd.get("timestamp-ms"),
236
+ author=sd.get("author"),
237
+ sequence_number=sd.get("sequence-number"),
238
+ user_created=sd.get("user-created"),
239
+ manifest_list=sd.get("manifest"),
240
+ schema_id=sd.get("schema-id"),
241
+ summary=sd.get("summary", {}),
242
+ operation_type=sd.get("operation-type"),
243
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
244
+ )
245
+ snaps.append(snap)
246
+ if snaps:
247
+ metadata.current_snapshot_id = snaps[-1].snapshot_id
248
+ else:
249
+ # Load only the current snapshot (efficient single read)
250
+ current_snap_id = data.get("current-snapshot-id")
251
+ if current_snap_id:
252
+ try:
253
+ snap_doc = (
254
+ self._snapshots_collection(collection, dataset_name)
255
+ .document(str(current_snap_id))
256
+ .get()
257
+ )
258
+ if snap_doc.exists:
259
+ sd = snap_doc.to_dict() or {}
260
+ snap = Snapshot(
261
+ snapshot_id=sd.get("snapshot-id"),
262
+ timestamp_ms=sd.get("timestamp-ms"),
263
+ author=sd.get("author"),
264
+ sequence_number=sd.get("sequence-number"),
265
+ user_created=sd.get("user-created"),
266
+ manifest_list=sd.get("manifest"),
267
+ schema_id=sd.get("schema-id"),
268
+ summary=sd.get("summary", {}),
269
+ operation_type=sd.get("operation-type"),
270
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
271
+ )
272
+ snaps.append(snap)
273
+ metadata.current_snapshot_id = current_snap_id
274
+ except Exception:
275
+ pass
276
+ metadata.snapshots = snaps
277
+
278
+ # Load schemas subcollection
279
+ schemas_coll = doc_ref.collection("schemas")
280
+ # Load all schemas if requested; otherwise load only current schema
281
+ if load_history:
282
+ schemas = []
283
+ for sdoc in schemas_coll.stream():
284
+ sd = sdoc.to_dict() or {}
285
+ schemas.append(
286
+ {
287
+ "schema_id": sdoc.id,
288
+ "columns": sd.get("columns", []),
289
+ "timestamp-ms": sd.get("timestamp-ms"),
290
+ "author": sd.get("author"),
291
+ "sequence-number": sd.get("sequence-number"),
292
+ }
293
+ )
294
+ metadata.schemas = schemas
295
+ metadata.current_schema_id = doc.to_dict().get("current-schema-id")
296
+ else:
297
+ # Only load the current schema document for efficiency
298
+ current_schema_id = doc.to_dict().get("current-schema-id")
299
+ if current_schema_id:
300
+ sdoc = schemas_coll.document(str(current_schema_id)).get()
301
+ if sdoc.exists:
302
+ sd = sdoc.to_dict() or {}
303
+ metadata.schemas = [
304
+ {
305
+ "schema_id": sdoc.id,
306
+ "columns": sd.get("columns", []),
307
+ "timestamp-ms": sd.get("timestamp-ms"),
308
+ "author": sd.get("author"),
309
+ "sequence-number": sd.get("sequence-number"),
310
+ }
311
+ ]
312
+ metadata.current_schema_id = current_schema_id
313
+ return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
314
+
315
+ def drop_dataset(self, identifier: str) -> None:
316
+ collection, dataset_name = identifier.split(".")
317
+ # Delete snapshots
318
+ snaps_coll = self._snapshots_collection(collection, dataset_name)
319
+ for doc in snaps_coll.stream():
320
+ snaps_coll.document(doc.id).delete()
321
+ # Delete dataset doc
322
+ self._dataset_doc_ref(collection, dataset_name).delete()
323
+
324
+ def list_datasets(self, collection: str) -> Iterable[str]:
325
+ coll = self._datasets_collection(collection)
326
+ return [doc.id for doc in coll.stream()]
327
+
328
+ def create_collection(
329
+ self,
330
+ collection: str,
331
+ properties: dict | None = None,
332
+ exists_ok: bool = False,
333
+ author: str = None,
334
+ ) -> None:
335
+ """Create a collection document under the catalog.
336
+
337
+ If `exists_ok` is False and the collection already exists, a KeyError is raised.
338
+ """
339
+ doc_ref = self._collection_ref(collection)
340
+ if doc_ref.get().exists:
341
+ if exists_ok:
342
+ return
343
+ raise CollectionAlreadyExists(f"Collection already exists: {collection}")
344
+
345
+ now_ms = int(time.time() * 1000)
346
+ if author is None:
347
+ raise ValueError("author must be provided when creating a collection")
348
+ doc_ref.set(
349
+ {
350
+ "name": collection,
351
+ "properties": properties or {},
352
+ "timestamp-ms": now_ms,
353
+ "author": author,
354
+ }
355
+ )
356
+
357
+ def create_collection_if_not_exists(
358
+ self, collection: str, properties: dict | None = None, author: Optional[str] = None
359
+ ) -> None:
360
+ """Convenience wrapper that creates the collection only if missing."""
361
+ self.create_collection(collection, properties=properties, exists_ok=True, author=author)
362
+
363
+ def dataset_exists(
364
+ self, identifier_or_collection: str, dataset_name: Optional[str] = None
365
+ ) -> bool:
366
+ """Return True if the dataset exists.
367
+
368
+ Supports two call forms:
369
+ - dataset_exists("collection.dataset")
370
+ - dataset_exists("collection", "dataset")
371
+ """
372
+ # Normalize inputs
373
+ if dataset_name is None:
374
+ # Expect a single collection like 'collection.dataset'
375
+ if "." not in identifier_or_collection:
376
+ raise ValueError(
377
+ "collection must be 'collection.dataset' or pass dataset_name separately"
378
+ )
379
+ collection, dataset_name = identifier_or_collection.rsplit(".", 1)
380
+ else:
381
+ collection = identifier_or_collection
382
+
383
+ try:
384
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
385
+ return doc_ref.get().exists
386
+ except Exception:
387
+ # On any error, be conservative and return False
388
+ return False
389
+
390
+ # Dataset API methods have been renamed to the preferred `dataset` terminology.
391
+
392
+ # --- View support -------------------------------------------------
393
+ def create_view(
394
+ self,
395
+ identifier: str | tuple,
396
+ sql: str,
397
+ schema: Any | None = None,
398
+ author: str = None,
399
+ description: Optional[str] = None,
400
+ properties: dict | None = None,
401
+ update_if_exists: bool = False,
402
+ ) -> CatalogView:
403
+ """Create a view document and a statement version in the `statement` subcollection.
404
+
405
+ `identifier` may be a string like 'namespace.view' or a tuple ('namespace','view').
406
+ """
407
+ # Normalize identifier
408
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
409
+ collection, view_name = identifier[0], identifier[1]
410
+ else:
411
+ collection, view_name = identifier.split(".")
412
+
413
+ doc_ref = self._view_doc_ref(collection, view_name)
414
+ if doc_ref.get().exists:
415
+ if not update_if_exists:
416
+ raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
417
+ # Update existing view - get current sequence number
418
+ existing_doc = doc_ref.get().to_dict()
419
+ current_statement_id = existing_doc.get("statement-id")
420
+ if current_statement_id:
421
+ stmt_ref = doc_ref.collection("statement").document(current_statement_id)
422
+ stmt_doc = stmt_ref.get()
423
+ if stmt_doc.exists:
424
+ sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
425
+ else:
426
+ sequence_number = 1
427
+ else:
428
+ sequence_number = 1
429
+ else:
430
+ sequence_number = 1
431
+
432
+ now_ms = int(time.time() * 1000)
433
+ if author is None:
434
+ raise ValueError("author must be provided when creating a view")
435
+
436
+ # Write statement version
437
+ statement_id = str(now_ms)
438
+ stmt_coll = doc_ref.collection("statement")
439
+ stmt_coll.document(statement_id).set(
440
+ {
441
+ "sql": sql,
442
+ "timestamp-ms": now_ms,
443
+ "author": author,
444
+ "sequence-number": sequence_number,
445
+ }
446
+ )
447
+
448
+ # Persist root view doc referencing the statement id
449
+ doc_ref.set(
450
+ {
451
+ "name": view_name,
452
+ "collection": collection,
453
+ "workspace": self.workspace,
454
+ "timestamp-ms": now_ms,
455
+ "author": author,
456
+ "description": description,
457
+ "describer": author,
458
+ "last-execution-ms": None,
459
+ "last-execution-data-size": None,
460
+ "last-execution-records": None,
461
+ "statement-id": statement_id,
462
+ "properties": properties or {},
463
+ }
464
+ )
465
+
466
+ # Send webhook notification
467
+ send_webhook(
468
+ action="create" if not update_if_exists else "update",
469
+ workspace=self.workspace,
470
+ collection=collection,
471
+ resource_type="view",
472
+ resource_name=view_name,
473
+ payload=view_created_payload(
474
+ definition=sql,
475
+ properties=properties,
476
+ ),
477
+ )
478
+
479
+ # Return a simple CatalogView wrapper
480
+ v = CatalogView(name=view_name, definition=sql, properties=properties or {})
481
+ # provide convenient attributes used by docs/examples
482
+ setattr(v, "sql", sql)
483
+ setattr(v, "metadata", type("M", (), {})())
484
+ v.metadata.schema = schema
485
+ # Attach catalog and identifier for describe() method
486
+ setattr(v, "_catalog", self)
487
+ setattr(v, "_identifier", f"{collection}.{view_name}")
488
+ return v
489
+
490
+ def load_view(self, identifier: str | tuple) -> CatalogView:
491
+ """Load a view by identifier. Returns a `CatalogView` with `.definition` and `.sql`.
492
+
493
+ Raises `ViewNotFound` if the view doc is missing.
494
+ """
495
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
496
+ collection, view_name = identifier[0], identifier[1]
497
+ else:
498
+ collection, view_name = identifier.split(".")
499
+
500
+ doc_ref = self._view_doc_ref(collection, view_name)
501
+ doc = doc_ref.get()
502
+ if not doc.exists:
503
+ raise ViewNotFound(f"View not found: {collection}.{view_name}")
504
+
505
+ data = doc.to_dict() or {}
506
+ stmt_id = data.get("statement-id")
507
+ sql = None
508
+ schema = data.get("schema")
509
+
510
+ sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
511
+ sql = (sdoc.to_dict() or {}).get("sql")
512
+
513
+ v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
514
+ setattr(v, "sql", sql or "")
515
+ setattr(v, "metadata", type("M", (), {})())
516
+ v.metadata.schema = schema
517
+ # Populate metadata fields from the stored view document so callers
518
+ # expecting attributes like `timestamp_ms` won't fail.
519
+ v.metadata.author = data.get("author")
520
+ v.metadata.description = data.get("description")
521
+ v.metadata.timestamp_ms = data.get("timestamp-ms")
522
+ # Execution/operational fields (may be None)
523
+ v.metadata.last_execution_ms = data.get("last-execution-ms")
524
+ v.metadata.last_execution_data_size = data.get("last-execution-data-size")
525
+ v.metadata.last_execution_records = data.get("last-execution-records")
526
+ # Optional describer (used to flag LLM-generated descriptions)
527
+ v.metadata.describer = data.get("describer")
528
+ # Attach catalog and identifier for describe() method
529
+ setattr(v, "_catalog", self)
530
+ setattr(v, "_identifier", f"{collection}.{view_name}")
531
+ return v
532
+
533
+ def drop_view(self, identifier: str | tuple) -> None:
534
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
535
+ collection, view_name = identifier[0], identifier[1]
536
+ else:
537
+ collection, view_name = identifier.split(".")
538
+
539
+ doc_ref = self._view_doc_ref(collection, view_name)
540
+ # delete statement subcollection
541
+ for d in doc_ref.collection("statement").stream():
542
+ doc_ref.collection("statement").document(d.id).delete()
543
+
544
+ doc_ref.delete()
545
+
546
+ def list_views(self, collection: str) -> Iterable[str]:
547
+ coll = self._views_collection(collection)
548
+ return [doc.id for doc in coll.stream()]
549
+
550
+ def view_exists(
551
+ self, identifier_or_collection: str | tuple, view_name: Optional[str] = None
552
+ ) -> bool:
553
+ """Return True if the view exists.
554
+
555
+ Supports two call forms:
556
+ - view_exists("collection.view")
557
+ - view_exists(("collection", "view"))
558
+ - view_exists("collection", "view")
559
+ """
560
+ # Normalize inputs
561
+ if view_name is None:
562
+ if isinstance(identifier_or_collection, tuple) or isinstance(
563
+ identifier_or_collection, list
564
+ ):
565
+ collection, view_name = identifier_or_collection[0], identifier_or_collection[1]
566
+ else:
567
+ if "." not in identifier_or_collection:
568
+ raise ValueError(
569
+ "identifier must be 'collection.view' or pass view_name separately"
570
+ )
571
+ collection, view_name = identifier_or_collection.rsplit(".", 1)
572
+ else:
573
+ collection = identifier_or_collection
574
+
575
+ try:
576
+ doc_ref = self._view_doc_ref(collection, view_name)
577
+ return doc_ref.get().exists
578
+ except Exception:
579
+ return False
580
+
581
+ def update_view_execution_metadata(
582
+ self,
583
+ identifier: str | tuple,
584
+ row_count: Optional[int] = None,
585
+ execution_time: Optional[float] = None,
586
+ ) -> None:
587
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
588
+ collection, view_name = identifier[0], identifier[1]
589
+ else:
590
+ collection, view_name = identifier.split(".")
591
+
592
+ doc_ref = self._view_doc_ref(collection, view_name)
593
+ updates = {}
594
+ now_ms = int(time.time() * 1000)
595
+ if row_count is not None:
596
+ updates["last-execution-records"] = row_count
597
+ if execution_time is not None:
598
+ updates["last-execution-time-ms"] = int(execution_time * 1000)
599
+ updates["last-execution-ms"] = now_ms
600
+ if updates:
601
+ doc_ref.update(updates)
602
+
603
+ def update_view_description(
604
+ self,
605
+ identifier: str | tuple,
606
+ description: str,
607
+ describer: Optional[str] = None,
608
+ ) -> None:
609
+ """Update the description for a view.
610
+
611
+ Args:
612
+ identifier: View identifier ('collection.view' or tuple)
613
+ description: The new description text
614
+ describer: Optional identifier for who/what created the description
615
+ """
616
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
617
+ collection, view_name = identifier[0], identifier[1]
618
+ else:
619
+ collection, view_name = identifier.split(".")
620
+
621
+ doc_ref = self._view_doc_ref(collection, view_name)
622
+ updates = {
623
+ "description": description,
624
+ }
625
+ if describer is not None:
626
+ updates["describer"] = describer
627
+ doc_ref.update(updates)
628
+
629
+ def update_dataset_description(
630
+ self,
631
+ identifier: str | tuple,
632
+ description: str,
633
+ describer: Optional[str] = None,
634
+ ) -> None:
635
+ """Update the description for a dataset.
636
+
637
+ Args:
638
+ identifier: Dataset identifier in format 'collection.dataset_name'
639
+ description: The new description text
640
+ describer: Optional identifier for who/what created the description
641
+ """
642
+
643
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
644
+ collection, dataset_name = identifier[0], identifier[1]
645
+ else:
646
+ collection, dataset_name = identifier.split(".")
647
+
648
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
649
+ updates = {
650
+ "description": description,
651
+ }
652
+ if describer is not None:
653
+ updates["describer"] = describer
654
+ doc_ref.update(updates)
655
+
656
+ def write_parquet_manifest(
657
+ self, snapshot_id: int, entries: List[dict], dataset_location: str
658
+ ) -> Optional[str]:
659
+ """Write a Parquet manifest for the given snapshot id and entries.
660
+
661
+ Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
662
+ The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
663
+ """
664
+ import pyarrow as pa
665
+ import pyarrow.parquet as pq
666
+
667
+ # If entries is None we skip writing; if entries is empty list, write
668
+ # an empty Parquet manifest (represents an empty dataset for this
669
+ # snapshot). This preserves previous manifests so older snapshots
670
+ # remain readable.
671
+ if entries is None:
672
+ return None
673
+
674
+ parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
675
+
676
+ # Use provided FileIO if it supports writing; otherwise write to GCS
677
+ try:
678
+ # Use an explicit schema so PyArrow types (especially nested lists)
679
+ # are correct and we avoid integer overflow / inference issues.
680
+ schema = pa.schema(
681
+ [
682
+ ("file_path", pa.string()),
683
+ ("file_format", pa.string()),
684
+ ("record_count", pa.int64()),
685
+ ("file_size_in_bytes", pa.int64()),
686
+ ("uncompressed_size_in_bytes", pa.int64()),
687
+ ("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
688
+ ("null_counts", pa.list_(pa.int64())),
689
+ ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
690
+ ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
691
+ ("histogram_bins", pa.int32()),
692
+ ("min_values", pa.list_(pa.binary())),
693
+ ("max_values", pa.list_(pa.binary())),
694
+ ]
695
+ )
696
+
697
+ # Normalize entries to match schema expectations:
698
+ normalized = []
699
+ for ent in entries:
700
+ if not isinstance(ent, dict):
701
+ normalized.append(ent)
702
+ continue
703
+ e = dict(ent)
704
+ # Ensure list fields exist
705
+ e.setdefault("min_k_hashes", [])
706
+ e.setdefault("histogram_counts", [])
707
+ e.setdefault("histogram_bins", 0)
708
+ e.setdefault("column_uncompressed_sizes_in_bytes", [])
709
+ e.setdefault("null_counts", [])
710
+
711
+ # Process min/max values: truncate to 16 bytes with ellipsis marker if longer
712
+ mv = e.get("min_values") or []
713
+ xv = e.get("max_values") or []
714
+
715
+ def truncate_value(v):
716
+ """Convert value to binary and truncate to 16 bytes with marker if needed."""
717
+ if v is None:
718
+ return None
719
+ # Convert to bytes
720
+ if isinstance(v, bytes):
721
+ b = v
722
+ else:
723
+ b = str(v).encode("utf-8")
724
+ # Truncate if longer than 16 bytes, add 0xFF as 17th byte to indicate truncation
725
+ if len(b) > 16:
726
+ return b[:16] + b"\xff"
727
+ return b
728
+
729
+ e["min_values"] = [truncate_value(v) for v in mv]
730
+ e["max_values"] = [truncate_value(v) for v in xv]
731
+ normalized.append(e)
732
+
733
+ try:
734
+ table = pa.Table.from_pylist(normalized, schema=schema)
735
+ except Exception as exc:
736
+ # Diagnostic output to help find malformed manifest entries
737
+
738
+ print(
739
+ "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
740
+ )
741
+ for i, ent in enumerate(entries):
742
+ print(f" Entry {i}:")
743
+ if isinstance(ent, dict):
744
+ for k, v in ent.items():
745
+ tname = type(v).__name__
746
+ try:
747
+ s = repr(v)
748
+ except Exception:
749
+ s = "<unreprable>"
750
+ print(f" - {k}: type={tname} repr={s[:200]}")
751
+ else:
752
+ print(
753
+ f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
754
+ )
755
+ raise exc
756
+
757
+ buf = pa.BufferOutputStream()
758
+ pq.write_table(table, buf, compression="zstd")
759
+ data = buf.getvalue().to_pybytes()
760
+
761
+ if self.io:
762
+ out = self.io.new_output(parquet_path).create()
763
+ out.write(data)
764
+ try:
765
+ # Some OutputFile implementations buffer and require close()
766
+ out.close()
767
+ except Exception:
768
+ pass
769
+
770
+ return parquet_path
771
+ except Exception as e:
772
+ # Log and return None on failure
773
+ # print(f"Failed to write Parquet manifest: {e}")
774
+ raise e
775
+
776
+ def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
777
+ """Persist a single snapshot document for a dataset."""
778
+ namespace, dataset_name = identifier.split(".")
779
+ snaps = self._snapshots_collection(namespace, dataset_name)
780
+ doc_id = str(snapshot.snapshot_id)
781
+ # Ensure summary contains all expected keys (zero defaults applied in dataclass)
782
+ summary = snapshot.summary or {}
783
+ # Provide explicit keys if missing
784
+ for k in [
785
+ "added-data-files",
786
+ "added-files-size",
787
+ "added-records",
788
+ "deleted-data-files",
789
+ "deleted-files-size",
790
+ "deleted-records",
791
+ "total-data-files",
792
+ "total-files-size",
793
+ "total-records",
794
+ ]:
795
+ summary.setdefault(k, 0)
796
+
797
+ data = {
798
+ "snapshot-id": snapshot.snapshot_id,
799
+ "timestamp-ms": snapshot.timestamp_ms,
800
+ "manifest": snapshot.manifest_list,
801
+ "commit-message": getattr(snapshot, "commit_message", ""),
802
+ "summary": summary,
803
+ "author": getattr(snapshot, "author", None),
804
+ "sequence-number": getattr(snapshot, "sequence_number", None),
805
+ "operation-type": getattr(snapshot, "operation_type", None),
806
+ "parent-snapshot-id": getattr(snapshot, "parent_snapshot_id", None),
807
+ }
808
+ if getattr(snapshot, "schema_id", None) is not None:
809
+ data["schema-id"] = snapshot.schema_id
810
+ snaps.document(doc_id).set(data)
811
+
812
+ def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
813
+ """Persist dataset-level metadata and snapshots to Firestore.
814
+
815
+ This writes the dataset document and upserts snapshot documents.
816
+ """
817
+ collection, dataset_name = identifier.split(".")
818
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
819
+ doc_ref.set(
820
+ {
821
+ "name": dataset_name,
822
+ "collection": collection,
823
+ "workspace": self.workspace,
824
+ "location": metadata.location,
825
+ "properties": metadata.properties,
826
+ "format-version": metadata.format_version,
827
+ "current-snapshot-id": metadata.current_snapshot_id,
828
+ "current-schema-id": metadata.current_schema_id,
829
+ "timestamp-ms": metadata.timestamp_ms,
830
+ "author": metadata.author,
831
+ "description": metadata.description,
832
+ "describer": metadata.describer,
833
+ "maintenance-policy": metadata.maintenance_policy,
834
+ "sort-orders": metadata.sort_orders,
835
+ }
836
+ )
837
+
838
+ # Metadata persisted in primary `datasets` collection only.
839
+
840
+ snaps_coll = self._snapshots_collection(collection, dataset_name)
841
+ # Upsert snapshot documents. Do NOT delete existing snapshot documents
842
+ # here to avoid accidental removal of historical snapshots on save.
843
+ for snap in metadata.snapshots:
844
+ snaps_coll.document(str(snap.snapshot_id)).set(
845
+ {
846
+ "snapshot-id": snap.snapshot_id,
847
+ "timestamp-ms": snap.timestamp_ms,
848
+ "manifest": snap.manifest_list,
849
+ "commit-message": getattr(snap, "commit_message", ""),
850
+ "schema-id": snap.schema_id,
851
+ "summary": snap.summary or {},
852
+ "author": getattr(snap, "author", None),
853
+ "sequence-number": getattr(snap, "sequence_number", None),
854
+ "user-created": getattr(snap, "user_created", None),
855
+ }
856
+ )
857
+
858
+ # Persist schemas subcollection
859
+ schemas_coll = doc_ref.collection("schemas")
860
+ existing_schema_ids = {d.id for d in schemas_coll.stream()}
861
+ new_schema_ids = set()
862
+ for s in metadata.schemas:
863
+ sid = s.get("schema_id")
864
+ if not sid:
865
+ continue
866
+ new_schema_ids.add(sid)
867
+ schemas_coll.document(sid).set(
868
+ {
869
+ "columns": s.get("columns", []),
870
+ "timestamp-ms": s.get("timestamp-ms"),
871
+ "author": s.get("author"),
872
+ "sequence-number": s.get("sequence-number"),
873
+ }
874
+ )
875
+ # Delete stale schema docs
876
+ for stale in existing_schema_ids - new_schema_ids:
877
+ schemas_coll.document(stale).delete()
878
+
879
+ def _schema_to_columns(self, schema: Any) -> list:
880
+ """Convert a pyarrow.Schema into a simple columns list for storage.
881
+
882
+ Each column is a dict: {"id": index (1-based), "name": column_name, "type": str(type)}
883
+ """
884
+ # Support pyarrow.Schema and Orso RelationSchema. When Orso's
885
+ # FlatColumn.from_arrow is available, use it to derive Orso types
886
+ # (type, element-type, scale, precision). Fall back to simple
887
+ # stringified types if Orso isn't installed.
888
+ cols = []
889
+ # Try Orso FlatColumn importer
890
+ import orso
891
+ import pyarrow as pa
892
+
893
+ # If schema is an Orso RelationSchema, try to obtain a list of columns
894
+ columns = None
895
+ if isinstance(schema, orso.schema.RelationSchema):
896
+ columns = schema.columns
897
+ elif isinstance(schema, pa.Schema):
898
+ orso_schema = orso.schema.convert_arrow_schema_to_orso_schema(schema)
899
+ columns = orso_schema.columns
900
+ else:
901
+ # print(f"[DEBUG] _schema_to_columns: unsupported schema type: {type(schema)}")
902
+ raise ValueError(
903
+ "Unsupported schema type, expected pyarrow.Schema or orso.RelationSchema"
904
+ )
905
+
906
+ # print(f"[DEBUG] _schema_to_columns: processing {len(columns)} columns")
907
+
908
+ for idx, column in enumerate(columns, start=1):
909
+ # If f looks like a pyarrow.Field, use its name/type
910
+ name = column.name
911
+
912
+ # Extract expected attributes safely
913
+ ctype = column.type
914
+ element_type = column.element_type if column.element_type else None
915
+ scale = column.scale
916
+ precision = column.precision
917
+ typed = {
918
+ "id": idx,
919
+ "name": name,
920
+ "type": ctype,
921
+ "element-type": element_type,
922
+ "scale": scale,
923
+ "precision": precision,
924
+ "expectation-policies": [],
925
+ }
926
+
927
+ cols.append(typed)
928
+
929
+ return cols
930
+
931
+ def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
932
+ """Persist a schema document in the dataset's `schemas` subcollection and
933
+ return the new schema id.
934
+ """
935
+ import uuid
936
+
937
+ doc_ref = self._dataset_doc_ref(namespace, dataset_name)
938
+ schemas_coll = doc_ref.collection("schemas")
939
+ sid = str(uuid.uuid4())
940
+ # print(f"[DEBUG] _write_schema called for {namespace}/{dataset_name} sid={sid}")
941
+ try:
942
+ cols = self._schema_to_columns(schema)
943
+ except Exception:
944
+ # print(
945
+ # f"[DEBUG] _write_schema: _schema_to_columns raised: {e}; falling back to empty columns list"
946
+ # )
947
+ cols = []
948
+ now_ms = int(time.time() * 1000)
949
+ if author is None:
950
+ raise ValueError("author must be provided when writing a schema")
951
+ # Determine next sequence number by scanning existing schema docs
952
+ try:
953
+ max_seq = 0
954
+ for d in schemas_coll.stream():
955
+ sd = d.to_dict() or {}
956
+ seq = sd.get("sequence-number") or 0
957
+ if isinstance(seq, int) and seq > max_seq:
958
+ max_seq = seq
959
+ new_seq = max_seq + 1
960
+ except Exception:
961
+ new_seq = 1
962
+
963
+ try:
964
+ # print(
965
+ # f"[DEBUG] Writing schema doc {sid} for {namespace}/{dataset_name} (cols={len(cols)})"
966
+ # )
967
+ schemas_coll.document(sid).set(
968
+ {
969
+ "columns": cols,
970
+ "timestamp-ms": now_ms,
971
+ "author": author,
972
+ "sequence-number": new_seq,
973
+ }
974
+ )
975
+ # print(f"[DEBUG] Wrote schema doc {sid}")
976
+ except Exception:
977
+ # print(f"[DEBUG] Failed to write schema doc {sid}: {e}")
978
+ pass
979
+ return sid