opteryx-catalog 0.4.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of opteryx-catalog might be problematic. Click here for more details.

@@ -0,0 +1,857 @@
1
+ from __future__ import annotations
2
+
3
+ import time
4
+ from typing import Any
5
+ from typing import Iterable
6
+ from typing import List
7
+ from typing import Optional
8
+
9
+ from google.cloud import firestore
10
+ from google.cloud import storage
11
+
12
+ from .catalog.dataset import SimpleDataset
13
+ from .catalog.metadata import DatasetMetadata
14
+ from .catalog.metadata import Snapshot
15
+ from .catalog.metastore import Metastore
16
+ from .catalog.view import View as CatalogView
17
+ from .exceptions import CollectionAlreadyExists
18
+ from .exceptions import DatasetAlreadyExists
19
+ from .exceptions import DatasetNotFound
20
+ from .exceptions import ViewAlreadyExists
21
+ from .exceptions import ViewNotFound
22
+ from .iops.base import FileIO
23
+
24
+
25
+ class OpteryxCatalog(Metastore):
26
+ """Firestore-backed Metastore implementation.
27
+
28
+ Terminology: catalog -> workspace -> collection -> dataset|view
29
+
30
+ Stores dataset documents under the configured workspace in Firestore.
31
+ Snapshots are stored in a `snapshots` subcollection under each
32
+ dataset's document. Parquet manifests are written to GCS under the
33
+ dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ workspace: str,
39
+ firestore_project: Optional[str] = None,
40
+ firestore_database: Optional[str] = None,
41
+ gcs_bucket: Optional[str] = None,
42
+ io: Optional[FileIO] = None,
43
+ ):
44
+ # `workspace` is the configured catalog/workspace name
45
+ self.workspace = workspace
46
+ # Backwards-compatible alias: keep `catalog_name` for older code paths
47
+ self.catalog_name = workspace
48
+ self.firestore_client = firestore.Client(
49
+ project=firestore_project, database=firestore_database
50
+ )
51
+ self._catalog_ref = self.firestore_client.collection(workspace)
52
+ # Ensure workspace-level properties document exists in Firestore.
53
+ # The $properties doc records metadata for the workspace such as
54
+ # 'timestamp-ms', 'author', 'billing-account-id' and 'owner'.
55
+ try:
56
+ props_ref = self._catalog_ref.document("$properties")
57
+ if not props_ref.get().exists:
58
+ now_ms = int(time.time() * 1000)
59
+ billing = None
60
+ owner = None
61
+ props_ref.set(
62
+ {
63
+ "timestamp-ms": now_ms,
64
+ "billing-account-id": billing,
65
+ "owner": owner,
66
+ }
67
+ )
68
+ except Exception:
69
+ # Be conservative: don't fail catalog initialization on Firestore errors
70
+ pass
71
+ self.gcs_bucket = gcs_bucket
72
+ self._storage_client = storage.Client() if gcs_bucket else None
73
+ # Default to a GCS-backed FileIO when a GCS bucket is configured and
74
+ # no explicit `io` was provided.
75
+ if io is not None:
76
+ self.io = io
77
+ else:
78
+ if gcs_bucket:
79
+ from .iops.gcs import GcsFileIO
80
+
81
+ self.io = GcsFileIO()
82
+ else:
83
+ self.io = FileIO()
84
+
85
+ def _collection_ref(self, collection: str):
86
+ """Alias for `_namespace_ref` using the preferred term `collection`.
87
+
88
+ Do NOT change call signatures; this helper provides a clearer name
89
+ for new code paths while remaining backwards-compatible.
90
+ """
91
+ return self._catalog_ref.document(collection)
92
+
93
+ def _datasets_collection(self, collection: str):
94
+ # Primary subcollection for datasets.
95
+ return self._collection_ref(collection).collection("datasets")
96
+
97
+ def _dataset_doc_ref(self, collection: str, dataset_name: str):
98
+ return self._datasets_collection(collection).document(dataset_name)
99
+
100
+ def _snapshots_collection(self, collection: str, dataset_name: str):
101
+ return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
102
+
103
+ def _views_collection(self, collection: str):
104
+ return self._collection_ref(collection).collection("views")
105
+
106
+ def _view_doc_ref(self, collection: str, view_name: str):
107
+ return self._views_collection(collection).document(view_name)
108
+
109
+ def create_dataset(
110
+ self, identifier: str, schema: Any, properties: dict | None = None, author: str = None
111
+ ) -> SimpleDataset:
112
+ if author is None:
113
+ raise ValueError("author must be provided when creating a dataset")
114
+ collection, dataset_name = identifier.split(".")
115
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
116
+ # Check primary `datasets` location
117
+ if doc_ref.get().exists:
118
+ raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
119
+
120
+ # Build default dataset metadata
121
+ location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
122
+ metadata = DatasetMetadata(
123
+ dataset_identifier=identifier,
124
+ schema=schema,
125
+ location=location,
126
+ properties=properties or {},
127
+ )
128
+
129
+ # Persist document with timestamp and author
130
+ now_ms = int(time.time() * 1000)
131
+ metadata.timestamp_ms = now_ms
132
+ metadata.author = author
133
+ doc_ref.set(
134
+ {
135
+ "name": dataset_name,
136
+ "collection": collection,
137
+ "workspace": self.workspace,
138
+ "location": location,
139
+ "properties": metadata.properties,
140
+ "format-version": metadata.format_version,
141
+ "timestamp-ms": now_ms,
142
+ "author": author,
143
+ "maintenance-policy": metadata.maintenance_policy,
144
+ }
145
+ )
146
+
147
+ # Persist initial schema into `schemas` subcollection if provided
148
+ if schema is not None:
149
+ schema_id = self._write_schema(collection, dataset_name, schema, author=author)
150
+ metadata.current_schema_id = schema_id
151
+ # Read back the schema doc to capture timestamp-ms, author, sequence-number
152
+ try:
153
+ sdoc = doc_ref.collection("schemas").document(schema_id).get()
154
+ sdata = sdoc.to_dict() or {}
155
+ metadata.schemas = [
156
+ {
157
+ "schema_id": schema_id,
158
+ "columns": sdata.get("columns", self._schema_to_columns(schema)),
159
+ "timestamp-ms": sdata.get("timestamp-ms"),
160
+ "author": sdata.get("author"),
161
+ "sequence-number": sdata.get("sequence-number"),
162
+ }
163
+ ]
164
+ except Exception:
165
+ metadata.schemas = [
166
+ {"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
167
+ ]
168
+ # update dataset doc to reference current schema
169
+ doc_ref.update({"current-schema-id": metadata.current_schema_id})
170
+
171
+ # Return SimpleDataset (attach this catalog so append() can persist)
172
+ return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
173
+
174
+ def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
175
+ """Load a dataset from Firestore.
176
+
177
+ Args:
178
+ identifier: Dataset identifier in format 'collection.dataset_name'
179
+ load_history: If True, load all snapshots from Firestore (expensive for
180
+ large histories). If False (default), only load the current snapshot,
181
+ which is sufficient for most write operations.
182
+
183
+ Returns:
184
+ SimpleDataset instance with metadata loaded from Firestore.
185
+
186
+ Raises:
187
+ DatasetNotFound: If the dataset does not exist in Firestore.
188
+ """
189
+ collection, dataset_name = identifier.split(".")
190
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
191
+ doc = doc_ref.get()
192
+ if not doc.exists:
193
+ raise DatasetNotFound(f"Dataset not found: {identifier}")
194
+
195
+ data = doc.to_dict() or {}
196
+ metadata = DatasetMetadata(
197
+ dataset_identifier=identifier,
198
+ location=data.get("location")
199
+ or f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}",
200
+ schema=data.get("schema"),
201
+ properties=data.get("properties") or {},
202
+ )
203
+
204
+ # Load dataset-level timestamp/author and collection/workspace
205
+ metadata.timestamp_ms = data.get("timestamp-ms")
206
+ metadata.author = data.get("author")
207
+ # note: Firestore dataset doc stores the original collection and workspace
208
+ # under keys `collection` and `workspace`.
209
+
210
+ # Load snapshots based on load_history flag
211
+ snaps = []
212
+ if load_history:
213
+ # Load all snapshots from Firestore (expensive for large histories)
214
+ for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
215
+ sd = snap_doc.to_dict() or {}
216
+ snap = Snapshot(
217
+ snapshot_id=sd.get("snapshot-id"),
218
+ timestamp_ms=sd.get("timestamp-ms"),
219
+ author=sd.get("author"),
220
+ sequence_number=sd.get("sequence-number"),
221
+ user_created=sd.get("user-created"),
222
+ manifest_list=sd.get("manifest"),
223
+ schema_id=sd.get("schema-id"),
224
+ summary=sd.get("summary", {}),
225
+ operation_type=sd.get("operation-type"),
226
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
227
+ )
228
+ snaps.append(snap)
229
+ if snaps:
230
+ metadata.current_snapshot_id = snaps[-1].snapshot_id
231
+ else:
232
+ # Load only the current snapshot (efficient single read)
233
+ current_snap_id = data.get("current-snapshot-id")
234
+ if current_snap_id:
235
+ try:
236
+ snap_doc = (
237
+ self._snapshots_collection(collection, dataset_name)
238
+ .document(str(current_snap_id))
239
+ .get()
240
+ )
241
+ if snap_doc.exists:
242
+ sd = snap_doc.to_dict() or {}
243
+ snap = Snapshot(
244
+ snapshot_id=sd.get("snapshot-id"),
245
+ timestamp_ms=sd.get("timestamp-ms"),
246
+ author=sd.get("author"),
247
+ sequence_number=sd.get("sequence-number"),
248
+ user_created=sd.get("user-created"),
249
+ manifest_list=sd.get("manifest"),
250
+ schema_id=sd.get("schema-id"),
251
+ summary=sd.get("summary", {}),
252
+ operation_type=sd.get("operation-type"),
253
+ parent_snapshot_id=sd.get("parent-snapshot-id"),
254
+ )
255
+ snaps.append(snap)
256
+ metadata.current_snapshot_id = current_snap_id
257
+ except Exception:
258
+ pass
259
+ metadata.snapshots = snaps
260
+
261
+ # Load schemas subcollection
262
+ schemas_coll = doc_ref.collection("schemas")
263
+ # Load all schemas if requested; otherwise load only current schema
264
+ if load_history:
265
+ schemas = []
266
+ for sdoc in schemas_coll.stream():
267
+ sd = sdoc.to_dict() or {}
268
+ schemas.append(
269
+ {
270
+ "schema_id": sdoc.id,
271
+ "columns": sd.get("columns", []),
272
+ "timestamp-ms": sd.get("timestamp-ms"),
273
+ "author": sd.get("author"),
274
+ "sequence-number": sd.get("sequence-number"),
275
+ }
276
+ )
277
+ metadata.schemas = schemas
278
+ metadata.current_schema_id = doc.to_dict().get("current-schema-id")
279
+ else:
280
+ # Only load the current schema document for efficiency
281
+ current_schema_id = doc.to_dict().get("current-schema-id")
282
+ if current_schema_id:
283
+ sdoc = schemas_coll.document(str(current_schema_id)).get()
284
+ if sdoc.exists:
285
+ sd = sdoc.to_dict() or {}
286
+ metadata.schemas = [
287
+ {
288
+ "schema_id": sdoc.id,
289
+ "columns": sd.get("columns", []),
290
+ "timestamp-ms": sd.get("timestamp-ms"),
291
+ "author": sd.get("author"),
292
+ "sequence-number": sd.get("sequence-number"),
293
+ }
294
+ ]
295
+ metadata.current_schema_id = current_schema_id
296
+ return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
297
+
298
+ def drop_dataset(self, identifier: str) -> None:
299
+ collection, dataset_name = identifier.split(".")
300
+ # Delete snapshots
301
+ snaps_coll = self._snapshots_collection(collection, dataset_name)
302
+ for doc in snaps_coll.stream():
303
+ snaps_coll.document(doc.id).delete()
304
+ # Delete dataset doc
305
+ self._dataset_doc_ref(collection, dataset_name).delete()
306
+
307
+ def list_datasets(self, collection: str) -> Iterable[str]:
308
+ coll = self._datasets_collection(collection)
309
+ return [doc.id for doc in coll.stream()]
310
+
311
+ def create_collection(
312
+ self,
313
+ collection: str,
314
+ properties: dict | None = None,
315
+ exists_ok: bool = False,
316
+ author: str = None,
317
+ ) -> None:
318
+ """Create a collection document under the catalog.
319
+
320
+ If `exists_ok` is False and the collection already exists, a KeyError is raised.
321
+ """
322
+ doc_ref = self._collection_ref(collection)
323
+ if doc_ref.get().exists:
324
+ if exists_ok:
325
+ return
326
+ raise CollectionAlreadyExists(f"Collection already exists: {collection}")
327
+
328
+ now_ms = int(time.time() * 1000)
329
+ if author is None:
330
+ raise ValueError("author must be provided when creating a collection")
331
+ doc_ref.set(
332
+ {
333
+ "name": collection,
334
+ "properties": properties or {},
335
+ "timestamp-ms": now_ms,
336
+ "author": author,
337
+ }
338
+ )
339
+
340
+ def create_collection_if_not_exists(
341
+ self, collection: str, properties: dict | None = None, author: Optional[str] = None
342
+ ) -> None:
343
+ """Convenience wrapper that creates the collection only if missing."""
344
+ self.create_collection(collection, properties=properties, exists_ok=True, author=author)
345
+
346
+ def dataset_exists(
347
+ self, identifier_or_collection: str, dataset_name: Optional[str] = None
348
+ ) -> bool:
349
+ """Return True if the dataset exists.
350
+
351
+ Supports two call forms:
352
+ - dataset_exists("collection.dataset")
353
+ - dataset_exists("collection", "dataset")
354
+ """
355
+ # Normalize inputs
356
+ if dataset_name is None:
357
+ # Expect a single collection like 'collection.dataset'
358
+ if "." not in identifier_or_collection:
359
+ raise ValueError(
360
+ "collection must be 'collection.dataset' or pass dataset_name separately"
361
+ )
362
+ collection, dataset_name = identifier_or_collection.rsplit(".", 1)
363
+ else:
364
+ collection = identifier_or_collection
365
+
366
+ try:
367
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
368
+ return doc_ref.get().exists
369
+ except Exception:
370
+ # On any error, be conservative and return False
371
+ return False
372
+
373
+ # Dataset API methods have been renamed to the preferred `dataset` terminology.
374
+
375
+ # --- View support -------------------------------------------------
376
+ def create_view(
377
+ self,
378
+ identifier: str | tuple,
379
+ sql: str,
380
+ schema: Any | None = None,
381
+ author: str = None,
382
+ description: Optional[str] = None,
383
+ properties: dict | None = None,
384
+ update_if_exists: bool = False,
385
+ ) -> CatalogView:
386
+ """Create a view document and a statement version in the `statement` subcollection.
387
+
388
+ `identifier` may be a string like 'namespace.view' or a tuple ('namespace','view').
389
+ """
390
+ # Normalize identifier
391
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
392
+ collection, view_name = identifier[0], identifier[1]
393
+ else:
394
+ collection, view_name = identifier.split(".")
395
+
396
+ doc_ref = self._view_doc_ref(collection, view_name)
397
+ if doc_ref.get().exists:
398
+ if not update_if_exists:
399
+ raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
400
+ # Update existing view - get current sequence number
401
+ existing_doc = doc_ref.get().to_dict()
402
+ current_statement_id = existing_doc.get("statement-id")
403
+ if current_statement_id:
404
+ stmt_ref = doc_ref.collection("statement").document(current_statement_id)
405
+ stmt_doc = stmt_ref.get()
406
+ if stmt_doc.exists:
407
+ sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
408
+ else:
409
+ sequence_number = 1
410
+ else:
411
+ sequence_number = 1
412
+ else:
413
+ sequence_number = 1
414
+
415
+ now_ms = int(time.time() * 1000)
416
+ if author is None:
417
+ raise ValueError("author must be provided when creating a view")
418
+
419
+ # Write statement version
420
+ statement_id = str(now_ms)
421
+ stmt_coll = doc_ref.collection("statement")
422
+ stmt_coll.document(statement_id).set(
423
+ {
424
+ "sql": sql,
425
+ "timestamp-ms": now_ms,
426
+ "author": author,
427
+ "sequence-number": sequence_number,
428
+ }
429
+ )
430
+
431
+ # Persist root view doc referencing the statement id
432
+ doc_ref.set(
433
+ {
434
+ "name": view_name,
435
+ "collection": collection,
436
+ "workspace": self.workspace,
437
+ "timestamp-ms": now_ms,
438
+ "author": author,
439
+ "description": description,
440
+ "describer": author,
441
+ "last-execution-ms": None,
442
+ "last-execution-data-size": None,
443
+ "last-execution-records": None,
444
+ "statement-id": statement_id,
445
+ "properties": properties or {},
446
+ }
447
+ )
448
+
449
+ # Return a simple CatalogView wrapper
450
+ v = CatalogView(name=view_name, definition=sql, properties=properties or {})
451
+ # provide convenient attributes used by docs/examples
452
+ setattr(v, "sql", sql)
453
+ setattr(v, "metadata", type("M", (), {})())
454
+ v.metadata.schema = schema
455
+ return v
456
+
457
+ def load_view(self, identifier: str | tuple) -> CatalogView:
458
+ """Load a view by identifier. Returns a `CatalogView` with `.definition` and `.sql`.
459
+
460
+ Raises `ViewNotFound` if the view doc is missing.
461
+ """
462
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
463
+ collection, view_name = identifier[0], identifier[1]
464
+ else:
465
+ collection, view_name = identifier.split(".")
466
+
467
+ doc_ref = self._view_doc_ref(collection, view_name)
468
+ doc = doc_ref.get()
469
+ if not doc.exists:
470
+ raise ViewNotFound(f"View not found: {collection}.{view_name}")
471
+
472
+ data = doc.to_dict() or {}
473
+ stmt_id = data.get("statement-id")
474
+ sql = None
475
+ schema = data.get("schema")
476
+
477
+ sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
478
+ sql = (sdoc.to_dict() or {}).get("sql")
479
+
480
+ v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
481
+ setattr(v, "sql", sql or "")
482
+ setattr(v, "metadata", type("M", (), {})())
483
+ v.metadata.schema = schema
484
+ # Populate metadata fields from the stored view document so callers
485
+ # expecting attributes like `timestamp_ms` won't fail.
486
+ v.metadata.author = data.get("author")
487
+ v.metadata.description = data.get("description")
488
+ v.metadata.timestamp_ms = data.get("timestamp-ms")
489
+ # Execution/operational fields (may be None)
490
+ v.metadata.last_execution_ms = data.get("last-execution-ms")
491
+ v.metadata.last_execution_data_size = data.get("last-execution-data-size")
492
+ v.metadata.last_execution_records = data.get("last-execution-records")
493
+ # Optional describer (used to flag LLM-generated descriptions)
494
+ v.metadata.describer = data.get("describer")
495
+ return v
496
+
497
+ def drop_view(self, identifier: str | tuple) -> None:
498
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
499
+ collection, view_name = identifier[0], identifier[1]
500
+ else:
501
+ collection, view_name = identifier.split(".")
502
+
503
+ doc_ref = self._view_doc_ref(collection, view_name)
504
+ # delete statement subcollection
505
+ for d in doc_ref.collection("statement").stream():
506
+ doc_ref.collection("statement").document(d.id).delete()
507
+
508
+ doc_ref.delete()
509
+
510
+ def list_views(self, collection: str) -> Iterable[str]:
511
+ coll = self._views_collection(collection)
512
+ return [doc.id for doc in coll.stream()]
513
+
514
+ def view_exists(
515
+ self, identifier_or_collection: str | tuple, view_name: Optional[str] = None
516
+ ) -> bool:
517
+ """Return True if the view exists.
518
+
519
+ Supports two call forms:
520
+ - view_exists("collection.view")
521
+ - view_exists(("collection", "view"))
522
+ - view_exists("collection", "view")
523
+ """
524
+ # Normalize inputs
525
+ if view_name is None:
526
+ if isinstance(identifier_or_collection, tuple) or isinstance(
527
+ identifier_or_collection, list
528
+ ):
529
+ collection, view_name = identifier_or_collection[0], identifier_or_collection[1]
530
+ else:
531
+ if "." not in identifier_or_collection:
532
+ raise ValueError(
533
+ "identifier must be 'collection.view' or pass view_name separately"
534
+ )
535
+ collection, view_name = identifier_or_collection.rsplit(".", 1)
536
+ else:
537
+ collection = identifier_or_collection
538
+
539
+ try:
540
+ doc_ref = self._view_doc_ref(collection, view_name)
541
+ return doc_ref.get().exists
542
+ except Exception:
543
+ return False
544
+
545
+ def update_view_execution_metadata(
546
+ self,
547
+ identifier: str | tuple,
548
+ row_count: Optional[int] = None,
549
+ execution_time: Optional[float] = None,
550
+ ) -> None:
551
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
552
+ collection, view_name = identifier[0], identifier[1]
553
+ else:
554
+ collection, view_name = identifier.split(".")
555
+
556
+ doc_ref = self._view_doc_ref(collection, view_name)
557
+ updates = {}
558
+ now_ms = int(time.time() * 1000)
559
+ if row_count is not None:
560
+ updates["last-execution-records"] = row_count
561
+ if execution_time is not None:
562
+ updates["last-execution-time-ms"] = int(execution_time * 1000)
563
+ updates["last-execution-ms"] = now_ms
564
+ if updates:
565
+ doc_ref.update(updates)
566
+
567
+ def write_parquet_manifest(
568
+ self, snapshot_id: int, entries: List[dict], dataset_location: str
569
+ ) -> Optional[str]:
570
+ """Write a Parquet manifest for the given snapshot id and entries.
571
+
572
+ Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
573
+ The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
574
+ """
575
+ import pyarrow as pa
576
+ import pyarrow.parquet as pq
577
+
578
+ # If entries is None we skip writing; if entries is empty list, write
579
+ # an empty Parquet manifest (represents an empty dataset for this
580
+ # snapshot). This preserves previous manifests so older snapshots
581
+ # remain readable.
582
+ if entries is None:
583
+ return None
584
+
585
+ parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
586
+
587
+ # Use provided FileIO if it supports writing; otherwise write to GCS
588
+ try:
589
+ # Use an explicit schema so PyArrow types (especially nested lists)
590
+ # are correct and we avoid integer overflow / inference issues.
591
+ schema = pa.schema(
592
+ [
593
+ ("file_path", pa.string()),
594
+ ("file_format", pa.string()),
595
+ ("record_count", pa.int64()),
596
+ ("file_size_in_bytes", pa.int64()),
597
+ ("uncompressed_size_in_bytes", pa.int64()),
598
+ ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
599
+ ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
600
+ ("histogram_bins", pa.int32()),
601
+ ("min_values", pa.list_(pa.int64())),
602
+ ("max_values", pa.list_(pa.int64())),
603
+ ]
604
+ )
605
+
606
+ try:
607
+ table = pa.Table.from_pylist(entries, schema=schema)
608
+ except Exception as exc:
609
+ # Diagnostic output to help find malformed manifest entries
610
+
611
+ print(
612
+ "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
613
+ )
614
+ for i, ent in enumerate(entries):
615
+ print(f" Entry {i}:")
616
+ if isinstance(ent, dict):
617
+ for k, v in ent.items():
618
+ tname = type(v).__name__
619
+ try:
620
+ s = repr(v)
621
+ except Exception:
622
+ s = "<unreprable>"
623
+ print(f" - {k}: type={tname} repr={s[:200]}")
624
+ else:
625
+ print(
626
+ f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
627
+ )
628
+ raise exc
629
+
630
+ buf = pa.BufferOutputStream()
631
+ pq.write_table(table, buf, compression="zstd")
632
+ data = buf.getvalue().to_pybytes()
633
+
634
+ if self.io:
635
+ out = self.io.new_output(parquet_path).create()
636
+ out.write(data)
637
+ try:
638
+ # Some OutputFile implementations buffer and require close()
639
+ out.close()
640
+ except Exception:
641
+ pass
642
+
643
+ return parquet_path
644
+ except Exception as e:
645
+ # Log and return None on failure
646
+ # print(f"Failed to write Parquet manifest: {e}")
647
+ raise e
648
+
649
+ def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
650
+ """Persist a single snapshot document for a dataset."""
651
+ namespace, dataset_name = identifier.split(".")
652
+ snaps = self._snapshots_collection(namespace, dataset_name)
653
+ doc_id = str(snapshot.snapshot_id)
654
+ # Ensure summary contains all expected keys (zero defaults applied in dataclass)
655
+ summary = snapshot.summary or {}
656
+ # Provide explicit keys if missing
657
+ for k in [
658
+ "added-data-files",
659
+ "added-files-size",
660
+ "added-records",
661
+ "deleted-data-files",
662
+ "deleted-files-size",
663
+ "deleted-records",
664
+ "total-data-files",
665
+ "total-files-size",
666
+ "total-records",
667
+ ]:
668
+ summary.setdefault(k, 0)
669
+
670
+ data = {
671
+ "snapshot-id": snapshot.snapshot_id,
672
+ "timestamp-ms": snapshot.timestamp_ms,
673
+ "manifest": snapshot.manifest_list,
674
+ "commit-message": getattr(snapshot, "commit_message", ""),
675
+ "summary": summary,
676
+ "author": getattr(snapshot, "author", None),
677
+ "sequence-number": getattr(snapshot, "sequence_number", None),
678
+ "operation-type": getattr(snapshot, "operation_type", None),
679
+ "parent-snapshot-id": getattr(snapshot, "parent_snapshot_id", None),
680
+ }
681
+ if getattr(snapshot, "schema_id", None) is not None:
682
+ data["schema-id"] = snapshot.schema_id
683
+ snaps.document(doc_id).set(data)
684
+
685
+ def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
686
+ """Persist dataset-level metadata and snapshots to Firestore.
687
+
688
+ This writes the dataset document and upserts snapshot documents.
689
+ """
690
+ collection, dataset_name = identifier.split(".")
691
+ doc_ref = self._dataset_doc_ref(collection, dataset_name)
692
+ doc_ref.set(
693
+ {
694
+ "name": dataset_name,
695
+ "collection": collection,
696
+ "workspace": self.workspace,
697
+ "location": metadata.location,
698
+ "properties": metadata.properties,
699
+ "format-version": metadata.format_version,
700
+ "current-snapshot-id": metadata.current_snapshot_id,
701
+ "current-schema-id": metadata.current_schema_id,
702
+ "timestamp-ms": metadata.timestamp_ms,
703
+ "author": metadata.author,
704
+ "description": metadata.description,
705
+ "describer": metadata.describer,
706
+ "maintenance-policy": metadata.maintenance_policy,
707
+ "sort-orders": metadata.sort_orders,
708
+ }
709
+ )
710
+
711
+ # Metadata persisted in primary `datasets` collection only.
712
+
713
+ snaps_coll = self._snapshots_collection(collection, dataset_name)
714
+ existing = {d.id for d in snaps_coll.stream()}
715
+ new_ids = set()
716
+ for snap in metadata.snapshots:
717
+ new_ids.add(str(snap.snapshot_id))
718
+ snaps_coll.document(str(snap.snapshot_id)).set(
719
+ {
720
+ "snapshot-id": snap.snapshot_id,
721
+ "timestamp-ms": snap.timestamp_ms,
722
+ "manifest": snap.manifest_list,
723
+ "commit-message": getattr(snap, "commit_message", ""),
724
+ "schema-id": snap.schema_id,
725
+ "summary": snap.summary or {},
726
+ "author": getattr(snap, "author", None),
727
+ "sequence-number": getattr(snap, "sequence_number", None),
728
+ "user-created": getattr(snap, "user_created", None),
729
+ }
730
+ )
731
+
732
+ # Delete stale snapshots
733
+ for stale in existing - new_ids:
734
+ snaps_coll.document(stale).delete()
735
+
736
+ # Persist schemas subcollection
737
+ schemas_coll = doc_ref.collection("schemas")
738
+ existing_schema_ids = {d.id for d in schemas_coll.stream()}
739
+ new_schema_ids = set()
740
+ for s in metadata.schemas:
741
+ sid = s.get("schema_id")
742
+ if not sid:
743
+ continue
744
+ new_schema_ids.add(sid)
745
+ schemas_coll.document(sid).set(
746
+ {
747
+ "columns": s.get("columns", []),
748
+ "timestamp-ms": s.get("timestamp-ms"),
749
+ "author": s.get("author"),
750
+ "sequence-number": s.get("sequence-number"),
751
+ }
752
+ )
753
+ # Delete stale schema docs
754
+ for stale in existing_schema_ids - new_schema_ids:
755
+ schemas_coll.document(stale).delete()
756
+
757
+ def _schema_to_columns(self, schema: Any) -> list:
758
+ """Convert a pyarrow.Schema into a simple columns list for storage.
759
+
760
+ Each column is a dict: {"id": index (1-based), "name": column_name, "type": str(type)}
761
+ """
762
+ # Support pyarrow.Schema and Orso RelationSchema. When Orso's
763
+ # FlatColumn.from_arrow is available, use it to derive Orso types
764
+ # (type, element-type, scale, precision). Fall back to simple
765
+ # stringified types if Orso isn't installed.
766
+ cols = []
767
+ # Try Orso FlatColumn importer
768
+ import orso
769
+ import pyarrow as pa
770
+
771
+ # If schema is an Orso RelationSchema, try to obtain a list of columns
772
+ columns = None
773
+ if isinstance(schema, orso.schema.RelationSchema):
774
+ columns = schema.columns
775
+ elif isinstance(schema, pa.Schema):
776
+ orso_schema = orso.schema.convert_arrow_schema_to_orso_schema(schema)
777
+ columns = orso_schema.columns
778
+ else:
779
+ # print(f"[DEBUG] _schema_to_columns: unsupported schema type: {type(schema)}")
780
+ raise ValueError(
781
+ "Unsupported schema type, expected pyarrow.Schema or orso.RelationSchema"
782
+ )
783
+
784
+ # print(f"[DEBUG] _schema_to_columns: processing {len(columns)} columns")
785
+
786
+ for idx, column in enumerate(columns, start=1):
787
+ # If f looks like a pyarrow.Field, use its name/type
788
+ name = column.name
789
+
790
+ # Extract expected attributes safely
791
+ ctype = column.type
792
+ element_type = column.element_type if column.element_type else None
793
+ scale = column.scale
794
+ precision = column.precision
795
+ typed = {
796
+ "id": idx,
797
+ "name": name,
798
+ "type": ctype,
799
+ "element-type": element_type,
800
+ "scale": scale,
801
+ "precision": precision,
802
+ "expectation-policies": [],
803
+ }
804
+
805
+ cols.append(typed)
806
+
807
+ return cols
808
+
809
+ def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
810
+ """Persist a schema document in the dataset's `schemas` subcollection and
811
+ return the new schema id.
812
+ """
813
+ import uuid
814
+
815
+ doc_ref = self._dataset_doc_ref(namespace, dataset_name)
816
+ schemas_coll = doc_ref.collection("schemas")
817
+ sid = str(uuid.uuid4())
818
+ # print(f"[DEBUG] _write_schema called for {namespace}/{dataset_name} sid={sid}")
819
+ try:
820
+ cols = self._schema_to_columns(schema)
821
+ except Exception:
822
+ # print(
823
+ # f"[DEBUG] _write_schema: _schema_to_columns raised: {e}; falling back to empty columns list"
824
+ # )
825
+ cols = []
826
+ now_ms = int(time.time() * 1000)
827
+ if author is None:
828
+ raise ValueError("author must be provided when writing a schema")
829
+ # Determine next sequence number by scanning existing schema docs
830
+ try:
831
+ max_seq = 0
832
+ for d in schemas_coll.stream():
833
+ sd = d.to_dict() or {}
834
+ seq = sd.get("sequence-number") or 0
835
+ if isinstance(seq, int) and seq > max_seq:
836
+ max_seq = seq
837
+ new_seq = max_seq + 1
838
+ except Exception:
839
+ new_seq = 1
840
+
841
+ try:
842
+ # print(
843
+ # f"[DEBUG] Writing schema doc {sid} for {namespace}/{dataset_name} (cols={len(cols)})"
844
+ # )
845
+ schemas_coll.document(sid).set(
846
+ {
847
+ "columns": cols,
848
+ "timestamp-ms": now_ms,
849
+ "author": author,
850
+ "sequence-number": new_seq,
851
+ }
852
+ )
853
+ # print(f"[DEBUG] Wrote schema doc {sid}")
854
+ except Exception:
855
+ # print(f"[DEBUG] Failed to write schema doc {sid}: {e}")
856
+ pass
857
+ return sid