opteryx-catalog 0.4.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of opteryx-catalog might be problematic. Click here for more details.
- opteryx_catalog/__init__.py +31 -0
- opteryx_catalog/catalog/__init__.py +4 -0
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +1199 -0
- opteryx_catalog/catalog/manifest.py +289 -0
- opteryx_catalog/catalog/metadata.py +81 -0
- opteryx_catalog/catalog/metastore.py +68 -0
- opteryx_catalog/catalog/view.py +12 -0
- opteryx_catalog/exceptions.py +38 -0
- opteryx_catalog/iops/__init__.py +6 -0
- opteryx_catalog/iops/base.py +42 -0
- opteryx_catalog/iops/fileio.py +125 -0
- opteryx_catalog/iops/gcs.py +255 -0
- opteryx_catalog/opteryx_catalog.py +857 -0
- opteryx_catalog-0.4.8.dist-info/METADATA +464 -0
- opteryx_catalog-0.4.8.dist-info/RECORD +25 -0
- opteryx_catalog-0.4.8.dist-info/WHEEL +5 -0
- opteryx_catalog-0.4.8.dist-info/licenses/LICENSE +201 -0
- opteryx_catalog-0.4.8.dist-info/top_level.txt +3 -0
- scripts/create_dataset.py +201 -0
- scripts/read_dataset.py +268 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +29 -0
- tests/test_import.py +5 -0
- tests/test_pyproject.py +8 -0
|
@@ -0,0 +1,857 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import time
|
|
4
|
+
from typing import Any
|
|
5
|
+
from typing import Iterable
|
|
6
|
+
from typing import List
|
|
7
|
+
from typing import Optional
|
|
8
|
+
|
|
9
|
+
from google.cloud import firestore
|
|
10
|
+
from google.cloud import storage
|
|
11
|
+
|
|
12
|
+
from .catalog.dataset import SimpleDataset
|
|
13
|
+
from .catalog.metadata import DatasetMetadata
|
|
14
|
+
from .catalog.metadata import Snapshot
|
|
15
|
+
from .catalog.metastore import Metastore
|
|
16
|
+
from .catalog.view import View as CatalogView
|
|
17
|
+
from .exceptions import CollectionAlreadyExists
|
|
18
|
+
from .exceptions import DatasetAlreadyExists
|
|
19
|
+
from .exceptions import DatasetNotFound
|
|
20
|
+
from .exceptions import ViewAlreadyExists
|
|
21
|
+
from .exceptions import ViewNotFound
|
|
22
|
+
from .iops.base import FileIO
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class OpteryxCatalog(Metastore):
|
|
26
|
+
"""Firestore-backed Metastore implementation.
|
|
27
|
+
|
|
28
|
+
Terminology: catalog -> workspace -> collection -> dataset|view
|
|
29
|
+
|
|
30
|
+
Stores dataset documents under the configured workspace in Firestore.
|
|
31
|
+
Snapshots are stored in a `snapshots` subcollection under each
|
|
32
|
+
dataset's document. Parquet manifests are written to GCS under the
|
|
33
|
+
dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
|
|
34
|
+
"""
|
|
35
|
+
|
|
36
|
+
def __init__(
|
|
37
|
+
self,
|
|
38
|
+
workspace: str,
|
|
39
|
+
firestore_project: Optional[str] = None,
|
|
40
|
+
firestore_database: Optional[str] = None,
|
|
41
|
+
gcs_bucket: Optional[str] = None,
|
|
42
|
+
io: Optional[FileIO] = None,
|
|
43
|
+
):
|
|
44
|
+
# `workspace` is the configured catalog/workspace name
|
|
45
|
+
self.workspace = workspace
|
|
46
|
+
# Backwards-compatible alias: keep `catalog_name` for older code paths
|
|
47
|
+
self.catalog_name = workspace
|
|
48
|
+
self.firestore_client = firestore.Client(
|
|
49
|
+
project=firestore_project, database=firestore_database
|
|
50
|
+
)
|
|
51
|
+
self._catalog_ref = self.firestore_client.collection(workspace)
|
|
52
|
+
# Ensure workspace-level properties document exists in Firestore.
|
|
53
|
+
# The $properties doc records metadata for the workspace such as
|
|
54
|
+
# 'timestamp-ms', 'author', 'billing-account-id' and 'owner'.
|
|
55
|
+
try:
|
|
56
|
+
props_ref = self._catalog_ref.document("$properties")
|
|
57
|
+
if not props_ref.get().exists:
|
|
58
|
+
now_ms = int(time.time() * 1000)
|
|
59
|
+
billing = None
|
|
60
|
+
owner = None
|
|
61
|
+
props_ref.set(
|
|
62
|
+
{
|
|
63
|
+
"timestamp-ms": now_ms,
|
|
64
|
+
"billing-account-id": billing,
|
|
65
|
+
"owner": owner,
|
|
66
|
+
}
|
|
67
|
+
)
|
|
68
|
+
except Exception:
|
|
69
|
+
# Be conservative: don't fail catalog initialization on Firestore errors
|
|
70
|
+
pass
|
|
71
|
+
self.gcs_bucket = gcs_bucket
|
|
72
|
+
self._storage_client = storage.Client() if gcs_bucket else None
|
|
73
|
+
# Default to a GCS-backed FileIO when a GCS bucket is configured and
|
|
74
|
+
# no explicit `io` was provided.
|
|
75
|
+
if io is not None:
|
|
76
|
+
self.io = io
|
|
77
|
+
else:
|
|
78
|
+
if gcs_bucket:
|
|
79
|
+
from .iops.gcs import GcsFileIO
|
|
80
|
+
|
|
81
|
+
self.io = GcsFileIO()
|
|
82
|
+
else:
|
|
83
|
+
self.io = FileIO()
|
|
84
|
+
|
|
85
|
+
def _collection_ref(self, collection: str):
|
|
86
|
+
"""Alias for `_namespace_ref` using the preferred term `collection`.
|
|
87
|
+
|
|
88
|
+
Do NOT change call signatures; this helper provides a clearer name
|
|
89
|
+
for new code paths while remaining backwards-compatible.
|
|
90
|
+
"""
|
|
91
|
+
return self._catalog_ref.document(collection)
|
|
92
|
+
|
|
93
|
+
def _datasets_collection(self, collection: str):
|
|
94
|
+
# Primary subcollection for datasets.
|
|
95
|
+
return self._collection_ref(collection).collection("datasets")
|
|
96
|
+
|
|
97
|
+
def _dataset_doc_ref(self, collection: str, dataset_name: str):
|
|
98
|
+
return self._datasets_collection(collection).document(dataset_name)
|
|
99
|
+
|
|
100
|
+
def _snapshots_collection(self, collection: str, dataset_name: str):
|
|
101
|
+
return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
|
|
102
|
+
|
|
103
|
+
def _views_collection(self, collection: str):
|
|
104
|
+
return self._collection_ref(collection).collection("views")
|
|
105
|
+
|
|
106
|
+
def _view_doc_ref(self, collection: str, view_name: str):
|
|
107
|
+
return self._views_collection(collection).document(view_name)
|
|
108
|
+
|
|
109
|
+
def create_dataset(
|
|
110
|
+
self, identifier: str, schema: Any, properties: dict | None = None, author: str = None
|
|
111
|
+
) -> SimpleDataset:
|
|
112
|
+
if author is None:
|
|
113
|
+
raise ValueError("author must be provided when creating a dataset")
|
|
114
|
+
collection, dataset_name = identifier.split(".")
|
|
115
|
+
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
116
|
+
# Check primary `datasets` location
|
|
117
|
+
if doc_ref.get().exists:
|
|
118
|
+
raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
|
|
119
|
+
|
|
120
|
+
# Build default dataset metadata
|
|
121
|
+
location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
|
|
122
|
+
metadata = DatasetMetadata(
|
|
123
|
+
dataset_identifier=identifier,
|
|
124
|
+
schema=schema,
|
|
125
|
+
location=location,
|
|
126
|
+
properties=properties or {},
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
# Persist document with timestamp and author
|
|
130
|
+
now_ms = int(time.time() * 1000)
|
|
131
|
+
metadata.timestamp_ms = now_ms
|
|
132
|
+
metadata.author = author
|
|
133
|
+
doc_ref.set(
|
|
134
|
+
{
|
|
135
|
+
"name": dataset_name,
|
|
136
|
+
"collection": collection,
|
|
137
|
+
"workspace": self.workspace,
|
|
138
|
+
"location": location,
|
|
139
|
+
"properties": metadata.properties,
|
|
140
|
+
"format-version": metadata.format_version,
|
|
141
|
+
"timestamp-ms": now_ms,
|
|
142
|
+
"author": author,
|
|
143
|
+
"maintenance-policy": metadata.maintenance_policy,
|
|
144
|
+
}
|
|
145
|
+
)
|
|
146
|
+
|
|
147
|
+
# Persist initial schema into `schemas` subcollection if provided
|
|
148
|
+
if schema is not None:
|
|
149
|
+
schema_id = self._write_schema(collection, dataset_name, schema, author=author)
|
|
150
|
+
metadata.current_schema_id = schema_id
|
|
151
|
+
# Read back the schema doc to capture timestamp-ms, author, sequence-number
|
|
152
|
+
try:
|
|
153
|
+
sdoc = doc_ref.collection("schemas").document(schema_id).get()
|
|
154
|
+
sdata = sdoc.to_dict() or {}
|
|
155
|
+
metadata.schemas = [
|
|
156
|
+
{
|
|
157
|
+
"schema_id": schema_id,
|
|
158
|
+
"columns": sdata.get("columns", self._schema_to_columns(schema)),
|
|
159
|
+
"timestamp-ms": sdata.get("timestamp-ms"),
|
|
160
|
+
"author": sdata.get("author"),
|
|
161
|
+
"sequence-number": sdata.get("sequence-number"),
|
|
162
|
+
}
|
|
163
|
+
]
|
|
164
|
+
except Exception:
|
|
165
|
+
metadata.schemas = [
|
|
166
|
+
{"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
|
|
167
|
+
]
|
|
168
|
+
# update dataset doc to reference current schema
|
|
169
|
+
doc_ref.update({"current-schema-id": metadata.current_schema_id})
|
|
170
|
+
|
|
171
|
+
# Return SimpleDataset (attach this catalog so append() can persist)
|
|
172
|
+
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
173
|
+
|
|
174
|
+
def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
|
|
175
|
+
"""Load a dataset from Firestore.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
identifier: Dataset identifier in format 'collection.dataset_name'
|
|
179
|
+
load_history: If True, load all snapshots from Firestore (expensive for
|
|
180
|
+
large histories). If False (default), only load the current snapshot,
|
|
181
|
+
which is sufficient for most write operations.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
SimpleDataset instance with metadata loaded from Firestore.
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
DatasetNotFound: If the dataset does not exist in Firestore.
|
|
188
|
+
"""
|
|
189
|
+
collection, dataset_name = identifier.split(".")
|
|
190
|
+
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
191
|
+
doc = doc_ref.get()
|
|
192
|
+
if not doc.exists:
|
|
193
|
+
raise DatasetNotFound(f"Dataset not found: {identifier}")
|
|
194
|
+
|
|
195
|
+
data = doc.to_dict() or {}
|
|
196
|
+
metadata = DatasetMetadata(
|
|
197
|
+
dataset_identifier=identifier,
|
|
198
|
+
location=data.get("location")
|
|
199
|
+
or f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}",
|
|
200
|
+
schema=data.get("schema"),
|
|
201
|
+
properties=data.get("properties") or {},
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
# Load dataset-level timestamp/author and collection/workspace
|
|
205
|
+
metadata.timestamp_ms = data.get("timestamp-ms")
|
|
206
|
+
metadata.author = data.get("author")
|
|
207
|
+
# note: Firestore dataset doc stores the original collection and workspace
|
|
208
|
+
# under keys `collection` and `workspace`.
|
|
209
|
+
|
|
210
|
+
# Load snapshots based on load_history flag
|
|
211
|
+
snaps = []
|
|
212
|
+
if load_history:
|
|
213
|
+
# Load all snapshots from Firestore (expensive for large histories)
|
|
214
|
+
for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
|
|
215
|
+
sd = snap_doc.to_dict() or {}
|
|
216
|
+
snap = Snapshot(
|
|
217
|
+
snapshot_id=sd.get("snapshot-id"),
|
|
218
|
+
timestamp_ms=sd.get("timestamp-ms"),
|
|
219
|
+
author=sd.get("author"),
|
|
220
|
+
sequence_number=sd.get("sequence-number"),
|
|
221
|
+
user_created=sd.get("user-created"),
|
|
222
|
+
manifest_list=sd.get("manifest"),
|
|
223
|
+
schema_id=sd.get("schema-id"),
|
|
224
|
+
summary=sd.get("summary", {}),
|
|
225
|
+
operation_type=sd.get("operation-type"),
|
|
226
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
227
|
+
)
|
|
228
|
+
snaps.append(snap)
|
|
229
|
+
if snaps:
|
|
230
|
+
metadata.current_snapshot_id = snaps[-1].snapshot_id
|
|
231
|
+
else:
|
|
232
|
+
# Load only the current snapshot (efficient single read)
|
|
233
|
+
current_snap_id = data.get("current-snapshot-id")
|
|
234
|
+
if current_snap_id:
|
|
235
|
+
try:
|
|
236
|
+
snap_doc = (
|
|
237
|
+
self._snapshots_collection(collection, dataset_name)
|
|
238
|
+
.document(str(current_snap_id))
|
|
239
|
+
.get()
|
|
240
|
+
)
|
|
241
|
+
if snap_doc.exists:
|
|
242
|
+
sd = snap_doc.to_dict() or {}
|
|
243
|
+
snap = Snapshot(
|
|
244
|
+
snapshot_id=sd.get("snapshot-id"),
|
|
245
|
+
timestamp_ms=sd.get("timestamp-ms"),
|
|
246
|
+
author=sd.get("author"),
|
|
247
|
+
sequence_number=sd.get("sequence-number"),
|
|
248
|
+
user_created=sd.get("user-created"),
|
|
249
|
+
manifest_list=sd.get("manifest"),
|
|
250
|
+
schema_id=sd.get("schema-id"),
|
|
251
|
+
summary=sd.get("summary", {}),
|
|
252
|
+
operation_type=sd.get("operation-type"),
|
|
253
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
254
|
+
)
|
|
255
|
+
snaps.append(snap)
|
|
256
|
+
metadata.current_snapshot_id = current_snap_id
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
259
|
+
metadata.snapshots = snaps
|
|
260
|
+
|
|
261
|
+
# Load schemas subcollection
|
|
262
|
+
schemas_coll = doc_ref.collection("schemas")
|
|
263
|
+
# Load all schemas if requested; otherwise load only current schema
|
|
264
|
+
if load_history:
|
|
265
|
+
schemas = []
|
|
266
|
+
for sdoc in schemas_coll.stream():
|
|
267
|
+
sd = sdoc.to_dict() or {}
|
|
268
|
+
schemas.append(
|
|
269
|
+
{
|
|
270
|
+
"schema_id": sdoc.id,
|
|
271
|
+
"columns": sd.get("columns", []),
|
|
272
|
+
"timestamp-ms": sd.get("timestamp-ms"),
|
|
273
|
+
"author": sd.get("author"),
|
|
274
|
+
"sequence-number": sd.get("sequence-number"),
|
|
275
|
+
}
|
|
276
|
+
)
|
|
277
|
+
metadata.schemas = schemas
|
|
278
|
+
metadata.current_schema_id = doc.to_dict().get("current-schema-id")
|
|
279
|
+
else:
|
|
280
|
+
# Only load the current schema document for efficiency
|
|
281
|
+
current_schema_id = doc.to_dict().get("current-schema-id")
|
|
282
|
+
if current_schema_id:
|
|
283
|
+
sdoc = schemas_coll.document(str(current_schema_id)).get()
|
|
284
|
+
if sdoc.exists:
|
|
285
|
+
sd = sdoc.to_dict() or {}
|
|
286
|
+
metadata.schemas = [
|
|
287
|
+
{
|
|
288
|
+
"schema_id": sdoc.id,
|
|
289
|
+
"columns": sd.get("columns", []),
|
|
290
|
+
"timestamp-ms": sd.get("timestamp-ms"),
|
|
291
|
+
"author": sd.get("author"),
|
|
292
|
+
"sequence-number": sd.get("sequence-number"),
|
|
293
|
+
}
|
|
294
|
+
]
|
|
295
|
+
metadata.current_schema_id = current_schema_id
|
|
296
|
+
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
297
|
+
|
|
298
|
+
def drop_dataset(self, identifier: str) -> None:
|
|
299
|
+
collection, dataset_name = identifier.split(".")
|
|
300
|
+
# Delete snapshots
|
|
301
|
+
snaps_coll = self._snapshots_collection(collection, dataset_name)
|
|
302
|
+
for doc in snaps_coll.stream():
|
|
303
|
+
snaps_coll.document(doc.id).delete()
|
|
304
|
+
# Delete dataset doc
|
|
305
|
+
self._dataset_doc_ref(collection, dataset_name).delete()
|
|
306
|
+
|
|
307
|
+
def list_datasets(self, collection: str) -> Iterable[str]:
|
|
308
|
+
coll = self._datasets_collection(collection)
|
|
309
|
+
return [doc.id for doc in coll.stream()]
|
|
310
|
+
|
|
311
|
+
def create_collection(
|
|
312
|
+
self,
|
|
313
|
+
collection: str,
|
|
314
|
+
properties: dict | None = None,
|
|
315
|
+
exists_ok: bool = False,
|
|
316
|
+
author: str = None,
|
|
317
|
+
) -> None:
|
|
318
|
+
"""Create a collection document under the catalog.
|
|
319
|
+
|
|
320
|
+
If `exists_ok` is False and the collection already exists, a KeyError is raised.
|
|
321
|
+
"""
|
|
322
|
+
doc_ref = self._collection_ref(collection)
|
|
323
|
+
if doc_ref.get().exists:
|
|
324
|
+
if exists_ok:
|
|
325
|
+
return
|
|
326
|
+
raise CollectionAlreadyExists(f"Collection already exists: {collection}")
|
|
327
|
+
|
|
328
|
+
now_ms = int(time.time() * 1000)
|
|
329
|
+
if author is None:
|
|
330
|
+
raise ValueError("author must be provided when creating a collection")
|
|
331
|
+
doc_ref.set(
|
|
332
|
+
{
|
|
333
|
+
"name": collection,
|
|
334
|
+
"properties": properties or {},
|
|
335
|
+
"timestamp-ms": now_ms,
|
|
336
|
+
"author": author,
|
|
337
|
+
}
|
|
338
|
+
)
|
|
339
|
+
|
|
340
|
+
def create_collection_if_not_exists(
|
|
341
|
+
self, collection: str, properties: dict | None = None, author: Optional[str] = None
|
|
342
|
+
) -> None:
|
|
343
|
+
"""Convenience wrapper that creates the collection only if missing."""
|
|
344
|
+
self.create_collection(collection, properties=properties, exists_ok=True, author=author)
|
|
345
|
+
|
|
346
|
+
def dataset_exists(
|
|
347
|
+
self, identifier_or_collection: str, dataset_name: Optional[str] = None
|
|
348
|
+
) -> bool:
|
|
349
|
+
"""Return True if the dataset exists.
|
|
350
|
+
|
|
351
|
+
Supports two call forms:
|
|
352
|
+
- dataset_exists("collection.dataset")
|
|
353
|
+
- dataset_exists("collection", "dataset")
|
|
354
|
+
"""
|
|
355
|
+
# Normalize inputs
|
|
356
|
+
if dataset_name is None:
|
|
357
|
+
# Expect a single collection like 'collection.dataset'
|
|
358
|
+
if "." not in identifier_or_collection:
|
|
359
|
+
raise ValueError(
|
|
360
|
+
"collection must be 'collection.dataset' or pass dataset_name separately"
|
|
361
|
+
)
|
|
362
|
+
collection, dataset_name = identifier_or_collection.rsplit(".", 1)
|
|
363
|
+
else:
|
|
364
|
+
collection = identifier_or_collection
|
|
365
|
+
|
|
366
|
+
try:
|
|
367
|
+
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
368
|
+
return doc_ref.get().exists
|
|
369
|
+
except Exception:
|
|
370
|
+
# On any error, be conservative and return False
|
|
371
|
+
return False
|
|
372
|
+
|
|
373
|
+
# Dataset API methods have been renamed to the preferred `dataset` terminology.
|
|
374
|
+
|
|
375
|
+
# --- View support -------------------------------------------------
|
|
376
|
+
def create_view(
|
|
377
|
+
self,
|
|
378
|
+
identifier: str | tuple,
|
|
379
|
+
sql: str,
|
|
380
|
+
schema: Any | None = None,
|
|
381
|
+
author: str = None,
|
|
382
|
+
description: Optional[str] = None,
|
|
383
|
+
properties: dict | None = None,
|
|
384
|
+
update_if_exists: bool = False,
|
|
385
|
+
) -> CatalogView:
|
|
386
|
+
"""Create a view document and a statement version in the `statement` subcollection.
|
|
387
|
+
|
|
388
|
+
`identifier` may be a string like 'namespace.view' or a tuple ('namespace','view').
|
|
389
|
+
"""
|
|
390
|
+
# Normalize identifier
|
|
391
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
392
|
+
collection, view_name = identifier[0], identifier[1]
|
|
393
|
+
else:
|
|
394
|
+
collection, view_name = identifier.split(".")
|
|
395
|
+
|
|
396
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
397
|
+
if doc_ref.get().exists:
|
|
398
|
+
if not update_if_exists:
|
|
399
|
+
raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
|
|
400
|
+
# Update existing view - get current sequence number
|
|
401
|
+
existing_doc = doc_ref.get().to_dict()
|
|
402
|
+
current_statement_id = existing_doc.get("statement-id")
|
|
403
|
+
if current_statement_id:
|
|
404
|
+
stmt_ref = doc_ref.collection("statement").document(current_statement_id)
|
|
405
|
+
stmt_doc = stmt_ref.get()
|
|
406
|
+
if stmt_doc.exists:
|
|
407
|
+
sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
|
|
408
|
+
else:
|
|
409
|
+
sequence_number = 1
|
|
410
|
+
else:
|
|
411
|
+
sequence_number = 1
|
|
412
|
+
else:
|
|
413
|
+
sequence_number = 1
|
|
414
|
+
|
|
415
|
+
now_ms = int(time.time() * 1000)
|
|
416
|
+
if author is None:
|
|
417
|
+
raise ValueError("author must be provided when creating a view")
|
|
418
|
+
|
|
419
|
+
# Write statement version
|
|
420
|
+
statement_id = str(now_ms)
|
|
421
|
+
stmt_coll = doc_ref.collection("statement")
|
|
422
|
+
stmt_coll.document(statement_id).set(
|
|
423
|
+
{
|
|
424
|
+
"sql": sql,
|
|
425
|
+
"timestamp-ms": now_ms,
|
|
426
|
+
"author": author,
|
|
427
|
+
"sequence-number": sequence_number,
|
|
428
|
+
}
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
# Persist root view doc referencing the statement id
|
|
432
|
+
doc_ref.set(
|
|
433
|
+
{
|
|
434
|
+
"name": view_name,
|
|
435
|
+
"collection": collection,
|
|
436
|
+
"workspace": self.workspace,
|
|
437
|
+
"timestamp-ms": now_ms,
|
|
438
|
+
"author": author,
|
|
439
|
+
"description": description,
|
|
440
|
+
"describer": author,
|
|
441
|
+
"last-execution-ms": None,
|
|
442
|
+
"last-execution-data-size": None,
|
|
443
|
+
"last-execution-records": None,
|
|
444
|
+
"statement-id": statement_id,
|
|
445
|
+
"properties": properties or {},
|
|
446
|
+
}
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
# Return a simple CatalogView wrapper
|
|
450
|
+
v = CatalogView(name=view_name, definition=sql, properties=properties or {})
|
|
451
|
+
# provide convenient attributes used by docs/examples
|
|
452
|
+
setattr(v, "sql", sql)
|
|
453
|
+
setattr(v, "metadata", type("M", (), {})())
|
|
454
|
+
v.metadata.schema = schema
|
|
455
|
+
return v
|
|
456
|
+
|
|
457
|
+
def load_view(self, identifier: str | tuple) -> CatalogView:
|
|
458
|
+
"""Load a view by identifier. Returns a `CatalogView` with `.definition` and `.sql`.
|
|
459
|
+
|
|
460
|
+
Raises `ViewNotFound` if the view doc is missing.
|
|
461
|
+
"""
|
|
462
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
463
|
+
collection, view_name = identifier[0], identifier[1]
|
|
464
|
+
else:
|
|
465
|
+
collection, view_name = identifier.split(".")
|
|
466
|
+
|
|
467
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
468
|
+
doc = doc_ref.get()
|
|
469
|
+
if not doc.exists:
|
|
470
|
+
raise ViewNotFound(f"View not found: {collection}.{view_name}")
|
|
471
|
+
|
|
472
|
+
data = doc.to_dict() or {}
|
|
473
|
+
stmt_id = data.get("statement-id")
|
|
474
|
+
sql = None
|
|
475
|
+
schema = data.get("schema")
|
|
476
|
+
|
|
477
|
+
sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
|
|
478
|
+
sql = (sdoc.to_dict() or {}).get("sql")
|
|
479
|
+
|
|
480
|
+
v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
|
|
481
|
+
setattr(v, "sql", sql or "")
|
|
482
|
+
setattr(v, "metadata", type("M", (), {})())
|
|
483
|
+
v.metadata.schema = schema
|
|
484
|
+
# Populate metadata fields from the stored view document so callers
|
|
485
|
+
# expecting attributes like `timestamp_ms` won't fail.
|
|
486
|
+
v.metadata.author = data.get("author")
|
|
487
|
+
v.metadata.description = data.get("description")
|
|
488
|
+
v.metadata.timestamp_ms = data.get("timestamp-ms")
|
|
489
|
+
# Execution/operational fields (may be None)
|
|
490
|
+
v.metadata.last_execution_ms = data.get("last-execution-ms")
|
|
491
|
+
v.metadata.last_execution_data_size = data.get("last-execution-data-size")
|
|
492
|
+
v.metadata.last_execution_records = data.get("last-execution-records")
|
|
493
|
+
# Optional describer (used to flag LLM-generated descriptions)
|
|
494
|
+
v.metadata.describer = data.get("describer")
|
|
495
|
+
return v
|
|
496
|
+
|
|
497
|
+
def drop_view(self, identifier: str | tuple) -> None:
|
|
498
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
499
|
+
collection, view_name = identifier[0], identifier[1]
|
|
500
|
+
else:
|
|
501
|
+
collection, view_name = identifier.split(".")
|
|
502
|
+
|
|
503
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
504
|
+
# delete statement subcollection
|
|
505
|
+
for d in doc_ref.collection("statement").stream():
|
|
506
|
+
doc_ref.collection("statement").document(d.id).delete()
|
|
507
|
+
|
|
508
|
+
doc_ref.delete()
|
|
509
|
+
|
|
510
|
+
def list_views(self, collection: str) -> Iterable[str]:
|
|
511
|
+
coll = self._views_collection(collection)
|
|
512
|
+
return [doc.id for doc in coll.stream()]
|
|
513
|
+
|
|
514
|
+
def view_exists(
|
|
515
|
+
self, identifier_or_collection: str | tuple, view_name: Optional[str] = None
|
|
516
|
+
) -> bool:
|
|
517
|
+
"""Return True if the view exists.
|
|
518
|
+
|
|
519
|
+
Supports two call forms:
|
|
520
|
+
- view_exists("collection.view")
|
|
521
|
+
- view_exists(("collection", "view"))
|
|
522
|
+
- view_exists("collection", "view")
|
|
523
|
+
"""
|
|
524
|
+
# Normalize inputs
|
|
525
|
+
if view_name is None:
|
|
526
|
+
if isinstance(identifier_or_collection, tuple) or isinstance(
|
|
527
|
+
identifier_or_collection, list
|
|
528
|
+
):
|
|
529
|
+
collection, view_name = identifier_or_collection[0], identifier_or_collection[1]
|
|
530
|
+
else:
|
|
531
|
+
if "." not in identifier_or_collection:
|
|
532
|
+
raise ValueError(
|
|
533
|
+
"identifier must be 'collection.view' or pass view_name separately"
|
|
534
|
+
)
|
|
535
|
+
collection, view_name = identifier_or_collection.rsplit(".", 1)
|
|
536
|
+
else:
|
|
537
|
+
collection = identifier_or_collection
|
|
538
|
+
|
|
539
|
+
try:
|
|
540
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
541
|
+
return doc_ref.get().exists
|
|
542
|
+
except Exception:
|
|
543
|
+
return False
|
|
544
|
+
|
|
545
|
+
def update_view_execution_metadata(
|
|
546
|
+
self,
|
|
547
|
+
identifier: str | tuple,
|
|
548
|
+
row_count: Optional[int] = None,
|
|
549
|
+
execution_time: Optional[float] = None,
|
|
550
|
+
) -> None:
|
|
551
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
552
|
+
collection, view_name = identifier[0], identifier[1]
|
|
553
|
+
else:
|
|
554
|
+
collection, view_name = identifier.split(".")
|
|
555
|
+
|
|
556
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
557
|
+
updates = {}
|
|
558
|
+
now_ms = int(time.time() * 1000)
|
|
559
|
+
if row_count is not None:
|
|
560
|
+
updates["last-execution-records"] = row_count
|
|
561
|
+
if execution_time is not None:
|
|
562
|
+
updates["last-execution-time-ms"] = int(execution_time * 1000)
|
|
563
|
+
updates["last-execution-ms"] = now_ms
|
|
564
|
+
if updates:
|
|
565
|
+
doc_ref.update(updates)
|
|
566
|
+
|
|
567
|
+
def write_parquet_manifest(
|
|
568
|
+
self, snapshot_id: int, entries: List[dict], dataset_location: str
|
|
569
|
+
) -> Optional[str]:
|
|
570
|
+
"""Write a Parquet manifest for the given snapshot id and entries.
|
|
571
|
+
|
|
572
|
+
Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
|
|
573
|
+
The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
|
|
574
|
+
"""
|
|
575
|
+
import pyarrow as pa
|
|
576
|
+
import pyarrow.parquet as pq
|
|
577
|
+
|
|
578
|
+
# If entries is None we skip writing; if entries is empty list, write
|
|
579
|
+
# an empty Parquet manifest (represents an empty dataset for this
|
|
580
|
+
# snapshot). This preserves previous manifests so older snapshots
|
|
581
|
+
# remain readable.
|
|
582
|
+
if entries is None:
|
|
583
|
+
return None
|
|
584
|
+
|
|
585
|
+
parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
586
|
+
|
|
587
|
+
# Use provided FileIO if it supports writing; otherwise write to GCS
|
|
588
|
+
try:
|
|
589
|
+
# Use an explicit schema so PyArrow types (especially nested lists)
|
|
590
|
+
# are correct and we avoid integer overflow / inference issues.
|
|
591
|
+
schema = pa.schema(
|
|
592
|
+
[
|
|
593
|
+
("file_path", pa.string()),
|
|
594
|
+
("file_format", pa.string()),
|
|
595
|
+
("record_count", pa.int64()),
|
|
596
|
+
("file_size_in_bytes", pa.int64()),
|
|
597
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
598
|
+
("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
|
|
599
|
+
("histogram_counts", pa.list_(pa.list_(pa.int64()))),
|
|
600
|
+
("histogram_bins", pa.int32()),
|
|
601
|
+
("min_values", pa.list_(pa.int64())),
|
|
602
|
+
("max_values", pa.list_(pa.int64())),
|
|
603
|
+
]
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
try:
|
|
607
|
+
table = pa.Table.from_pylist(entries, schema=schema)
|
|
608
|
+
except Exception as exc:
|
|
609
|
+
# Diagnostic output to help find malformed manifest entries
|
|
610
|
+
|
|
611
|
+
print(
|
|
612
|
+
"[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
|
|
613
|
+
)
|
|
614
|
+
for i, ent in enumerate(entries):
|
|
615
|
+
print(f" Entry {i}:")
|
|
616
|
+
if isinstance(ent, dict):
|
|
617
|
+
for k, v in ent.items():
|
|
618
|
+
tname = type(v).__name__
|
|
619
|
+
try:
|
|
620
|
+
s = repr(v)
|
|
621
|
+
except Exception:
|
|
622
|
+
s = "<unreprable>"
|
|
623
|
+
print(f" - {k}: type={tname} repr={s[:200]}")
|
|
624
|
+
else:
|
|
625
|
+
print(
|
|
626
|
+
f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
|
|
627
|
+
)
|
|
628
|
+
raise exc
|
|
629
|
+
|
|
630
|
+
buf = pa.BufferOutputStream()
|
|
631
|
+
pq.write_table(table, buf, compression="zstd")
|
|
632
|
+
data = buf.getvalue().to_pybytes()
|
|
633
|
+
|
|
634
|
+
if self.io:
|
|
635
|
+
out = self.io.new_output(parquet_path).create()
|
|
636
|
+
out.write(data)
|
|
637
|
+
try:
|
|
638
|
+
# Some OutputFile implementations buffer and require close()
|
|
639
|
+
out.close()
|
|
640
|
+
except Exception:
|
|
641
|
+
pass
|
|
642
|
+
|
|
643
|
+
return parquet_path
|
|
644
|
+
except Exception as e:
|
|
645
|
+
# Log and return None on failure
|
|
646
|
+
# print(f"Failed to write Parquet manifest: {e}")
|
|
647
|
+
raise e
|
|
648
|
+
|
|
649
|
+
def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
|
|
650
|
+
"""Persist a single snapshot document for a dataset."""
|
|
651
|
+
namespace, dataset_name = identifier.split(".")
|
|
652
|
+
snaps = self._snapshots_collection(namespace, dataset_name)
|
|
653
|
+
doc_id = str(snapshot.snapshot_id)
|
|
654
|
+
# Ensure summary contains all expected keys (zero defaults applied in dataclass)
|
|
655
|
+
summary = snapshot.summary or {}
|
|
656
|
+
# Provide explicit keys if missing
|
|
657
|
+
for k in [
|
|
658
|
+
"added-data-files",
|
|
659
|
+
"added-files-size",
|
|
660
|
+
"added-records",
|
|
661
|
+
"deleted-data-files",
|
|
662
|
+
"deleted-files-size",
|
|
663
|
+
"deleted-records",
|
|
664
|
+
"total-data-files",
|
|
665
|
+
"total-files-size",
|
|
666
|
+
"total-records",
|
|
667
|
+
]:
|
|
668
|
+
summary.setdefault(k, 0)
|
|
669
|
+
|
|
670
|
+
data = {
|
|
671
|
+
"snapshot-id": snapshot.snapshot_id,
|
|
672
|
+
"timestamp-ms": snapshot.timestamp_ms,
|
|
673
|
+
"manifest": snapshot.manifest_list,
|
|
674
|
+
"commit-message": getattr(snapshot, "commit_message", ""),
|
|
675
|
+
"summary": summary,
|
|
676
|
+
"author": getattr(snapshot, "author", None),
|
|
677
|
+
"sequence-number": getattr(snapshot, "sequence_number", None),
|
|
678
|
+
"operation-type": getattr(snapshot, "operation_type", None),
|
|
679
|
+
"parent-snapshot-id": getattr(snapshot, "parent_snapshot_id", None),
|
|
680
|
+
}
|
|
681
|
+
if getattr(snapshot, "schema_id", None) is not None:
|
|
682
|
+
data["schema-id"] = snapshot.schema_id
|
|
683
|
+
snaps.document(doc_id).set(data)
|
|
684
|
+
|
|
685
|
+
def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
|
|
686
|
+
"""Persist dataset-level metadata and snapshots to Firestore.
|
|
687
|
+
|
|
688
|
+
This writes the dataset document and upserts snapshot documents.
|
|
689
|
+
"""
|
|
690
|
+
collection, dataset_name = identifier.split(".")
|
|
691
|
+
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
692
|
+
doc_ref.set(
|
|
693
|
+
{
|
|
694
|
+
"name": dataset_name,
|
|
695
|
+
"collection": collection,
|
|
696
|
+
"workspace": self.workspace,
|
|
697
|
+
"location": metadata.location,
|
|
698
|
+
"properties": metadata.properties,
|
|
699
|
+
"format-version": metadata.format_version,
|
|
700
|
+
"current-snapshot-id": metadata.current_snapshot_id,
|
|
701
|
+
"current-schema-id": metadata.current_schema_id,
|
|
702
|
+
"timestamp-ms": metadata.timestamp_ms,
|
|
703
|
+
"author": metadata.author,
|
|
704
|
+
"description": metadata.description,
|
|
705
|
+
"describer": metadata.describer,
|
|
706
|
+
"maintenance-policy": metadata.maintenance_policy,
|
|
707
|
+
"sort-orders": metadata.sort_orders,
|
|
708
|
+
}
|
|
709
|
+
)
|
|
710
|
+
|
|
711
|
+
# Metadata persisted in primary `datasets` collection only.
|
|
712
|
+
|
|
713
|
+
snaps_coll = self._snapshots_collection(collection, dataset_name)
|
|
714
|
+
existing = {d.id for d in snaps_coll.stream()}
|
|
715
|
+
new_ids = set()
|
|
716
|
+
for snap in metadata.snapshots:
|
|
717
|
+
new_ids.add(str(snap.snapshot_id))
|
|
718
|
+
snaps_coll.document(str(snap.snapshot_id)).set(
|
|
719
|
+
{
|
|
720
|
+
"snapshot-id": snap.snapshot_id,
|
|
721
|
+
"timestamp-ms": snap.timestamp_ms,
|
|
722
|
+
"manifest": snap.manifest_list,
|
|
723
|
+
"commit-message": getattr(snap, "commit_message", ""),
|
|
724
|
+
"schema-id": snap.schema_id,
|
|
725
|
+
"summary": snap.summary or {},
|
|
726
|
+
"author": getattr(snap, "author", None),
|
|
727
|
+
"sequence-number": getattr(snap, "sequence_number", None),
|
|
728
|
+
"user-created": getattr(snap, "user_created", None),
|
|
729
|
+
}
|
|
730
|
+
)
|
|
731
|
+
|
|
732
|
+
# Delete stale snapshots
|
|
733
|
+
for stale in existing - new_ids:
|
|
734
|
+
snaps_coll.document(stale).delete()
|
|
735
|
+
|
|
736
|
+
# Persist schemas subcollection
|
|
737
|
+
schemas_coll = doc_ref.collection("schemas")
|
|
738
|
+
existing_schema_ids = {d.id for d in schemas_coll.stream()}
|
|
739
|
+
new_schema_ids = set()
|
|
740
|
+
for s in metadata.schemas:
|
|
741
|
+
sid = s.get("schema_id")
|
|
742
|
+
if not sid:
|
|
743
|
+
continue
|
|
744
|
+
new_schema_ids.add(sid)
|
|
745
|
+
schemas_coll.document(sid).set(
|
|
746
|
+
{
|
|
747
|
+
"columns": s.get("columns", []),
|
|
748
|
+
"timestamp-ms": s.get("timestamp-ms"),
|
|
749
|
+
"author": s.get("author"),
|
|
750
|
+
"sequence-number": s.get("sequence-number"),
|
|
751
|
+
}
|
|
752
|
+
)
|
|
753
|
+
# Delete stale schema docs
|
|
754
|
+
for stale in existing_schema_ids - new_schema_ids:
|
|
755
|
+
schemas_coll.document(stale).delete()
|
|
756
|
+
|
|
757
|
+
def _schema_to_columns(self, schema: Any) -> list:
|
|
758
|
+
"""Convert a pyarrow.Schema into a simple columns list for storage.
|
|
759
|
+
|
|
760
|
+
Each column is a dict: {"id": index (1-based), "name": column_name, "type": str(type)}
|
|
761
|
+
"""
|
|
762
|
+
# Support pyarrow.Schema and Orso RelationSchema. When Orso's
|
|
763
|
+
# FlatColumn.from_arrow is available, use it to derive Orso types
|
|
764
|
+
# (type, element-type, scale, precision). Fall back to simple
|
|
765
|
+
# stringified types if Orso isn't installed.
|
|
766
|
+
cols = []
|
|
767
|
+
# Try Orso FlatColumn importer
|
|
768
|
+
import orso
|
|
769
|
+
import pyarrow as pa
|
|
770
|
+
|
|
771
|
+
# If schema is an Orso RelationSchema, try to obtain a list of columns
|
|
772
|
+
columns = None
|
|
773
|
+
if isinstance(schema, orso.schema.RelationSchema):
|
|
774
|
+
columns = schema.columns
|
|
775
|
+
elif isinstance(schema, pa.Schema):
|
|
776
|
+
orso_schema = orso.schema.convert_arrow_schema_to_orso_schema(schema)
|
|
777
|
+
columns = orso_schema.columns
|
|
778
|
+
else:
|
|
779
|
+
# print(f"[DEBUG] _schema_to_columns: unsupported schema type: {type(schema)}")
|
|
780
|
+
raise ValueError(
|
|
781
|
+
"Unsupported schema type, expected pyarrow.Schema or orso.RelationSchema"
|
|
782
|
+
)
|
|
783
|
+
|
|
784
|
+
# print(f"[DEBUG] _schema_to_columns: processing {len(columns)} columns")
|
|
785
|
+
|
|
786
|
+
for idx, column in enumerate(columns, start=1):
|
|
787
|
+
# If f looks like a pyarrow.Field, use its name/type
|
|
788
|
+
name = column.name
|
|
789
|
+
|
|
790
|
+
# Extract expected attributes safely
|
|
791
|
+
ctype = column.type
|
|
792
|
+
element_type = column.element_type if column.element_type else None
|
|
793
|
+
scale = column.scale
|
|
794
|
+
precision = column.precision
|
|
795
|
+
typed = {
|
|
796
|
+
"id": idx,
|
|
797
|
+
"name": name,
|
|
798
|
+
"type": ctype,
|
|
799
|
+
"element-type": element_type,
|
|
800
|
+
"scale": scale,
|
|
801
|
+
"precision": precision,
|
|
802
|
+
"expectation-policies": [],
|
|
803
|
+
}
|
|
804
|
+
|
|
805
|
+
cols.append(typed)
|
|
806
|
+
|
|
807
|
+
return cols
|
|
808
|
+
|
|
809
|
+
def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
|
|
810
|
+
"""Persist a schema document in the dataset's `schemas` subcollection and
|
|
811
|
+
return the new schema id.
|
|
812
|
+
"""
|
|
813
|
+
import uuid
|
|
814
|
+
|
|
815
|
+
doc_ref = self._dataset_doc_ref(namespace, dataset_name)
|
|
816
|
+
schemas_coll = doc_ref.collection("schemas")
|
|
817
|
+
sid = str(uuid.uuid4())
|
|
818
|
+
# print(f"[DEBUG] _write_schema called for {namespace}/{dataset_name} sid={sid}")
|
|
819
|
+
try:
|
|
820
|
+
cols = self._schema_to_columns(schema)
|
|
821
|
+
except Exception:
|
|
822
|
+
# print(
|
|
823
|
+
# f"[DEBUG] _write_schema: _schema_to_columns raised: {e}; falling back to empty columns list"
|
|
824
|
+
# )
|
|
825
|
+
cols = []
|
|
826
|
+
now_ms = int(time.time() * 1000)
|
|
827
|
+
if author is None:
|
|
828
|
+
raise ValueError("author must be provided when writing a schema")
|
|
829
|
+
# Determine next sequence number by scanning existing schema docs
|
|
830
|
+
try:
|
|
831
|
+
max_seq = 0
|
|
832
|
+
for d in schemas_coll.stream():
|
|
833
|
+
sd = d.to_dict() or {}
|
|
834
|
+
seq = sd.get("sequence-number") or 0
|
|
835
|
+
if isinstance(seq, int) and seq > max_seq:
|
|
836
|
+
max_seq = seq
|
|
837
|
+
new_seq = max_seq + 1
|
|
838
|
+
except Exception:
|
|
839
|
+
new_seq = 1
|
|
840
|
+
|
|
841
|
+
try:
|
|
842
|
+
# print(
|
|
843
|
+
# f"[DEBUG] Writing schema doc {sid} for {namespace}/{dataset_name} (cols={len(cols)})"
|
|
844
|
+
# )
|
|
845
|
+
schemas_coll.document(sid).set(
|
|
846
|
+
{
|
|
847
|
+
"columns": cols,
|
|
848
|
+
"timestamp-ms": now_ms,
|
|
849
|
+
"author": author,
|
|
850
|
+
"sequence-number": new_seq,
|
|
851
|
+
}
|
|
852
|
+
)
|
|
853
|
+
# print(f"[DEBUG] Wrote schema doc {sid}")
|
|
854
|
+
except Exception:
|
|
855
|
+
# print(f"[DEBUG] Failed to write schema doc {sid}: {e}")
|
|
856
|
+
pass
|
|
857
|
+
return sid
|