opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +536 -0
- opteryx_catalog/catalog/dataset.py +840 -520
- opteryx_catalog/catalog/manifest.py +475 -0
- opteryx_catalog/catalog/metadata.py +5 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +296 -242
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/create_dataset.py +1 -1
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- scripts/read_dataset.py +1 -1
- tests/test_collections.py +37 -0
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import time
|
|
5
4
|
from typing import Any
|
|
6
5
|
from typing import Iterable
|
|
@@ -21,6 +20,9 @@ from .exceptions import DatasetNotFound
|
|
|
21
20
|
from .exceptions import ViewAlreadyExists
|
|
22
21
|
from .exceptions import ViewNotFound
|
|
23
22
|
from .iops.base import FileIO
|
|
23
|
+
from .webhooks import send_webhook
|
|
24
|
+
from .webhooks.events import dataset_created_payload
|
|
25
|
+
from .webhooks.events import view_created_payload
|
|
24
26
|
|
|
25
27
|
|
|
26
28
|
class OpteryxCatalog(Metastore):
|
|
@@ -28,7 +30,7 @@ class OpteryxCatalog(Metastore):
|
|
|
28
30
|
|
|
29
31
|
Terminology: catalog -> workspace -> collection -> dataset|view
|
|
30
32
|
|
|
31
|
-
Stores
|
|
33
|
+
Stores dataset documents under the configured workspace in Firestore.
|
|
32
34
|
Snapshots are stored in a `snapshots` subcollection under each
|
|
33
35
|
dataset's document. Parquet manifests are written to GCS under the
|
|
34
36
|
dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
|
|
@@ -57,12 +59,8 @@ class OpteryxCatalog(Metastore):
|
|
|
57
59
|
props_ref = self._catalog_ref.document("$properties")
|
|
58
60
|
if not props_ref.get().exists:
|
|
59
61
|
now_ms = int(time.time() * 1000)
|
|
60
|
-
billing =
|
|
61
|
-
|
|
62
|
-
or os.environ.get("BILLING_ACCOUNT")
|
|
63
|
-
or None
|
|
64
|
-
)
|
|
65
|
-
owner = os.environ.get("WORKSPACE_OWNER") or None
|
|
62
|
+
billing = None
|
|
63
|
+
owner = None
|
|
66
64
|
props_ref.set(
|
|
67
65
|
{
|
|
68
66
|
"timestamp-ms": now_ms,
|
|
@@ -81,12 +79,9 @@ class OpteryxCatalog(Metastore):
|
|
|
81
79
|
self.io = io
|
|
82
80
|
else:
|
|
83
81
|
if gcs_bucket:
|
|
84
|
-
|
|
85
|
-
from .iops.gcs import GcsFileIO
|
|
82
|
+
from .iops.gcs import GcsFileIO
|
|
86
83
|
|
|
87
|
-
|
|
88
|
-
except Exception:
|
|
89
|
-
self.io = FileIO()
|
|
84
|
+
self.io = GcsFileIO()
|
|
90
85
|
else:
|
|
91
86
|
self.io = FileIO()
|
|
92
87
|
|
|
@@ -109,7 +104,7 @@ class OpteryxCatalog(Metastore):
|
|
|
109
104
|
return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
|
|
110
105
|
|
|
111
106
|
def _views_collection(self, collection: str):
|
|
112
|
-
return self.
|
|
107
|
+
return self._collection_ref(collection).collection("views")
|
|
113
108
|
|
|
114
109
|
def _view_doc_ref(self, collection: str, view_name: str):
|
|
115
110
|
return self._views_collection(collection).document(view_name)
|
|
@@ -125,7 +120,7 @@ class OpteryxCatalog(Metastore):
|
|
|
125
120
|
if doc_ref.get().exists:
|
|
126
121
|
raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
|
|
127
122
|
|
|
128
|
-
# Build default
|
|
123
|
+
# Build default dataset metadata
|
|
129
124
|
location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
|
|
130
125
|
metadata = DatasetMetadata(
|
|
131
126
|
dataset_identifier=identifier,
|
|
@@ -149,11 +144,10 @@ class OpteryxCatalog(Metastore):
|
|
|
149
144
|
"timestamp-ms": now_ms,
|
|
150
145
|
"author": author,
|
|
151
146
|
"maintenance-policy": metadata.maintenance_policy,
|
|
147
|
+
"annotations": metadata.annotations,
|
|
152
148
|
}
|
|
153
149
|
)
|
|
154
150
|
|
|
155
|
-
# Persisted in primary `datasets` collection only.
|
|
156
|
-
|
|
157
151
|
# Persist initial schema into `schemas` subcollection if provided
|
|
158
152
|
if schema is not None:
|
|
159
153
|
schema_id = self._write_schema(collection, dataset_name, schema, author=author)
|
|
@@ -175,13 +169,41 @@ class OpteryxCatalog(Metastore):
|
|
|
175
169
|
metadata.schemas = [
|
|
176
170
|
{"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
|
|
177
171
|
]
|
|
178
|
-
# update
|
|
172
|
+
# update dataset doc to reference current schema
|
|
179
173
|
doc_ref.update({"current-schema-id": metadata.current_schema_id})
|
|
180
174
|
|
|
175
|
+
# Send webhook notification
|
|
176
|
+
send_webhook(
|
|
177
|
+
action="create",
|
|
178
|
+
workspace=self.workspace,
|
|
179
|
+
collection=collection,
|
|
180
|
+
resource_type="dataset",
|
|
181
|
+
resource_name=dataset_name,
|
|
182
|
+
payload=dataset_created_payload(
|
|
183
|
+
schema=schema,
|
|
184
|
+
location=location,
|
|
185
|
+
properties=properties,
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
181
189
|
# Return SimpleDataset (attach this catalog so append() can persist)
|
|
182
190
|
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
183
191
|
|
|
184
|
-
def load_dataset(self, identifier: str) -> SimpleDataset:
|
|
192
|
+
def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
|
|
193
|
+
"""Load a dataset from Firestore.
|
|
194
|
+
|
|
195
|
+
Args:
|
|
196
|
+
identifier: Dataset identifier in format 'collection.dataset_name'
|
|
197
|
+
load_history: If True, load all snapshots from Firestore (expensive for
|
|
198
|
+
large histories). If False (default), only load the current snapshot,
|
|
199
|
+
which is sufficient for most write operations.
|
|
200
|
+
|
|
201
|
+
Returns:
|
|
202
|
+
SimpleDataset instance with metadata loaded from Firestore.
|
|
203
|
+
|
|
204
|
+
Raises:
|
|
205
|
+
DatasetNotFound: If the dataset does not exist in Firestore.
|
|
206
|
+
"""
|
|
185
207
|
collection, dataset_name = identifier.split(".")
|
|
186
208
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
187
209
|
doc = doc_ref.get()
|
|
@@ -197,37 +219,69 @@ class OpteryxCatalog(Metastore):
|
|
|
197
219
|
properties=data.get("properties") or {},
|
|
198
220
|
)
|
|
199
221
|
|
|
200
|
-
# Load
|
|
222
|
+
# Load dataset-level timestamp/author and collection/workspace
|
|
201
223
|
metadata.timestamp_ms = data.get("timestamp-ms")
|
|
202
224
|
metadata.author = data.get("author")
|
|
203
|
-
|
|
204
|
-
|
|
225
|
+
metadata.description = data.get("description")
|
|
226
|
+
metadata.describer = data.get("describer")
|
|
227
|
+
metadata.annotations = data.get("annotations") or []
|
|
205
228
|
|
|
206
|
-
# Load snapshots
|
|
229
|
+
# Load snapshots based on load_history flag
|
|
207
230
|
snaps = []
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
231
|
+
if load_history:
|
|
232
|
+
# Load all snapshots from Firestore (expensive for large histories)
|
|
233
|
+
for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
|
|
234
|
+
sd = snap_doc.to_dict() or {}
|
|
235
|
+
snap = Snapshot(
|
|
236
|
+
snapshot_id=sd.get("snapshot-id"),
|
|
237
|
+
timestamp_ms=sd.get("timestamp-ms"),
|
|
238
|
+
author=sd.get("author"),
|
|
239
|
+
sequence_number=sd.get("sequence-number"),
|
|
240
|
+
user_created=sd.get("user-created"),
|
|
241
|
+
manifest_list=sd.get("manifest"),
|
|
242
|
+
schema_id=sd.get("schema-id"),
|
|
243
|
+
summary=sd.get("summary", {}),
|
|
244
|
+
operation_type=sd.get("operation-type"),
|
|
245
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
246
|
+
)
|
|
247
|
+
snaps.append(snap)
|
|
248
|
+
if snaps:
|
|
249
|
+
metadata.current_snapshot_id = snaps[-1].snapshot_id
|
|
250
|
+
else:
|
|
251
|
+
# Load only the current snapshot (efficient single read)
|
|
252
|
+
current_snap_id = data.get("current-snapshot-id")
|
|
253
|
+
if current_snap_id:
|
|
254
|
+
try:
|
|
255
|
+
snap_doc = (
|
|
256
|
+
self._snapshots_collection(collection, dataset_name)
|
|
257
|
+
.document(str(current_snap_id))
|
|
258
|
+
.get()
|
|
259
|
+
)
|
|
260
|
+
if snap_doc.exists:
|
|
261
|
+
sd = snap_doc.to_dict() or {}
|
|
262
|
+
snap = Snapshot(
|
|
263
|
+
snapshot_id=sd.get("snapshot-id"),
|
|
264
|
+
timestamp_ms=sd.get("timestamp-ms"),
|
|
265
|
+
author=sd.get("author"),
|
|
266
|
+
sequence_number=sd.get("sequence-number"),
|
|
267
|
+
user_created=sd.get("user-created"),
|
|
268
|
+
manifest_list=sd.get("manifest"),
|
|
269
|
+
schema_id=sd.get("schema-id"),
|
|
270
|
+
summary=sd.get("summary", {}),
|
|
271
|
+
operation_type=sd.get("operation-type"),
|
|
272
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
273
|
+
)
|
|
274
|
+
snaps.append(snap)
|
|
275
|
+
metadata.current_snapshot_id = current_snap_id
|
|
276
|
+
except Exception:
|
|
277
|
+
pass
|
|
223
278
|
metadata.snapshots = snaps
|
|
224
|
-
if snaps:
|
|
225
|
-
metadata.current_snapshot_id = snaps[-1].snapshot_id
|
|
226
279
|
|
|
227
280
|
# Load schemas subcollection
|
|
228
|
-
|
|
281
|
+
schemas_coll = doc_ref.collection("schemas")
|
|
282
|
+
# Load all schemas if requested; otherwise load only current schema
|
|
283
|
+
if load_history:
|
|
229
284
|
schemas = []
|
|
230
|
-
schemas_coll = doc_ref.collection("schemas")
|
|
231
285
|
for sdoc in schemas_coll.stream():
|
|
232
286
|
sd = sdoc.to_dict() or {}
|
|
233
287
|
schemas.append(
|
|
@@ -241,9 +295,23 @@ class OpteryxCatalog(Metastore):
|
|
|
241
295
|
)
|
|
242
296
|
metadata.schemas = schemas
|
|
243
297
|
metadata.current_schema_id = doc.to_dict().get("current-schema-id")
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
298
|
+
else:
|
|
299
|
+
# Only load the current schema document for efficiency
|
|
300
|
+
current_schema_id = doc.to_dict().get("current-schema-id")
|
|
301
|
+
if current_schema_id:
|
|
302
|
+
sdoc = schemas_coll.document(str(current_schema_id)).get()
|
|
303
|
+
if sdoc.exists:
|
|
304
|
+
sd = sdoc.to_dict() or {}
|
|
305
|
+
metadata.schemas = [
|
|
306
|
+
{
|
|
307
|
+
"schema_id": sdoc.id,
|
|
308
|
+
"columns": sd.get("columns", []),
|
|
309
|
+
"timestamp-ms": sd.get("timestamp-ms"),
|
|
310
|
+
"author": sd.get("author"),
|
|
311
|
+
"sequence-number": sd.get("sequence-number"),
|
|
312
|
+
}
|
|
313
|
+
]
|
|
314
|
+
metadata.current_schema_id = current_schema_id
|
|
247
315
|
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
248
316
|
|
|
249
317
|
def drop_dataset(self, identifier: str) -> None:
|
|
@@ -259,6 +327,13 @@ class OpteryxCatalog(Metastore):
|
|
|
259
327
|
coll = self._datasets_collection(collection)
|
|
260
328
|
return [doc.id for doc in coll.stream()]
|
|
261
329
|
|
|
330
|
+
def list_collections(self) -> Iterable[str]:
|
|
331
|
+
"""List top-level collections (documents) in this workspace."""
|
|
332
|
+
try:
|
|
333
|
+
return [col.id for col in self._catalog_ref.list_documents() if col.id[0] != "$"]
|
|
334
|
+
except:
|
|
335
|
+
return []
|
|
336
|
+
|
|
262
337
|
def create_collection(
|
|
263
338
|
self,
|
|
264
339
|
collection: str,
|
|
@@ -270,7 +345,7 @@ class OpteryxCatalog(Metastore):
|
|
|
270
345
|
|
|
271
346
|
If `exists_ok` is False and the collection already exists, a KeyError is raised.
|
|
272
347
|
"""
|
|
273
|
-
doc_ref = self.
|
|
348
|
+
doc_ref = self._collection_ref(collection)
|
|
274
349
|
if doc_ref.get().exists:
|
|
275
350
|
if exists_ok:
|
|
276
351
|
return
|
|
@@ -285,6 +360,7 @@ class OpteryxCatalog(Metastore):
|
|
|
285
360
|
"properties": properties or {},
|
|
286
361
|
"timestamp-ms": now_ms,
|
|
287
362
|
"author": author,
|
|
363
|
+
"annotations": [],
|
|
288
364
|
}
|
|
289
365
|
)
|
|
290
366
|
|
|
@@ -292,11 +368,7 @@ class OpteryxCatalog(Metastore):
|
|
|
292
368
|
self, collection: str, properties: dict | None = None, author: Optional[str] = None
|
|
293
369
|
) -> None:
|
|
294
370
|
"""Convenience wrapper that creates the collection only if missing."""
|
|
295
|
-
|
|
296
|
-
self.create_collection(collection, properties=properties, exists_ok=True, author=author)
|
|
297
|
-
except Exception:
|
|
298
|
-
# Be conservative: surface caller-level warnings rather than failing
|
|
299
|
-
return
|
|
371
|
+
self.create_collection(collection, properties=properties, exists_ok=True, author=author)
|
|
300
372
|
|
|
301
373
|
def dataset_exists(
|
|
302
374
|
self, identifier_or_collection: str, dataset_name: Optional[str] = None
|
|
@@ -309,12 +381,14 @@ class OpteryxCatalog(Metastore):
|
|
|
309
381
|
"""
|
|
310
382
|
# Normalize inputs
|
|
311
383
|
if dataset_name is None:
|
|
312
|
-
# Expect a single collection like 'collection.
|
|
384
|
+
# Expect a single collection like 'collection.dataset'
|
|
313
385
|
if "." not in identifier_or_collection:
|
|
314
386
|
raise ValueError(
|
|
315
|
-
"collection must be 'collection.
|
|
387
|
+
"collection must be 'collection.dataset' or pass dataset_name separately"
|
|
316
388
|
)
|
|
317
389
|
collection, dataset_name = identifier_or_collection.rsplit(".", 1)
|
|
390
|
+
else:
|
|
391
|
+
collection = identifier_or_collection
|
|
318
392
|
|
|
319
393
|
try:
|
|
320
394
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
@@ -334,6 +408,7 @@ class OpteryxCatalog(Metastore):
|
|
|
334
408
|
author: str = None,
|
|
335
409
|
description: Optional[str] = None,
|
|
336
410
|
properties: dict | None = None,
|
|
411
|
+
update_if_exists: bool = False,
|
|
337
412
|
) -> CatalogView:
|
|
338
413
|
"""Create a view document and a statement version in the `statement` subcollection.
|
|
339
414
|
|
|
@@ -347,7 +422,22 @@ class OpteryxCatalog(Metastore):
|
|
|
347
422
|
|
|
348
423
|
doc_ref = self._view_doc_ref(collection, view_name)
|
|
349
424
|
if doc_ref.get().exists:
|
|
350
|
-
|
|
425
|
+
if not update_if_exists:
|
|
426
|
+
raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
|
|
427
|
+
# Update existing view - get current sequence number
|
|
428
|
+
existing_doc = doc_ref.get().to_dict()
|
|
429
|
+
current_statement_id = existing_doc.get("statement-id")
|
|
430
|
+
if current_statement_id:
|
|
431
|
+
stmt_ref = doc_ref.collection("statement").document(current_statement_id)
|
|
432
|
+
stmt_doc = stmt_ref.get()
|
|
433
|
+
if stmt_doc.exists:
|
|
434
|
+
sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
|
|
435
|
+
else:
|
|
436
|
+
sequence_number = 1
|
|
437
|
+
else:
|
|
438
|
+
sequence_number = 1
|
|
439
|
+
else:
|
|
440
|
+
sequence_number = 1
|
|
351
441
|
|
|
352
442
|
now_ms = int(time.time() * 1000)
|
|
353
443
|
if author is None:
|
|
@@ -361,7 +451,7 @@ class OpteryxCatalog(Metastore):
|
|
|
361
451
|
"sql": sql,
|
|
362
452
|
"timestamp-ms": now_ms,
|
|
363
453
|
"author": author,
|
|
364
|
-
"sequence-number":
|
|
454
|
+
"sequence-number": sequence_number,
|
|
365
455
|
}
|
|
366
456
|
)
|
|
367
457
|
|
|
@@ -383,12 +473,28 @@ class OpteryxCatalog(Metastore):
|
|
|
383
473
|
}
|
|
384
474
|
)
|
|
385
475
|
|
|
476
|
+
# Send webhook notification
|
|
477
|
+
send_webhook(
|
|
478
|
+
action="create" if not update_if_exists else "update",
|
|
479
|
+
workspace=self.workspace,
|
|
480
|
+
collection=collection,
|
|
481
|
+
resource_type="view",
|
|
482
|
+
resource_name=view_name,
|
|
483
|
+
payload=view_created_payload(
|
|
484
|
+
definition=sql,
|
|
485
|
+
properties=properties,
|
|
486
|
+
),
|
|
487
|
+
)
|
|
488
|
+
|
|
386
489
|
# Return a simple CatalogView wrapper
|
|
387
490
|
v = CatalogView(name=view_name, definition=sql, properties=properties or {})
|
|
388
491
|
# provide convenient attributes used by docs/examples
|
|
389
492
|
setattr(v, "sql", sql)
|
|
390
493
|
setattr(v, "metadata", type("M", (), {})())
|
|
391
494
|
v.metadata.schema = schema
|
|
495
|
+
# Attach catalog and identifier for describe() method
|
|
496
|
+
setattr(v, "_catalog", self)
|
|
497
|
+
setattr(v, "_identifier", f"{collection}.{view_name}")
|
|
392
498
|
return v
|
|
393
499
|
|
|
394
500
|
def load_view(self, identifier: str | tuple) -> CatalogView:
|
|
@@ -410,27 +516,28 @@ class OpteryxCatalog(Metastore):
|
|
|
410
516
|
stmt_id = data.get("statement-id")
|
|
411
517
|
sql = None
|
|
412
518
|
schema = data.get("schema")
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
if sdoc.exists:
|
|
417
|
-
sql = (sdoc.to_dict() or {}).get("sql")
|
|
418
|
-
# fallback: pick the most recent statement
|
|
419
|
-
if not sql:
|
|
420
|
-
for s in doc_ref.collection("statement").stream():
|
|
421
|
-
sd = s.to_dict() or {}
|
|
422
|
-
if sd.get("sql"):
|
|
423
|
-
sql = sd.get("sql")
|
|
424
|
-
break
|
|
425
|
-
except Exception:
|
|
426
|
-
pass
|
|
519
|
+
|
|
520
|
+
sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
|
|
521
|
+
sql = (sdoc.to_dict() or {}).get("sql")
|
|
427
522
|
|
|
428
523
|
v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
|
|
429
524
|
setattr(v, "sql", sql or "")
|
|
430
525
|
setattr(v, "metadata", type("M", (), {})())
|
|
431
526
|
v.metadata.schema = schema
|
|
527
|
+
# Populate metadata fields from the stored view document so callers
|
|
528
|
+
# expecting attributes like `timestamp_ms` won't fail.
|
|
432
529
|
v.metadata.author = data.get("author")
|
|
433
530
|
v.metadata.description = data.get("description")
|
|
531
|
+
v.metadata.timestamp_ms = data.get("timestamp-ms")
|
|
532
|
+
# Execution/operational fields (may be None)
|
|
533
|
+
v.metadata.last_execution_ms = data.get("last-execution-ms")
|
|
534
|
+
v.metadata.last_execution_data_size = data.get("last-execution-data-size")
|
|
535
|
+
v.metadata.last_execution_records = data.get("last-execution-records")
|
|
536
|
+
# Optional describer (used to flag LLM-generated descriptions)
|
|
537
|
+
v.metadata.describer = data.get("describer")
|
|
538
|
+
# Attach catalog and identifier for describe() method
|
|
539
|
+
setattr(v, "_catalog", self)
|
|
540
|
+
setattr(v, "_identifier", f"{collection}.{view_name}")
|
|
434
541
|
return v
|
|
435
542
|
|
|
436
543
|
def drop_view(self, identifier: str | tuple) -> None:
|
|
@@ -441,11 +548,9 @@ class OpteryxCatalog(Metastore):
|
|
|
441
548
|
|
|
442
549
|
doc_ref = self._view_doc_ref(collection, view_name)
|
|
443
550
|
# delete statement subcollection
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
except Exception:
|
|
448
|
-
pass
|
|
551
|
+
for d in doc_ref.collection("statement").stream():
|
|
552
|
+
doc_ref.collection("statement").document(d.id).delete()
|
|
553
|
+
|
|
449
554
|
doc_ref.delete()
|
|
450
555
|
|
|
451
556
|
def list_views(self, collection: str) -> Iterable[str]:
|
|
@@ -474,6 +579,8 @@ class OpteryxCatalog(Metastore):
|
|
|
474
579
|
"identifier must be 'collection.view' or pass view_name separately"
|
|
475
580
|
)
|
|
476
581
|
collection, view_name = identifier_or_collection.rsplit(".", 1)
|
|
582
|
+
else:
|
|
583
|
+
collection = identifier_or_collection
|
|
477
584
|
|
|
478
585
|
try:
|
|
479
586
|
doc_ref = self._view_doc_ref(collection, view_name)
|
|
@@ -501,40 +608,82 @@ class OpteryxCatalog(Metastore):
|
|
|
501
608
|
updates["last-execution-time-ms"] = int(execution_time * 1000)
|
|
502
609
|
updates["last-execution-ms"] = now_ms
|
|
503
610
|
if updates:
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
611
|
+
doc_ref.update(updates)
|
|
612
|
+
|
|
613
|
+
def update_view_description(
|
|
614
|
+
self,
|
|
615
|
+
identifier: str | tuple,
|
|
616
|
+
description: str,
|
|
617
|
+
describer: Optional[str] = None,
|
|
618
|
+
) -> None:
|
|
619
|
+
"""Update the description for a view.
|
|
620
|
+
|
|
621
|
+
Args:
|
|
622
|
+
identifier: View identifier ('collection.view' or tuple)
|
|
623
|
+
description: The new description text
|
|
624
|
+
describer: Optional identifier for who/what created the description
|
|
625
|
+
"""
|
|
626
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
627
|
+
collection, view_name = identifier[0], identifier[1]
|
|
628
|
+
else:
|
|
629
|
+
collection, view_name = identifier.split(".")
|
|
630
|
+
|
|
631
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
632
|
+
updates = {
|
|
633
|
+
"description": description,
|
|
634
|
+
}
|
|
635
|
+
if describer is not None:
|
|
636
|
+
updates["describer"] = describer
|
|
637
|
+
doc_ref.update(updates)
|
|
638
|
+
|
|
639
|
+
def update_dataset_description(
|
|
640
|
+
self,
|
|
641
|
+
identifier: str | tuple,
|
|
642
|
+
description: str,
|
|
643
|
+
describer: Optional[str] = None,
|
|
644
|
+
) -> None:
|
|
645
|
+
"""Update the description for a dataset.
|
|
646
|
+
|
|
647
|
+
Args:
|
|
648
|
+
identifier: Dataset identifier in format 'collection.dataset_name'
|
|
649
|
+
description: The new description text
|
|
650
|
+
describer: Optional identifier for who/what created the description
|
|
651
|
+
"""
|
|
652
|
+
|
|
653
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
654
|
+
collection, dataset_name = identifier[0], identifier[1]
|
|
655
|
+
else:
|
|
656
|
+
collection, dataset_name = identifier.split(".")
|
|
657
|
+
|
|
658
|
+
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
659
|
+
updates = {
|
|
660
|
+
"description": description,
|
|
661
|
+
}
|
|
662
|
+
if describer is not None:
|
|
663
|
+
updates["describer"] = describer
|
|
664
|
+
doc_ref.update(updates)
|
|
508
665
|
|
|
509
666
|
def write_parquet_manifest(
|
|
510
|
-
self, snapshot_id: int, entries: List[dict],
|
|
667
|
+
self, snapshot_id: int, entries: List[dict], dataset_location: str
|
|
511
668
|
) -> Optional[str]:
|
|
512
669
|
"""Write a Parquet manifest for the given snapshot id and entries.
|
|
513
670
|
|
|
514
671
|
Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
|
|
515
|
-
The manifest will be written to <
|
|
672
|
+
The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
|
|
516
673
|
"""
|
|
517
674
|
import pyarrow as pa
|
|
518
675
|
import pyarrow.parquet as pq
|
|
519
676
|
|
|
677
|
+
from .iops.fileio import WRITE_PARQUET_OPTIONS
|
|
678
|
+
|
|
520
679
|
# If entries is None we skip writing; if entries is empty list, write
|
|
521
|
-
# an empty Parquet manifest (represents an empty
|
|
680
|
+
# an empty Parquet manifest (represents an empty dataset for this
|
|
522
681
|
# snapshot). This preserves previous manifests so older snapshots
|
|
523
682
|
# remain readable.
|
|
524
683
|
if entries is None:
|
|
525
684
|
return None
|
|
526
685
|
|
|
527
|
-
|
|
528
|
-
try:
|
|
529
|
-
pass
|
|
530
|
-
|
|
531
|
-
# print("[MANIFEST] Parquet manifest entries to write:")
|
|
532
|
-
# print(json.dumps(entries, indent=2, default=str))
|
|
533
|
-
except Exception:
|
|
534
|
-
# print("[MANIFEST] Parquet manifest entries:", entries)
|
|
535
|
-
pass
|
|
536
|
-
|
|
537
|
-
parquet_path = f"{table_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
686
|
+
parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
538
687
|
|
|
539
688
|
# Use provided FileIO if it supports writing; otherwise write to GCS
|
|
540
689
|
try:
|
|
@@ -546,146 +695,63 @@ class OpteryxCatalog(Metastore):
|
|
|
546
695
|
("file_format", pa.string()),
|
|
547
696
|
("record_count", pa.int64()),
|
|
548
697
|
("file_size_in_bytes", pa.int64()),
|
|
698
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
699
|
+
("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
|
|
700
|
+
("null_counts", pa.list_(pa.int64())),
|
|
549
701
|
("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
|
|
550
702
|
("histogram_counts", pa.list_(pa.list_(pa.int64()))),
|
|
551
703
|
("histogram_bins", pa.int32()),
|
|
552
704
|
("min_values", pa.list_(pa.int64())),
|
|
553
705
|
("max_values", pa.list_(pa.int64())),
|
|
706
|
+
("min_values_display", pa.list_(pa.string())),
|
|
707
|
+
("max_values_display", pa.list_(pa.string())),
|
|
554
708
|
]
|
|
555
709
|
)
|
|
556
710
|
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
711
|
+
# Normalize entries to match schema expectations:
|
|
712
|
+
normalized = []
|
|
713
|
+
for ent in entries:
|
|
714
|
+
if not isinstance(ent, dict):
|
|
715
|
+
normalized.append(ent)
|
|
716
|
+
continue
|
|
717
|
+
e = dict(ent)
|
|
718
|
+
# Ensure list fields exist
|
|
719
|
+
e.setdefault("min_k_hashes", [])
|
|
720
|
+
e.setdefault("histogram_counts", [])
|
|
721
|
+
e.setdefault("histogram_bins", 0)
|
|
722
|
+
e.setdefault("column_uncompressed_sizes_in_bytes", [])
|
|
723
|
+
e.setdefault("null_counts", [])
|
|
724
|
+
e.setdefault("min_values_display", [])
|
|
725
|
+
e.setdefault("max_values_display", [])
|
|
726
|
+
|
|
727
|
+
# min/max values are stored as compressed int64 values
|
|
728
|
+
# display values are string representations for human readability
|
|
729
|
+
mv = e.get("min_values") or []
|
|
730
|
+
xv = e.get("max_values") or []
|
|
731
|
+
mv_disp = e.get("min_values_display") or []
|
|
732
|
+
xv_disp = e.get("max_values_display") or []
|
|
733
|
+
|
|
734
|
+
def truncate_display(v, max_len=32):
|
|
735
|
+
"""Truncate display value to max_len characters, adding '...' if longer."""
|
|
736
|
+
if v is None:
|
|
737
|
+
return None
|
|
738
|
+
s = str(v)
|
|
739
|
+
if len(s) > max_len:
|
|
740
|
+
return s[:max_len] + "..."
|
|
741
|
+
return s
|
|
742
|
+
|
|
743
|
+
# Ensure int64 values are properly typed for min/max
|
|
744
|
+
e["min_values"] = [int(v) if v is not None else None for v in mv]
|
|
745
|
+
e["max_values"] = [int(v) if v is not None else None for v in xv]
|
|
746
|
+
# Display values truncated to 32 chars with '...' suffix if longer
|
|
747
|
+
e["min_values_display"] = [truncate_display(v) for v in mv_disp]
|
|
748
|
+
e["max_values_display"] = [truncate_display(v) for v in xv_disp]
|
|
749
|
+
normalized.append(e)
|
|
750
|
+
|
|
751
|
+
table = pa.Table.from_pylist(normalized, schema=schema)
|
|
581
752
|
|
|
582
|
-
# Attempt to sanitize entries and retry conversion.
|
|
583
|
-
try:
|
|
584
|
-
print("[MANIFEST DEBUG] Attempting to sanitize entries and retry")
|
|
585
|
-
sanitized = []
|
|
586
|
-
for ent in entries:
|
|
587
|
-
if not isinstance(ent, dict):
|
|
588
|
-
sanitized.append(ent)
|
|
589
|
-
continue
|
|
590
|
-
e2 = dict(ent) # copy
|
|
591
|
-
# Ensure numeric fields
|
|
592
|
-
for k in ("record_count", "file_size_in_bytes", "histogram_bins"):
|
|
593
|
-
v = e2.get(k)
|
|
594
|
-
try:
|
|
595
|
-
e2[k] = int(v) if v is not None else 0
|
|
596
|
-
except Exception:
|
|
597
|
-
e2[k] = 0
|
|
598
|
-
# Ensure min_k_hashes is list[list[int]]
|
|
599
|
-
mk = e2.get("min_k_hashes")
|
|
600
|
-
if not isinstance(mk, list):
|
|
601
|
-
e2["min_k_hashes"] = []
|
|
602
|
-
else:
|
|
603
|
-
new_mk = []
|
|
604
|
-
for sub in mk:
|
|
605
|
-
if isinstance(sub, list):
|
|
606
|
-
try:
|
|
607
|
-
new_mk.append([int(x) for x in sub])
|
|
608
|
-
except Exception:
|
|
609
|
-
new_mk.append([])
|
|
610
|
-
else:
|
|
611
|
-
new_mk.append([])
|
|
612
|
-
e2["min_k_hashes"] = new_mk
|
|
613
|
-
# Ensure histogram_counts is list[list[int]]
|
|
614
|
-
hc = e2.get("histogram_counts")
|
|
615
|
-
if not isinstance(hc, list):
|
|
616
|
-
e2["histogram_counts"] = []
|
|
617
|
-
else:
|
|
618
|
-
new_hc = []
|
|
619
|
-
for sub in hc:
|
|
620
|
-
if isinstance(sub, list):
|
|
621
|
-
try:
|
|
622
|
-
new_hc.append([int(x) for x in sub])
|
|
623
|
-
except Exception:
|
|
624
|
-
new_hc.append([])
|
|
625
|
-
else:
|
|
626
|
-
new_hc.append([])
|
|
627
|
-
e2["histogram_counts"] = new_hc
|
|
628
|
-
# Sanitize min_values / max_values: must be list[int] or None
|
|
629
|
-
# Sanitize min_values / max_values: coerce to int64 using to_int() if available
|
|
630
|
-
try:
|
|
631
|
-
from opteryx.compiled.structures.relation_statistics import to_int
|
|
632
|
-
except Exception:
|
|
633
|
-
|
|
634
|
-
def to_int(val):
|
|
635
|
-
# Best-effort fallback: handle numpy types, strings and numbers
|
|
636
|
-
try:
|
|
637
|
-
if val is None:
|
|
638
|
-
return None
|
|
639
|
-
if hasattr(val, "item"):
|
|
640
|
-
val = val.item()
|
|
641
|
-
if isinstance(val, (bytes, bytearray)):
|
|
642
|
-
val = val.decode(errors="ignore")
|
|
643
|
-
if isinstance(val, str):
|
|
644
|
-
# empty strings are invalid
|
|
645
|
-
if val == "":
|
|
646
|
-
return None
|
|
647
|
-
try:
|
|
648
|
-
return int(val)
|
|
649
|
-
except Exception:
|
|
650
|
-
return None
|
|
651
|
-
if isinstance(val, float):
|
|
652
|
-
return int(val)
|
|
653
|
-
return int(val)
|
|
654
|
-
except Exception:
|
|
655
|
-
return None
|
|
656
|
-
|
|
657
|
-
for key in ("min_values", "max_values"):
|
|
658
|
-
mv = e2.get(key)
|
|
659
|
-
if not isinstance(mv, list):
|
|
660
|
-
e2[key] = [None]
|
|
661
|
-
else:
|
|
662
|
-
new_mv = []
|
|
663
|
-
for x in mv:
|
|
664
|
-
try:
|
|
665
|
-
if x is None:
|
|
666
|
-
new_mv.append(None)
|
|
667
|
-
continue
|
|
668
|
-
# Use to_int to coerce into int64 semantics
|
|
669
|
-
v = x
|
|
670
|
-
if hasattr(v, "item"):
|
|
671
|
-
v = v.item()
|
|
672
|
-
coerced = to_int(v)
|
|
673
|
-
# to_int may return None-like sentinel; accept ints only
|
|
674
|
-
if coerced is None:
|
|
675
|
-
new_mv.append(None)
|
|
676
|
-
else:
|
|
677
|
-
new_mv.append(int(coerced))
|
|
678
|
-
except Exception:
|
|
679
|
-
new_mv.append(None)
|
|
680
|
-
e2[key] = new_mv
|
|
681
|
-
sanitized.append(e2)
|
|
682
|
-
table = pa.Table.from_pylist(sanitized, schema=schema)
|
|
683
|
-
print("[MANIFEST DEBUG] Sanitized entries converted successfully")
|
|
684
|
-
except Exception:
|
|
685
|
-
print("[MANIFEST DEBUG] Sanitization failed; re-raising original exception")
|
|
686
|
-
raise
|
|
687
753
|
buf = pa.BufferOutputStream()
|
|
688
|
-
pq.write_table(table, buf,
|
|
754
|
+
pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
|
|
689
755
|
data = buf.getvalue().to_pybytes()
|
|
690
756
|
|
|
691
757
|
if self.io:
|
|
@@ -696,15 +762,6 @@ class OpteryxCatalog(Metastore):
|
|
|
696
762
|
out.close()
|
|
697
763
|
except Exception:
|
|
698
764
|
pass
|
|
699
|
-
elif self._storage_client and self.gcs_bucket:
|
|
700
|
-
# Write to GCS bucket
|
|
701
|
-
bucket = self._storage_client.bucket(self.gcs_bucket)
|
|
702
|
-
# object path: remove gs://bucket/ prefix
|
|
703
|
-
parsed = parquet_path
|
|
704
|
-
if parsed.startswith("gs://"):
|
|
705
|
-
parsed = parsed[5 + len(self.gcs_bucket) + 1 :]
|
|
706
|
-
blob = bucket.blob(parsed)
|
|
707
|
-
blob.upload_from_string(data)
|
|
708
765
|
|
|
709
766
|
return parquet_path
|
|
710
767
|
except Exception as e:
|
|
@@ -713,7 +770,7 @@ class OpteryxCatalog(Metastore):
|
|
|
713
770
|
raise e
|
|
714
771
|
|
|
715
772
|
def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
|
|
716
|
-
"""Persist a single snapshot document for a
|
|
773
|
+
"""Persist a single snapshot document for a dataset."""
|
|
717
774
|
namespace, dataset_name = identifier.split(".")
|
|
718
775
|
snaps = self._snapshots_collection(namespace, dataset_name)
|
|
719
776
|
doc_id = str(snapshot.snapshot_id)
|
|
@@ -749,9 +806,9 @@ class OpteryxCatalog(Metastore):
|
|
|
749
806
|
snaps.document(doc_id).set(data)
|
|
750
807
|
|
|
751
808
|
def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
|
|
752
|
-
"""Persist
|
|
809
|
+
"""Persist dataset-level metadata and snapshots to Firestore.
|
|
753
810
|
|
|
754
|
-
This writes the
|
|
811
|
+
This writes the dataset document and upserts snapshot documents.
|
|
755
812
|
"""
|
|
756
813
|
collection, dataset_name = identifier.split(".")
|
|
757
814
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
@@ -763,6 +820,7 @@ class OpteryxCatalog(Metastore):
|
|
|
763
820
|
"location": metadata.location,
|
|
764
821
|
"properties": metadata.properties,
|
|
765
822
|
"format-version": metadata.format_version,
|
|
823
|
+
"annotations": metadata.annotations,
|
|
766
824
|
"current-snapshot-id": metadata.current_snapshot_id,
|
|
767
825
|
"current-schema-id": metadata.current_schema_id,
|
|
768
826
|
"timestamp-ms": metadata.timestamp_ms,
|
|
@@ -777,10 +835,9 @@ class OpteryxCatalog(Metastore):
|
|
|
777
835
|
# Metadata persisted in primary `datasets` collection only.
|
|
778
836
|
|
|
779
837
|
snaps_coll = self._snapshots_collection(collection, dataset_name)
|
|
780
|
-
|
|
781
|
-
|
|
838
|
+
# Upsert snapshot documents. Do NOT delete existing snapshot documents
|
|
839
|
+
# here to avoid accidental removal of historical snapshots on save.
|
|
782
840
|
for snap in metadata.snapshots:
|
|
783
|
-
new_ids.add(str(snap.snapshot_id))
|
|
784
841
|
snaps_coll.document(str(snap.snapshot_id)).set(
|
|
785
842
|
{
|
|
786
843
|
"snapshot-id": snap.snapshot_id,
|
|
@@ -795,10 +852,6 @@ class OpteryxCatalog(Metastore):
|
|
|
795
852
|
}
|
|
796
853
|
)
|
|
797
854
|
|
|
798
|
-
# Delete stale snapshots
|
|
799
|
-
for stale in existing - new_ids:
|
|
800
|
-
snaps_coll.document(stale).delete()
|
|
801
|
-
|
|
802
855
|
# Persist schemas subcollection
|
|
803
856
|
schemas_coll = doc_ref.collection("schemas")
|
|
804
857
|
existing_schema_ids = {d.id for d in schemas_coll.stream()}
|
|
@@ -866,6 +919,7 @@ class OpteryxCatalog(Metastore):
|
|
|
866
919
|
"scale": scale,
|
|
867
920
|
"precision": precision,
|
|
868
921
|
"expectation-policies": [],
|
|
922
|
+
"annotations": [],
|
|
869
923
|
}
|
|
870
924
|
|
|
871
925
|
cols.append(typed)
|
|
@@ -873,7 +927,7 @@ class OpteryxCatalog(Metastore):
|
|
|
873
927
|
return cols
|
|
874
928
|
|
|
875
929
|
def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
|
|
876
|
-
"""Persist a schema document in the
|
|
930
|
+
"""Persist a schema document in the dataset's `schemas` subcollection and
|
|
877
931
|
return the new schema id.
|
|
878
932
|
"""
|
|
879
933
|
import uuid
|