opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.11__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/__init__.py +1 -1
- opteryx_catalog/catalog/__init__.py +2 -1
- opteryx_catalog/catalog/compaction.py +529 -0
- opteryx_catalog/catalog/dataset.py +433 -451
- opteryx_catalog/catalog/manifest.py +415 -0
- opteryx_catalog/catalog/metadata.py +2 -2
- opteryx_catalog/catalog/metastore.py +2 -2
- opteryx_catalog/exceptions.py +1 -1
- opteryx_catalog/iops/gcs.py +35 -5
- opteryx_catalog/opteryx_catalog.py +257 -231
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/METADATA +1 -1
- opteryx_catalog-0.4.11.dist-info/RECORD +25 -0
- scripts/create_dataset.py +1 -1
- scripts/read_dataset.py +1 -1
- tests/test_compaction.py +233 -0
- tests/test_dataset_metadata.py +14 -0
- opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/WHEEL +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.11.dist-info}/top_level.txt +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
import os
|
|
4
3
|
import time
|
|
5
4
|
from typing import Any
|
|
6
5
|
from typing import Iterable
|
|
@@ -28,7 +27,7 @@ class OpteryxCatalog(Metastore):
|
|
|
28
27
|
|
|
29
28
|
Terminology: catalog -> workspace -> collection -> dataset|view
|
|
30
29
|
|
|
31
|
-
Stores
|
|
30
|
+
Stores dataset documents under the configured workspace in Firestore.
|
|
32
31
|
Snapshots are stored in a `snapshots` subcollection under each
|
|
33
32
|
dataset's document. Parquet manifests are written to GCS under the
|
|
34
33
|
dataset location's `metadata/manifest-<snapshot_id>.parquet` path.
|
|
@@ -57,12 +56,8 @@ class OpteryxCatalog(Metastore):
|
|
|
57
56
|
props_ref = self._catalog_ref.document("$properties")
|
|
58
57
|
if not props_ref.get().exists:
|
|
59
58
|
now_ms = int(time.time() * 1000)
|
|
60
|
-
billing =
|
|
61
|
-
|
|
62
|
-
or os.environ.get("BILLING_ACCOUNT")
|
|
63
|
-
or None
|
|
64
|
-
)
|
|
65
|
-
owner = os.environ.get("WORKSPACE_OWNER") or None
|
|
59
|
+
billing = None
|
|
60
|
+
owner = None
|
|
66
61
|
props_ref.set(
|
|
67
62
|
{
|
|
68
63
|
"timestamp-ms": now_ms,
|
|
@@ -81,12 +76,9 @@ class OpteryxCatalog(Metastore):
|
|
|
81
76
|
self.io = io
|
|
82
77
|
else:
|
|
83
78
|
if gcs_bucket:
|
|
84
|
-
|
|
85
|
-
from .iops.gcs import GcsFileIO
|
|
79
|
+
from .iops.gcs import GcsFileIO
|
|
86
80
|
|
|
87
|
-
|
|
88
|
-
except Exception:
|
|
89
|
-
self.io = FileIO()
|
|
81
|
+
self.io = GcsFileIO()
|
|
90
82
|
else:
|
|
91
83
|
self.io = FileIO()
|
|
92
84
|
|
|
@@ -109,7 +101,7 @@ class OpteryxCatalog(Metastore):
|
|
|
109
101
|
return self._dataset_doc_ref(collection, dataset_name).collection("snapshots")
|
|
110
102
|
|
|
111
103
|
def _views_collection(self, collection: str):
|
|
112
|
-
return self.
|
|
104
|
+
return self._collection_ref(collection).collection("views")
|
|
113
105
|
|
|
114
106
|
def _view_doc_ref(self, collection: str, view_name: str):
|
|
115
107
|
return self._views_collection(collection).document(view_name)
|
|
@@ -125,7 +117,7 @@ class OpteryxCatalog(Metastore):
|
|
|
125
117
|
if doc_ref.get().exists:
|
|
126
118
|
raise DatasetAlreadyExists(f"Dataset already exists: {identifier}")
|
|
127
119
|
|
|
128
|
-
# Build default
|
|
120
|
+
# Build default dataset metadata
|
|
129
121
|
location = f"gs://{self.gcs_bucket}/{self.workspace}/{collection}/{dataset_name}"
|
|
130
122
|
metadata = DatasetMetadata(
|
|
131
123
|
dataset_identifier=identifier,
|
|
@@ -152,8 +144,6 @@ class OpteryxCatalog(Metastore):
|
|
|
152
144
|
}
|
|
153
145
|
)
|
|
154
146
|
|
|
155
|
-
# Persisted in primary `datasets` collection only.
|
|
156
|
-
|
|
157
147
|
# Persist initial schema into `schemas` subcollection if provided
|
|
158
148
|
if schema is not None:
|
|
159
149
|
schema_id = self._write_schema(collection, dataset_name, schema, author=author)
|
|
@@ -175,13 +165,27 @@ class OpteryxCatalog(Metastore):
|
|
|
175
165
|
metadata.schemas = [
|
|
176
166
|
{"schema_id": schema_id, "columns": self._schema_to_columns(schema)}
|
|
177
167
|
]
|
|
178
|
-
# update
|
|
168
|
+
# update dataset doc to reference current schema
|
|
179
169
|
doc_ref.update({"current-schema-id": metadata.current_schema_id})
|
|
180
170
|
|
|
181
171
|
# Return SimpleDataset (attach this catalog so append() can persist)
|
|
182
172
|
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
183
173
|
|
|
184
|
-
def load_dataset(self, identifier: str) -> SimpleDataset:
|
|
174
|
+
def load_dataset(self, identifier: str, load_history: bool = False) -> SimpleDataset:
|
|
175
|
+
"""Load a dataset from Firestore.
|
|
176
|
+
|
|
177
|
+
Args:
|
|
178
|
+
identifier: Dataset identifier in format 'collection.dataset_name'
|
|
179
|
+
load_history: If True, load all snapshots from Firestore (expensive for
|
|
180
|
+
large histories). If False (default), only load the current snapshot,
|
|
181
|
+
which is sufficient for most write operations.
|
|
182
|
+
|
|
183
|
+
Returns:
|
|
184
|
+
SimpleDataset instance with metadata loaded from Firestore.
|
|
185
|
+
|
|
186
|
+
Raises:
|
|
187
|
+
DatasetNotFound: If the dataset does not exist in Firestore.
|
|
188
|
+
"""
|
|
185
189
|
collection, dataset_name = identifier.split(".")
|
|
186
190
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
187
191
|
doc = doc_ref.get()
|
|
@@ -197,37 +201,68 @@ class OpteryxCatalog(Metastore):
|
|
|
197
201
|
properties=data.get("properties") or {},
|
|
198
202
|
)
|
|
199
203
|
|
|
200
|
-
# Load
|
|
204
|
+
# Load dataset-level timestamp/author and collection/workspace
|
|
201
205
|
metadata.timestamp_ms = data.get("timestamp-ms")
|
|
202
206
|
metadata.author = data.get("author")
|
|
203
|
-
# note: Firestore
|
|
207
|
+
# note: Firestore dataset doc stores the original collection and workspace
|
|
204
208
|
# under keys `collection` and `workspace`.
|
|
205
209
|
|
|
206
|
-
# Load snapshots
|
|
210
|
+
# Load snapshots based on load_history flag
|
|
207
211
|
snaps = []
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
212
|
+
if load_history:
|
|
213
|
+
# Load all snapshots from Firestore (expensive for large histories)
|
|
214
|
+
for snap_doc in self._snapshots_collection(collection, dataset_name).stream():
|
|
215
|
+
sd = snap_doc.to_dict() or {}
|
|
216
|
+
snap = Snapshot(
|
|
217
|
+
snapshot_id=sd.get("snapshot-id"),
|
|
218
|
+
timestamp_ms=sd.get("timestamp-ms"),
|
|
219
|
+
author=sd.get("author"),
|
|
220
|
+
sequence_number=sd.get("sequence-number"),
|
|
221
|
+
user_created=sd.get("user-created"),
|
|
222
|
+
manifest_list=sd.get("manifest"),
|
|
223
|
+
schema_id=sd.get("schema-id"),
|
|
224
|
+
summary=sd.get("summary", {}),
|
|
225
|
+
operation_type=sd.get("operation-type"),
|
|
226
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
227
|
+
)
|
|
228
|
+
snaps.append(snap)
|
|
229
|
+
if snaps:
|
|
230
|
+
metadata.current_snapshot_id = snaps[-1].snapshot_id
|
|
231
|
+
else:
|
|
232
|
+
# Load only the current snapshot (efficient single read)
|
|
233
|
+
current_snap_id = data.get("current-snapshot-id")
|
|
234
|
+
if current_snap_id:
|
|
235
|
+
try:
|
|
236
|
+
snap_doc = (
|
|
237
|
+
self._snapshots_collection(collection, dataset_name)
|
|
238
|
+
.document(str(current_snap_id))
|
|
239
|
+
.get()
|
|
240
|
+
)
|
|
241
|
+
if snap_doc.exists:
|
|
242
|
+
sd = snap_doc.to_dict() or {}
|
|
243
|
+
snap = Snapshot(
|
|
244
|
+
snapshot_id=sd.get("snapshot-id"),
|
|
245
|
+
timestamp_ms=sd.get("timestamp-ms"),
|
|
246
|
+
author=sd.get("author"),
|
|
247
|
+
sequence_number=sd.get("sequence-number"),
|
|
248
|
+
user_created=sd.get("user-created"),
|
|
249
|
+
manifest_list=sd.get("manifest"),
|
|
250
|
+
schema_id=sd.get("schema-id"),
|
|
251
|
+
summary=sd.get("summary", {}),
|
|
252
|
+
operation_type=sd.get("operation-type"),
|
|
253
|
+
parent_snapshot_id=sd.get("parent-snapshot-id"),
|
|
254
|
+
)
|
|
255
|
+
snaps.append(snap)
|
|
256
|
+
metadata.current_snapshot_id = current_snap_id
|
|
257
|
+
except Exception:
|
|
258
|
+
pass
|
|
223
259
|
metadata.snapshots = snaps
|
|
224
|
-
if snaps:
|
|
225
|
-
metadata.current_snapshot_id = snaps[-1].snapshot_id
|
|
226
260
|
|
|
227
261
|
# Load schemas subcollection
|
|
228
|
-
|
|
262
|
+
schemas_coll = doc_ref.collection("schemas")
|
|
263
|
+
# Load all schemas if requested; otherwise load only current schema
|
|
264
|
+
if load_history:
|
|
229
265
|
schemas = []
|
|
230
|
-
schemas_coll = doc_ref.collection("schemas")
|
|
231
266
|
for sdoc in schemas_coll.stream():
|
|
232
267
|
sd = sdoc.to_dict() or {}
|
|
233
268
|
schemas.append(
|
|
@@ -241,9 +276,23 @@ class OpteryxCatalog(Metastore):
|
|
|
241
276
|
)
|
|
242
277
|
metadata.schemas = schemas
|
|
243
278
|
metadata.current_schema_id = doc.to_dict().get("current-schema-id")
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
279
|
+
else:
|
|
280
|
+
# Only load the current schema document for efficiency
|
|
281
|
+
current_schema_id = doc.to_dict().get("current-schema-id")
|
|
282
|
+
if current_schema_id:
|
|
283
|
+
sdoc = schemas_coll.document(str(current_schema_id)).get()
|
|
284
|
+
if sdoc.exists:
|
|
285
|
+
sd = sdoc.to_dict() or {}
|
|
286
|
+
metadata.schemas = [
|
|
287
|
+
{
|
|
288
|
+
"schema_id": sdoc.id,
|
|
289
|
+
"columns": sd.get("columns", []),
|
|
290
|
+
"timestamp-ms": sd.get("timestamp-ms"),
|
|
291
|
+
"author": sd.get("author"),
|
|
292
|
+
"sequence-number": sd.get("sequence-number"),
|
|
293
|
+
}
|
|
294
|
+
]
|
|
295
|
+
metadata.current_schema_id = current_schema_id
|
|
247
296
|
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
248
297
|
|
|
249
298
|
def drop_dataset(self, identifier: str) -> None:
|
|
@@ -270,7 +319,7 @@ class OpteryxCatalog(Metastore):
|
|
|
270
319
|
|
|
271
320
|
If `exists_ok` is False and the collection already exists, a KeyError is raised.
|
|
272
321
|
"""
|
|
273
|
-
doc_ref = self.
|
|
322
|
+
doc_ref = self._collection_ref(collection)
|
|
274
323
|
if doc_ref.get().exists:
|
|
275
324
|
if exists_ok:
|
|
276
325
|
return
|
|
@@ -292,11 +341,7 @@ class OpteryxCatalog(Metastore):
|
|
|
292
341
|
self, collection: str, properties: dict | None = None, author: Optional[str] = None
|
|
293
342
|
) -> None:
|
|
294
343
|
"""Convenience wrapper that creates the collection only if missing."""
|
|
295
|
-
|
|
296
|
-
self.create_collection(collection, properties=properties, exists_ok=True, author=author)
|
|
297
|
-
except Exception:
|
|
298
|
-
# Be conservative: surface caller-level warnings rather than failing
|
|
299
|
-
return
|
|
344
|
+
self.create_collection(collection, properties=properties, exists_ok=True, author=author)
|
|
300
345
|
|
|
301
346
|
def dataset_exists(
|
|
302
347
|
self, identifier_or_collection: str, dataset_name: Optional[str] = None
|
|
@@ -309,12 +354,14 @@ class OpteryxCatalog(Metastore):
|
|
|
309
354
|
"""
|
|
310
355
|
# Normalize inputs
|
|
311
356
|
if dataset_name is None:
|
|
312
|
-
# Expect a single collection like 'collection.
|
|
357
|
+
# Expect a single collection like 'collection.dataset'
|
|
313
358
|
if "." not in identifier_or_collection:
|
|
314
359
|
raise ValueError(
|
|
315
|
-
"collection must be 'collection.
|
|
360
|
+
"collection must be 'collection.dataset' or pass dataset_name separately"
|
|
316
361
|
)
|
|
317
362
|
collection, dataset_name = identifier_or_collection.rsplit(".", 1)
|
|
363
|
+
else:
|
|
364
|
+
collection = identifier_or_collection
|
|
318
365
|
|
|
319
366
|
try:
|
|
320
367
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
@@ -334,6 +381,7 @@ class OpteryxCatalog(Metastore):
|
|
|
334
381
|
author: str = None,
|
|
335
382
|
description: Optional[str] = None,
|
|
336
383
|
properties: dict | None = None,
|
|
384
|
+
update_if_exists: bool = False,
|
|
337
385
|
) -> CatalogView:
|
|
338
386
|
"""Create a view document and a statement version in the `statement` subcollection.
|
|
339
387
|
|
|
@@ -347,7 +395,22 @@ class OpteryxCatalog(Metastore):
|
|
|
347
395
|
|
|
348
396
|
doc_ref = self._view_doc_ref(collection, view_name)
|
|
349
397
|
if doc_ref.get().exists:
|
|
350
|
-
|
|
398
|
+
if not update_if_exists:
|
|
399
|
+
raise ViewAlreadyExists(f"View already exists: {collection}.{view_name}")
|
|
400
|
+
# Update existing view - get current sequence number
|
|
401
|
+
existing_doc = doc_ref.get().to_dict()
|
|
402
|
+
current_statement_id = existing_doc.get("statement-id")
|
|
403
|
+
if current_statement_id:
|
|
404
|
+
stmt_ref = doc_ref.collection("statement").document(current_statement_id)
|
|
405
|
+
stmt_doc = stmt_ref.get()
|
|
406
|
+
if stmt_doc.exists:
|
|
407
|
+
sequence_number = stmt_doc.to_dict().get("sequence-number", 0) + 1
|
|
408
|
+
else:
|
|
409
|
+
sequence_number = 1
|
|
410
|
+
else:
|
|
411
|
+
sequence_number = 1
|
|
412
|
+
else:
|
|
413
|
+
sequence_number = 1
|
|
351
414
|
|
|
352
415
|
now_ms = int(time.time() * 1000)
|
|
353
416
|
if author is None:
|
|
@@ -361,7 +424,7 @@ class OpteryxCatalog(Metastore):
|
|
|
361
424
|
"sql": sql,
|
|
362
425
|
"timestamp-ms": now_ms,
|
|
363
426
|
"author": author,
|
|
364
|
-
"sequence-number":
|
|
427
|
+
"sequence-number": sequence_number,
|
|
365
428
|
}
|
|
366
429
|
)
|
|
367
430
|
|
|
@@ -389,6 +452,9 @@ class OpteryxCatalog(Metastore):
|
|
|
389
452
|
setattr(v, "sql", sql)
|
|
390
453
|
setattr(v, "metadata", type("M", (), {})())
|
|
391
454
|
v.metadata.schema = schema
|
|
455
|
+
# Attach catalog and identifier for describe() method
|
|
456
|
+
setattr(v, "_catalog", self)
|
|
457
|
+
setattr(v, "_identifier", f"{collection}.{view_name}")
|
|
392
458
|
return v
|
|
393
459
|
|
|
394
460
|
def load_view(self, identifier: str | tuple) -> CatalogView:
|
|
@@ -410,27 +476,28 @@ class OpteryxCatalog(Metastore):
|
|
|
410
476
|
stmt_id = data.get("statement-id")
|
|
411
477
|
sql = None
|
|
412
478
|
schema = data.get("schema")
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
if sdoc.exists:
|
|
417
|
-
sql = (sdoc.to_dict() or {}).get("sql")
|
|
418
|
-
# fallback: pick the most recent statement
|
|
419
|
-
if not sql:
|
|
420
|
-
for s in doc_ref.collection("statement").stream():
|
|
421
|
-
sd = s.to_dict() or {}
|
|
422
|
-
if sd.get("sql"):
|
|
423
|
-
sql = sd.get("sql")
|
|
424
|
-
break
|
|
425
|
-
except Exception:
|
|
426
|
-
pass
|
|
479
|
+
|
|
480
|
+
sdoc = doc_ref.collection("statement").document(str(stmt_id)).get()
|
|
481
|
+
sql = (sdoc.to_dict() or {}).get("sql")
|
|
427
482
|
|
|
428
483
|
v = CatalogView(name=view_name, definition=sql or "", properties=data.get("properties", {}))
|
|
429
484
|
setattr(v, "sql", sql or "")
|
|
430
485
|
setattr(v, "metadata", type("M", (), {})())
|
|
431
486
|
v.metadata.schema = schema
|
|
487
|
+
# Populate metadata fields from the stored view document so callers
|
|
488
|
+
# expecting attributes like `timestamp_ms` won't fail.
|
|
432
489
|
v.metadata.author = data.get("author")
|
|
433
490
|
v.metadata.description = data.get("description")
|
|
491
|
+
v.metadata.timestamp_ms = data.get("timestamp-ms")
|
|
492
|
+
# Execution/operational fields (may be None)
|
|
493
|
+
v.metadata.last_execution_ms = data.get("last-execution-ms")
|
|
494
|
+
v.metadata.last_execution_data_size = data.get("last-execution-data-size")
|
|
495
|
+
v.metadata.last_execution_records = data.get("last-execution-records")
|
|
496
|
+
# Optional describer (used to flag LLM-generated descriptions)
|
|
497
|
+
v.metadata.describer = data.get("describer")
|
|
498
|
+
# Attach catalog and identifier for describe() method
|
|
499
|
+
setattr(v, "_catalog", self)
|
|
500
|
+
setattr(v, "_identifier", f"{collection}.{view_name}")
|
|
434
501
|
return v
|
|
435
502
|
|
|
436
503
|
def drop_view(self, identifier: str | tuple) -> None:
|
|
@@ -441,11 +508,9 @@ class OpteryxCatalog(Metastore):
|
|
|
441
508
|
|
|
442
509
|
doc_ref = self._view_doc_ref(collection, view_name)
|
|
443
510
|
# delete statement subcollection
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
except Exception:
|
|
448
|
-
pass
|
|
511
|
+
for d in doc_ref.collection("statement").stream():
|
|
512
|
+
doc_ref.collection("statement").document(d.id).delete()
|
|
513
|
+
|
|
449
514
|
doc_ref.delete()
|
|
450
515
|
|
|
451
516
|
def list_views(self, collection: str) -> Iterable[str]:
|
|
@@ -474,6 +539,8 @@ class OpteryxCatalog(Metastore):
|
|
|
474
539
|
"identifier must be 'collection.view' or pass view_name separately"
|
|
475
540
|
)
|
|
476
541
|
collection, view_name = identifier_or_collection.rsplit(".", 1)
|
|
542
|
+
else:
|
|
543
|
+
collection = identifier_or_collection
|
|
477
544
|
|
|
478
545
|
try:
|
|
479
546
|
doc_ref = self._view_doc_ref(collection, view_name)
|
|
@@ -501,40 +568,75 @@ class OpteryxCatalog(Metastore):
|
|
|
501
568
|
updates["last-execution-time-ms"] = int(execution_time * 1000)
|
|
502
569
|
updates["last-execution-ms"] = now_ms
|
|
503
570
|
if updates:
|
|
504
|
-
|
|
505
|
-
|
|
506
|
-
|
|
507
|
-
|
|
571
|
+
doc_ref.update(updates)
|
|
572
|
+
|
|
573
|
+
def update_view_description(
|
|
574
|
+
self,
|
|
575
|
+
identifier: str | tuple,
|
|
576
|
+
description: str,
|
|
577
|
+
describer: Optional[str] = None,
|
|
578
|
+
) -> None:
|
|
579
|
+
"""Update the description for a view.
|
|
580
|
+
|
|
581
|
+
Args:
|
|
582
|
+
identifier: View identifier ('collection.view' or tuple)
|
|
583
|
+
description: The new description text
|
|
584
|
+
describer: Optional identifier for who/what created the description
|
|
585
|
+
"""
|
|
586
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
587
|
+
collection, view_name = identifier[0], identifier[1]
|
|
588
|
+
else:
|
|
589
|
+
collection, view_name = identifier.split(".")
|
|
590
|
+
|
|
591
|
+
doc_ref = self._view_doc_ref(collection, view_name)
|
|
592
|
+
updates = {
|
|
593
|
+
"description": description,
|
|
594
|
+
}
|
|
595
|
+
if describer is not None:
|
|
596
|
+
updates["describer"] = describer
|
|
597
|
+
doc_ref.update(updates)
|
|
598
|
+
|
|
599
|
+
def update_dataset_description(
|
|
600
|
+
self,
|
|
601
|
+
identifier: str,
|
|
602
|
+
description: str,
|
|
603
|
+
describer: Optional[str] = None,
|
|
604
|
+
) -> None:
|
|
605
|
+
"""Update the description for a dataset.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
identifier: Dataset identifier in format 'collection.dataset_name'
|
|
609
|
+
description: The new description text
|
|
610
|
+
describer: Optional identifier for who/what created the description
|
|
611
|
+
"""
|
|
612
|
+
collection, dataset_name = identifier.split(".")
|
|
613
|
+
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
614
|
+
updates = {
|
|
615
|
+
"description": description,
|
|
616
|
+
}
|
|
617
|
+
if describer is not None:
|
|
618
|
+
updates["describer"] = describer
|
|
619
|
+
doc_ref.update(updates)
|
|
508
620
|
|
|
509
621
|
def write_parquet_manifest(
|
|
510
|
-
self, snapshot_id: int, entries: List[dict],
|
|
622
|
+
self, snapshot_id: int, entries: List[dict], dataset_location: str
|
|
511
623
|
) -> Optional[str]:
|
|
512
624
|
"""Write a Parquet manifest for the given snapshot id and entries.
|
|
513
625
|
|
|
514
626
|
Entries should be plain dicts convertible by pyarrow.Table.from_pylist.
|
|
515
|
-
The manifest will be written to <
|
|
627
|
+
The manifest will be written to <dataset_location>/metadata/manifest-<snapshot_id>.parquet
|
|
516
628
|
"""
|
|
517
629
|
import pyarrow as pa
|
|
518
630
|
import pyarrow.parquet as pq
|
|
519
631
|
|
|
520
632
|
# If entries is None we skip writing; if entries is empty list, write
|
|
521
|
-
# an empty Parquet manifest (represents an empty
|
|
633
|
+
# an empty Parquet manifest (represents an empty dataset for this
|
|
522
634
|
# snapshot). This preserves previous manifests so older snapshots
|
|
523
635
|
# remain readable.
|
|
524
636
|
if entries is None:
|
|
525
637
|
return None
|
|
526
638
|
|
|
527
|
-
|
|
528
|
-
try:
|
|
529
|
-
pass
|
|
530
|
-
|
|
531
|
-
# print("[MANIFEST] Parquet manifest entries to write:")
|
|
532
|
-
# print(json.dumps(entries, indent=2, default=str))
|
|
533
|
-
except Exception:
|
|
534
|
-
# print("[MANIFEST] Parquet manifest entries:", entries)
|
|
535
|
-
pass
|
|
536
|
-
|
|
537
|
-
parquet_path = f"{table_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
639
|
+
parquet_path = f"{dataset_location}/metadata/manifest-{snapshot_id}.parquet"
|
|
538
640
|
|
|
539
641
|
# Use provided FileIO if it supports writing; otherwise write to GCS
|
|
540
642
|
try:
|
|
@@ -546,144 +648,77 @@ class OpteryxCatalog(Metastore):
|
|
|
546
648
|
("file_format", pa.string()),
|
|
547
649
|
("record_count", pa.int64()),
|
|
548
650
|
("file_size_in_bytes", pa.int64()),
|
|
651
|
+
("uncompressed_size_in_bytes", pa.int64()),
|
|
652
|
+
("column_uncompressed_sizes_in_bytes", pa.list_(pa.int64())),
|
|
653
|
+
("null_counts", pa.list_(pa.int64())),
|
|
549
654
|
("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
|
|
550
655
|
("histogram_counts", pa.list_(pa.list_(pa.int64()))),
|
|
551
656
|
("histogram_bins", pa.int32()),
|
|
552
|
-
("min_values", pa.list_(pa.
|
|
553
|
-
("max_values", pa.list_(pa.
|
|
657
|
+
("min_values", pa.list_(pa.binary())),
|
|
658
|
+
("max_values", pa.list_(pa.binary())),
|
|
554
659
|
]
|
|
555
660
|
)
|
|
556
661
|
|
|
662
|
+
# Normalize entries to match schema expectations:
|
|
663
|
+
normalized = []
|
|
664
|
+
for ent in entries:
|
|
665
|
+
if not isinstance(ent, dict):
|
|
666
|
+
normalized.append(ent)
|
|
667
|
+
continue
|
|
668
|
+
e = dict(ent)
|
|
669
|
+
# Ensure list fields exist
|
|
670
|
+
e.setdefault("min_k_hashes", [])
|
|
671
|
+
e.setdefault("histogram_counts", [])
|
|
672
|
+
e.setdefault("histogram_bins", 0)
|
|
673
|
+
e.setdefault("column_uncompressed_sizes_in_bytes", [])
|
|
674
|
+
e.setdefault("null_counts", [])
|
|
675
|
+
|
|
676
|
+
# Process min/max values: truncate to 16 bytes with ellipsis marker if longer
|
|
677
|
+
mv = e.get("min_values") or []
|
|
678
|
+
xv = e.get("max_values") or []
|
|
679
|
+
|
|
680
|
+
def truncate_value(v):
|
|
681
|
+
"""Convert value to binary and truncate to 16 bytes with marker if needed."""
|
|
682
|
+
if v is None:
|
|
683
|
+
return None
|
|
684
|
+
# Convert to bytes
|
|
685
|
+
if isinstance(v, bytes):
|
|
686
|
+
b = v
|
|
687
|
+
else:
|
|
688
|
+
b = str(v).encode('utf-8')
|
|
689
|
+
# Truncate if longer than 16 bytes, add 0xFF as 17th byte to indicate truncation
|
|
690
|
+
if len(b) > 16:
|
|
691
|
+
return b[:16] + b'\xff'
|
|
692
|
+
return b
|
|
693
|
+
|
|
694
|
+
e["min_values"] = [truncate_value(v) for v in mv]
|
|
695
|
+
e["max_values"] = [truncate_value(v) for v in xv]
|
|
696
|
+
normalized.append(e)
|
|
697
|
+
|
|
557
698
|
try:
|
|
558
|
-
table = pa.Table.from_pylist(
|
|
559
|
-
except Exception:
|
|
699
|
+
table = pa.Table.from_pylist(normalized, schema=schema)
|
|
700
|
+
except Exception as exc:
|
|
560
701
|
# Diagnostic output to help find malformed manifest entries
|
|
561
|
-
try:
|
|
562
|
-
print(
|
|
563
|
-
"[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
|
|
564
|
-
)
|
|
565
|
-
for i, ent in enumerate(entries):
|
|
566
|
-
print(f" Entry {i}:")
|
|
567
|
-
if isinstance(ent, dict):
|
|
568
|
-
for k, v in ent.items():
|
|
569
|
-
tname = type(v).__name__
|
|
570
|
-
try:
|
|
571
|
-
s = repr(v)
|
|
572
|
-
except Exception:
|
|
573
|
-
s = "<unreprable>"
|
|
574
|
-
print(f" - {k}: type={tname} repr={s[:200]}")
|
|
575
|
-
else:
|
|
576
|
-
print(
|
|
577
|
-
f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
|
|
578
|
-
)
|
|
579
|
-
except Exception:
|
|
580
|
-
pass
|
|
581
702
|
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
e2 = dict(ent) # copy
|
|
591
|
-
# Ensure numeric fields
|
|
592
|
-
for k in ("record_count", "file_size_in_bytes", "histogram_bins"):
|
|
593
|
-
v = e2.get(k)
|
|
703
|
+
print(
|
|
704
|
+
"[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
|
|
705
|
+
)
|
|
706
|
+
for i, ent in enumerate(entries):
|
|
707
|
+
print(f" Entry {i}:")
|
|
708
|
+
if isinstance(ent, dict):
|
|
709
|
+
for k, v in ent.items():
|
|
710
|
+
tname = type(v).__name__
|
|
594
711
|
try:
|
|
595
|
-
|
|
712
|
+
s = repr(v)
|
|
596
713
|
except Exception:
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
if isinstance(sub, list):
|
|
606
|
-
try:
|
|
607
|
-
new_mk.append([int(x) for x in sub])
|
|
608
|
-
except Exception:
|
|
609
|
-
new_mk.append([])
|
|
610
|
-
else:
|
|
611
|
-
new_mk.append([])
|
|
612
|
-
e2["min_k_hashes"] = new_mk
|
|
613
|
-
# Ensure histogram_counts is list[list[int]]
|
|
614
|
-
hc = e2.get("histogram_counts")
|
|
615
|
-
if not isinstance(hc, list):
|
|
616
|
-
e2["histogram_counts"] = []
|
|
617
|
-
else:
|
|
618
|
-
new_hc = []
|
|
619
|
-
for sub in hc:
|
|
620
|
-
if isinstance(sub, list):
|
|
621
|
-
try:
|
|
622
|
-
new_hc.append([int(x) for x in sub])
|
|
623
|
-
except Exception:
|
|
624
|
-
new_hc.append([])
|
|
625
|
-
else:
|
|
626
|
-
new_hc.append([])
|
|
627
|
-
e2["histogram_counts"] = new_hc
|
|
628
|
-
# Sanitize min_values / max_values: must be list[int] or None
|
|
629
|
-
# Sanitize min_values / max_values: coerce to int64 using to_int() if available
|
|
630
|
-
try:
|
|
631
|
-
from opteryx.compiled.structures.relation_statistics import to_int
|
|
632
|
-
except Exception:
|
|
633
|
-
|
|
634
|
-
def to_int(val):
|
|
635
|
-
# Best-effort fallback: handle numpy types, strings and numbers
|
|
636
|
-
try:
|
|
637
|
-
if val is None:
|
|
638
|
-
return None
|
|
639
|
-
if hasattr(val, "item"):
|
|
640
|
-
val = val.item()
|
|
641
|
-
if isinstance(val, (bytes, bytearray)):
|
|
642
|
-
val = val.decode(errors="ignore")
|
|
643
|
-
if isinstance(val, str):
|
|
644
|
-
# empty strings are invalid
|
|
645
|
-
if val == "":
|
|
646
|
-
return None
|
|
647
|
-
try:
|
|
648
|
-
return int(val)
|
|
649
|
-
except Exception:
|
|
650
|
-
return None
|
|
651
|
-
if isinstance(val, float):
|
|
652
|
-
return int(val)
|
|
653
|
-
return int(val)
|
|
654
|
-
except Exception:
|
|
655
|
-
return None
|
|
656
|
-
|
|
657
|
-
for key in ("min_values", "max_values"):
|
|
658
|
-
mv = e2.get(key)
|
|
659
|
-
if not isinstance(mv, list):
|
|
660
|
-
e2[key] = [None]
|
|
661
|
-
else:
|
|
662
|
-
new_mv = []
|
|
663
|
-
for x in mv:
|
|
664
|
-
try:
|
|
665
|
-
if x is None:
|
|
666
|
-
new_mv.append(None)
|
|
667
|
-
continue
|
|
668
|
-
# Use to_int to coerce into int64 semantics
|
|
669
|
-
v = x
|
|
670
|
-
if hasattr(v, "item"):
|
|
671
|
-
v = v.item()
|
|
672
|
-
coerced = to_int(v)
|
|
673
|
-
# to_int may return None-like sentinel; accept ints only
|
|
674
|
-
if coerced is None:
|
|
675
|
-
new_mv.append(None)
|
|
676
|
-
else:
|
|
677
|
-
new_mv.append(int(coerced))
|
|
678
|
-
except Exception:
|
|
679
|
-
new_mv.append(None)
|
|
680
|
-
e2[key] = new_mv
|
|
681
|
-
sanitized.append(e2)
|
|
682
|
-
table = pa.Table.from_pylist(sanitized, schema=schema)
|
|
683
|
-
print("[MANIFEST DEBUG] Sanitized entries converted successfully")
|
|
684
|
-
except Exception:
|
|
685
|
-
print("[MANIFEST DEBUG] Sanitization failed; re-raising original exception")
|
|
686
|
-
raise
|
|
714
|
+
s = "<unreprable>"
|
|
715
|
+
print(f" - {k}: type={tname} repr={s[:200]}")
|
|
716
|
+
else:
|
|
717
|
+
print(
|
|
718
|
+
f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
|
|
719
|
+
)
|
|
720
|
+
raise exc
|
|
721
|
+
|
|
687
722
|
buf = pa.BufferOutputStream()
|
|
688
723
|
pq.write_table(table, buf, compression="zstd")
|
|
689
724
|
data = buf.getvalue().to_pybytes()
|
|
@@ -696,15 +731,6 @@ class OpteryxCatalog(Metastore):
|
|
|
696
731
|
out.close()
|
|
697
732
|
except Exception:
|
|
698
733
|
pass
|
|
699
|
-
elif self._storage_client and self.gcs_bucket:
|
|
700
|
-
# Write to GCS bucket
|
|
701
|
-
bucket = self._storage_client.bucket(self.gcs_bucket)
|
|
702
|
-
# object path: remove gs://bucket/ prefix
|
|
703
|
-
parsed = parquet_path
|
|
704
|
-
if parsed.startswith("gs://"):
|
|
705
|
-
parsed = parsed[5 + len(self.gcs_bucket) + 1 :]
|
|
706
|
-
blob = bucket.blob(parsed)
|
|
707
|
-
blob.upload_from_string(data)
|
|
708
734
|
|
|
709
735
|
return parquet_path
|
|
710
736
|
except Exception as e:
|
|
@@ -713,7 +739,7 @@ class OpteryxCatalog(Metastore):
|
|
|
713
739
|
raise e
|
|
714
740
|
|
|
715
741
|
def save_snapshot(self, identifier: str, snapshot: Snapshot) -> None:
|
|
716
|
-
"""Persist a single snapshot document for a
|
|
742
|
+
"""Persist a single snapshot document for a dataset."""
|
|
717
743
|
namespace, dataset_name = identifier.split(".")
|
|
718
744
|
snaps = self._snapshots_collection(namespace, dataset_name)
|
|
719
745
|
doc_id = str(snapshot.snapshot_id)
|
|
@@ -749,9 +775,9 @@ class OpteryxCatalog(Metastore):
|
|
|
749
775
|
snaps.document(doc_id).set(data)
|
|
750
776
|
|
|
751
777
|
def save_dataset_metadata(self, identifier: str, metadata: DatasetMetadata) -> None:
|
|
752
|
-
"""Persist
|
|
778
|
+
"""Persist dataset-level metadata and snapshots to Firestore.
|
|
753
779
|
|
|
754
|
-
This writes the
|
|
780
|
+
This writes the dataset document and upserts snapshot documents.
|
|
755
781
|
"""
|
|
756
782
|
collection, dataset_name = identifier.split(".")
|
|
757
783
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
@@ -873,7 +899,7 @@ class OpteryxCatalog(Metastore):
|
|
|
873
899
|
return cols
|
|
874
900
|
|
|
875
901
|
def _write_schema(self, namespace: str, dataset_name: str, schema: Any, author: str) -> str:
|
|
876
|
-
"""Persist a schema document in the
|
|
902
|
+
"""Persist a schema document in the dataset's `schemas` subcollection and
|
|
877
903
|
return the new schema id.
|
|
878
904
|
"""
|
|
879
905
|
import uuid
|