opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/catalog/compaction.py +15 -8
- opteryx_catalog/catalog/dataset.py +449 -111
- opteryx_catalog/catalog/manifest.py +390 -330
- opteryx_catalog/catalog/metadata.py +3 -0
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +82 -54
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- tests/test_collections.py +37 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
from pytest import approx
|
|
10
|
+
import random
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_count_at():
|
|
14
|
+
h = distogram.Distogram(bin_count=3)
|
|
15
|
+
print(h)
|
|
16
|
+
|
|
17
|
+
# fill histogram
|
|
18
|
+
distogram.update(h, 16, count=4)
|
|
19
|
+
distogram.update(h, 23, count=3)
|
|
20
|
+
distogram.update(h, 28, count=5)
|
|
21
|
+
print(h)
|
|
22
|
+
|
|
23
|
+
actual_result = distogram.count_at(h, 25)
|
|
24
|
+
assert actual_result == approx(6.859999999)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_count_at_normal():
|
|
28
|
+
points = 10000
|
|
29
|
+
normal = [random.normalvariate(0.0, 1.0) for _ in range(points)]
|
|
30
|
+
h = distogram.Distogram()
|
|
31
|
+
|
|
32
|
+
for i in normal:
|
|
33
|
+
distogram.update(h, i)
|
|
34
|
+
|
|
35
|
+
assert distogram.count_at(h, 0) == approx(points / 2, rel=0.05)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def test_count_at_not_enough_elements():
|
|
39
|
+
h = distogram.Distogram()
|
|
40
|
+
|
|
41
|
+
distogram.update(h, 1)
|
|
42
|
+
distogram.update(h, 2)
|
|
43
|
+
distogram.update(h, 3)
|
|
44
|
+
|
|
45
|
+
assert distogram.count_at(h, 2.5) == 2
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def test_count_at_left():
|
|
49
|
+
h = distogram.Distogram(bin_count=6)
|
|
50
|
+
|
|
51
|
+
for i in [1, 2, 3, 4, 5, 6, 0.7, 1.1]:
|
|
52
|
+
distogram.update(h, i)
|
|
53
|
+
|
|
54
|
+
assert distogram.count_at(h, 0.77) == approx(0.14), distogram.count_at(h, 0.77)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def test_count_at_right():
|
|
58
|
+
h = distogram.Distogram(bin_count=6)
|
|
59
|
+
|
|
60
|
+
for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
|
|
61
|
+
distogram.update(h, i)
|
|
62
|
+
|
|
63
|
+
assert distogram.count_at(h, 6.5) == approx(7.307692307692308)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def test_count_at_empty():
|
|
67
|
+
h = distogram.Distogram()
|
|
68
|
+
|
|
69
|
+
assert distogram.count_at(h, 6.5) is None
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def test_count_at_out_of_bouns():
|
|
73
|
+
h = distogram.Distogram()
|
|
74
|
+
|
|
75
|
+
for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
|
|
76
|
+
distogram.update(h, i)
|
|
77
|
+
|
|
78
|
+
assert distogram.count_at(h, 0.2) is None
|
|
79
|
+
assert distogram.count_at(h, 10) is None
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__": # pragma: no cover
|
|
83
|
+
test_count_at()
|
|
84
|
+
test_count_at_empty()
|
|
85
|
+
test_count_at_left()
|
|
86
|
+
test_count_at_normal()
|
|
87
|
+
test_count_at_not_enough_elements()
|
|
88
|
+
test_count_at_out_of_bouns()
|
|
89
|
+
test_count_at_right()
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
from pytest import approx
|
|
10
|
+
|
|
11
|
+
import numpy as np
|
|
12
|
+
import random
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def test_quantile():
|
|
16
|
+
h = distogram.Distogram(bin_count=3)
|
|
17
|
+
distogram.update(h, 16, count=4)
|
|
18
|
+
distogram.update(h, 23, count=3)
|
|
19
|
+
distogram.update(h, 28, count=5)
|
|
20
|
+
|
|
21
|
+
assert distogram.quantile(h, 0.5) == approx(23.625)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_quantile_not_enough_elemnts():
|
|
25
|
+
h = distogram.Distogram(bin_count=10)
|
|
26
|
+
|
|
27
|
+
for i in [12.3, 5.4, 8.2, 100.53, 23.5, 13.98]:
|
|
28
|
+
distogram.update(h, i)
|
|
29
|
+
|
|
30
|
+
assert distogram.quantile(h, 0.5) == approx(13.14)
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def test_quantile_on_left():
|
|
34
|
+
h = distogram.Distogram(bin_count=6)
|
|
35
|
+
|
|
36
|
+
data = [12.3, 5.2, 5.4, 4.9, 5.5, 5.6, 8.2, 30.53, 23.5, 13.98]
|
|
37
|
+
for i in data:
|
|
38
|
+
distogram.update(h, i)
|
|
39
|
+
|
|
40
|
+
assert distogram.quantile(h, 0.01) == approx(np.quantile(data, 0.01), rel=0.01)
|
|
41
|
+
assert distogram.quantile(h, 0.05) == approx(np.quantile(data, 0.05), rel=0.05)
|
|
42
|
+
assert distogram.quantile(h, 0.25) == approx(np.quantile(data, 0.25), rel=0.05)
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def test_quantile_on_right():
|
|
46
|
+
h = distogram.Distogram(bin_count=6)
|
|
47
|
+
|
|
48
|
+
data = [12.3, 8.2, 100.53, 23.5, 13.98, 200, 200.2, 200.8, 200.4, 200.1]
|
|
49
|
+
for i in data:
|
|
50
|
+
distogram.update(h, i)
|
|
51
|
+
|
|
52
|
+
assert distogram.quantile(h, 0.99) == approx(np.quantile(data, 0.99), rel=0.01)
|
|
53
|
+
assert distogram.quantile(h, 0.85) == approx(np.quantile(data, 0.85), rel=0.01)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def test_normal():
|
|
57
|
+
# normal = np.random.normal(0,1, 1000)
|
|
58
|
+
normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
|
|
59
|
+
h = distogram.Distogram(bin_count=64)
|
|
60
|
+
|
|
61
|
+
for i in normal:
|
|
62
|
+
distogram.update(h, i)
|
|
63
|
+
|
|
64
|
+
assert distogram.quantile(h, 0.5) == approx(np.quantile(normal, 0.5), abs=0.2)
|
|
65
|
+
assert distogram.quantile(h, 0.95) == approx(np.quantile(normal, 0.95), abs=0.2)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def test_quantile_empty():
|
|
69
|
+
h = distogram.Distogram()
|
|
70
|
+
|
|
71
|
+
assert distogram.quantile(h, 0.3) is None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def test_quantile_out_of_bouns():
|
|
75
|
+
h = distogram.Distogram()
|
|
76
|
+
|
|
77
|
+
for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
|
|
78
|
+
distogram.update(h, i)
|
|
79
|
+
|
|
80
|
+
assert distogram.quantile(h, -0.2) is None
|
|
81
|
+
assert distogram.quantile(h, 10) is None
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
from pytest import approx
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import random
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def test_stats():
|
|
17
|
+
normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
|
|
18
|
+
h = distogram.Distogram()
|
|
19
|
+
|
|
20
|
+
for i in normal:
|
|
21
|
+
distogram.update(h, i)
|
|
22
|
+
|
|
23
|
+
assert distogram.mean(h) == approx(np.mean(normal), abs=0.1)
|
|
24
|
+
assert distogram.variance(h) == approx(np.var(normal), abs=0.1)
|
|
25
|
+
assert distogram.stddev(h) == approx(np.std(normal), abs=0.1)
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# type:ignore
|
|
2
|
+
# isort: skip_file
|
|
3
|
+
import sys
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
|
|
7
|
+
|
|
8
|
+
from opteryx.third_party.maki_nage import distogram
|
|
9
|
+
import pytest
|
|
10
|
+
from pytest import approx
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def test_update():
|
|
14
|
+
h = distogram.Distogram(bin_count=3)
|
|
15
|
+
|
|
16
|
+
# fill histogram
|
|
17
|
+
distogram.update(h, 23)
|
|
18
|
+
assert h.bins == [(23, 1)]
|
|
19
|
+
distogram.update(h, 28)
|
|
20
|
+
assert h.bins == [(23, 1), (28, 1)]
|
|
21
|
+
distogram.update(h, 16)
|
|
22
|
+
assert h.bins == [(16, 1), (23, 1), (28, 1)]
|
|
23
|
+
|
|
24
|
+
# update count on existing value
|
|
25
|
+
distogram.update(h, 23)
|
|
26
|
+
assert h.bins == [(16, 1), (23, 2), (28, 1)]
|
|
27
|
+
distogram.update(h, 28)
|
|
28
|
+
assert h.bins == [(16, 1), (23, 2), (28, 2)]
|
|
29
|
+
distogram.update(h, 16)
|
|
30
|
+
assert h.bins == [(16, 2), (23, 2), (28, 2)]
|
|
31
|
+
|
|
32
|
+
# merge values
|
|
33
|
+
h = distogram.update(h, 26)
|
|
34
|
+
assert h.bins[0] == (16, 2)
|
|
35
|
+
assert h.bins[1] == (23, 2)
|
|
36
|
+
assert h.bins[2][0] == approx(27.33333)
|
|
37
|
+
assert h.bins[2][1] == 3
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_update_with_invalid_count():
|
|
41
|
+
h = distogram.Distogram(bin_count=3)
|
|
42
|
+
|
|
43
|
+
with pytest.raises(ValueError):
|
|
44
|
+
distogram.update(h, 23, count=0)
|
|
@@ -20,6 +20,9 @@ from .exceptions import DatasetNotFound
|
|
|
20
20
|
from .exceptions import ViewAlreadyExists
|
|
21
21
|
from .exceptions import ViewNotFound
|
|
22
22
|
from .iops.base import FileIO
|
|
23
|
+
from .webhooks import send_webhook
|
|
24
|
+
from .webhooks.events import dataset_created_payload
|
|
25
|
+
from .webhooks.events import view_created_payload
|
|
23
26
|
|
|
24
27
|
|
|
25
28
|
class OpteryxCatalog(Metastore):
|
|
@@ -141,6 +144,7 @@ class OpteryxCatalog(Metastore):
|
|
|
141
144
|
"timestamp-ms": now_ms,
|
|
142
145
|
"author": author,
|
|
143
146
|
"maintenance-policy": metadata.maintenance_policy,
|
|
147
|
+
"annotations": metadata.annotations,
|
|
144
148
|
}
|
|
145
149
|
)
|
|
146
150
|
|
|
@@ -168,6 +172,20 @@ class OpteryxCatalog(Metastore):
|
|
|
168
172
|
# update dataset doc to reference current schema
|
|
169
173
|
doc_ref.update({"current-schema-id": metadata.current_schema_id})
|
|
170
174
|
|
|
175
|
+
# Send webhook notification
|
|
176
|
+
send_webhook(
|
|
177
|
+
action="create",
|
|
178
|
+
workspace=self.workspace,
|
|
179
|
+
collection=collection,
|
|
180
|
+
resource_type="dataset",
|
|
181
|
+
resource_name=dataset_name,
|
|
182
|
+
payload=dataset_created_payload(
|
|
183
|
+
schema=schema,
|
|
184
|
+
location=location,
|
|
185
|
+
properties=properties,
|
|
186
|
+
),
|
|
187
|
+
)
|
|
188
|
+
|
|
171
189
|
# Return SimpleDataset (attach this catalog so append() can persist)
|
|
172
190
|
return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
|
|
173
191
|
|
|
@@ -204,8 +222,9 @@ class OpteryxCatalog(Metastore):
|
|
|
204
222
|
# Load dataset-level timestamp/author and collection/workspace
|
|
205
223
|
metadata.timestamp_ms = data.get("timestamp-ms")
|
|
206
224
|
metadata.author = data.get("author")
|
|
207
|
-
|
|
208
|
-
|
|
225
|
+
metadata.description = data.get("description")
|
|
226
|
+
metadata.describer = data.get("describer")
|
|
227
|
+
metadata.annotations = data.get("annotations") or []
|
|
209
228
|
|
|
210
229
|
# Load snapshots based on load_history flag
|
|
211
230
|
snaps = []
|
|
@@ -308,6 +327,13 @@ class OpteryxCatalog(Metastore):
|
|
|
308
327
|
coll = self._datasets_collection(collection)
|
|
309
328
|
return [doc.id for doc in coll.stream()]
|
|
310
329
|
|
|
330
|
+
def list_collections(self) -> Iterable[str]:
|
|
331
|
+
"""List top-level collections (documents) in this workspace."""
|
|
332
|
+
try:
|
|
333
|
+
return [col.id for col in self._catalog_ref.list_documents() if col.id[0] != "$"]
|
|
334
|
+
except:
|
|
335
|
+
return []
|
|
336
|
+
|
|
311
337
|
def create_collection(
|
|
312
338
|
self,
|
|
313
339
|
collection: str,
|
|
@@ -334,6 +360,7 @@ class OpteryxCatalog(Metastore):
|
|
|
334
360
|
"properties": properties or {},
|
|
335
361
|
"timestamp-ms": now_ms,
|
|
336
362
|
"author": author,
|
|
363
|
+
"annotations": [],
|
|
337
364
|
}
|
|
338
365
|
)
|
|
339
366
|
|
|
@@ -446,6 +473,19 @@ class OpteryxCatalog(Metastore):
|
|
|
446
473
|
}
|
|
447
474
|
)
|
|
448
475
|
|
|
476
|
+
# Send webhook notification
|
|
477
|
+
send_webhook(
|
|
478
|
+
action="create" if not update_if_exists else "update",
|
|
479
|
+
workspace=self.workspace,
|
|
480
|
+
collection=collection,
|
|
481
|
+
resource_type="view",
|
|
482
|
+
resource_name=view_name,
|
|
483
|
+
payload=view_created_payload(
|
|
484
|
+
definition=sql,
|
|
485
|
+
properties=properties,
|
|
486
|
+
),
|
|
487
|
+
)
|
|
488
|
+
|
|
449
489
|
# Return a simple CatalogView wrapper
|
|
450
490
|
v = CatalogView(name=view_name, definition=sql, properties=properties or {})
|
|
451
491
|
# provide convenient attributes used by docs/examples
|
|
@@ -598,7 +638,7 @@ class OpteryxCatalog(Metastore):
|
|
|
598
638
|
|
|
599
639
|
def update_dataset_description(
|
|
600
640
|
self,
|
|
601
|
-
identifier: str,
|
|
641
|
+
identifier: str | tuple,
|
|
602
642
|
description: str,
|
|
603
643
|
describer: Optional[str] = None,
|
|
604
644
|
) -> None:
|
|
@@ -609,7 +649,12 @@ class OpteryxCatalog(Metastore):
|
|
|
609
649
|
description: The new description text
|
|
610
650
|
describer: Optional identifier for who/what created the description
|
|
611
651
|
"""
|
|
612
|
-
|
|
652
|
+
|
|
653
|
+
if isinstance(identifier, tuple) or isinstance(identifier, list):
|
|
654
|
+
collection, dataset_name = identifier[0], identifier[1]
|
|
655
|
+
else:
|
|
656
|
+
collection, dataset_name = identifier.split(".")
|
|
657
|
+
|
|
613
658
|
doc_ref = self._dataset_doc_ref(collection, dataset_name)
|
|
614
659
|
updates = {
|
|
615
660
|
"description": description,
|
|
@@ -629,6 +674,8 @@ class OpteryxCatalog(Metastore):
|
|
|
629
674
|
import pyarrow as pa
|
|
630
675
|
import pyarrow.parquet as pq
|
|
631
676
|
|
|
677
|
+
from .iops.fileio import WRITE_PARQUET_OPTIONS
|
|
678
|
+
|
|
632
679
|
# If entries is None we skip writing; if entries is empty list, write
|
|
633
680
|
# an empty Parquet manifest (represents an empty dataset for this
|
|
634
681
|
# snapshot). This preserves previous manifests so older snapshots
|
|
@@ -654,8 +701,10 @@ class OpteryxCatalog(Metastore):
|
|
|
654
701
|
("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
|
|
655
702
|
("histogram_counts", pa.list_(pa.list_(pa.int64()))),
|
|
656
703
|
("histogram_bins", pa.int32()),
|
|
657
|
-
("min_values", pa.list_(pa.
|
|
658
|
-
("max_values", pa.list_(pa.
|
|
704
|
+
("min_values", pa.list_(pa.int64())),
|
|
705
|
+
("max_values", pa.list_(pa.int64())),
|
|
706
|
+
("min_values_display", pa.list_(pa.string())),
|
|
707
|
+
("max_values_display", pa.list_(pa.string())),
|
|
659
708
|
]
|
|
660
709
|
)
|
|
661
710
|
|
|
@@ -672,55 +721,37 @@ class OpteryxCatalog(Metastore):
|
|
|
672
721
|
e.setdefault("histogram_bins", 0)
|
|
673
722
|
e.setdefault("column_uncompressed_sizes_in_bytes", [])
|
|
674
723
|
e.setdefault("null_counts", [])
|
|
675
|
-
|
|
676
|
-
|
|
724
|
+
e.setdefault("min_values_display", [])
|
|
725
|
+
e.setdefault("max_values_display", [])
|
|
726
|
+
|
|
727
|
+
# min/max values are stored as compressed int64 values
|
|
728
|
+
# display values are string representations for human readability
|
|
677
729
|
mv = e.get("min_values") or []
|
|
678
730
|
xv = e.get("max_values") or []
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
731
|
+
mv_disp = e.get("min_values_display") or []
|
|
732
|
+
xv_disp = e.get("max_values_display") or []
|
|
733
|
+
|
|
734
|
+
def truncate_display(v, max_len=32):
|
|
735
|
+
"""Truncate display value to max_len characters, adding '...' if longer."""
|
|
682
736
|
if v is None:
|
|
683
737
|
return None
|
|
684
|
-
|
|
685
|
-
if
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
e["
|
|
695
|
-
e["max_values"] = [truncate_value(v) for v in xv]
|
|
738
|
+
s = str(v)
|
|
739
|
+
if len(s) > max_len:
|
|
740
|
+
return s[:max_len] + "..."
|
|
741
|
+
return s
|
|
742
|
+
|
|
743
|
+
# Ensure int64 values are properly typed for min/max
|
|
744
|
+
e["min_values"] = [int(v) if v is not None else None for v in mv]
|
|
745
|
+
e["max_values"] = [int(v) if v is not None else None for v in xv]
|
|
746
|
+
# Display values truncated to 32 chars with '...' suffix if longer
|
|
747
|
+
e["min_values_display"] = [truncate_display(v) for v in mv_disp]
|
|
748
|
+
e["max_values_display"] = [truncate_display(v) for v in xv_disp]
|
|
696
749
|
normalized.append(e)
|
|
697
750
|
|
|
698
|
-
|
|
699
|
-
table = pa.Table.from_pylist(normalized, schema=schema)
|
|
700
|
-
except Exception as exc:
|
|
701
|
-
# Diagnostic output to help find malformed manifest entries
|
|
702
|
-
|
|
703
|
-
print(
|
|
704
|
-
"[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
|
|
705
|
-
)
|
|
706
|
-
for i, ent in enumerate(entries):
|
|
707
|
-
print(f" Entry {i}:")
|
|
708
|
-
if isinstance(ent, dict):
|
|
709
|
-
for k, v in ent.items():
|
|
710
|
-
tname = type(v).__name__
|
|
711
|
-
try:
|
|
712
|
-
s = repr(v)
|
|
713
|
-
except Exception:
|
|
714
|
-
s = "<unreprable>"
|
|
715
|
-
print(f" - {k}: type={tname} repr={s[:200]}")
|
|
716
|
-
else:
|
|
717
|
-
print(
|
|
718
|
-
f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
|
|
719
|
-
)
|
|
720
|
-
raise exc
|
|
751
|
+
table = pa.Table.from_pylist(normalized, schema=schema)
|
|
721
752
|
|
|
722
753
|
buf = pa.BufferOutputStream()
|
|
723
|
-
pq.write_table(table, buf,
|
|
754
|
+
pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
|
|
724
755
|
data = buf.getvalue().to_pybytes()
|
|
725
756
|
|
|
726
757
|
if self.io:
|
|
@@ -789,6 +820,7 @@ class OpteryxCatalog(Metastore):
|
|
|
789
820
|
"location": metadata.location,
|
|
790
821
|
"properties": metadata.properties,
|
|
791
822
|
"format-version": metadata.format_version,
|
|
823
|
+
"annotations": metadata.annotations,
|
|
792
824
|
"current-snapshot-id": metadata.current_snapshot_id,
|
|
793
825
|
"current-schema-id": metadata.current_schema_id,
|
|
794
826
|
"timestamp-ms": metadata.timestamp_ms,
|
|
@@ -803,10 +835,9 @@ class OpteryxCatalog(Metastore):
|
|
|
803
835
|
# Metadata persisted in primary `datasets` collection only.
|
|
804
836
|
|
|
805
837
|
snaps_coll = self._snapshots_collection(collection, dataset_name)
|
|
806
|
-
|
|
807
|
-
|
|
838
|
+
# Upsert snapshot documents. Do NOT delete existing snapshot documents
|
|
839
|
+
# here to avoid accidental removal of historical snapshots on save.
|
|
808
840
|
for snap in metadata.snapshots:
|
|
809
|
-
new_ids.add(str(snap.snapshot_id))
|
|
810
841
|
snaps_coll.document(str(snap.snapshot_id)).set(
|
|
811
842
|
{
|
|
812
843
|
"snapshot-id": snap.snapshot_id,
|
|
@@ -821,10 +852,6 @@ class OpteryxCatalog(Metastore):
|
|
|
821
852
|
}
|
|
822
853
|
)
|
|
823
854
|
|
|
824
|
-
# Delete stale snapshots
|
|
825
|
-
for stale in existing - new_ids:
|
|
826
|
-
snaps_coll.document(stale).delete()
|
|
827
|
-
|
|
828
855
|
# Persist schemas subcollection
|
|
829
856
|
schemas_coll = doc_ref.collection("schemas")
|
|
830
857
|
existing_schema_ids = {d.id for d in schemas_coll.stream()}
|
|
@@ -892,6 +919,7 @@ class OpteryxCatalog(Metastore):
|
|
|
892
919
|
"scale": scale,
|
|
893
920
|
"precision": precision,
|
|
894
921
|
"expectation-policies": [],
|
|
922
|
+
"annotations": [],
|
|
895
923
|
}
|
|
896
924
|
|
|
897
925
|
cols.append(typed)
|