opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,89 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ from pytest import approx
10
+ import random
11
+
12
+
13
+ def test_count_at():
14
+ h = distogram.Distogram(bin_count=3)
15
+ print(h)
16
+
17
+ # fill histogram
18
+ distogram.update(h, 16, count=4)
19
+ distogram.update(h, 23, count=3)
20
+ distogram.update(h, 28, count=5)
21
+ print(h)
22
+
23
+ actual_result = distogram.count_at(h, 25)
24
+ assert actual_result == approx(6.859999999)
25
+
26
+
27
+ def test_count_at_normal():
28
+ points = 10000
29
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(points)]
30
+ h = distogram.Distogram()
31
+
32
+ for i in normal:
33
+ distogram.update(h, i)
34
+
35
+ assert distogram.count_at(h, 0) == approx(points / 2, rel=0.05)
36
+
37
+
38
+ def test_count_at_not_enough_elements():
39
+ h = distogram.Distogram()
40
+
41
+ distogram.update(h, 1)
42
+ distogram.update(h, 2)
43
+ distogram.update(h, 3)
44
+
45
+ assert distogram.count_at(h, 2.5) == 2
46
+
47
+
48
+ def test_count_at_left():
49
+ h = distogram.Distogram(bin_count=6)
50
+
51
+ for i in [1, 2, 3, 4, 5, 6, 0.7, 1.1]:
52
+ distogram.update(h, i)
53
+
54
+ assert distogram.count_at(h, 0.77) == approx(0.14), distogram.count_at(h, 0.77)
55
+
56
+
57
+ def test_count_at_right():
58
+ h = distogram.Distogram(bin_count=6)
59
+
60
+ for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
61
+ distogram.update(h, i)
62
+
63
+ assert distogram.count_at(h, 6.5) == approx(7.307692307692308)
64
+
65
+
66
+ def test_count_at_empty():
67
+ h = distogram.Distogram()
68
+
69
+ assert distogram.count_at(h, 6.5) is None
70
+
71
+
72
+ def test_count_at_out_of_bouns():
73
+ h = distogram.Distogram()
74
+
75
+ for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
76
+ distogram.update(h, i)
77
+
78
+ assert distogram.count_at(h, 0.2) is None
79
+ assert distogram.count_at(h, 10) is None
80
+
81
+
82
+ if __name__ == "__main__": # pragma: no cover
83
+ test_count_at()
84
+ test_count_at_empty()
85
+ test_count_at_left()
86
+ test_count_at_normal()
87
+ test_count_at_not_enough_elements()
88
+ test_count_at_out_of_bouns()
89
+ test_count_at_right()
@@ -0,0 +1,81 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ from pytest import approx
10
+
11
+ import numpy as np
12
+ import random
13
+
14
+
15
+ def test_quantile():
16
+ h = distogram.Distogram(bin_count=3)
17
+ distogram.update(h, 16, count=4)
18
+ distogram.update(h, 23, count=3)
19
+ distogram.update(h, 28, count=5)
20
+
21
+ assert distogram.quantile(h, 0.5) == approx(23.625)
22
+
23
+
24
+ def test_quantile_not_enough_elemnts():
25
+ h = distogram.Distogram(bin_count=10)
26
+
27
+ for i in [12.3, 5.4, 8.2, 100.53, 23.5, 13.98]:
28
+ distogram.update(h, i)
29
+
30
+ assert distogram.quantile(h, 0.5) == approx(13.14)
31
+
32
+
33
+ def test_quantile_on_left():
34
+ h = distogram.Distogram(bin_count=6)
35
+
36
+ data = [12.3, 5.2, 5.4, 4.9, 5.5, 5.6, 8.2, 30.53, 23.5, 13.98]
37
+ for i in data:
38
+ distogram.update(h, i)
39
+
40
+ assert distogram.quantile(h, 0.01) == approx(np.quantile(data, 0.01), rel=0.01)
41
+ assert distogram.quantile(h, 0.05) == approx(np.quantile(data, 0.05), rel=0.05)
42
+ assert distogram.quantile(h, 0.25) == approx(np.quantile(data, 0.25), rel=0.05)
43
+
44
+
45
+ def test_quantile_on_right():
46
+ h = distogram.Distogram(bin_count=6)
47
+
48
+ data = [12.3, 8.2, 100.53, 23.5, 13.98, 200, 200.2, 200.8, 200.4, 200.1]
49
+ for i in data:
50
+ distogram.update(h, i)
51
+
52
+ assert distogram.quantile(h, 0.99) == approx(np.quantile(data, 0.99), rel=0.01)
53
+ assert distogram.quantile(h, 0.85) == approx(np.quantile(data, 0.85), rel=0.01)
54
+
55
+
56
+ def test_normal():
57
+ # normal = np.random.normal(0,1, 1000)
58
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
59
+ h = distogram.Distogram(bin_count=64)
60
+
61
+ for i in normal:
62
+ distogram.update(h, i)
63
+
64
+ assert distogram.quantile(h, 0.5) == approx(np.quantile(normal, 0.5), abs=0.2)
65
+ assert distogram.quantile(h, 0.95) == approx(np.quantile(normal, 0.95), abs=0.2)
66
+
67
+
68
+ def test_quantile_empty():
69
+ h = distogram.Distogram()
70
+
71
+ assert distogram.quantile(h, 0.3) is None
72
+
73
+
74
+ def test_quantile_out_of_bouns():
75
+ h = distogram.Distogram()
76
+
77
+ for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
78
+ distogram.update(h, i)
79
+
80
+ assert distogram.quantile(h, -0.2) is None
81
+ assert distogram.quantile(h, 10) is None
@@ -0,0 +1,25 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ from pytest import approx
10
+
11
+
12
+ import numpy as np
13
+ import random
14
+
15
+
16
+ def test_stats():
17
+ normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
18
+ h = distogram.Distogram()
19
+
20
+ for i in normal:
21
+ distogram.update(h, i)
22
+
23
+ assert distogram.mean(h) == approx(np.mean(normal), abs=0.1)
24
+ assert distogram.variance(h) == approx(np.var(normal), abs=0.1)
25
+ assert distogram.stddev(h) == approx(np.std(normal), abs=0.1)
@@ -0,0 +1,44 @@
1
+ # type:ignore
2
+ # isort: skip_file
3
+ import sys
4
+ import os
5
+
6
+ sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
7
+
8
+ from opteryx.third_party.maki_nage import distogram
9
+ import pytest
10
+ from pytest import approx
11
+
12
+
13
+ def test_update():
14
+ h = distogram.Distogram(bin_count=3)
15
+
16
+ # fill histogram
17
+ distogram.update(h, 23)
18
+ assert h.bins == [(23, 1)]
19
+ distogram.update(h, 28)
20
+ assert h.bins == [(23, 1), (28, 1)]
21
+ distogram.update(h, 16)
22
+ assert h.bins == [(16, 1), (23, 1), (28, 1)]
23
+
24
+ # update count on existing value
25
+ distogram.update(h, 23)
26
+ assert h.bins == [(16, 1), (23, 2), (28, 1)]
27
+ distogram.update(h, 28)
28
+ assert h.bins == [(16, 1), (23, 2), (28, 2)]
29
+ distogram.update(h, 16)
30
+ assert h.bins == [(16, 2), (23, 2), (28, 2)]
31
+
32
+ # merge values
33
+ h = distogram.update(h, 26)
34
+ assert h.bins[0] == (16, 2)
35
+ assert h.bins[1] == (23, 2)
36
+ assert h.bins[2][0] == approx(27.33333)
37
+ assert h.bins[2][1] == 3
38
+
39
+
40
+ def test_update_with_invalid_count():
41
+ h = distogram.Distogram(bin_count=3)
42
+
43
+ with pytest.raises(ValueError):
44
+ distogram.update(h, 23, count=0)
@@ -20,6 +20,9 @@ from .exceptions import DatasetNotFound
20
20
  from .exceptions import ViewAlreadyExists
21
21
  from .exceptions import ViewNotFound
22
22
  from .iops.base import FileIO
23
+ from .webhooks import send_webhook
24
+ from .webhooks.events import dataset_created_payload
25
+ from .webhooks.events import view_created_payload
23
26
 
24
27
 
25
28
  class OpteryxCatalog(Metastore):
@@ -141,6 +144,7 @@ class OpteryxCatalog(Metastore):
141
144
  "timestamp-ms": now_ms,
142
145
  "author": author,
143
146
  "maintenance-policy": metadata.maintenance_policy,
147
+ "annotations": metadata.annotations,
144
148
  }
145
149
  )
146
150
 
@@ -168,6 +172,20 @@ class OpteryxCatalog(Metastore):
168
172
  # update dataset doc to reference current schema
169
173
  doc_ref.update({"current-schema-id": metadata.current_schema_id})
170
174
 
175
+ # Send webhook notification
176
+ send_webhook(
177
+ action="create",
178
+ workspace=self.workspace,
179
+ collection=collection,
180
+ resource_type="dataset",
181
+ resource_name=dataset_name,
182
+ payload=dataset_created_payload(
183
+ schema=schema,
184
+ location=location,
185
+ properties=properties,
186
+ ),
187
+ )
188
+
171
189
  # Return SimpleDataset (attach this catalog so append() can persist)
172
190
  return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
173
191
 
@@ -204,8 +222,9 @@ class OpteryxCatalog(Metastore):
204
222
  # Load dataset-level timestamp/author and collection/workspace
205
223
  metadata.timestamp_ms = data.get("timestamp-ms")
206
224
  metadata.author = data.get("author")
207
- # note: Firestore dataset doc stores the original collection and workspace
208
- # under keys `collection` and `workspace`.
225
+ metadata.description = data.get("description")
226
+ metadata.describer = data.get("describer")
227
+ metadata.annotations = data.get("annotations") or []
209
228
 
210
229
  # Load snapshots based on load_history flag
211
230
  snaps = []
@@ -308,6 +327,13 @@ class OpteryxCatalog(Metastore):
308
327
  coll = self._datasets_collection(collection)
309
328
  return [doc.id for doc in coll.stream()]
310
329
 
330
+ def list_collections(self) -> Iterable[str]:
331
+ """List top-level collections (documents) in this workspace."""
332
+ try:
333
+ return [col.id for col in self._catalog_ref.list_documents() if col.id[0] != "$"]
334
+ except:
335
+ return []
336
+
311
337
  def create_collection(
312
338
  self,
313
339
  collection: str,
@@ -334,6 +360,7 @@ class OpteryxCatalog(Metastore):
334
360
  "properties": properties or {},
335
361
  "timestamp-ms": now_ms,
336
362
  "author": author,
363
+ "annotations": [],
337
364
  }
338
365
  )
339
366
 
@@ -446,6 +473,19 @@ class OpteryxCatalog(Metastore):
446
473
  }
447
474
  )
448
475
 
476
+ # Send webhook notification
477
+ send_webhook(
478
+ action="create" if not update_if_exists else "update",
479
+ workspace=self.workspace,
480
+ collection=collection,
481
+ resource_type="view",
482
+ resource_name=view_name,
483
+ payload=view_created_payload(
484
+ definition=sql,
485
+ properties=properties,
486
+ ),
487
+ )
488
+
449
489
  # Return a simple CatalogView wrapper
450
490
  v = CatalogView(name=view_name, definition=sql, properties=properties or {})
451
491
  # provide convenient attributes used by docs/examples
@@ -598,7 +638,7 @@ class OpteryxCatalog(Metastore):
598
638
 
599
639
  def update_dataset_description(
600
640
  self,
601
- identifier: str,
641
+ identifier: str | tuple,
602
642
  description: str,
603
643
  describer: Optional[str] = None,
604
644
  ) -> None:
@@ -609,7 +649,12 @@ class OpteryxCatalog(Metastore):
609
649
  description: The new description text
610
650
  describer: Optional identifier for who/what created the description
611
651
  """
612
- collection, dataset_name = identifier.split(".")
652
+
653
+ if isinstance(identifier, tuple) or isinstance(identifier, list):
654
+ collection, dataset_name = identifier[0], identifier[1]
655
+ else:
656
+ collection, dataset_name = identifier.split(".")
657
+
613
658
  doc_ref = self._dataset_doc_ref(collection, dataset_name)
614
659
  updates = {
615
660
  "description": description,
@@ -629,6 +674,8 @@ class OpteryxCatalog(Metastore):
629
674
  import pyarrow as pa
630
675
  import pyarrow.parquet as pq
631
676
 
677
+ from .iops.fileio import WRITE_PARQUET_OPTIONS
678
+
632
679
  # If entries is None we skip writing; if entries is empty list, write
633
680
  # an empty Parquet manifest (represents an empty dataset for this
634
681
  # snapshot). This preserves previous manifests so older snapshots
@@ -654,8 +701,10 @@ class OpteryxCatalog(Metastore):
654
701
  ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
655
702
  ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
656
703
  ("histogram_bins", pa.int32()),
657
- ("min_values", pa.list_(pa.binary())),
658
- ("max_values", pa.list_(pa.binary())),
704
+ ("min_values", pa.list_(pa.int64())),
705
+ ("max_values", pa.list_(pa.int64())),
706
+ ("min_values_display", pa.list_(pa.string())),
707
+ ("max_values_display", pa.list_(pa.string())),
659
708
  ]
660
709
  )
661
710
 
@@ -672,55 +721,37 @@ class OpteryxCatalog(Metastore):
672
721
  e.setdefault("histogram_bins", 0)
673
722
  e.setdefault("column_uncompressed_sizes_in_bytes", [])
674
723
  e.setdefault("null_counts", [])
675
-
676
- # Process min/max values: truncate to 16 bytes with ellipsis marker if longer
724
+ e.setdefault("min_values_display", [])
725
+ e.setdefault("max_values_display", [])
726
+
727
+ # min/max values are stored as compressed int64 values
728
+ # display values are string representations for human readability
677
729
  mv = e.get("min_values") or []
678
730
  xv = e.get("max_values") or []
679
-
680
- def truncate_value(v):
681
- """Convert value to binary and truncate to 16 bytes with marker if needed."""
731
+ mv_disp = e.get("min_values_display") or []
732
+ xv_disp = e.get("max_values_display") or []
733
+
734
+ def truncate_display(v, max_len=32):
735
+ """Truncate display value to max_len characters, adding '...' if longer."""
682
736
  if v is None:
683
737
  return None
684
- # Convert to bytes
685
- if isinstance(v, bytes):
686
- b = v
687
- else:
688
- b = str(v).encode('utf-8')
689
- # Truncate if longer than 16 bytes, add 0xFF as 17th byte to indicate truncation
690
- if len(b) > 16:
691
- return b[:16] + b'\xff'
692
- return b
693
-
694
- e["min_values"] = [truncate_value(v) for v in mv]
695
- e["max_values"] = [truncate_value(v) for v in xv]
738
+ s = str(v)
739
+ if len(s) > max_len:
740
+ return s[:max_len] + "..."
741
+ return s
742
+
743
+ # Ensure int64 values are properly typed for min/max
744
+ e["min_values"] = [int(v) if v is not None else None for v in mv]
745
+ e["max_values"] = [int(v) if v is not None else None for v in xv]
746
+ # Display values truncated to 32 chars with '...' suffix if longer
747
+ e["min_values_display"] = [truncate_display(v) for v in mv_disp]
748
+ e["max_values_display"] = [truncate_display(v) for v in xv_disp]
696
749
  normalized.append(e)
697
750
 
698
- try:
699
- table = pa.Table.from_pylist(normalized, schema=schema)
700
- except Exception as exc:
701
- # Diagnostic output to help find malformed manifest entries
702
-
703
- print(
704
- "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
705
- )
706
- for i, ent in enumerate(entries):
707
- print(f" Entry {i}:")
708
- if isinstance(ent, dict):
709
- for k, v in ent.items():
710
- tname = type(v).__name__
711
- try:
712
- s = repr(v)
713
- except Exception:
714
- s = "<unreprable>"
715
- print(f" - {k}: type={tname} repr={s[:200]}")
716
- else:
717
- print(
718
- f" - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
719
- )
720
- raise exc
751
+ table = pa.Table.from_pylist(normalized, schema=schema)
721
752
 
722
753
  buf = pa.BufferOutputStream()
723
- pq.write_table(table, buf, compression="zstd")
754
+ pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
724
755
  data = buf.getvalue().to_pybytes()
725
756
 
726
757
  if self.io:
@@ -789,6 +820,7 @@ class OpteryxCatalog(Metastore):
789
820
  "location": metadata.location,
790
821
  "properties": metadata.properties,
791
822
  "format-version": metadata.format_version,
823
+ "annotations": metadata.annotations,
792
824
  "current-snapshot-id": metadata.current_snapshot_id,
793
825
  "current-schema-id": metadata.current_schema_id,
794
826
  "timestamp-ms": metadata.timestamp_ms,
@@ -803,10 +835,9 @@ class OpteryxCatalog(Metastore):
803
835
  # Metadata persisted in primary `datasets` collection only.
804
836
 
805
837
  snaps_coll = self._snapshots_collection(collection, dataset_name)
806
- existing = {d.id for d in snaps_coll.stream()}
807
- new_ids = set()
838
+ # Upsert snapshot documents. Do NOT delete existing snapshot documents
839
+ # here to avoid accidental removal of historical snapshots on save.
808
840
  for snap in metadata.snapshots:
809
- new_ids.add(str(snap.snapshot_id))
810
841
  snaps_coll.document(str(snap.snapshot_id)).set(
811
842
  {
812
843
  "snapshot-id": snap.snapshot_id,
@@ -821,10 +852,6 @@ class OpteryxCatalog(Metastore):
821
852
  }
822
853
  )
823
854
 
824
- # Delete stale snapshots
825
- for stale in existing - new_ids:
826
- snaps_coll.document(stale).delete()
827
-
828
855
  # Persist schemas subcollection
829
856
  schemas_coll = doc_ref.collection("schemas")
830
857
  existing_schema_ids = {d.id for d in schemas_coll.stream()}
@@ -892,6 +919,7 @@ class OpteryxCatalog(Metastore):
892
919
  "scale": scale,
893
920
  "precision": precision,
894
921
  "expectation-policies": [],
922
+ "annotations": [],
895
923
  }
896
924
 
897
925
  cols.append(typed)