PyPI - opteryx-catalog - Versions diffs - 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl - Mend

opteryx-catalog 0.4.11py3-none-any.whl → 0.4.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

opteryx_catalog/catalog/compaction.py +15 -8
opteryx_catalog/catalog/dataset.py +449 -111
opteryx_catalog/catalog/manifest.py +390 -330
opteryx_catalog/catalog/metadata.py +3 -0
opteryx_catalog/iops/fileio.py +13 -0
opteryx_catalog/maki_nage/__init__.py +8 -0
opteryx_catalog/maki_nage/distogram.py +558 -0
opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
opteryx_catalog/maki_nage/tests/test_count.py +19 -0
opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
opteryx_catalog/maki_nage/tests/test_update.py +44 -0
opteryx_catalog/opteryx_catalog.py +82 -54
opteryx_catalog/webhooks/__init__.py +230 -0
opteryx_catalog/webhooks/events.py +177 -0
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
scripts/collect_byte_counts.py +42 -0
scripts/emit_full_single_file.py +81 -0
scripts/inspect_manifest_dryrun.py +322 -0
scripts/inspect_single_file.py +147 -0
scripts/inspect_single_file_gcs.py +124 -0
tests/test_collections.py +37 -0
tests/test_describe_uncompressed.py +127 -0
tests/test_refresh_manifest.py +275 -0
tests/test_webhooks.py +177 -0
opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0

opteryx_catalog/maki_nage/tests/test_count_at.py ADDED Viewed

@@ -0,0 +1,89 @@
+# type:ignore
+# isort: skip_file
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
+from opteryx.third_party.maki_nage import distogram
+from pytest import approx
+import random
+def test_count_at():
+    h = distogram.Distogram(bin_count=3)
+    print(h)
+    # fill histogram
+    distogram.update(h, 16, count=4)
+    distogram.update(h, 23, count=3)
+    distogram.update(h, 28, count=5)
+    print(h)
+    actual_result = distogram.count_at(h, 25)
+    assert actual_result == approx(6.859999999)
+def test_count_at_normal():
+    points = 10000
+    normal = [random.normalvariate(0.0, 1.0) for _ in range(points)]
+    h = distogram.Distogram()
+    for i in normal:
+        distogram.update(h, i)
+    assert distogram.count_at(h, 0) == approx(points / 2, rel=0.05)
+def test_count_at_not_enough_elements():
+    h = distogram.Distogram()
+    distogram.update(h, 1)
+    distogram.update(h, 2)
+    distogram.update(h, 3)
+    assert distogram.count_at(h, 2.5) == 2
+def test_count_at_left():
+    h = distogram.Distogram(bin_count=6)
+    for i in [1, 2, 3, 4, 5, 6, 0.7, 1.1]:
+        distogram.update(h, i)
+    assert distogram.count_at(h, 0.77) == approx(0.14), distogram.count_at(h, 0.77)
+def test_count_at_right():
+    h = distogram.Distogram(bin_count=6)
+    for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
+        distogram.update(h, i)
+    assert distogram.count_at(h, 6.5) == approx(7.307692307692308)
+def test_count_at_empty():
+    h = distogram.Distogram()
+    assert distogram.count_at(h, 6.5) is None
+def test_count_at_out_of_bouns():
+    h = distogram.Distogram()
+    for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
+        distogram.update(h, i)
+    assert distogram.count_at(h, 0.2) is None
+    assert distogram.count_at(h, 10) is None
+if __name__ == "__main__":  # pragma: no cover
+    test_count_at()
+    test_count_at_empty()
+    test_count_at_left()
+    test_count_at_normal()
+    test_count_at_not_enough_elements()
+    test_count_at_out_of_bouns()
+    test_count_at_right()

opteryx_catalog/maki_nage/tests/test_quantile.py ADDED Viewed

@@ -0,0 +1,81 @@
+# type:ignore
+# isort: skip_file
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
+from opteryx.third_party.maki_nage import distogram
+from pytest import approx
+import numpy as np
+import random
+def test_quantile():
+    h = distogram.Distogram(bin_count=3)
+    distogram.update(h, 16, count=4)
+    distogram.update(h, 23, count=3)
+    distogram.update(h, 28, count=5)
+    assert distogram.quantile(h, 0.5) == approx(23.625)
+def test_quantile_not_enough_elemnts():
+    h = distogram.Distogram(bin_count=10)
+    for i in [12.3, 5.4, 8.2, 100.53, 23.5, 13.98]:
+        distogram.update(h, i)
+    assert distogram.quantile(h, 0.5) == approx(13.14)
+def test_quantile_on_left():
+    h = distogram.Distogram(bin_count=6)
+    data = [12.3, 5.2, 5.4, 4.9, 5.5, 5.6, 8.2, 30.53, 23.5, 13.98]
+    for i in data:
+        distogram.update(h, i)
+    assert distogram.quantile(h, 0.01) == approx(np.quantile(data, 0.01), rel=0.01)
+    assert distogram.quantile(h, 0.05) == approx(np.quantile(data, 0.05), rel=0.05)
+    assert distogram.quantile(h, 0.25) == approx(np.quantile(data, 0.25), rel=0.05)
+def test_quantile_on_right():
+    h = distogram.Distogram(bin_count=6)
+    data = [12.3, 8.2, 100.53, 23.5, 13.98, 200, 200.2, 200.8, 200.4, 200.1]
+    for i in data:
+        distogram.update(h, i)
+    assert distogram.quantile(h, 0.99) == approx(np.quantile(data, 0.99), rel=0.01)
+    assert distogram.quantile(h, 0.85) == approx(np.quantile(data, 0.85), rel=0.01)
+def test_normal():
+    # normal = np.random.normal(0,1, 1000)
+    normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
+    h = distogram.Distogram(bin_count=64)
+    for i in normal:
+        distogram.update(h, i)
+    assert distogram.quantile(h, 0.5) == approx(np.quantile(normal, 0.5), abs=0.2)
+    assert distogram.quantile(h, 0.95) == approx(np.quantile(normal, 0.95), abs=0.2)
+def test_quantile_empty():
+    h = distogram.Distogram()
+    assert distogram.quantile(h, 0.3) is None
+def test_quantile_out_of_bouns():
+    h = distogram.Distogram()
+    for i in [1, 2, 3, 4, 5, 6, 6.7, 6.1]:
+        distogram.update(h, i)
+    assert distogram.quantile(h, -0.2) is None
+    assert distogram.quantile(h, 10) is None

opteryx_catalog/maki_nage/tests/test_stats.py ADDED Viewed

@@ -0,0 +1,25 @@
+# type:ignore
+# isort: skip_file
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
+from opteryx.third_party.maki_nage import distogram
+from pytest import approx
+import numpy as np
+import random
+def test_stats():
+    normal = [random.normalvariate(0.0, 1.0) for _ in range(10000)]
+    h = distogram.Distogram()
+    for i in normal:
+        distogram.update(h, i)
+    assert distogram.mean(h) == approx(np.mean(normal), abs=0.1)
+    assert distogram.variance(h) == approx(np.var(normal), abs=0.1)
+    assert distogram.stddev(h) == approx(np.std(normal), abs=0.1)

opteryx_catalog/maki_nage/tests/test_update.py ADDED Viewed

@@ -0,0 +1,44 @@
+# type:ignore
+# isort: skip_file
+import sys
+import os
+sys.path.insert(1, os.path.join(sys.path[0], "../../../.."))
+from opteryx.third_party.maki_nage import distogram
+import pytest
+from pytest import approx
+def test_update():
+    h = distogram.Distogram(bin_count=3)
+    # fill histogram
+    distogram.update(h, 23)
+    assert h.bins == [(23, 1)]
+    distogram.update(h, 28)
+    assert h.bins == [(23, 1), (28, 1)]
+    distogram.update(h, 16)
+    assert h.bins == [(16, 1), (23, 1), (28, 1)]
+    # update count on existing value
+    distogram.update(h, 23)
+    assert h.bins == [(16, 1), (23, 2), (28, 1)]
+    distogram.update(h, 28)
+    assert h.bins == [(16, 1), (23, 2), (28, 2)]
+    distogram.update(h, 16)
+    assert h.bins == [(16, 2), (23, 2), (28, 2)]
+    # merge values
+    h = distogram.update(h, 26)
+    assert h.bins[0] == (16, 2)
+    assert h.bins[1] == (23, 2)
+    assert h.bins[2][0] == approx(27.33333)
+    assert h.bins[2][1] == 3
+def test_update_with_invalid_count():
+    h = distogram.Distogram(bin_count=3)
+    with pytest.raises(ValueError):
+        distogram.update(h, 23, count=0)

opteryx_catalog/opteryx_catalog.py CHANGED Viewed

@@ -20,6 +20,9 @@ from .exceptions import DatasetNotFound
 from .exceptions import ViewAlreadyExists
 from .exceptions import ViewNotFound
 from .iops.base import FileIO
+from .webhooks import send_webhook
+from .webhooks.events import dataset_created_payload
+from .webhooks.events import view_created_payload
 class OpteryxCatalog(Metastore):
@@ -141,6 +144,7 @@ class OpteryxCatalog(Metastore):
                 "timestamp-ms": now_ms,
                 "author": author,
                 "maintenance-policy": metadata.maintenance_policy,
+                "annotations": metadata.annotations,
             }
         )
@@ -168,6 +172,20 @@ class OpteryxCatalog(Metastore):
             # update dataset doc to reference current schema
             doc_ref.update({"current-schema-id": metadata.current_schema_id})
+        # Send webhook notification
+        send_webhook(
+            action="create",
+            workspace=self.workspace,
+            collection=collection,
+            resource_type="dataset",
+            resource_name=dataset_name,
+            payload=dataset_created_payload(
+                schema=schema,
+                location=location,
+                properties=properties,
+            ),
+        )
         # Return SimpleDataset (attach this catalog so append() can persist)
         return SimpleDataset(identifier=identifier, _metadata=metadata, io=self.io, catalog=self)
@@ -204,8 +222,9 @@ class OpteryxCatalog(Metastore):
         # Load dataset-level timestamp/author and collection/workspace
         metadata.timestamp_ms = data.get("timestamp-ms")
         metadata.author = data.get("author")
-        # note: Firestore dataset doc stores the original collection and workspace
-        # under keys `collection` and `workspace`.
+        metadata.description = data.get("description")
+        metadata.describer = data.get("describer")
+        metadata.annotations = data.get("annotations") or []
         # Load snapshots based on load_history flag
         snaps = []
@@ -308,6 +327,13 @@ class OpteryxCatalog(Metastore):
         coll = self._datasets_collection(collection)
         return [doc.id for doc in coll.stream()]
+    def list_collections(self) -> Iterable[str]:
+        """List top-level collections (documents) in this workspace."""
+        try:
+            return [col.id for col in self._catalog_ref.list_documents() if col.id[0] != "$"]
+        except:
+            return []
     def create_collection(
         self,
         collection: str,
@@ -334,6 +360,7 @@ class OpteryxCatalog(Metastore):
                 "properties": properties or {},
                 "timestamp-ms": now_ms,
                 "author": author,
+                "annotations": [],
             }
         )
@@ -446,6 +473,19 @@ class OpteryxCatalog(Metastore):
             }
         )
+        # Send webhook notification
+        send_webhook(
+            action="create" if not update_if_exists else "update",
+            workspace=self.workspace,
+            collection=collection,
+            resource_type="view",
+            resource_name=view_name,
+            payload=view_created_payload(
+                definition=sql,
+                properties=properties,
+            ),
+        )
         # Return a simple CatalogView wrapper
         v = CatalogView(name=view_name, definition=sql, properties=properties or {})
         # provide convenient attributes used by docs/examples
@@ -598,7 +638,7 @@ class OpteryxCatalog(Metastore):
     def update_dataset_description(
         self,
-        identifier: str,
+        identifier: str | tuple,
         description: str,
         describer: Optional[str] = None,
     ) -> None:
@@ -609,7 +649,12 @@ class OpteryxCatalog(Metastore):
             description: The new description text
             describer: Optional identifier for who/what created the description
         """
-        collection, dataset_name = identifier.split(".")
+        if isinstance(identifier, tuple) or isinstance(identifier, list):
+            collection, dataset_name = identifier[0], identifier[1]
+        else:
+            collection, dataset_name = identifier.split(".")
         doc_ref = self._dataset_doc_ref(collection, dataset_name)
         updates = {
             "description": description,
@@ -629,6 +674,8 @@ class OpteryxCatalog(Metastore):
         import pyarrow as pa
         import pyarrow.parquet as pq
+        from .iops.fileio import WRITE_PARQUET_OPTIONS
         # If entries is None we skip writing; if entries is empty list, write
         # an empty Parquet manifest (represents an empty dataset for this
         # snapshot). This preserves previous manifests so older snapshots
@@ -654,8 +701,10 @@ class OpteryxCatalog(Metastore):
                     ("min_k_hashes", pa.list_(pa.list_(pa.uint64()))),
                     ("histogram_counts", pa.list_(pa.list_(pa.int64()))),
                     ("histogram_bins", pa.int32()),
-                    ("min_values", pa.list_(pa.binary())),
-                    ("max_values", pa.list_(pa.binary())),
+                    ("min_values", pa.list_(pa.int64())),
+                    ("max_values", pa.list_(pa.int64())),
+                    ("min_values_display", pa.list_(pa.string())),
+                    ("max_values_display", pa.list_(pa.string())),
                 ]
             )
@@ -672,55 +721,37 @@ class OpteryxCatalog(Metastore):
                 e.setdefault("histogram_bins", 0)
                 e.setdefault("column_uncompressed_sizes_in_bytes", [])
                 e.setdefault("null_counts", [])
-                # Process min/max values: truncate to 16 bytes with ellipsis marker if longer
+                e.setdefault("min_values_display", [])
+                e.setdefault("max_values_display", [])
+                # min/max values are stored as compressed int64 values
+                # display values are string representations for human readability
                 mv = e.get("min_values") or []
                 xv = e.get("max_values") or []
-                def truncate_value(v):
-                    """Convert value to binary and truncate to 16 bytes with marker if needed."""
+                mv_disp = e.get("min_values_display") or []
+                xv_disp = e.get("max_values_display") or []
+                def truncate_display(v, max_len=32):
+                    """Truncate display value to max_len characters, adding '...' if longer."""
                     if v is None:
                         return None
-                    # Convert to bytes
-                    if isinstance(v, bytes):
-                        b = v
-                    else:
-                        b = str(v).encode('utf-8')
-                    # Truncate if longer than 16 bytes, add 0xFF as 17th byte to indicate truncation
-                    if len(b) > 16:
-                        return b[:16] + b'\xff'
-                    return b
-                e["min_values"] = [truncate_value(v) for v in mv]
-                e["max_values"] = [truncate_value(v) for v in xv]
+                    s = str(v)
+                    if len(s) > max_len:
+                        return s[:max_len] + "..."
+                    return s
+                # Ensure int64 values are properly typed for min/max
+                e["min_values"] = [int(v) if v is not None else None for v in mv]
+                e["max_values"] = [int(v) if v is not None else None for v in xv]
+                # Display values truncated to 32 chars with '...' suffix if longer
+                e["min_values_display"] = [truncate_display(v) for v in mv_disp]
+                e["max_values_display"] = [truncate_display(v) for v in xv_disp]
                 normalized.append(e)
-            try:
-                table = pa.Table.from_pylist(normalized, schema=schema)
-            except Exception as exc:
-                # Diagnostic output to help find malformed manifest entries
-                print(
-                    "[MANIFEST DEBUG] Failed to convert entries to Parquet manifest table. Dumping entries:"
-                )
-                for i, ent in enumerate(entries):
-                    print(f" Entry {i}:")
-                    if isinstance(ent, dict):
-                        for k, v in ent.items():
-                            tname = type(v).__name__
-                            try:
-                                s = repr(v)
-                            except Exception:
-                                s = "<unreprable>"
-                            print(f"  - {k}: type={tname} repr={s[:200]}")
-                    else:
-                        print(
-                            f"  - non-dict entry: type={type(ent).__name__} repr={repr(ent)[:200]}"
-                        )
-                raise exc
+            table = pa.Table.from_pylist(normalized, schema=schema)
             buf = pa.BufferOutputStream()
-            pq.write_table(table, buf, compression="zstd")
+            pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
             data = buf.getvalue().to_pybytes()
             if self.io:
@@ -789,6 +820,7 @@ class OpteryxCatalog(Metastore):
                 "location": metadata.location,
                 "properties": metadata.properties,
                 "format-version": metadata.format_version,
+                "annotations": metadata.annotations,
                 "current-snapshot-id": metadata.current_snapshot_id,
                 "current-schema-id": metadata.current_schema_id,
                 "timestamp-ms": metadata.timestamp_ms,
@@ -803,10 +835,9 @@ class OpteryxCatalog(Metastore):
         # Metadata persisted in primary `datasets` collection only.
         snaps_coll = self._snapshots_collection(collection, dataset_name)
-        existing = {d.id for d in snaps_coll.stream()}
-        new_ids = set()
+        # Upsert snapshot documents. Do NOT delete existing snapshot documents
+        # here to avoid accidental removal of historical snapshots on save.
         for snap in metadata.snapshots:
-            new_ids.add(str(snap.snapshot_id))
             snaps_coll.document(str(snap.snapshot_id)).set(
                 {
                     "snapshot-id": snap.snapshot_id,
@@ -821,10 +852,6 @@ class OpteryxCatalog(Metastore):
                 }
             )
-        # Delete stale snapshots
-        for stale in existing - new_ids:
-            snaps_coll.document(stale).delete()
         # Persist schemas subcollection
         schemas_coll = doc_ref.collection("schemas")
         existing_schema_ids = {d.id for d in schemas_coll.stream()}
@@ -892,6 +919,7 @@ class OpteryxCatalog(Metastore):
                 "scale": scale,
                 "precision": precision,
                 "expectation-policies": [],
+                "annotations": [],
             }
             cols.append(typed)

opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

opteryx-catalog 0.4.11py3-none-any.whl → 0.4.26py3-none-any.whl