PyPI - opteryx-catalog - Versions diffs - 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl - Mend

opteryx-catalog 0.4.4py3-none-any.whl → 0.4.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

opteryx_catalog/__init__.py +1 -1
opteryx_catalog/catalog/__init__.py +2 -1
opteryx_catalog/catalog/compaction.py +536 -0
opteryx_catalog/catalog/dataset.py +840 -520
opteryx_catalog/catalog/manifest.py +475 -0
opteryx_catalog/catalog/metadata.py +5 -2
opteryx_catalog/catalog/metastore.py +2 -2
opteryx_catalog/exceptions.py +1 -1
opteryx_catalog/iops/fileio.py +13 -0
opteryx_catalog/iops/gcs.py +35 -5
opteryx_catalog/maki_nage/__init__.py +8 -0
opteryx_catalog/maki_nage/distogram.py +558 -0
opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
opteryx_catalog/maki_nage/tests/test_count.py +19 -0
opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
opteryx_catalog/maki_nage/tests/test_update.py +44 -0
opteryx_catalog/opteryx_catalog.py +296 -242
opteryx_catalog/webhooks/__init__.py +230 -0
opteryx_catalog/webhooks/events.py +177 -0
{opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
{opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
scripts/collect_byte_counts.py +42 -0
scripts/create_dataset.py +1 -1
scripts/emit_full_single_file.py +81 -0
scripts/inspect_manifest_dryrun.py +322 -0
scripts/inspect_single_file.py +147 -0
scripts/inspect_single_file_gcs.py +124 -0
scripts/read_dataset.py +1 -1
tests/test_collections.py +37 -0
tests/test_compaction.py +233 -0
tests/test_dataset_metadata.py +14 -0
tests/test_describe_uncompressed.py +127 -0
tests/test_refresh_manifest.py +275 -0
tests/test_webhooks.py +177 -0
opteryx_catalog-0.4.4.dist-info/RECORD +0 -23
{opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
{opteryx_catalog-0.4.4.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0

opteryx_catalog/catalog/dataset.py CHANGED Viewed

@@ -8,6 +8,8 @@ from typing import Any
 from typing import Iterable
 from typing import Optional
+from .manifest import ParquetManifestEntry
+from .manifest import build_parquet_manifest_entry_from_bytes
 from .metadata import DatasetMetadata
 from .metadata import Snapshot
 from .metastore import Dataset
@@ -69,6 +71,26 @@ class SimpleDataset(Dataset):
     def metadata(self) -> DatasetMetadata:
         return self._metadata
+    def _next_sequence_number(self) -> int:
+        """Calculate the next sequence number.
+        Uses the current snapshot's sequence number + 1. Works efficiently
+        with load_history=False since we only need the most recent snapshot,
+        not the full history.
+        Returns:
+            The next sequence number (current snapshot's sequence + 1, or 1 if no snapshots).
+        """
+        if not self.metadata.snapshots:
+            # No snapshots yet - this is the first one
+            return 1
+        # Get the current (most recent) snapshot - should have the highest sequence number
+        current = self.snapshot()
+        if current:
+            seq = getattr(current, "sequence_number", None)
+            return int(seq) + 1 if seq is not None else 1
     def snapshot(self, snapshot_id: Optional[int] = None) -> Optional[Snapshot]:
         """Return a Snapshot.
@@ -95,20 +117,17 @@ class SimpleDataset(Dataset):
                 if doc.exists:
                     sd = doc.to_dict() or {}
                     snap = Snapshot(
-                        snapshot_id=int(
-                            sd.get("snapshot-id") or sd.get("snapshot_id") or snapshot_id
-                        ),
-                        timestamp_ms=int(sd.get("timestamp-ms") or sd.get("timestamp_ms") or 0),
+                        snapshot_id=int(sd.get("snapshot-id") or snapshot_id),
+                        timestamp_ms=int(sd.get("timestamp-ms", 0)),
                         author=sd.get("author"),
-                        sequence_number=sd.get("sequence-number") or sd.get("sequence_number"),
-                        user_created=sd.get("user-created") or sd.get("user_created"),
-                        manifest_list=sd.get("manifest") or sd.get("manifest_list"),
-                        schema_id=sd.get("schema-id") or sd.get("schema_id"),
+                        sequence_number=sd.get("sequence-number", 0),
+                        user_created=sd.get("user-created"),
+                        manifest_list=sd.get("manifest"),
+                        schema_id=sd.get("schema-id"),
                         summary=sd.get("summary", {}),
-                        operation_type=sd.get("operation-type") or sd.get("operation_type"),
-                        parent_snapshot_id=sd.get("parent-snapshot-id")
-                        or sd.get("parent_snapshot_id"),
-                        commit_message=sd.get("commit-message") or sd.get("commit_message"),
+                        operation_type=sd.get("operation-type"),
+                        parent_snapshot_id=sd.get("parent-snapshot-id"),
+                        commit_message=sd.get("commit-message"),
                     )
                     return snap
             except Exception:
@@ -227,148 +246,9 @@ class SimpleDataset(Dataset):
         if not hasattr(table, "schema"):
             raise TypeError("append() expects a pyarrow.Table-like object")
-        # Write parquet file with collision-resistant name
-        fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
-        data_path = f"{self.metadata.location}/data/{fname}"
-        buf = pa.BufferOutputStream()
-        pq.write_table(table, buf, compression="zstd")
-        pdata = buf.getvalue().to_pybytes()
-        out = self.io.new_output(data_path).create()
-        out.write(pdata)
-        out.close()
-        # Prepare sketches/stats
-        K = 32
-        HBINS = 32
-        min_k_hashes: list[list[int]] = []
-        histograms: list[list[int]] = []
-        min_values: list[int] = []
-        max_values: list[int] = []
-        # Use draken for efficient hashing and compression when available.
-        import heapq
-        # canonical NULL flag for missing values
-        NULL_FLAG = -(1 << 63)
-        try:
-            import opteryx.draken as draken  # type: ignore
-            num_rows = int(table.num_rows)
-            for col_idx, col in enumerate(table.columns):
-                # hash column values to 64-bit via draken (new cpdef API)
-                vec = draken.Vector.from_arrow(col)
-                hashes = list(vec.hash())
-                # Decide whether to compute min-k/histogram for this column based
-                # on field type and, for strings, average length of values.
-                field_type = table.schema.field(col_idx).type
-                compute_min_k = False
-                if (
-                    pa.types.is_integer(field_type)
-                    or pa.types.is_floating(field_type)
-                    or pa.types.is_decimal(field_type)
-                ):
-                    compute_min_k = True
-                elif (
-                    pa.types.is_timestamp(field_type)
-                    or pa.types.is_date(field_type)
-                    or pa.types.is_time(field_type)
-                ):
-                    compute_min_k = True
-                elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
-                    # compute average length from non-null values; only allow
-                    # min-k/histogram for short strings (avg <= 16)
-                    col_py = None
-                    try:
-                        col_py = col.to_pylist()
-                    except Exception:
-                        col_py = None
-                    if col_py is not None:
-                        lens = [len(x) for x in col_py if x is not None]
-                        if lens:
-                            avg_len = sum(lens) / len(lens)
-                            if avg_len <= 16:
-                                compute_min_k = True
-                # KMV: take K smallest hashes when allowed; otherwise store an
-                # empty list for this column.
-                if compute_min_k:
-                    smallest = heapq.nsmallest(K, hashes)
-                    col_min_k = sorted(smallest)
-                else:
-                    col_min_k = []
-                # For histogram decisions follow the same rule as min-k
-                compute_hist = compute_min_k
-                # Use draken.compress() to get canonical int64 per value
-                mapped = list(vec.compress())
-                non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
-                if non_nulls_mapped:
-                    vmin = min(non_nulls_mapped)
-                    vmax = max(non_nulls_mapped)
-                    col_min = int(vmin)
-                    col_max = int(vmax)
-                    if compute_hist:
-                        if vmin == vmax:
-                            col_hist = [0] * HBINS
-                            col_hist[-1] = len(non_nulls_mapped)
-                        else:
-                            col_hist = [0] * HBINS
-                            span = float(vmax - vmin)
-                            for m in non_nulls_mapped:
-                                b = int(((float(m) - float(vmin)) / span) * (HBINS - 1))
-                                if b < 0:
-                                    b = 0
-                                if b >= HBINS:
-                                    b = HBINS - 1
-                                col_hist[b] += 1
-                    else:
-                        col_hist = [0] * HBINS
-                else:
-                    # no non-null values; histogram via hash buckets
-                    col_min = NULL_FLAG
-                    col_max = NULL_FLAG
-                    if compute_hist:
-                        col_hist = [0] * HBINS
-                        for h in hashes:
-                            b = (h >> (64 - 5)) & 0x1F
-                            col_hist[b] += 1
-                    else:
-                        col_hist = [0] * HBINS
-                min_k_hashes.append(col_min_k)
-                histograms.append(col_hist)
-                min_values.append(col_min)
-                max_values.append(col_max)
-        except Exception:
-            # If draken or its dependencies are unavailable, fall back to
-            # conservative defaults so we can still write the manifest and
-            # snapshot without failing the append operation.
-            num_cols = table.num_columns
-            min_k_hashes = [[] for _ in range(num_cols)]
-            HBINS = 32
-            histograms = [[0] * HBINS for _ in range(num_cols)]
-            min_values = [NULL_FLAG] * num_cols
-            max_values = [NULL_FLAG] * num_cols
-        entries = [
-            {
-                "file_path": data_path,
-                "file_format": "parquet",
-                "record_count": int(table.num_rows),
-                "file_size_in_bytes": len(pdata),
-                "min_k_hashes": min_k_hashes,
-                "histogram_counts": histograms,
-                "histogram_bins": HBINS,
-                "min_values": min_values,
-                "max_values": max_values,
-            }
-        ]
+        # Write table and build manifest entry
+        manifest_entry = self._write_table_and_build_entry(table)
+        entries = [manifest_entry.to_dict()]
         # persist manifest: for append, merge previous manifest entries
         # with the new entries so the snapshot's manifest is cumulative.
@@ -384,35 +264,15 @@ class SimpleDataset(Dataset):
                 prev_manifest_path = prev_snap.manifest_list
                 try:
                     # Prefer FileIO when available
-                    if self.io and hasattr(self.io, "new_input"):
-                        inp = self.io.new_input(prev_manifest_path)
-                        with inp.open() as f:
-                            prev_data = f.read()
-                        import pyarrow as pa
-                        import pyarrow.parquet as pq
-                        prev_table = pq.read_table(pa.BufferReader(prev_data))
-                        prev_rows = prev_table.to_pylist()
-                        merged_entries = prev_rows + merged_entries
-                    else:
-                        # Fall back to catalog storage client (GCS)
-                        if (
-                            self.catalog
-                            and getattr(self.catalog, "_storage_client", None)
-                            and getattr(self.catalog, "gcs_bucket", None)
-                        ):
-                            bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
-                            parsed = prev_manifest_path
-                            if parsed.startswith("gs://"):
-                                parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
-                            blob = bucket.blob(parsed)
-                            prev_data = blob.download_as_bytes()
-                            import pyarrow as pa
-                            import pyarrow.parquet as pq
-                            prev_table = pq.read_table(pa.BufferReader(prev_data))
-                            prev_rows = prev_table.to_pylist()
-                            merged_entries = prev_rows + merged_entries
+                    inp = self.io.new_input(prev_manifest_path)
+                    with inp.open() as f:
+                        prev_data = f.read()
+                    import pyarrow as pa
+                    import pyarrow.parquet as pq
+                    prev_table = pq.read_table(pa.BufferReader(prev_data))
+                    prev_rows = prev_table.to_pylist()
+                    merged_entries = prev_rows + merged_entries
                 except Exception:
                     # If we can't read the previous manifest, continue with
                     # just the new entries (don't fail the append).
@@ -433,63 +293,52 @@ class SimpleDataset(Dataset):
             commit_message = f"commit by {author}"
         recs = int(table.num_rows)
-        fsize = len(pdata)
+        fsize = int(getattr(manifest_entry, "file_size_in_bytes", 0))
+        # Calculate uncompressed size from the manifest entry
+        added_data_size = manifest_entry.uncompressed_size_in_bytes
         added_data_files = 1
         added_files_size = fsize
         added_records = recs
         deleted_data_files = 0
         deleted_files_size = 0
+        deleted_data_size = 0
         deleted_records = 0
         prev = self.snapshot()
         if prev and prev.summary:
-            try:
-                prev_total_files = int(prev.summary.get("total-data-files", 0))
-            except Exception:
-                prev_total_files = 0
-            try:
-                prev_total_size = int(prev.summary.get("total-files-size", 0))
-            except Exception:
-                prev_total_size = 0
-            try:
-                prev_total_records = int(prev.summary.get("total-records", 0))
-            except Exception:
-                prev_total_records = 0
+            prev_total_files = int(prev.summary.get("total-data-files", 0))
+            prev_total_size = int(prev.summary.get("total-files-size", 0))
+            prev_total_data_size = int(prev.summary.get("total-data-size", 0))
+            prev_total_records = int(prev.summary.get("total-records", 0))
         else:
             prev_total_files = 0
             prev_total_size = 0
+            prev_total_data_size = 0
             prev_total_records = 0
         total_data_files = prev_total_files + added_data_files - deleted_data_files
         total_files_size = prev_total_size + added_files_size - deleted_files_size
+        total_data_size = prev_total_data_size + added_data_size - deleted_data_size
         total_records = prev_total_records + added_records - deleted_records
         summary = {
             "added-data-files": added_data_files,
             "added-files-size": added_files_size,
+            "added-data-size": added_data_size,
             "added-records": added_records,
             "deleted-data-files": deleted_data_files,
             "deleted-files-size": deleted_files_size,
+            "deleted-data-size": deleted_data_size,
             "deleted-records": deleted_records,
             "total-data-files": total_data_files,
             "total-files-size": total_files_size,
+            "total-data-size": total_data_size,
             "total-records": total_records,
         }
         # sequence number
         try:
-            max_seq = 0
-            for s in self.metadata.snapshots:
-                seq = getattr(s, "sequence_number", None)
-                if seq is None:
-                    continue
-                try:
-                    ival = int(seq)
-                except Exception:
-                    continue
-                if ival > max_seq:
-                    max_seq = ival
-            next_seq = max_seq + 1
+            next_seq = self._next_sequence_number()
         except Exception:
             next_seq = 1
@@ -518,6 +367,140 @@ class SimpleDataset(Dataset):
         if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
             self.catalog.save_dataset_metadata(self.identifier, self.metadata)
+    def _write_table_and_build_entry(self, table: Any):
+        """Write a PyArrow table to storage and return a ParquetManifestEntry.
+        This centralizes the IO and manifest construction so other operations
+        (e.g. `overwrite`) can reuse the same behavior as `append`.
+        """
+        # Write parquet file with collision-resistant name
+        fname = f"{time.time_ns():x}-{self._get_node()}.parquet"
+        data_path = f"{self.metadata.location}/data/{fname}"
+        import pyarrow as pa
+        import pyarrow.parquet as pq
+        from ..iops.fileio import WRITE_PARQUET_OPTIONS
+        buf = pa.BufferOutputStream()
+        pq.write_table(table, buf, **WRITE_PARQUET_OPTIONS)
+        pdata = buf.getvalue().to_pybytes()
+        out = self.io.new_output(data_path).create()
+        out.write(pdata)
+        out.close()
+        # Build manifest entry with statistics using a bytes-based, per-column scan
+        manifest_entry = build_parquet_manifest_entry_from_bytes(
+            pdata, data_path, len(pdata), orig_table=table
+        )
+        return manifest_entry
+    def overwrite(self, table: Any, author: str = None, commit_message: Optional[str] = None):
+        """Replace the dataset entirely with `table` in a single snapshot.
+        Semantics:
+        - Write the provided table as new data file(s)
+        - Create a new parquet manifest that contains only the new entries
+        - Create a snapshot that records previous files as deleted and the
+          new files as added (logical replace)
+        """
+        # Similar validation as append
+        snapshot_id = int(time.time() * 1000)
+        if not hasattr(table, "schema"):
+            raise TypeError("overwrite() expects a pyarrow.Table-like object")
+        if author is None:
+            raise ValueError("author must be provided when overwriting a dataset")
+        # Write new data and build manifest entries (single table -> single entry)
+        manifest_entry = self._write_table_and_build_entry(table)
+        new_entries = [manifest_entry.to_dict()]
+        # Write manifest containing only the new entries
+        manifest_path = None
+        if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
+            manifest_path = self.catalog.write_parquet_manifest(
+                snapshot_id, new_entries, self.metadata.location
+            )
+        # Compute deltas: previous manifest becomes deleted
+        prev = self.snapshot(None)
+        prev_total_files = 0
+        prev_total_size = 0
+        prev_total_data_size = 0
+        prev_total_records = 0
+        if prev and prev.summary:
+            prev_total_files = int(prev.summary.get("total-data-files", 0))
+            prev_total_size = int(prev.summary.get("total-files-size", 0))
+            prev_total_data_size = int(prev.summary.get("total-data-size", 0))
+            prev_total_records = int(prev.summary.get("total-records", 0))
+        deleted_data_files = prev_total_files
+        deleted_files_size = prev_total_size
+        deleted_data_size = prev_total_data_size
+        deleted_records = prev_total_records
+        added_data_files = len(new_entries)
+        added_files_size = sum(e.get("file_size_in_bytes", 0) for e in new_entries)
+        added_data_size = sum(e.get("uncompressed_size_in_bytes", 0) for e in new_entries)
+        added_records = sum(e.get("record_count", 0) for e in new_entries)
+        total_data_files = added_data_files
+        total_files_size = added_files_size
+        total_data_size = added_data_size
+        total_records = added_records
+        summary = {
+            "added-data-files": added_data_files,
+            "added-files-size": added_files_size,
+            "added-data-size": added_data_size,
+            "added-records": added_records,
+            "deleted-data-files": deleted_data_files,
+            "deleted-files-size": deleted_files_size,
+            "deleted-data-size": deleted_data_size,
+            "deleted-records": deleted_records,
+            "total-data-files": total_data_files,
+            "total-files-size": total_files_size,
+            "total-data-size": total_data_size,
+            "total-records": total_records,
+        }
+        # sequence number
+        try:
+            next_seq = self._next_sequence_number()
+        except Exception:
+            next_seq = 1
+        parent_id = self.metadata.current_snapshot_id
+        if commit_message is None:
+            commit_message = f"overwrite by {author}"
+        snap = Snapshot(
+            snapshot_id=snapshot_id,
+            timestamp_ms=snapshot_id,
+            author=author,
+            sequence_number=next_seq,
+            user_created=True,
+            operation_type="overwrite",
+            parent_snapshot_id=parent_id,
+            manifest_list=manifest_path,
+            schema_id=self.metadata.current_schema_id,
+            commit_message=commit_message,
+            summary=summary,
+        )
+        # Replace in-memory snapshots
+        self.metadata.snapshots.append(snap)
+        self.metadata.current_snapshot_id = snapshot_id
+        if self.catalog and hasattr(self.catalog, "save_snapshot"):
+            self.catalog.save_snapshot(self.identifier, snap)
+        if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
+            self.catalog.save_dataset_metadata(self.identifier, self.metadata)
     def add_files(self, files: list[str], author: str = None, commit_message: Optional[str] = None):
         """Add filenames to the dataset manifest without writing the files.
@@ -540,45 +523,20 @@ class SimpleDataset(Dataset):
         prev_total_records = 0
         prev_entries = []
         if prev and prev.summary:
-            try:
-                prev_total_files = int(prev.summary.get("total-data-files", 0))
-            except Exception:
-                prev_total_files = 0
-            try:
-                prev_total_size = int(prev.summary.get("total-files-size", 0))
-            except Exception:
-                prev_total_size = 0
-            try:
-                prev_total_records = int(prev.summary.get("total-records", 0))
-            except Exception:
-                prev_total_records = 0
+            prev_total_files = int(prev.summary.get("total-data-files", 0))
+            prev_total_size = int(prev.summary.get("total-files-size", 0))
+            prev_total_records = int(prev.summary.get("total-records", 0))
         if prev and getattr(prev, "manifest_list", None):
             # try to read prev manifest entries
             try:
                 import pyarrow as pa
                 import pyarrow.parquet as pq
-                if self.io and hasattr(self.io, "new_input"):
-                    inp = self.io.new_input(prev.manifest_list)
-                    with inp.open() as f:
-                        data = f.read()
-                    table = pq.read_table(pa.BufferReader(data))
-                    prev_entries = table.to_pylist()
-                else:
-                    if (
-                        self.catalog
-                        and getattr(self.catalog, "_storage_client", None)
-                        and getattr(self.catalog, "gcs_bucket", None)
-                    ):
-                        bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
-                        parsed = prev.manifest_list
-                        if parsed.startswith("gs://"):
-                            parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
-                        blob = bucket.blob(parsed)
-                        data = blob.download_as_bytes()
-                        table = pq.read_table(pa.BufferReader(data))
-                        prev_entries = table.to_pylist()
+                inp = self.io.new_input(prev.manifest_list)
+                with inp.open() as f:
+                    data = f.read()
+                table = pq.read_table(pa.BufferReader(data))
+                prev_entries = table.to_pylist()
             except Exception:
                 prev_entries = []
@@ -587,9 +545,7 @@ class SimpleDataset(Dataset):
         }
         # Build new entries for files that don't already exist. Only accept
-        # Parquet files and attempt to read lightweight metadata (bytes,
-        # row count, per-column min/max) from the Parquet footer when
-        # available.
+        # Parquet files and compute full statistics for each file.
         new_entries = []
         seen = set()
         for fp in files:
@@ -600,147 +556,52 @@ class SimpleDataset(Dataset):
                 continue
             seen.add(fp)
-            # Attempt to read file bytes and parquet metadata
-            file_size = 0
-            record_count = 0
-            min_values = []
-            max_values = []
+            # Read file and compute full statistics
             try:
                 import pyarrow as pa
                 import pyarrow.parquet as pq
-                data = None
-                if self.io and hasattr(self.io, "new_input"):
-                    inp = self.io.new_input(fp)
-                    with inp.open() as f:
-                        data = f.read()
-                else:
-                    if (
-                        self.catalog
-                        and getattr(self.catalog, "_storage_client", None)
-                        and getattr(self.catalog, "gcs_bucket", None)
-                    ):
-                        bucket = self.catalog._storage_client.bucket(self.catalog.gcs_bucket)
-                        parsed = fp
-                        if parsed.startswith("gs://"):
-                            parsed = parsed[5 + len(self.catalog.gcs_bucket) + 1 :]
-                        blob = bucket.blob(parsed)
-                        data = blob.download_as_bytes()
+                inp = self.io.new_input(fp)
+                with inp.open() as f:
+                    data = f.read()
                 if data:
+                    # Compute statistics using a single read of the compressed bytes
                     file_size = len(data)
-                    pf = pq.ParquetFile(pa.BufferReader(data))
-                    record_count = int(pf.metadata.num_rows or 0)
-                    # Prefer computing min/max via draken.compress() over
-                    # relying on Parquet footer stats which may contain
-                    # heterogenous or non-numeric values. Fall back to
-                    # footer stats only if draken is unavailable.
-                    try:
-                        import opteryx.draken as draken  # type: ignore
-                        table = pq.read_table(pa.BufferReader(data))
-                        ncols = table.num_columns
-                        mins = [None] * ncols
-                        maxs = [None] * ncols
-                        NULL_FLAG = -(1 << 63)
-                        for ci in range(ncols):
-                            try:
-                                col = table.column(ci)
-                                # combine chunks if needed
-                                if hasattr(col, "combine_chunks"):
-                                    arr = col.combine_chunks()
-                                else:
-                                    arr = col
-                                vec = draken.Vector.from_arrow(arr)
-                                mapped = list(vec.compress())
-                                non_nulls = [m for m in mapped if m != NULL_FLAG]
-                                if non_nulls:
-                                    mins[ci] = int(min(non_nulls))
-                                    maxs[ci] = int(max(non_nulls))
-                                else:
-                                    mins[ci] = None
-                                    maxs[ci] = None
-                            except Exception:
-                                # per-column fallback: leave None
-                                mins[ci] = None
-                                maxs[ci] = None
-                    except Exception:
-                        # Draken not available; fall back to Parquet footer stats
-                        ncols = pf.metadata.num_columns
-                        mins = [None] * ncols
-                        maxs = [None] * ncols
-                        for rg in range(pf.num_row_groups):
-                            for ci in range(ncols):
-                                col_meta = pf.metadata.row_group(rg).column(ci)
-                                stats = getattr(col_meta, "statistics", None)
-                                if not stats:
-                                    continue
-                                smin = getattr(stats, "min", None)
-                                smax = getattr(stats, "max", None)
-                                if smin is None and smax is None:
-                                    continue
-                                def _to_py(v):
-                                    try:
-                                        return int(v)
-                                    except Exception:
-                                        try:
-                                            return float(v)
-                                        except Exception:
-                                            try:
-                                                if isinstance(v, (bytes, bytearray)):
-                                                    return v.decode("utf-8", errors="ignore")
-                                            except Exception:
-                                                pass
-                                            return v
-                                if smin is not None:
-                                    sval = _to_py(smin)
-                                    if mins[ci] is None:
-                                        mins[ci] = sval
-                                    else:
-                                        try:
-                                            if sval < mins[ci]:
-                                                mins[ci] = sval
-                                        except Exception:
-                                            pass
-                                if smax is not None:
-                                    sval = _to_py(smax)
-                                    if maxs[ci] is None:
-                                        maxs[ci] = sval
-                                    else:
-                                        try:
-                                            if sval > maxs[ci]:
-                                                maxs[ci] = sval
-                                        except Exception:
-                                            pass
-                    # normalize lists to empty lists when values missing
-                    min_values = [m for m in mins if m is not None]
-                    max_values = [m for m in maxs if m is not None]
+                    manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
+                else:
+                    # Empty file, create placeholder entry
+                    manifest_entry = ParquetManifestEntry(
+                        file_path=fp,
+                        file_format="parquet",
+                        record_count=0,
+                        null_counts=[],
+                        file_size_in_bytes=0,
+                        uncompressed_size_in_bytes=0,
+                        column_uncompressed_sizes_in_bytes=[],
+                        min_k_hashes=[],
+                        histogram_counts=[],
+                        histogram_bins=0,
+                        min_values=[],
+                        max_values=[],
+                    )
             except Exception:
-                # If metadata read fails, fall back to placeholders
-                file_size = 0
-                record_count = 0
-                min_values = []
-                max_values = []
-            new_entries.append(
-                {
-                    "file_path": fp,
-                    "file_format": "parquet",
-                    "record_count": int(record_count),
-                    "file_size_in_bytes": int(file_size),
-                    "min_k_hashes": [],
-                    "histogram_counts": [],
-                    "histogram_bins": 0,
-                    "min_values": min_values,
-                    "max_values": max_values,
-                }
-            )
+                # If read fails, fall back to placeholders
+                manifest_entry = ParquetManifestEntry(
+                    file_path=fp,
+                    file_format="parquet",
+                    record_count=0,
+                    null_counts=[],
+                    file_size_in_bytes=0,
+                    uncompressed_size_in_bytes=0,
+                    column_uncompressed_sizes_in_bytes=[],
+                    min_k_hashes=[],
+                    histogram_counts=[],
+                    histogram_bins=0,
+                    min_values=[],
+                    max_values=[],
+                )
+            new_entries.append(manifest_entry.to_dict())
         merged_entries = prev_entries + new_entries
@@ -754,41 +615,44 @@ class SimpleDataset(Dataset):
         # Build summary deltas
         added_data_files = len(new_entries)
         added_files_size = 0
+        added_data_size = 0
         added_records = 0
+        # Sum statistics from new entries
+        for entry in new_entries:
+            added_data_size += entry.get("uncompressed_size_in_bytes", 0)
+            added_records += entry.get("record_count", 0)
         deleted_data_files = 0
         deleted_files_size = 0
+        deleted_data_size = 0
         deleted_records = 0
+        prev_total_data_size = (
+            int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
+        )
         total_data_files = prev_total_files + added_data_files - deleted_data_files
         total_files_size = prev_total_size + added_files_size - deleted_files_size
+        total_data_size = prev_total_data_size + added_data_size - deleted_data_size
         total_records = prev_total_records + added_records - deleted_records
         summary = {
             "added-data-files": added_data_files,
             "added-files-size": added_files_size,
+            "added-data-size": added_data_size,
             "added-records": added_records,
             "deleted-data-files": deleted_data_files,
             "deleted-files-size": deleted_files_size,
+            "deleted-data-size": deleted_data_size,
             "deleted-records": deleted_records,
             "total-data-files": total_data_files,
             "total-files-size": total_files_size,
+            "total-data-size": total_data_size,
             "total-records": total_records,
         }
         # Sequence number
         try:
-            max_seq = 0
-            for s in self.metadata.snapshots:
-                seq = getattr(s, "sequence_number", None)
-                if seq is None:
-                    continue
-                try:
-                    ival = int(seq)
-                except Exception:
-                    continue
-                if ival > max_seq:
-                    max_seq = ival
-            next_seq = max_seq + 1
+            next_seq = self._next_sequence_number()
         except Exception:
             next_seq = 1
@@ -853,7 +717,7 @@ class SimpleDataset(Dataset):
                 prev_total_records = 0
         # Build unique new entries (ignore duplicates in input). Only accept
-        # parquet files and try to read lightweight metadata from each file.
+        # parquet files and compute full statistics for each file.
         new_entries = []
         seen = set()
         for fp in files:
@@ -863,14 +727,7 @@ class SimpleDataset(Dataset):
                 continue
             seen.add(fp)
-            file_size = 0
-            record_count = 0
-            min_values = []
-            max_values = []
             try:
-                import pyarrow as pa
-                import pyarrow.parquet as pq
                 data = None
                 if self.io and hasattr(self.io, "new_input"):
                     inp = self.io.new_input(fp)
@@ -890,80 +747,42 @@ class SimpleDataset(Dataset):
                         data = blob.download_as_bytes()
                 if data:
+                    # Compute statistics using a single read of the compressed bytes
                     file_size = len(data)
-                    pf = pq.ParquetFile(pa.BufferReader(data))
-                    record_count = int(pf.metadata.num_rows or 0)
-                    ncols = pf.metadata.num_columns
-                    mins = [None] * ncols
-                    maxs = [None] * ncols
-                    for rg in range(pf.num_row_groups):
-                        for ci in range(ncols):
-                            col_meta = pf.metadata.row_group(rg).column(ci)
-                            stats = getattr(col_meta, "statistics", None)
-                            if not stats:
-                                continue
-                            smin = getattr(stats, "min", None)
-                            smax = getattr(stats, "max", None)
-                            if smin is None and smax is None:
-                                continue
-                            def _to_py(v):
-                                try:
-                                    return int(v)
-                                except Exception:
-                                    try:
-                                        return float(v)
-                                    except Exception:
-                                        try:
-                                            if isinstance(v, (bytes, bytearray)):
-                                                return v.decode("utf-8", errors="ignore")
-                                        except Exception:
-                                            pass
-                                        return v
-                            if smin is not None:
-                                sval = _to_py(smin)
-                                if mins[ci] is None:
-                                    mins[ci] = sval
-                                else:
-                                    try:
-                                        if sval < mins[ci]:
-                                            mins[ci] = sval
-                                    except Exception:
-                                        pass
-                            if smax is not None:
-                                sval = _to_py(smax)
-                                if maxs[ci] is None:
-                                    maxs[ci] = sval
-                                else:
-                                    try:
-                                        if sval > maxs[ci]:
-                                            maxs[ci] = sval
-                                    except Exception:
-                                        pass
-                    min_values = [m for m in mins if m is not None]
-                    max_values = [m for m in maxs if m is not None]
+                    manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
+                else:
+                    # Empty file, create placeholder entry
+                    manifest_entry = ParquetManifestEntry(
+                        file_path=fp,
+                        file_format="parquet",
+                        record_count=0,
+                        null_counts=[],
+                        file_size_in_bytes=0,
+                        uncompressed_size_in_bytes=0,
+                        column_uncompressed_sizes_in_bytes=[],
+                        min_k_hashes=[],
+                        histogram_counts=[],
+                        histogram_bins=0,
+                        min_values=[],
+                        max_values=[],
+                    )
             except Exception:
-                file_size = 0
-                record_count = 0
-                min_values = []
-                max_values = []
-            new_entries.append(
-                {
-                    "file_path": fp,
-                    "file_format": "parquet",
-                    "record_count": int(record_count),
-                    "file_size_in_bytes": int(file_size),
-                    "min_k_hashes": [],
-                    "histogram_counts": [],
-                    "histogram_bins": 0,
-                    "min_values": min_values,
-                    "max_values": max_values,
-                }
-            )
+                # If read fails, create placeholder entry
+                manifest_entry = ParquetManifestEntry(
+                    file_path=fp,
+                    file_format="parquet",
+                    record_count=0,
+                    null_counts=[],
+                    file_size_in_bytes=0,
+                    uncompressed_size_in_bytes=0,
+                    column_uncompressed_sizes_in_bytes=[],
+                    min_k_hashes=[],
+                    histogram_counts=[],
+                    histogram_bins=0,
+                    min_values=[],
+                    max_values=[],
+                )
+            new_entries.append(manifest_entry.to_dict())
         manifest_path = None
         if self.catalog and hasattr(self.catalog, "write_parquet_manifest"):
@@ -974,42 +793,43 @@ class SimpleDataset(Dataset):
         # Build summary: previous entries become deleted
         deleted_data_files = prev_total_files
         deleted_files_size = prev_total_size
+        deleted_data_size = (
+            int(prev.summary.get("total-data-size", 0)) if prev and prev.summary else 0
+        )
         deleted_records = prev_total_records
         added_data_files = len(new_entries)
         added_files_size = 0
+        added_data_size = 0
         added_records = 0
+        # Sum statistics from new entries
+        for entry in new_entries:
+            added_data_size += entry.get("uncompressed_size_in_bytes", 0)
+            added_records += entry.get("record_count", 0)
         total_data_files = added_data_files
         total_files_size = added_files_size
+        total_data_size = added_data_size
         total_records = added_records
         summary = {
             "added-data-files": added_data_files,
             "added-files-size": added_files_size,
+            "added-data-size": added_data_size,
             "added-records": added_records,
             "deleted-data-files": deleted_data_files,
             "deleted-files-size": deleted_files_size,
+            "deleted-data-size": deleted_data_size,
             "deleted-records": deleted_records,
             "total-data-files": total_data_files,
             "total-files-size": total_files_size,
+            "total-data-size": total_data_size,
             "total-records": total_records,
         }
         # Sequence number
         try:
-            max_seq = 0
-            for s in self.metadata.snapshots:
-                seq = getattr(s, "sequence_number", None)
-                if seq is None:
-                    continue
-                try:
-                    ival = int(seq)
-                except Exception:
-                    continue
-                if ival > max_seq:
-                    max_seq = ival
-            next_seq = max_seq + 1
+            next_seq = self._next_sequence_number()
         except Exception:
             next_seq = 1
@@ -1041,14 +861,10 @@ class SimpleDataset(Dataset):
         if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
             self.catalog.save_dataset_metadata(self.identifier, self.metadata)
-    def scan(
-        self, row_filter=None, row_limit=None, snapshot_id: Optional[int] = None
-    ) -> Iterable[Datafile]:
+    def scan(self, row_filter=None, snapshot_id: Optional[int] = None) -> Iterable[Datafile]:
         """Return Datafile objects for the given snapshot.
         - If `snapshot_id` is None, use the current snapshot.
-        - Ignore `row_filter` for now and return all files listed in the
-          snapshot's parquet manifest (if present).
         """
         # Determine snapshot to read using the dataset-level helper which
         # prefers the in-memory current snapshot and otherwise performs a
@@ -1065,8 +881,6 @@ class SimpleDataset(Dataset):
             import pyarrow as pa
             import pyarrow.parquet as pq
-            data = None
             inp = self.io.new_input(manifest_path)
             with inp.open() as f:
                 data = f.read()
@@ -1076,23 +890,536 @@ class SimpleDataset(Dataset):
             table = pq.read_table(pa.BufferReader(data))
             rows = table.to_pylist()
-            cum_rows = 0
             for r in rows:
                 yield Datafile(entry=r)
-                try:
-                    rc = int(r.get("record_count") or 0)
-                except Exception:
-                    rc = 0
-                cum_rows += rc
-                if row_limit is not None and cum_rows >= row_limit:
-                    break
         except FileNotFoundError:
             return iter(())
         except Exception:
             return iter(())
+    def describe(self, snapshot_id: Optional[int] = None, bins: int = 10) -> dict:
+        """Describe all schema columns for the given snapshot.
+        Returns a dict mapping column name -> statistics (same shape as
+        the previous `describe` per-column output).
+        """
+        import heapq
+        snap = self.snapshot(snapshot_id)
+        if snap is None or not getattr(snap, "manifest_list", None):
+            raise ValueError("No manifest available for this dataset/snapshot")
+        manifest_path = snap.manifest_list
+        # Read manifest once
+        try:
+            import pyarrow as pa
+            import pyarrow.parquet as pq
+            inp = self.io.new_input(manifest_path)
+            with inp.open() as f:
+                data = f.read()
+            if not data:
+                raise ValueError("Empty manifest data")
+            table = pq.read_table(pa.BufferReader(data))
+            entries = table.to_pylist()
+        except Exception:
+            raise
+        # Resolve schema and describe all columns
+        orso_schema = None
+        try:
+            orso_schema = self.schema()
+        except Exception:
+            orso_schema = None
+        if orso_schema is None:
+            raise ValueError("Schema unavailable; cannot describe all columns")
+        # Map column name -> index for every schema column
+        col_to_idx: dict[str, int] = {c.name: i for i, c in enumerate(orso_schema.columns)}
+        # Initialize accumulators per column
+        stats: dict[str, dict] = {}
+        for name in col_to_idx:
+            stats[name] = {
+                "null_count": 0,
+                "mins": [],
+                "maxs": [],
+                "hashes": set(),
+                "file_hist_infos": [],
+                "min_displays": [],
+                "max_displays": [],
+                "uncompressed_bytes": 0,
+            }
+        total_rows = 0
+        def _decode_minmax(v):
+            if v is None:
+                return None
+            if isinstance(v, (int, float)):
+                return v
+            # For strings stored as string values (not bytes), return as-is
+            if isinstance(v, str):
+                # Try to parse as number for backward compatibility
+                try:
+                    return int(v)
+                except Exception:
+                    try:
+                        return float(v)
+                    except Exception:
+                        # Not a number, return the string itself for display
+                        return v
+            try:
+                if isinstance(v, (bytes, bytearray, memoryview)):
+                    b = bytes(v)
+                    if b and b[-1] == 0xFF:
+                        b = b[:-1]
+                    s = b.decode("utf-8")
+                    try:
+                        return int(s)
+                    except Exception:
+                        try:
+                            return float(s)
+                        except Exception:
+                            # Decoded bytes that aren't numbers, return as string
+                            return s
+            except Exception:
+                pass
+            return None
+        # Single pass through entries updating per-column accumulators
+        for ent in entries:
+            if not isinstance(ent, dict):
+                continue
+            total_rows += int(ent.get("record_count") or 0)
+            # prefetch lists
+            ncounts = ent.get("null_counts") or []
+            mks = ent.get("min_k_hashes") or []
+            hists = ent.get("histogram_counts") or []
+            mv = ent.get("min_values") or []
+            xv = ent.get("max_values") or []
+            mv_disp = ent.get("min_values_display") or []
+            xv_disp = ent.get("max_values_display") or []
+            col_sizes = ent.get("column_uncompressed_sizes_in_bytes") or []
+            for cname, cidx in col_to_idx.items():
+                # nulls
+                try:
+                    stats[cname]["null_count"] += int((ncounts or [0])[cidx])
+                except Exception:
+                    pass
+                # mins/maxs
+                try:
+                    raw_min = mv[cidx]
+                except Exception:
+                    raw_min = None
+                try:
+                    raw_max = xv[cidx]
+                except Exception:
+                    raw_max = None
+                dmin = _decode_minmax(raw_min)
+                dmax = _decode_minmax(raw_max)
+                if dmin is not None:
+                    stats[cname]["mins"].append(dmin)
+                if dmax is not None:
+                    stats[cname]["maxs"].append(dmax)
+                # collect textual display values when present
+                try:
+                    try:
+                        raw_min_disp = mv_disp[cidx]
+                    except Exception:
+                        raw_min_disp = None
+                    try:
+                        raw_max_disp = xv_disp[cidx]
+                    except Exception:
+                        raw_max_disp = None
+                    def _decode_display(v):
+                        if v is None:
+                            return None
+                        try:
+                            if isinstance(v, (bytes, bytearray, memoryview)):
+                                b = bytes(v)
+                                if b and b[-1] == 0xFF:
+                                    b = b[:-1]
+                                return b.decode("utf-8", errors="replace")
+                            if isinstance(v, str):
+                                return v
+                        except Exception:
+                            return None
+                        return None
+                    md = _decode_display(raw_min_disp)
+                    xd = _decode_display(raw_max_disp)
+                    if md is not None:
+                        stats[cname]["min_displays"].append(md)
+                    if xd is not None:
+                        stats[cname]["max_displays"].append(xd)
+                except Exception:
+                    pass
+                # min-k hashes
+                try:
+                    col_mk = mks[cidx] or []
+                except Exception:
+                    col_mk = []
+                for h in col_mk:
+                    try:
+                        stats[cname]["hashes"].add(int(h))
+                    except Exception:
+                        pass
+                # histograms
+                try:
+                    col_hist = hists[cidx]
+                except Exception:
+                    col_hist = []
+                if col_hist:
+                    try:
+                        if dmin is not None and dmax is not None and dmin != dmax:
+                            stats[cname]["file_hist_infos"].append(
+                                (float(dmin), float(dmax), list(col_hist))
+                            )
+                    except Exception:
+                        pass
+                # uncompressed bytes for this column (sum across files)
+                try:
+                    stats[cname]["uncompressed_bytes"] += int((col_sizes or [0])[cidx])
+                except Exception:
+                    pass
+        # Build results per column
+        results: dict[str, dict] = {}
+        for cname, cidx in col_to_idx.items():
+            s = stats[cname]
+            # Handle mixed types: separate strings from numbers
+            mins_filtered = [v for v in s["mins"] if v is not None]
+            maxs_filtered = [v for v in s["maxs"] if v is not None]
+            # Group by type: strings vs numbers
+            str_mins = [v for v in mins_filtered if isinstance(v, str)]
+            num_mins = [v for v in mins_filtered if not isinstance(v, str)]
+            str_maxs = [v for v in maxs_filtered if isinstance(v, str)]
+            num_maxs = [v for v in maxs_filtered if not isinstance(v, str)]
+            # Use whichever type has values (strings take precedence for text columns)
+            global_min = None
+            global_max = None
+            if str_mins:
+                global_min = min(str_mins)
+            elif num_mins:
+                global_min = min(num_mins)
+            if str_maxs:
+                global_max = max(str_maxs)
+            elif num_maxs:
+                global_max = max(num_maxs)
+            # kmv approx
+            cardinality = 0
+            cardinality_is_exact = False
+            try:
+                collected = s["hashes"]
+                if collected:
+                    smallest = heapq.nsmallest(32, collected)
+                    k = len(smallest)
+                    if k < 31:
+                        cardinality = len(set(smallest))
+                        cardinality_is_exact = True
+                    else:
+                        MAX_HASH = (1 << 64) - 1
+                        R = max(smallest)
+                        if R == 0:
+                            cardinality = len(set(smallest))
+                        else:
+                            cardinality = int((k - 1) * (MAX_HASH + 1) / (R + 1))
+            except Exception:
+                cardinality = 0
+            # distribution via distogram
+            distribution = None
+            if (
+                s["file_hist_infos"]
+                and global_min is not None
+                and global_max is not None
+                and global_max > global_min
+            ):
+                try:
+                    from opteryx_catalog.maki_nage.distogram import Distogram
+                    from opteryx_catalog.maki_nage.distogram import count as _count_dist
+                    from opteryx_catalog.maki_nage.distogram import count_up_to as _count_up_to
+                    from opteryx_catalog.maki_nage.distogram import merge as _merge_distogram
+                    from opteryx_catalog.maki_nage.distogram import update as _update_distogram
+                    dist_bin_count = max(50, bins * 5)
+                    global_d = Distogram(bin_count=dist_bin_count)
+                    for fmin, fmax, counts in s["file_hist_infos"]:
+                        fbins = len(counts)
+                        if fbins <= 0:
+                            continue
+                        temp = Distogram(bin_count=dist_bin_count)
+                        span = float(fmax - fmin) if fmax != fmin else 0.0
+                        for bi, cnt in enumerate(counts):
+                            if cnt <= 0:
+                                continue
+                            if span == 0.0:
+                                rep = float(fmin)
+                            else:
+                                rep = fmin + (bi + 0.5) * span / fbins
+                            _update_distogram(temp, float(rep), int(cnt))
+                        global_d = _merge_distogram(global_d, temp)
+                    distribution = [0] * bins
+                    total = int(_count_dist(global_d) or 0)
+                    if total == 0:
+                        distribution = [0] * bins
+                    else:
+                        prev = 0.0
+                        gmin = float(global_min)
+                        gmax = float(global_max)
+                        for i in range(1, bins + 1):
+                            edge = gmin + (i / bins) * (gmax - gmin)
+                            cum = _count_up_to(global_d, edge) or 0.0
+                            distribution[i - 1] = int(round(cum - prev))
+                            prev = cum
+                        diff = total - sum(distribution)
+                        if diff != 0:
+                            distribution[-1] += diff
+                except Exception:
+                    distribution = [0] * bins
+                    gspan = float(global_max - global_min)
+                    for fmin, fmax, counts in s["file_hist_infos"]:
+                        fbins = len(counts)
+                        if fbins <= 0:
+                            continue
+                        for bi, cnt in enumerate(counts):
+                            if cnt <= 0:
+                                continue
+                            rep = fmin + (bi + 0.5) * (fmax - fmin) / fbins
+                            gi = int((rep - global_min) / gspan * bins)
+                            if gi < 0:
+                                gi = 0
+                            if gi >= bins:
+                                gi = bins - 1
+                            distribution[gi] += int(cnt)
+            res = {
+                "dataset": self.identifier,
+                "description": getattr(self.metadata, "description", None),
+                "row_count": total_rows,
+                "column": cname,
+                "min": global_min,
+                "max": global_max,
+                "null_count": s["null_count"],
+                "uncompressed_bytes": s["uncompressed_bytes"],
+                "cardinality": cardinality,
+                "cardinality_is_exact": cardinality_is_exact,
+                "distribution": distribution,
+            }
+            # If textual, attempt display prefixes like describe()
+            try:
+                is_text = False
+                if orso_schema is not None:
+                    col = orso_schema.columns[cidx]
+                    ctype = getattr(col, "type", None)
+                    if ctype is not None:
+                        sctype = str(ctype).lower()
+                        if "char" in sctype or "string" in sctype or "varchar" in sctype:
+                            is_text = True
+            except Exception:
+                is_text = False
+            if is_text:
+                # Use only textual display values collected from manifests.
+                # Decode bytes and strip truncation marker (0xFF) if present.
+                def _decode_display_raw(v):
+                    if v is None:
+                        return None
+                    try:
+                        if isinstance(v, (bytes, bytearray, memoryview)):
+                            b = bytes(v)
+                            if b and b[-1] == 0xFF:
+                                b = b[:-1]
+                            s_val = b.decode("utf-8", errors="replace")
+                            return s_val[:16]
+                        if isinstance(v, str):
+                            return v[:16]
+                    except Exception:
+                        return None
+                    return None
+                min_disp = None
+                max_disp = None
+                try:
+                    if s.get("min_displays"):
+                        for v in s.get("min_displays"):
+                            dv = _decode_display_raw(v)
+                            if dv:
+                                min_disp = dv
+                                break
+                    if s.get("max_displays"):
+                        for v in s.get("max_displays"):
+                            dv = _decode_display_raw(v)
+                            if dv:
+                                max_disp = dv
+                                break
+                except Exception:
+                    min_disp = None
+                    max_disp = None
+                if min_disp is not None or max_disp is not None:
+                    res["min_display"] = min_disp
+                    res["max_display"] = max_disp
+            results[cname] = res
+        return results
+    def refresh_manifest(self, agent: str, author: Optional[str] = None) -> Optional[int]:
+        """Refresh manifest statistics and create a new snapshot.
+        - `agent`: identifier for the agent performing the refresh (string)
+        - `author`: optional author to record; if omitted uses current snapshot author
+        This recalculates per-file statistics (min/max, record counts, sizes)
+        for every file in the current manifest, writes a new manifest and
+        creates a new snapshot with `user_created=False` and
+        `operation_type='statistics-refresh'`.
+        Returns the new `snapshot_id` on success or None on failure.
+        """
+        prev = self.snapshot(None)
+        if prev is None or not getattr(prev, "manifest_list", None):
+            raise ValueError("No current manifest available to refresh")
+        # Use same author/commit-timestamp as previous snapshot unless overridden
+        use_author = author if author is not None else getattr(prev, "author", None)
+        snapshot_id = int(time.time() * 1000)
+        # Rebuild manifest entries by re-reading each data file
+        entries = []
+        try:
+            # Read previous manifest entries
+            inp = self.io.new_input(prev.manifest_list)
+            with inp.open() as f:
+                prev_data = f.read()
+            import pyarrow as pa
+            import pyarrow.parquet as pq
+            # the manifest is a parquet file, read into a pyarrow Table
+            prev_manifest = pq.read_table(pa.BufferReader(prev_data))
+            prev_rows = prev_manifest.to_pylist()
+        except Exception:
+            prev_rows = []
+        total_files = 0
+        total_size = 0
+        total_data_size = 0
+        total_records = 0
+        for ent in prev_rows:
+            if not isinstance(ent, dict):
+                continue
+            fp = ent.get("file_path")
+            if not fp:
+                continue
+            try:
+                inp = self.io.new_input(fp)
+                with inp.open() as f:
+                    data = f.read()
+                # Full statistics including histograms and k-hashes
+                file_size = len(data)
+                manifest_entry = build_parquet_manifest_entry_from_bytes(data, fp, file_size)
+                dent = manifest_entry.to_dict()
+            except Exception:
+                # Fall back to original entry if re-read fails
+                dent = ent
+            entries.append(dent)
+            total_files += 1
+            total_size += int(dent.get("file_size_in_bytes") or 0)
+            total_data_size += int(dent.get("uncompressed_size_in_bytes") or 0)
+            total_records += int(dent.get("record_count") or 0)
+        # write new manifest
+        manifest_path = self.catalog.write_parquet_manifest(
+            snapshot_id, entries, self.metadata.location
+        )
+        # Build summary
+        summary = {
+            "added-data-files": 0,
+            "added-files-size": 0,
+            "added-data-size": 0,
+            "added-records": 0,
+            "deleted-data-files": 0,
+            "deleted-files-size": 0,
+            "deleted-data-size": 0,
+            "deleted-records": 0,
+            "total-data-files": total_files,
+            "total-files-size": total_size,
+            "total-data-size": total_data_size,
+            "total-records": total_records,
+        }
+        # sequence number
+        try:
+            next_seq = self._next_sequence_number()
+        except Exception:
+            next_seq = 1
+        parent_id = self.metadata.current_snapshot_id
+        # Agent committer metadata
+        agent_meta = {
+            "timestamp": int(time.time() * 1000),
+            "action": "statistics-refresh",
+            "agent": agent,
+        }
+        snap = Snapshot(
+            snapshot_id=snapshot_id,
+            timestamp_ms=getattr(prev, "timestamp_ms", snapshot_id),
+            author=use_author,
+            sequence_number=next_seq,
+            user_created=False,
+            operation_type="statistics-refresh",
+            parent_snapshot_id=parent_id,
+            manifest_list=manifest_path,
+            schema_id=self.metadata.current_schema_id,
+            commit_message=getattr(prev, "commit_message", "statistics refresh"),
+            summary=summary,
+        )
+        # attach agent metadata under summary
+        if snap.summary is None:
+            snap.summary = {}
+        snap.summary["agent-committer"] = agent_meta
+        # update in-memory metadata
+        self.metadata.snapshots.append(snap)
+        self.metadata.current_snapshot_id = snapshot_id
+        # persist
+        if self.catalog and hasattr(self.catalog, "save_snapshot"):
+            self.catalog.save_snapshot(self.identifier, snap)
+        if self.catalog and hasattr(self.catalog, "save_dataset_metadata"):
+            self.catalog.save_dataset_metadata(self.identifier, self.metadata)
+        return snapshot_id
     def truncate(self, author: str = None, commit_message: Optional[str] = None) -> None:
-        """Delete all data files and manifests for this table.
+        """Delete all data files and manifests for this dataset.
         This attempts to delete every data file referenced by existing
         Parquet manifests and then delete the manifest files themselves.
@@ -1109,6 +1436,7 @@ class SimpleDataset(Dataset):
         snaps = list(self.metadata.snapshots)
         removed_files = []
         removed_total_size = 0
+        removed_data_size = 0
         for snap in snaps:
             manifest_path = getattr(snap, "manifest_list", None)
@@ -1118,31 +1446,34 @@ class SimpleDataset(Dataset):
             # Read manifest via FileIO if available
             rows = []
             try:
-                if hasattr(io, "new_input"):
-                    inp = io.new_input(manifest_path)
-                    with inp.open() as f:
-                        data = f.read()
-                    table = pq.read_table(pa.BufferReader(data))
-                    rows = table.to_pylist()
+                inp = io.new_input(manifest_path)
+                with inp.open() as f:
+                    data = f.read()
+                table = pq.read_table(pa.BufferReader(data))
+                rows = table.to_pylist()
             except Exception:
                 rows = []
             for r in rows:
                 fp = None
                 fsize = 0
+                data_size = 0
                 if isinstance(r, dict):
                     fp = r.get("file_path")
                     fsize = int(r.get("file_size_in_bytes") or 0)
+                    data_size = int(r.get("uncompressed_size_in_bytes") or 0)
                     if not fp and "data_file" in r and isinstance(r["data_file"], dict):
                         fp = r["data_file"].get("file_path") or r["data_file"].get("path")
                         fsize = int(r["data_file"].get("file_size_in_bytes") or 0)
+                        data_size = int(r["data_file"].get("uncompressed_size_in_bytes") or 0)
                 if fp:
                     removed_files.append(fp)
                     removed_total_size += fsize
+                    removed_data_size += data_size
         # Create a new empty Parquet manifest (entries=[]) to represent the
-        # truncated table for the new snapshot. Do not delete objects.
+        # truncated dataset for the new snapshot. Do not delete objects.
         snapshot_id = int(time.time() * 1000)
         # Do NOT write an empty Parquet manifest when there are no entries.
@@ -1157,29 +1488,21 @@ class SimpleDataset(Dataset):
         summary = {
             "added-data-files": 0,
             "added-files-size": 0,
+            "added-data-size": 0,
             "added-records": 0,
             "deleted-data-files": deleted_count,
             "deleted-files-size": deleted_size,
+            "deleted-data-size": removed_data_size,
             "deleted-records": 0,
             "total-data-files": 0,
             "total-files-size": 0,
+            "total-data-size": 0,
             "total-records": 0,
         }
         # Sequence number
         try:
-            max_seq = 0
-            for s in self.metadata.snapshots:
-                seq = getattr(s, "sequence_number", None)
-                if seq is None:
-                    continue
-                try:
-                    ival = int(seq)
-                except Exception:
-                    continue
-                if ival > max_seq:
-                    max_seq = ival
-            next_seq = max_seq + 1
+            next_seq = self._next_sequence_number()
         except Exception:
             next_seq = 1
@@ -1215,7 +1538,4 @@ class SimpleDataset(Dataset):
         self.metadata.current_snapshot_id = snapshot_id
         if self.catalog and hasattr(self.catalog, "save_snapshot"):
-            try:
-                self.catalog.save_snapshot(self.identifier, snap)
-            except Exception:
-                pass
+            self.catalog.save_snapshot(self.identifier, snap)

opteryx-catalog 0.4.4__py3-none-any.whl → 0.4.26__py3-none-any.whl

opteryx-catalog 0.4.4py3-none-any.whl → 0.4.26py3-none-any.whl