PyPI - opteryx-catalog - Versions diffs - 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl - Mend

opteryx-catalog 0.4.11py3-none-any.whl → 0.4.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

opteryx_catalog/catalog/compaction.py +15 -8
opteryx_catalog/catalog/dataset.py +449 -111
opteryx_catalog/catalog/manifest.py +390 -330
opteryx_catalog/catalog/metadata.py +3 -0
opteryx_catalog/iops/fileio.py +13 -0
opteryx_catalog/maki_nage/__init__.py +8 -0
opteryx_catalog/maki_nage/distogram.py +558 -0
opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
opteryx_catalog/maki_nage/tests/test_count.py +19 -0
opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
opteryx_catalog/maki_nage/tests/test_update.py +44 -0
opteryx_catalog/opteryx_catalog.py +82 -54
opteryx_catalog/webhooks/__init__.py +230 -0
opteryx_catalog/webhooks/events.py +177 -0
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
scripts/collect_byte_counts.py +42 -0
scripts/emit_full_single_file.py +81 -0
scripts/inspect_manifest_dryrun.py +322 -0
scripts/inspect_single_file.py +147 -0
scripts/inspect_single_file_gcs.py +124 -0
tests/test_collections.py +37 -0
tests/test_describe_uncompressed.py +127 -0
tests/test_refresh_manifest.py +275 -0
tests/test_webhooks.py +177 -0
opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
{opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0

opteryx_catalog/catalog/manifest.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from __future__ import annotations
+import logging
+import time
+from collections import Counter
 from dataclasses import dataclass
 from dataclasses import field
 from typing import Any
@@ -44,6 +47,8 @@ class ParquetManifestEntry:
     histogram_bins: int
     min_values: list
     max_values: list
+    min_values_display: list
+    max_values_display: list
     def to_dict(self) -> dict:
         return {
@@ -59,380 +64,435 @@ class ParquetManifestEntry:
             "histogram_bins": self.histogram_bins,
             "min_values": self.min_values,
             "max_values": self.max_values,
+            "min_values_display": self.min_values_display,
+            "max_values_display": self.max_values_display,
         }
-def build_parquet_manifest_entry(
-    table: Any, file_path: str, file_size_in_bytes: int
-) -> ParquetManifestEntry:
-    """Build a Parquet manifest entry with statistics for a PyArrow table.
+logger = logging.getLogger(__name__)
+_manifest_metrics = Counter()
-    Args:
-        table: PyArrow table to analyze
-        file_path: Path where the file is stored
-        file_size_in_bytes: Size of the parquet file in bytes
+def _compute_stats_for_arrow_column(col, field_type, file_path: str):
+    """Compute statistics for a single PyArrow column (Array or ChunkedArray).
-    Returns:
-        ParquetManifestEntry with computed statistics
+    Returns a tuple: (col_min_k, col_hist, col_min, col_max, min_display, max_display, null_count)
     """
+    import heapq
+    import opteryx.draken as draken  # type: ignore
     import pyarrow as pa
-    min_k_hashes: list[list[int]] = []
-    histograms: list[list[int]] = []
-    min_values: list[int] = []
-    null_counts: list[int] = []
-    max_values: list[int] = []
+    # Ensure single contiguous array when possible
+    if hasattr(col, "combine_chunks"):
+        try:
+            col = col.combine_chunks()
+        except Exception:
+            # leave as-is
+            pass
-    # Use draken for efficient hashing and compression when available.
-    import heapq
+    # Record compress/hash usage
+    _manifest_metrics["hash_calls"] += 1
+    _manifest_metrics["compress_calls"] += 1
-    # Try to compute additional per-column statistics when draken is available.
+    col_py = None
     try:
-        import opteryx.draken as draken  # type: ignore
-        for col_idx, col in enumerate(table.columns):
-            # hash column values to 64-bit via draken (new cpdef API)
-            vec = draken.Vector.from_arrow(col)
-            hashes = list(vec.hash())
-            # Decide whether to compute min-k/histogram for this column based
-            # on field type and, for strings, average length of values.
-            field_type = table.schema.field(col_idx).type
-            compute_min_k = False
-            if (
-                pa.types.is_integer(field_type)
-                or pa.types.is_floating(field_type)
-                or pa.types.is_decimal(field_type)
-            ):
-                compute_min_k = True
-            elif (
-                pa.types.is_timestamp(field_type)
-                or pa.types.is_date(field_type)
-                or pa.types.is_time(field_type)
-            ):
-                compute_min_k = True
-            elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
-                # compute average length from non-null values; only allow
-                # min-k/histogram for short strings (avg <= 16)
-                col_py = None
+        vec = draken.Vector.from_arrow(col)
+    except Exception:  # pragma: no cover - be robust
+        raise
+    hashes = set(vec.hash())
+    # Decide whether to compute min-k/histogram for this column
+    compute_min_k = False
+    if (
+        pa.types.is_integer(field_type)
+        or pa.types.is_floating(field_type)
+        or pa.types.is_decimal(field_type)
+    ):
+        compute_min_k = True
+    elif (
+        pa.types.is_timestamp(field_type)
+        or pa.types.is_date(field_type)
+        or pa.types.is_time(field_type)
+    ):
+        compute_min_k = True
+    elif (
+        pa.types.is_string(field_type)
+        or pa.types.is_large_string(field_type)
+        or pa.types.is_binary(field_type)
+        or pa.types.is_large_binary(field_type)
+    ):
+        # For strings/binary we may need pylist for display
+        try:
+            col_py = col.to_pylist()
+        except Exception:
+            col_py = None
+        compute_min_k = True
+    if compute_min_k:
+        smallest = heapq.nsmallest(MIN_K_HASHES, hashes)
+        col_min_k = sorted(smallest)
+    else:
+        col_min_k = []
+    import pyarrow as pa  # local import for types
+    compute_hist = compute_min_k
+    if pa.types.is_boolean(field_type):
+        compute_hist = True
+    # Use draken.compress() to get canonical int64 per value
+    compressed = list(vec.compress())
+    null_count = sum(1 for m in compressed if m == NULL_FLAG)
+    non_nulls_compressed = [m for m in compressed if m != NULL_FLAG]
+    if non_nulls_compressed:
+        vmin = min(non_nulls_compressed)
+        vmax = max(non_nulls_compressed)
+        col_min = int(vmin)
+        col_max = int(vmax)
+        if compute_hist:
+            # Special-case boolean histograms
+            if pa.types.is_boolean(field_type):
                 try:
-                    col_py = col.to_pylist()
+                    if col_py is None:
+                        try:
+                            col_py = col.to_pylist()
+                        except Exception:
+                            col_py = None
+                    if col_py is not None:
+                        non_nulls_bool = [v for v in col_py if v is not None]
+                        false_count = sum(1 for v in non_nulls_bool if v is False)
+                        true_count = sum(1 for v in non_nulls_bool if v is True)
+                    else:
+                        # Fallback: infer from compressed mapping (assume 0/1)
+                        false_count = sum(1 for m in non_nulls_compressed if m == 0)
+                        true_count = sum(1 for m in non_nulls_compressed if m != 0)
                 except Exception:
-                    col_py = None
+                    false_count = 0
+                    true_count = 0
-                if col_py is not None:
-                    lens = [len(x) for x in col_py if x is not None]
-                    if lens:
-                        avg_len = sum(lens) / len(lens)
-                        if avg_len <= 16:
-                            compute_min_k = True
-            # KMV: take K smallest unique hashes when allowed; otherwise
-            # store an empty list for this column. Deduplicate hashes so
-            # the KMV sketch contains unique hashes (avoids duplicates
-            # skewing cardinality estimates).
-            if compute_min_k:
-                unique_hashes = set(hashes)
-                smallest = heapq.nsmallest(MIN_K_HASHES, unique_hashes)
-                col_min_k = sorted(smallest)
+                col_hist = [int(true_count), int(false_count)]
             else:
-                col_min_k = []
-            # For histogram decisions follow the same rule as min-k
-            compute_hist = compute_min_k
-            # Use draken.compress() to get canonical int64 per value
-            mapped = list(vec.compress())
-            # Compute null count from compressed representation
-            null_count = sum(1 for m in mapped if m == NULL_FLAG)
-            null_counts.append(int(null_count))
-            non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
-            if non_nulls_mapped:
-                vmin = min(non_nulls_mapped)
-                vmax = max(non_nulls_mapped)
-                col_min = int(vmin)
-                col_max = int(vmax)
-                if compute_hist:
-                    if vmin == vmax:
-                        col_hist = [0] * HISTOGRAM_BINS
-                        col_hist[-1] = len(non_nulls_mapped)
-                    else:
-                        col_hist = [0] * HISTOGRAM_BINS
-                        span = float(vmax - vmin)
-                        for m in non_nulls_mapped:
-                            b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
-                            if b < 0:
-                                b = 0
-                            if b >= HISTOGRAM_BINS:
-                                b = HISTOGRAM_BINS - 1
-                            col_hist[b] += 1
+                if vmin == vmax:
+                    col_hist = []
                 else:
                     col_hist = [0] * HISTOGRAM_BINS
-            else:
-                # no non-null values; histogram via hash buckets
-                col_min = NULL_FLAG
-                col_max = NULL_FLAG
-                if compute_hist:
-                    col_hist = [0] * HISTOGRAM_BINS
-                    for h in hashes:
-                        b = (h >> (64 - 5)) & 0x1F
+                    span = float(vmax - vmin)
+                    for m in non_nulls_compressed:
+                        b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
+                        if b < 0:
+                            b = 0
+                        if b >= HISTOGRAM_BINS:
+                            b = HISTOGRAM_BINS - 1
                         col_hist[b] += 1
-                else:
-                    col_hist = [0] * HISTOGRAM_BINS
+        else:
+            col_hist = []
+    else:
+        # no non-null values
+        col_min = NULL_FLAG
+        col_max = NULL_FLAG
+        col_hist = []
-            min_k_hashes.append(col_min_k)
-            histograms.append(col_hist)
-            min_values.append(col_min)
-            max_values.append(col_max)
-        # end for
-    except Exception:
-        # Draken not available or failed; leave min_k_hashes/histograms empty
-        min_k_hashes = [[] for _ in table.columns]
-        histograms = [[] for _ in table.columns]
-        # Attempt to compute per-column min/max from the table directly
-        try:
-            for col in table.columns:
+    # display values
+    try:
+        if pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
+            if col_py is None:
                 try:
                     col_py = col.to_pylist()
-                    non_nulls = [v for v in col_py if v is not None]
-                    null_count = len(col_py) - len(non_nulls)
-                    null_counts.append(int(null_count))
-                    if non_nulls:
-                        try:
-                            min_values.append(min(non_nulls))
-                            max_values.append(max(non_nulls))
-                        except Exception:
-                            min_values.append(None)
-                            max_values.append(None)
-                    else:
-                        min_values.append(None)
-                        max_values.append(None)
                 except Exception:
-                    min_values.append(None)
-                    max_values.append(None)
-                    # If we couldn't introspect column values, assume 0 nulls
-                    null_counts.append(0)
-        except Exception:
-            # If even direct inspection fails, ensure lists lengths match
-            min_values = [None] * len(table.columns)
-            max_values = [None] * len(table.columns)
-            null_counts = [0] * len(table.columns)
-    # Calculate uncompressed size from table buffers — must be accurate.
-    column_uncompressed: list[int] = []
-    uncompressed_size = 0
-    for col in table.columns:
-        col_total = 0
-        for chunk in col.chunks:
-            try:
-                buffs = chunk.buffers()
-            except Exception as exc:
-                raise RuntimeError(
-                    f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
-                ) from exc
-            for buffer in buffs:
-                if buffer is not None:
-                    col_total += buffer.size
-        column_uncompressed.append(int(col_total))
-        uncompressed_size += col_total
-    return ParquetManifestEntry(
-        file_path=file_path,
-        file_format="parquet",
-        record_count=int(table.num_rows),
-        file_size_in_bytes=file_size_in_bytes,
-        uncompressed_size_in_bytes=uncompressed_size,
-        column_uncompressed_sizes_in_bytes=column_uncompressed,
-        null_counts=null_counts,
-        min_k_hashes=min_k_hashes,
-        histogram_counts=histograms,
-        histogram_bins=HISTOGRAM_BINS,
-        min_values=min_values,
-        max_values=max_values,
+                    col_py = None
+            if col_py is not None:
+                non_nulls_str = [x for x in col_py if x is not None]
+                if non_nulls_str:
+                    min_value = min(non_nulls_str)
+                    max_value = max(non_nulls_str)
+                    if len(min_value) > 16:
+                        min_value = min_value[:16] + "..."
+                    if len(max_value) > 16:
+                        max_value = max_value[:16] + "..."
+                    min_display = min_value
+                    max_display = max_value
+                else:
+                    min_display = None
+                    max_display = None
+            else:
+                min_display = None
+                max_display = None
+        elif pa.types.is_binary(field_type) or pa.types.is_large_binary(field_type):
+            if col_py is None:
+                try:
+                    col_py = col.to_pylist()
+                except Exception:
+                    col_py = None
+            if col_py is not None:
+                non_nulls = [x for x in col_py if x is not None]
+                if non_nulls:
+                    min_value = min(non_nulls)
+                    max_value = max(non_nulls)
+                    if len(min_value) > 16:
+                        min_value = min_value[:16] + "..."
+                    if len(max_value) > 16:
+                        max_value = max_value[:16] + "..."
+                    if any(ord(b) < 32 or ord(b) > 126 for b in min_value):
+                        min_value = min_value.hex()
+                        min_value = min_value[:16] + "..."
+                    if any(ord(b) < 32 or ord(b) > 126 for b in max_value):
+                        max_value = max_value.hex()
+                        max_value = max_value[:16] + "..."
+                    min_display = min_value
+                    max_display = max_value
+                else:
+                    min_display = None
+                    max_display = None
+            else:
+                min_display = None
+                max_display = None
+        else:
+            if col_py is None:
+                try:
+                    col_py = col.to_pylist()
+                except Exception:
+                    col_py = None
+            if col_py is not None:
+                non_nulls = [x for x in col_py if x is not None]
+                if non_nulls:
+                    min_display = min(non_nulls)
+                    max_display = max(non_nulls)
+                else:
+                    min_display = None
+                    max_display = None
+            else:
+                min_display = None
+                max_display = None
+    except Exception:
+        min_display = None
+        max_display = None
+    return (
+        col_min_k,
+        col_hist,
+        int(col_min),
+        int(col_max),
+        min_display,
+        max_display,
+        int(null_count),
     )
-def build_parquet_manifest_minmax_entry(data: bytes, file_path: str) -> ParquetManifestEntry:
-    """Build a Parquet manifest entry with min/max statistics using fast rugo reader.
+def build_parquet_manifest_entry_from_bytes(
+    data_bytes: bytes,
+    file_path: str,
+    file_size_in_bytes: int | None = None,
+    orig_table: Any | None = None,
+) -> ParquetManifestEntry:
+    """Build a manifest entry by reading a parquet file as bytes and scanning column-by-column.
-    This is much faster than build_parquet_manifest_entry (microseconds per file)
-    and is suitable for bulk file operations where full statistics are not needed.
+    This reads the compressed file once and materializes one full column at a time
+    (combine_chunks) which keeps peak memory low while letting per-column
+    stat calculation (draken) operate on contiguous arrays.
+    """
+    import pyarrow as pa
+    import pyarrow.parquet as pq
-    Args:
-        data: Raw parquet file bytes
-        file_path: Path where the file is stored
+    t_start = time.perf_counter()
+    _manifest_metrics["files_read"] += 1
+    _manifest_metrics["bytes_read"] += len(data_bytes)
-    Returns:
-        ParquetManifestEntry with min/max statistics only (no histograms or k-hashes)
-    """
-    file_size = len(data)
+    buf = pa.BufferReader(data_bytes)
+    pf = pq.ParquetFile(buf)
+    meta = pf.metadata
-    # Prefer rugo fast metadata reader when available, otherwise fall back
-    # to pyarrow ParquetFile to extract row-group statistics.
+    # Try to read rugo metadata early so we can compute sizes without
+    # materializing the table later. This is zero-copy and fast.
     try:
-        import opteryx.rugo.parquet as parquet_meta
-        from opteryx.compiled.structures.relation_statistics import to_int
+        from opteryx.rugo.parquet import read_metadata_from_memoryview
-        if isinstance(data, memoryview):
-            metadata = parquet_meta.read_metadata_from_memoryview(data, include_statistics=True)
-        else:
-            metadata = parquet_meta.read_metadata_from_memoryview(
-                memoryview(data), include_statistics=True
-            )
+        rmeta = read_metadata_from_memoryview(memoryview(data_bytes))
+    except Exception:
+        rmeta = None
-        record_count = metadata["num_rows"]
-    except ImportError:
-        # Fallback: use pyarrow to read Parquet metadata
-        import pyarrow as pa
-        import pyarrow.parquet as pq
-        pf = pq.ParquetFile(pa.BufferReader(data))
-        record_count = int(pf.metadata.num_rows or 0)
-        # Construct minimal metadata structure compatible with expected shape
-        metadata = {"num_rows": record_count, "row_groups": []}
-        for rg in range(pf.num_row_groups):
-            rg_entry = {"columns": []}
-            for ci in range(pf.metadata.num_columns):
-                col_meta = pf.metadata.row_group(rg).column(ci)
-                col_entry = {"name": pf.schema.names[ci]}
-                stats = getattr(col_meta, "statistics", None)
-                if stats:
-                    col_entry["min"] = getattr(stats, "min", None)
-                    col_entry["max"] = getattr(stats, "max", None)
-                rg_entry["columns"].append(col_entry)
-            # total_byte_size may not be available; leave out to trigger full-table calculation later
-            metadata["row_groups"].append(rg_entry)
-        # Define a simple to_int fallback for the pyarrow path
-        def to_int(v: object) -> int:
+    # Prepare result containers
+    min_k_hashes: list[list[int]] = []
+    histograms: list[list[int]] = []
+    min_values: list[int] = []
+    null_counts: list[int] = []
+    max_values: list[int] = []
+    min_values_display: list = []
+    max_values_display: list = []
+    # iterate schema fields and process each column independently
+    schema = pf.schema_arrow
+    for col_idx, field in enumerate(schema):
+        col_name = field.name
+        try:
+            col_table = pf.read(columns=[col_name])
+            col = col_table.column(0).combine_chunks()
+        except Exception:
+            # fallback: try reading the row group column (more granular)
             try:
-                return int(v)
+                tbl = pf.read_row_group(0, columns=[col_name])
+                col = tbl.column(0).combine_chunks()
             except Exception:
-                try:
-                    if isinstance(v, (bytes, bytearray)):
-                        s = v.decode("utf-8", errors="ignore")
-                        return int(float(s)) if s else 0
-                    return int(float(v))
-                except Exception:
-                    return 0
-    # Gather min/max per column across all row groups
-    column_stats = {}
-    for row_group in metadata["row_groups"]:
-        for column in row_group["columns"]:
-            column_name = column["name"]
-            if column_name not in column_stats:
-                column_stats[column_name] = {"min": None, "max": None}
-            min_value = column.get("min")
-            if min_value is not None:
-                # Compress value to int using to_int
-                min_compressed = to_int(min_value)
-                if column_stats[column_name]["min"] is None:
-                    column_stats[column_name]["min"] = min_compressed
-                else:
-                    column_stats[column_name]["min"] = min(
-                        column_stats[column_name]["min"], min_compressed
-                    )
-            max_value = column.get("max")
-            if max_value is not None:
-                # Compress value to int using to_int
-                max_compressed = to_int(max_value)
-                if column_stats[column_name]["max"] is None:
-                    column_stats[column_name]["max"] = max_compressed
-                else:
-                    column_stats[column_name]["max"] = max(
-                        column_stats[column_name]["max"], max_compressed
-                    )
-    # Extract min/max values (filter out None)
-    min_values = [stats["min"] for stats in column_stats.values() if stats["min"] is not None]
-    max_values = [stats["max"] for stats in column_stats.values() if stats["max"] is not None]
-    # Attempt to gather null counts from metadata row groups if available
-    column_nulls: dict = {}
-    for row_group in metadata["row_groups"]:
-        for column in row_group["columns"]:
-            cname = column["name"]
-            if cname not in column_nulls:
-                column_nulls[cname] = 0
-            nc = column.get("null_count")
-            if nc is not None:
-                try:
-                    column_nulls[cname] += int(nc)
-                except Exception:
-                    pass
-    if column_nulls:
-        null_counts = [column_nulls.get(n, 0) for n in column_stats.keys()]
-    else:
-        null_counts = []
+                # Last resort: read entire file and then take the column
+                tbl = pf.read()
+                col = tbl.column(col_idx).combine_chunks()
+        # compute stats using existing logic encapsulated in helper
+        (
+            col_min_k,
+            col_hist,
+            col_min,
+            col_max,
+            col_min_display,
+            col_max_display,
+            null_count,
+        ) = _compute_stats_for_arrow_column(col, field.type, file_path)
+        # free the table-level reference if present so memory can be reclaimed
+        try:
+            del col_table
+        except Exception:
+            pass
+        try:
+            del tbl
+        except Exception:
+            pass
+        min_k_hashes.append(col_min_k)
+        histograms.append(col_hist)
+        min_values.append(col_min)
+        max_values.append(col_max)
+        min_values_display.append(col_min_display)
+        max_values_display.append(col_max_display)
+        null_counts.append(null_count)
+    # Calculate uncompressed sizes. When the original in-memory table is
+    # available (we just wrote it), prefer using it so sizes match the
+    # table-based builder exactly. Otherwise materialize the table from
+    # bytes and compute sizes the same way.
+    import pyarrow as pa
+    import pyarrow.parquet as pq
-    # Get uncompressed size from metadata; if missing, read full table and
-    # compute accurate uncompressed size from buffers. Also attempt to
-    # compute per-column uncompressed byte counts when reading the table.
-    uncompressed_size = 0
     column_uncompressed: list[int] = []
-    missing = False
-    for row_group in metadata["row_groups"]:
-        v = row_group.get("total_byte_size", None)
-        if v is None:
-            missing = True
-            break
-        uncompressed_size += v
-    if missing or uncompressed_size == 0:
-        try:
-            import pyarrow as pa
-            import pyarrow.parquet as pq
+    uncompressed_size = 0
-            table = pq.read_table(pa.BufferReader(data))
-            uncompressed_size = 0
-            # Compute per-column uncompressed sizes and null counts from the table
-            for col in table.columns:
-                col_total = 0
-                null_total = 0
-                for chunk in col.chunks:
-                    for buffer in chunk.buffers():
-                        if buffer is not None:
-                            col_total += buffer.size
-                    try:
-                        null_total += int(chunk.null_count)
-                    except Exception:
-                        # Fallback to slow python inspection
-                        try:
-                            col_py = col.to_pylist()
-                            null_total = len(col_py) - len([v for v in col_py if v is not None])
-                        except Exception:
-                            null_total = 0
-                column_uncompressed.append(int(col_total))
-                uncompressed_size += col_total
-                null_counts = null_counts or []
-                null_counts.append(int(null_total))
-        except Exception as exc:
-            raise RuntimeError(
-                f"Unable to determine uncompressed size for {file_path}: {exc}"
-            ) from exc
+    # Free references to large objects we no longer need so memory can be reclaimed
+    try:
+        del buf
+    except Exception:
+        pass
+    try:
+        del pf
+    except Exception:
+        pass
+    try:
+        del data_bytes
+    except Exception:
+        pass
+    if orig_table is not None:
+        # Use the original table buffers so results match the table-based route
+        for col in orig_table.columns:
+            col_total = 0
+            for chunk in col.chunks:
+                try:
+                    buffs = chunk.buffers()
+                except Exception as exc:
+                    raise RuntimeError(
+                        f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
+                    ) from exc
+                for buffer in buffs:
+                    if buffer is not None:
+                        col_total += buffer.size
+            column_uncompressed.append(int(col_total))
+            uncompressed_size += col_total
     else:
-        # If we didn't read the table and null_counts is still empty, default to zeros
-        if not null_counts:
-            null_counts = [0] * len(column_stats)
+        # Use rugo metadata (if available) to compute per-column uncompressed sizes
+        if rmeta:
+            rgs = rmeta.get("row_groups", [])
+            if rgs:
+                ncols = len(rgs[0].get("columns", []))
+                for cidx in range(ncols):
+                    col_total = 0
+                    for rg in rgs:
+                        cols = rg.get("columns", [])
+                        if cidx < len(cols):
+                            col_total += int(cols[cidx].get("total_byte_size", 0) or 0)
+                    column_uncompressed.append(int(col_total))
+                    uncompressed_size += col_total
+                _manifest_metrics["sizes_from_rugo"] += 1
+            else:
+                column_uncompressed = [0] * len(schema)
+                uncompressed_size = 0
+                _manifest_metrics["sizes_from_rugo_missing"] += 1
+        else:
+            # If rugo metadata isn't available, avoid materializing the table;
+            # emit zero sizes (safe and memory-light) and track that we lacked
+            # metadata for sizes.
+            column_uncompressed = [0] * len(schema)
+            uncompressed_size = 0
+            _manifest_metrics["sizes_from_rugo_unavailable"] += 1
+            logger.debug(
+                "rugo metadata unavailable for %s; emitting zero column sizes to avoid materializing table",
+                file_path,
+            )
-    return ParquetManifestEntry(
+    entry = ParquetManifestEntry(
         file_path=file_path,
         file_format="parquet",
-        record_count=int(record_count),
-        file_size_in_bytes=file_size,
+        record_count=int(meta.num_rows),
+        file_size_in_bytes=int(file_size_in_bytes or len(data_bytes)),
         uncompressed_size_in_bytes=uncompressed_size,
         column_uncompressed_sizes_in_bytes=column_uncompressed,
         null_counts=null_counts,
-        min_k_hashes=[],
-        histogram_counts=[],
-        histogram_bins=0,
+        min_k_hashes=min_k_hashes,
+        histogram_counts=histograms,
+        histogram_bins=HISTOGRAM_BINS,
         min_values=min_values,
         max_values=max_values,
+        min_values_display=min_values_display,
+        max_values_display=max_values_display,
+    )
+    logger.debug(
+        "build_parquet_manifest_entry_from_bytes %s files=%d dur=%.3fs",
+        file_path,
+        _manifest_metrics["files_read"],
+        time.perf_counter() - t_start,
+    )
+    return entry
+# Backwards-compatible wrapper that keeps the original calling convention
+# when a pyarrow Table is already provided (tests and some scripts rely on it).
+def build_parquet_manifest_entry(
+    table: Any, file_path: str, file_size_in_bytes: int | None = None
+) -> ParquetManifestEntry:
+    """DEPRECATED: explicit table-based manifest building is removed.
+    The implementation previously accepted a PyArrow ``table`` and performed
+    the same per-column statistics calculation. That behavior hid a different
+    IO/scan path and led to inconsistent performance characteristics.
+    Use ``build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=None)``
+    instead. If you have an in-memory table you can serialize it and call the
+    bytes-based builder, or pass ``orig_table`` to preserve exact uncompressed
+    size calculations.
+    This function now fails fast to avoid silently using the removed path.
+    """
+    raise RuntimeError(
+        "table-based manifest builder removed: use build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=table) instead"
     )
+def get_manifest_metrics() -> dict:
+    """Return a snapshot of manifest instrumentation counters (for tests/benchmarks)."""
+    return dict(_manifest_metrics)
+def reset_manifest_metrics() -> None:
+    """Reset the manifest metrics counters to zero."""
+    _manifest_metrics.clear()

opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

opteryx-catalog 0.4.11py3-none-any.whl → 0.4.26py3-none-any.whl