opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -1,5 +1,8 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
4
+ import time
5
+ from collections import Counter
3
6
  from dataclasses import dataclass
4
7
  from dataclasses import field
5
8
  from typing import Any
@@ -44,6 +47,8 @@ class ParquetManifestEntry:
44
47
  histogram_bins: int
45
48
  min_values: list
46
49
  max_values: list
50
+ min_values_display: list
51
+ max_values_display: list
47
52
 
48
53
  def to_dict(self) -> dict:
49
54
  return {
@@ -59,380 +64,435 @@ class ParquetManifestEntry:
59
64
  "histogram_bins": self.histogram_bins,
60
65
  "min_values": self.min_values,
61
66
  "max_values": self.max_values,
67
+ "min_values_display": self.min_values_display,
68
+ "max_values_display": self.max_values_display,
62
69
  }
63
70
 
64
71
 
65
- def build_parquet_manifest_entry(
66
- table: Any, file_path: str, file_size_in_bytes: int
67
- ) -> ParquetManifestEntry:
68
- """Build a Parquet manifest entry with statistics for a PyArrow table.
72
+ logger = logging.getLogger(__name__)
73
+ _manifest_metrics = Counter()
74
+
69
75
 
70
- Args:
71
- table: PyArrow table to analyze
72
- file_path: Path where the file is stored
73
- file_size_in_bytes: Size of the parquet file in bytes
76
+ def _compute_stats_for_arrow_column(col, field_type, file_path: str):
77
+ """Compute statistics for a single PyArrow column (Array or ChunkedArray).
74
78
 
75
- Returns:
76
- ParquetManifestEntry with computed statistics
79
+ Returns a tuple: (col_min_k, col_hist, col_min, col_max, min_display, max_display, null_count)
77
80
  """
81
+ import heapq
82
+
83
+ import opteryx.draken as draken # type: ignore
78
84
  import pyarrow as pa
79
85
 
80
- min_k_hashes: list[list[int]] = []
81
- histograms: list[list[int]] = []
82
- min_values: list[int] = []
83
- null_counts: list[int] = []
84
- max_values: list[int] = []
86
+ # Ensure single contiguous array when possible
87
+ if hasattr(col, "combine_chunks"):
88
+ try:
89
+ col = col.combine_chunks()
90
+ except Exception:
91
+ # leave as-is
92
+ pass
85
93
 
86
- # Use draken for efficient hashing and compression when available.
87
- import heapq
94
+ # Record compress/hash usage
95
+ _manifest_metrics["hash_calls"] += 1
96
+ _manifest_metrics["compress_calls"] += 1
88
97
 
89
- # Try to compute additional per-column statistics when draken is available.
98
+ col_py = None
90
99
  try:
91
- import opteryx.draken as draken # type: ignore
92
-
93
- for col_idx, col in enumerate(table.columns):
94
- # hash column values to 64-bit via draken (new cpdef API)
95
- vec = draken.Vector.from_arrow(col)
96
- hashes = list(vec.hash())
97
-
98
- # Decide whether to compute min-k/histogram for this column based
99
- # on field type and, for strings, average length of values.
100
- field_type = table.schema.field(col_idx).type
101
- compute_min_k = False
102
- if (
103
- pa.types.is_integer(field_type)
104
- or pa.types.is_floating(field_type)
105
- or pa.types.is_decimal(field_type)
106
- ):
107
- compute_min_k = True
108
- elif (
109
- pa.types.is_timestamp(field_type)
110
- or pa.types.is_date(field_type)
111
- or pa.types.is_time(field_type)
112
- ):
113
- compute_min_k = True
114
- elif pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
115
- # compute average length from non-null values; only allow
116
- # min-k/histogram for short strings (avg <= 16)
117
- col_py = None
100
+ vec = draken.Vector.from_arrow(col)
101
+ except Exception: # pragma: no cover - be robust
102
+ raise
103
+
104
+ hashes = set(vec.hash())
105
+
106
+ # Decide whether to compute min-k/histogram for this column
107
+ compute_min_k = False
108
+ if (
109
+ pa.types.is_integer(field_type)
110
+ or pa.types.is_floating(field_type)
111
+ or pa.types.is_decimal(field_type)
112
+ ):
113
+ compute_min_k = True
114
+ elif (
115
+ pa.types.is_timestamp(field_type)
116
+ or pa.types.is_date(field_type)
117
+ or pa.types.is_time(field_type)
118
+ ):
119
+ compute_min_k = True
120
+ elif (
121
+ pa.types.is_string(field_type)
122
+ or pa.types.is_large_string(field_type)
123
+ or pa.types.is_binary(field_type)
124
+ or pa.types.is_large_binary(field_type)
125
+ ):
126
+ # For strings/binary we may need pylist for display
127
+ try:
128
+ col_py = col.to_pylist()
129
+ except Exception:
130
+ col_py = None
131
+ compute_min_k = True
132
+
133
+ if compute_min_k:
134
+ smallest = heapq.nsmallest(MIN_K_HASHES, hashes)
135
+ col_min_k = sorted(smallest)
136
+ else:
137
+ col_min_k = []
138
+
139
+ import pyarrow as pa # local import for types
140
+
141
+ compute_hist = compute_min_k
142
+ if pa.types.is_boolean(field_type):
143
+ compute_hist = True
144
+
145
+ # Use draken.compress() to get canonical int64 per value
146
+ compressed = list(vec.compress())
147
+ null_count = sum(1 for m in compressed if m == NULL_FLAG)
148
+
149
+ non_nulls_compressed = [m for m in compressed if m != NULL_FLAG]
150
+ if non_nulls_compressed:
151
+ vmin = min(non_nulls_compressed)
152
+ vmax = max(non_nulls_compressed)
153
+ col_min = int(vmin)
154
+ col_max = int(vmax)
155
+ if compute_hist:
156
+ # Special-case boolean histograms
157
+ if pa.types.is_boolean(field_type):
118
158
  try:
119
- col_py = col.to_pylist()
159
+ if col_py is None:
160
+ try:
161
+ col_py = col.to_pylist()
162
+ except Exception:
163
+ col_py = None
164
+ if col_py is not None:
165
+ non_nulls_bool = [v for v in col_py if v is not None]
166
+ false_count = sum(1 for v in non_nulls_bool if v is False)
167
+ true_count = sum(1 for v in non_nulls_bool if v is True)
168
+ else:
169
+ # Fallback: infer from compressed mapping (assume 0/1)
170
+ false_count = sum(1 for m in non_nulls_compressed if m == 0)
171
+ true_count = sum(1 for m in non_nulls_compressed if m != 0)
120
172
  except Exception:
121
- col_py = None
173
+ false_count = 0
174
+ true_count = 0
122
175
 
123
- if col_py is not None:
124
- lens = [len(x) for x in col_py if x is not None]
125
- if lens:
126
- avg_len = sum(lens) / len(lens)
127
- if avg_len <= 16:
128
- compute_min_k = True
129
-
130
- # KMV: take K smallest unique hashes when allowed; otherwise
131
- # store an empty list for this column. Deduplicate hashes so
132
- # the KMV sketch contains unique hashes (avoids duplicates
133
- # skewing cardinality estimates).
134
- if compute_min_k:
135
- unique_hashes = set(hashes)
136
- smallest = heapq.nsmallest(MIN_K_HASHES, unique_hashes)
137
- col_min_k = sorted(smallest)
176
+ col_hist = [int(true_count), int(false_count)]
138
177
  else:
139
- col_min_k = []
140
-
141
- # For histogram decisions follow the same rule as min-k
142
- compute_hist = compute_min_k
143
-
144
- # Use draken.compress() to get canonical int64 per value
145
- mapped = list(vec.compress())
146
- # Compute null count from compressed representation
147
- null_count = sum(1 for m in mapped if m == NULL_FLAG)
148
- null_counts.append(int(null_count))
149
- non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
150
- if non_nulls_mapped:
151
- vmin = min(non_nulls_mapped)
152
- vmax = max(non_nulls_mapped)
153
- col_min = int(vmin)
154
- col_max = int(vmax)
155
- if compute_hist:
156
- if vmin == vmax:
157
- col_hist = [0] * HISTOGRAM_BINS
158
- col_hist[-1] = len(non_nulls_mapped)
159
- else:
160
- col_hist = [0] * HISTOGRAM_BINS
161
- span = float(vmax - vmin)
162
- for m in non_nulls_mapped:
163
- b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
164
- if b < 0:
165
- b = 0
166
- if b >= HISTOGRAM_BINS:
167
- b = HISTOGRAM_BINS - 1
168
- col_hist[b] += 1
178
+ if vmin == vmax:
179
+ col_hist = []
169
180
  else:
170
181
  col_hist = [0] * HISTOGRAM_BINS
171
- else:
172
- # no non-null values; histogram via hash buckets
173
- col_min = NULL_FLAG
174
- col_max = NULL_FLAG
175
- if compute_hist:
176
- col_hist = [0] * HISTOGRAM_BINS
177
- for h in hashes:
178
- b = (h >> (64 - 5)) & 0x1F
182
+ span = float(vmax - vmin)
183
+ for m in non_nulls_compressed:
184
+ b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
185
+ if b < 0:
186
+ b = 0
187
+ if b >= HISTOGRAM_BINS:
188
+ b = HISTOGRAM_BINS - 1
179
189
  col_hist[b] += 1
180
- else:
181
- col_hist = [0] * HISTOGRAM_BINS
190
+ else:
191
+ col_hist = []
192
+ else:
193
+ # no non-null values
194
+ col_min = NULL_FLAG
195
+ col_max = NULL_FLAG
196
+ col_hist = []
182
197
 
183
- min_k_hashes.append(col_min_k)
184
- histograms.append(col_hist)
185
- min_values.append(col_min)
186
- max_values.append(col_max)
187
- # end for
188
- except Exception:
189
- # Draken not available or failed; leave min_k_hashes/histograms empty
190
- min_k_hashes = [[] for _ in table.columns]
191
- histograms = [[] for _ in table.columns]
192
- # Attempt to compute per-column min/max from the table directly
193
- try:
194
- for col in table.columns:
198
+ # display values
199
+ try:
200
+ if pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
201
+ if col_py is None:
195
202
  try:
196
203
  col_py = col.to_pylist()
197
- non_nulls = [v for v in col_py if v is not None]
198
- null_count = len(col_py) - len(non_nulls)
199
- null_counts.append(int(null_count))
200
- if non_nulls:
201
- try:
202
- min_values.append(min(non_nulls))
203
- max_values.append(max(non_nulls))
204
- except Exception:
205
- min_values.append(None)
206
- max_values.append(None)
207
- else:
208
- min_values.append(None)
209
- max_values.append(None)
210
204
  except Exception:
211
- min_values.append(None)
212
- max_values.append(None)
213
- # If we couldn't introspect column values, assume 0 nulls
214
- null_counts.append(0)
215
- except Exception:
216
- # If even direct inspection fails, ensure lists lengths match
217
- min_values = [None] * len(table.columns)
218
- max_values = [None] * len(table.columns)
219
- null_counts = [0] * len(table.columns)
220
-
221
- # Calculate uncompressed size from table buffers — must be accurate.
222
- column_uncompressed: list[int] = []
223
- uncompressed_size = 0
224
- for col in table.columns:
225
- col_total = 0
226
- for chunk in col.chunks:
227
- try:
228
- buffs = chunk.buffers()
229
- except Exception as exc:
230
- raise RuntimeError(
231
- f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
232
- ) from exc
233
- for buffer in buffs:
234
- if buffer is not None:
235
- col_total += buffer.size
236
- column_uncompressed.append(int(col_total))
237
- uncompressed_size += col_total
238
-
239
- return ParquetManifestEntry(
240
- file_path=file_path,
241
- file_format="parquet",
242
- record_count=int(table.num_rows),
243
- file_size_in_bytes=file_size_in_bytes,
244
- uncompressed_size_in_bytes=uncompressed_size,
245
- column_uncompressed_sizes_in_bytes=column_uncompressed,
246
- null_counts=null_counts,
247
- min_k_hashes=min_k_hashes,
248
- histogram_counts=histograms,
249
- histogram_bins=HISTOGRAM_BINS,
250
- min_values=min_values,
251
- max_values=max_values,
205
+ col_py = None
206
+ if col_py is not None:
207
+ non_nulls_str = [x for x in col_py if x is not None]
208
+ if non_nulls_str:
209
+ min_value = min(non_nulls_str)
210
+ max_value = max(non_nulls_str)
211
+ if len(min_value) > 16:
212
+ min_value = min_value[:16] + "..."
213
+ if len(max_value) > 16:
214
+ max_value = max_value[:16] + "..."
215
+ min_display = min_value
216
+ max_display = max_value
217
+ else:
218
+ min_display = None
219
+ max_display = None
220
+ else:
221
+ min_display = None
222
+ max_display = None
223
+ elif pa.types.is_binary(field_type) or pa.types.is_large_binary(field_type):
224
+ if col_py is None:
225
+ try:
226
+ col_py = col.to_pylist()
227
+ except Exception:
228
+ col_py = None
229
+ if col_py is not None:
230
+ non_nulls = [x for x in col_py if x is not None]
231
+ if non_nulls:
232
+ min_value = min(non_nulls)
233
+ max_value = max(non_nulls)
234
+ if len(min_value) > 16:
235
+ min_value = min_value[:16] + "..."
236
+ if len(max_value) > 16:
237
+ max_value = max_value[:16] + "..."
238
+ if any(ord(b) < 32 or ord(b) > 126 for b in min_value):
239
+ min_value = min_value.hex()
240
+ min_value = min_value[:16] + "..."
241
+ if any(ord(b) < 32 or ord(b) > 126 for b in max_value):
242
+ max_value = max_value.hex()
243
+ max_value = max_value[:16] + "..."
244
+ min_display = min_value
245
+ max_display = max_value
246
+ else:
247
+ min_display = None
248
+ max_display = None
249
+ else:
250
+ min_display = None
251
+ max_display = None
252
+ else:
253
+ if col_py is None:
254
+ try:
255
+ col_py = col.to_pylist()
256
+ except Exception:
257
+ col_py = None
258
+ if col_py is not None:
259
+ non_nulls = [x for x in col_py if x is not None]
260
+ if non_nulls:
261
+ min_display = min(non_nulls)
262
+ max_display = max(non_nulls)
263
+ else:
264
+ min_display = None
265
+ max_display = None
266
+ else:
267
+ min_display = None
268
+ max_display = None
269
+ except Exception:
270
+ min_display = None
271
+ max_display = None
272
+
273
+ return (
274
+ col_min_k,
275
+ col_hist,
276
+ int(col_min),
277
+ int(col_max),
278
+ min_display,
279
+ max_display,
280
+ int(null_count),
252
281
  )
253
282
 
254
283
 
255
- def build_parquet_manifest_minmax_entry(data: bytes, file_path: str) -> ParquetManifestEntry:
256
- """Build a Parquet manifest entry with min/max statistics using fast rugo reader.
284
+ def build_parquet_manifest_entry_from_bytes(
285
+ data_bytes: bytes,
286
+ file_path: str,
287
+ file_size_in_bytes: int | None = None,
288
+ orig_table: Any | None = None,
289
+ ) -> ParquetManifestEntry:
290
+ """Build a manifest entry by reading a parquet file as bytes and scanning column-by-column.
257
291
 
258
- This is much faster than build_parquet_manifest_entry (microseconds per file)
259
- and is suitable for bulk file operations where full statistics are not needed.
292
+ This reads the compressed file once and materializes one full column at a time
293
+ (combine_chunks) which keeps peak memory low while letting per-column
294
+ stat calculation (draken) operate on contiguous arrays.
295
+ """
296
+ import pyarrow as pa
297
+ import pyarrow.parquet as pq
260
298
 
261
- Args:
262
- data: Raw parquet file bytes
263
- file_path: Path where the file is stored
299
+ t_start = time.perf_counter()
300
+ _manifest_metrics["files_read"] += 1
301
+ _manifest_metrics["bytes_read"] += len(data_bytes)
264
302
 
265
- Returns:
266
- ParquetManifestEntry with min/max statistics only (no histograms or k-hashes)
267
- """
268
- file_size = len(data)
303
+ buf = pa.BufferReader(data_bytes)
304
+ pf = pq.ParquetFile(buf)
305
+ meta = pf.metadata
269
306
 
270
- # Prefer rugo fast metadata reader when available, otherwise fall back
271
- # to pyarrow ParquetFile to extract row-group statistics.
307
+ # Try to read rugo metadata early so we can compute sizes without
308
+ # materializing the table later. This is zero-copy and fast.
272
309
  try:
273
- import opteryx.rugo.parquet as parquet_meta
274
- from opteryx.compiled.structures.relation_statistics import to_int
310
+ from opteryx.rugo.parquet import read_metadata_from_memoryview
275
311
 
276
- if isinstance(data, memoryview):
277
- metadata = parquet_meta.read_metadata_from_memoryview(data, include_statistics=True)
278
- else:
279
- metadata = parquet_meta.read_metadata_from_memoryview(
280
- memoryview(data), include_statistics=True
281
- )
312
+ rmeta = read_metadata_from_memoryview(memoryview(data_bytes))
313
+ except Exception:
314
+ rmeta = None
282
315
 
283
- record_count = metadata["num_rows"]
284
- except ImportError:
285
- # Fallback: use pyarrow to read Parquet metadata
286
- import pyarrow as pa
287
- import pyarrow.parquet as pq
288
-
289
- pf = pq.ParquetFile(pa.BufferReader(data))
290
- record_count = int(pf.metadata.num_rows or 0)
291
-
292
- # Construct minimal metadata structure compatible with expected shape
293
- metadata = {"num_rows": record_count, "row_groups": []}
294
- for rg in range(pf.num_row_groups):
295
- rg_entry = {"columns": []}
296
- for ci in range(pf.metadata.num_columns):
297
- col_meta = pf.metadata.row_group(rg).column(ci)
298
- col_entry = {"name": pf.schema.names[ci]}
299
- stats = getattr(col_meta, "statistics", None)
300
- if stats:
301
- col_entry["min"] = getattr(stats, "min", None)
302
- col_entry["max"] = getattr(stats, "max", None)
303
- rg_entry["columns"].append(col_entry)
304
- # total_byte_size may not be available; leave out to trigger full-table calculation later
305
- metadata["row_groups"].append(rg_entry)
306
-
307
- # Define a simple to_int fallback for the pyarrow path
308
- def to_int(v: object) -> int:
316
+ # Prepare result containers
317
+ min_k_hashes: list[list[int]] = []
318
+ histograms: list[list[int]] = []
319
+ min_values: list[int] = []
320
+ null_counts: list[int] = []
321
+ max_values: list[int] = []
322
+ min_values_display: list = []
323
+ max_values_display: list = []
324
+
325
+ # iterate schema fields and process each column independently
326
+ schema = pf.schema_arrow
327
+ for col_idx, field in enumerate(schema):
328
+ col_name = field.name
329
+ try:
330
+ col_table = pf.read(columns=[col_name])
331
+ col = col_table.column(0).combine_chunks()
332
+ except Exception:
333
+ # fallback: try reading the row group column (more granular)
309
334
  try:
310
- return int(v)
335
+ tbl = pf.read_row_group(0, columns=[col_name])
336
+ col = tbl.column(0).combine_chunks()
311
337
  except Exception:
312
- try:
313
- if isinstance(v, (bytes, bytearray)):
314
- s = v.decode("utf-8", errors="ignore")
315
- return int(float(s)) if s else 0
316
- return int(float(v))
317
- except Exception:
318
- return 0
319
-
320
- # Gather min/max per column across all row groups
321
- column_stats = {}
322
- for row_group in metadata["row_groups"]:
323
- for column in row_group["columns"]:
324
- column_name = column["name"]
325
-
326
- if column_name not in column_stats:
327
- column_stats[column_name] = {"min": None, "max": None}
328
-
329
- min_value = column.get("min")
330
- if min_value is not None:
331
- # Compress value to int using to_int
332
- min_compressed = to_int(min_value)
333
- if column_stats[column_name]["min"] is None:
334
- column_stats[column_name]["min"] = min_compressed
335
- else:
336
- column_stats[column_name]["min"] = min(
337
- column_stats[column_name]["min"], min_compressed
338
- )
339
-
340
- max_value = column.get("max")
341
- if max_value is not None:
342
- # Compress value to int using to_int
343
- max_compressed = to_int(max_value)
344
- if column_stats[column_name]["max"] is None:
345
- column_stats[column_name]["max"] = max_compressed
346
- else:
347
- column_stats[column_name]["max"] = max(
348
- column_stats[column_name]["max"], max_compressed
349
- )
350
-
351
- # Extract min/max values (filter out None)
352
- min_values = [stats["min"] for stats in column_stats.values() if stats["min"] is not None]
353
- max_values = [stats["max"] for stats in column_stats.values() if stats["max"] is not None]
354
-
355
- # Attempt to gather null counts from metadata row groups if available
356
- column_nulls: dict = {}
357
- for row_group in metadata["row_groups"]:
358
- for column in row_group["columns"]:
359
- cname = column["name"]
360
- if cname not in column_nulls:
361
- column_nulls[cname] = 0
362
- nc = column.get("null_count")
363
- if nc is not None:
364
- try:
365
- column_nulls[cname] += int(nc)
366
- except Exception:
367
- pass
368
-
369
- if column_nulls:
370
- null_counts = [column_nulls.get(n, 0) for n in column_stats.keys()]
371
- else:
372
- null_counts = []
338
+ # Last resort: read entire file and then take the column
339
+ tbl = pf.read()
340
+ col = tbl.column(col_idx).combine_chunks()
341
+
342
+ # compute stats using existing logic encapsulated in helper
343
+ (
344
+ col_min_k,
345
+ col_hist,
346
+ col_min,
347
+ col_max,
348
+ col_min_display,
349
+ col_max_display,
350
+ null_count,
351
+ ) = _compute_stats_for_arrow_column(col, field.type, file_path)
352
+
353
+ # free the table-level reference if present so memory can be reclaimed
354
+ try:
355
+ del col_table
356
+ except Exception:
357
+ pass
358
+ try:
359
+ del tbl
360
+ except Exception:
361
+ pass
362
+
363
+ min_k_hashes.append(col_min_k)
364
+ histograms.append(col_hist)
365
+ min_values.append(col_min)
366
+ max_values.append(col_max)
367
+ min_values_display.append(col_min_display)
368
+ max_values_display.append(col_max_display)
369
+ null_counts.append(null_count)
370
+
371
+ # Calculate uncompressed sizes. When the original in-memory table is
372
+ # available (we just wrote it), prefer using it so sizes match the
373
+ # table-based builder exactly. Otherwise materialize the table from
374
+ # bytes and compute sizes the same way.
375
+ import pyarrow as pa
376
+ import pyarrow.parquet as pq
373
377
 
374
- # Get uncompressed size from metadata; if missing, read full table and
375
- # compute accurate uncompressed size from buffers. Also attempt to
376
- # compute per-column uncompressed byte counts when reading the table.
377
- uncompressed_size = 0
378
378
  column_uncompressed: list[int] = []
379
- missing = False
380
- for row_group in metadata["row_groups"]:
381
- v = row_group.get("total_byte_size", None)
382
- if v is None:
383
- missing = True
384
- break
385
- uncompressed_size += v
386
-
387
- if missing or uncompressed_size == 0:
388
- try:
389
- import pyarrow as pa
390
- import pyarrow.parquet as pq
379
+ uncompressed_size = 0
391
380
 
392
- table = pq.read_table(pa.BufferReader(data))
393
- uncompressed_size = 0
394
- # Compute per-column uncompressed sizes and null counts from the table
395
- for col in table.columns:
396
- col_total = 0
397
- null_total = 0
398
- for chunk in col.chunks:
399
- for buffer in chunk.buffers():
400
- if buffer is not None:
401
- col_total += buffer.size
402
- try:
403
- null_total += int(chunk.null_count)
404
- except Exception:
405
- # Fallback to slow python inspection
406
- try:
407
- col_py = col.to_pylist()
408
- null_total = len(col_py) - len([v for v in col_py if v is not None])
409
- except Exception:
410
- null_total = 0
411
-
412
- column_uncompressed.append(int(col_total))
413
- uncompressed_size += col_total
414
- null_counts = null_counts or []
415
- null_counts.append(int(null_total))
416
- except Exception as exc:
417
- raise RuntimeError(
418
- f"Unable to determine uncompressed size for {file_path}: {exc}"
419
- ) from exc
381
+ # Free references to large objects we no longer need so memory can be reclaimed
382
+ try:
383
+ del buf
384
+ except Exception:
385
+ pass
386
+ try:
387
+ del pf
388
+ except Exception:
389
+ pass
390
+ try:
391
+ del data_bytes
392
+ except Exception:
393
+ pass
394
+
395
+ if orig_table is not None:
396
+ # Use the original table buffers so results match the table-based route
397
+ for col in orig_table.columns:
398
+ col_total = 0
399
+ for chunk in col.chunks:
400
+ try:
401
+ buffs = chunk.buffers()
402
+ except Exception as exc:
403
+ raise RuntimeError(
404
+ f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
405
+ ) from exc
406
+ for buffer in buffs:
407
+ if buffer is not None:
408
+ col_total += buffer.size
409
+ column_uncompressed.append(int(col_total))
410
+ uncompressed_size += col_total
420
411
  else:
421
- # If we didn't read the table and null_counts is still empty, default to zeros
422
- if not null_counts:
423
- null_counts = [0] * len(column_stats)
412
+ # Use rugo metadata (if available) to compute per-column uncompressed sizes
413
+ if rmeta:
414
+ rgs = rmeta.get("row_groups", [])
415
+ if rgs:
416
+ ncols = len(rgs[0].get("columns", []))
417
+ for cidx in range(ncols):
418
+ col_total = 0
419
+ for rg in rgs:
420
+ cols = rg.get("columns", [])
421
+ if cidx < len(cols):
422
+ col_total += int(cols[cidx].get("total_byte_size", 0) or 0)
423
+ column_uncompressed.append(int(col_total))
424
+ uncompressed_size += col_total
425
+ _manifest_metrics["sizes_from_rugo"] += 1
426
+ else:
427
+ column_uncompressed = [0] * len(schema)
428
+ uncompressed_size = 0
429
+ _manifest_metrics["sizes_from_rugo_missing"] += 1
430
+ else:
431
+ # If rugo metadata isn't available, avoid materializing the table;
432
+ # emit zero sizes (safe and memory-light) and track that we lacked
433
+ # metadata for sizes.
434
+ column_uncompressed = [0] * len(schema)
435
+ uncompressed_size = 0
436
+ _manifest_metrics["sizes_from_rugo_unavailable"] += 1
437
+ logger.debug(
438
+ "rugo metadata unavailable for %s; emitting zero column sizes to avoid materializing table",
439
+ file_path,
440
+ )
424
441
 
425
- return ParquetManifestEntry(
442
+ entry = ParquetManifestEntry(
426
443
  file_path=file_path,
427
444
  file_format="parquet",
428
- record_count=int(record_count),
429
- file_size_in_bytes=file_size,
445
+ record_count=int(meta.num_rows),
446
+ file_size_in_bytes=int(file_size_in_bytes or len(data_bytes)),
430
447
  uncompressed_size_in_bytes=uncompressed_size,
431
448
  column_uncompressed_sizes_in_bytes=column_uncompressed,
432
449
  null_counts=null_counts,
433
- min_k_hashes=[],
434
- histogram_counts=[],
435
- histogram_bins=0,
450
+ min_k_hashes=min_k_hashes,
451
+ histogram_counts=histograms,
452
+ histogram_bins=HISTOGRAM_BINS,
436
453
  min_values=min_values,
437
454
  max_values=max_values,
455
+ min_values_display=min_values_display,
456
+ max_values_display=max_values_display,
457
+ )
458
+
459
+ logger.debug(
460
+ "build_parquet_manifest_entry_from_bytes %s files=%d dur=%.3fs",
461
+ file_path,
462
+ _manifest_metrics["files_read"],
463
+ time.perf_counter() - t_start,
464
+ )
465
+ return entry
466
+
467
+
468
+ # Backwards-compatible wrapper that keeps the original calling convention
469
+ # when a pyarrow Table is already provided (tests and some scripts rely on it).
470
+ def build_parquet_manifest_entry(
471
+ table: Any, file_path: str, file_size_in_bytes: int | None = None
472
+ ) -> ParquetManifestEntry:
473
+ """DEPRECATED: explicit table-based manifest building is removed.
474
+
475
+ The implementation previously accepted a PyArrow ``table`` and performed
476
+ the same per-column statistics calculation. That behavior hid a different
477
+ IO/scan path and led to inconsistent performance characteristics.
478
+
479
+ Use ``build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=None)``
480
+ instead. If you have an in-memory table you can serialize it and call the
481
+ bytes-based builder, or pass ``orig_table`` to preserve exact uncompressed
482
+ size calculations.
483
+
484
+ This function now fails fast to avoid silently using the removed path.
485
+ """
486
+ raise RuntimeError(
487
+ "table-based manifest builder removed: use build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=table) instead"
438
488
  )
489
+
490
+
491
+ def get_manifest_metrics() -> dict:
492
+ """Return a snapshot of manifest instrumentation counters (for tests/benchmarks)."""
493
+ return dict(_manifest_metrics)
494
+
495
+
496
+ def reset_manifest_metrics() -> None:
497
+ """Reset the manifest metrics counters to zero."""
498
+ _manifest_metrics.clear()