opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/catalog/compaction.py +15 -8
- opteryx_catalog/catalog/dataset.py +449 -111
- opteryx_catalog/catalog/manifest.py +390 -330
- opteryx_catalog/catalog/metadata.py +3 -0
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +82 -54
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- tests/test_collections.py +37 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -1,5 +1,8 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
4
|
+
import time
|
|
5
|
+
from collections import Counter
|
|
3
6
|
from dataclasses import dataclass
|
|
4
7
|
from dataclasses import field
|
|
5
8
|
from typing import Any
|
|
@@ -44,6 +47,8 @@ class ParquetManifestEntry:
|
|
|
44
47
|
histogram_bins: int
|
|
45
48
|
min_values: list
|
|
46
49
|
max_values: list
|
|
50
|
+
min_values_display: list
|
|
51
|
+
max_values_display: list
|
|
47
52
|
|
|
48
53
|
def to_dict(self) -> dict:
|
|
49
54
|
return {
|
|
@@ -59,380 +64,435 @@ class ParquetManifestEntry:
|
|
|
59
64
|
"histogram_bins": self.histogram_bins,
|
|
60
65
|
"min_values": self.min_values,
|
|
61
66
|
"max_values": self.max_values,
|
|
67
|
+
"min_values_display": self.min_values_display,
|
|
68
|
+
"max_values_display": self.max_values_display,
|
|
62
69
|
}
|
|
63
70
|
|
|
64
71
|
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
"""Build a Parquet manifest entry with statistics for a PyArrow table.
|
|
72
|
+
logger = logging.getLogger(__name__)
|
|
73
|
+
_manifest_metrics = Counter()
|
|
74
|
+
|
|
69
75
|
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
file_path: Path where the file is stored
|
|
73
|
-
file_size_in_bytes: Size of the parquet file in bytes
|
|
76
|
+
def _compute_stats_for_arrow_column(col, field_type, file_path: str):
|
|
77
|
+
"""Compute statistics for a single PyArrow column (Array or ChunkedArray).
|
|
74
78
|
|
|
75
|
-
Returns:
|
|
76
|
-
ParquetManifestEntry with computed statistics
|
|
79
|
+
Returns a tuple: (col_min_k, col_hist, col_min, col_max, min_display, max_display, null_count)
|
|
77
80
|
"""
|
|
81
|
+
import heapq
|
|
82
|
+
|
|
83
|
+
import opteryx.draken as draken # type: ignore
|
|
78
84
|
import pyarrow as pa
|
|
79
85
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
86
|
+
# Ensure single contiguous array when possible
|
|
87
|
+
if hasattr(col, "combine_chunks"):
|
|
88
|
+
try:
|
|
89
|
+
col = col.combine_chunks()
|
|
90
|
+
except Exception:
|
|
91
|
+
# leave as-is
|
|
92
|
+
pass
|
|
85
93
|
|
|
86
|
-
#
|
|
87
|
-
|
|
94
|
+
# Record compress/hash usage
|
|
95
|
+
_manifest_metrics["hash_calls"] += 1
|
|
96
|
+
_manifest_metrics["compress_calls"] += 1
|
|
88
97
|
|
|
89
|
-
|
|
98
|
+
col_py = None
|
|
90
99
|
try:
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
100
|
+
vec = draken.Vector.from_arrow(col)
|
|
101
|
+
except Exception: # pragma: no cover - be robust
|
|
102
|
+
raise
|
|
103
|
+
|
|
104
|
+
hashes = set(vec.hash())
|
|
105
|
+
|
|
106
|
+
# Decide whether to compute min-k/histogram for this column
|
|
107
|
+
compute_min_k = False
|
|
108
|
+
if (
|
|
109
|
+
pa.types.is_integer(field_type)
|
|
110
|
+
or pa.types.is_floating(field_type)
|
|
111
|
+
or pa.types.is_decimal(field_type)
|
|
112
|
+
):
|
|
113
|
+
compute_min_k = True
|
|
114
|
+
elif (
|
|
115
|
+
pa.types.is_timestamp(field_type)
|
|
116
|
+
or pa.types.is_date(field_type)
|
|
117
|
+
or pa.types.is_time(field_type)
|
|
118
|
+
):
|
|
119
|
+
compute_min_k = True
|
|
120
|
+
elif (
|
|
121
|
+
pa.types.is_string(field_type)
|
|
122
|
+
or pa.types.is_large_string(field_type)
|
|
123
|
+
or pa.types.is_binary(field_type)
|
|
124
|
+
or pa.types.is_large_binary(field_type)
|
|
125
|
+
):
|
|
126
|
+
# For strings/binary we may need pylist for display
|
|
127
|
+
try:
|
|
128
|
+
col_py = col.to_pylist()
|
|
129
|
+
except Exception:
|
|
130
|
+
col_py = None
|
|
131
|
+
compute_min_k = True
|
|
132
|
+
|
|
133
|
+
if compute_min_k:
|
|
134
|
+
smallest = heapq.nsmallest(MIN_K_HASHES, hashes)
|
|
135
|
+
col_min_k = sorted(smallest)
|
|
136
|
+
else:
|
|
137
|
+
col_min_k = []
|
|
138
|
+
|
|
139
|
+
import pyarrow as pa # local import for types
|
|
140
|
+
|
|
141
|
+
compute_hist = compute_min_k
|
|
142
|
+
if pa.types.is_boolean(field_type):
|
|
143
|
+
compute_hist = True
|
|
144
|
+
|
|
145
|
+
# Use draken.compress() to get canonical int64 per value
|
|
146
|
+
compressed = list(vec.compress())
|
|
147
|
+
null_count = sum(1 for m in compressed if m == NULL_FLAG)
|
|
148
|
+
|
|
149
|
+
non_nulls_compressed = [m for m in compressed if m != NULL_FLAG]
|
|
150
|
+
if non_nulls_compressed:
|
|
151
|
+
vmin = min(non_nulls_compressed)
|
|
152
|
+
vmax = max(non_nulls_compressed)
|
|
153
|
+
col_min = int(vmin)
|
|
154
|
+
col_max = int(vmax)
|
|
155
|
+
if compute_hist:
|
|
156
|
+
# Special-case boolean histograms
|
|
157
|
+
if pa.types.is_boolean(field_type):
|
|
118
158
|
try:
|
|
119
|
-
col_py
|
|
159
|
+
if col_py is None:
|
|
160
|
+
try:
|
|
161
|
+
col_py = col.to_pylist()
|
|
162
|
+
except Exception:
|
|
163
|
+
col_py = None
|
|
164
|
+
if col_py is not None:
|
|
165
|
+
non_nulls_bool = [v for v in col_py if v is not None]
|
|
166
|
+
false_count = sum(1 for v in non_nulls_bool if v is False)
|
|
167
|
+
true_count = sum(1 for v in non_nulls_bool if v is True)
|
|
168
|
+
else:
|
|
169
|
+
# Fallback: infer from compressed mapping (assume 0/1)
|
|
170
|
+
false_count = sum(1 for m in non_nulls_compressed if m == 0)
|
|
171
|
+
true_count = sum(1 for m in non_nulls_compressed if m != 0)
|
|
120
172
|
except Exception:
|
|
121
|
-
|
|
173
|
+
false_count = 0
|
|
174
|
+
true_count = 0
|
|
122
175
|
|
|
123
|
-
|
|
124
|
-
lens = [len(x) for x in col_py if x is not None]
|
|
125
|
-
if lens:
|
|
126
|
-
avg_len = sum(lens) / len(lens)
|
|
127
|
-
if avg_len <= 16:
|
|
128
|
-
compute_min_k = True
|
|
129
|
-
|
|
130
|
-
# KMV: take K smallest unique hashes when allowed; otherwise
|
|
131
|
-
# store an empty list for this column. Deduplicate hashes so
|
|
132
|
-
# the KMV sketch contains unique hashes (avoids duplicates
|
|
133
|
-
# skewing cardinality estimates).
|
|
134
|
-
if compute_min_k:
|
|
135
|
-
unique_hashes = set(hashes)
|
|
136
|
-
smallest = heapq.nsmallest(MIN_K_HASHES, unique_hashes)
|
|
137
|
-
col_min_k = sorted(smallest)
|
|
176
|
+
col_hist = [int(true_count), int(false_count)]
|
|
138
177
|
else:
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
# For histogram decisions follow the same rule as min-k
|
|
142
|
-
compute_hist = compute_min_k
|
|
143
|
-
|
|
144
|
-
# Use draken.compress() to get canonical int64 per value
|
|
145
|
-
mapped = list(vec.compress())
|
|
146
|
-
# Compute null count from compressed representation
|
|
147
|
-
null_count = sum(1 for m in mapped if m == NULL_FLAG)
|
|
148
|
-
null_counts.append(int(null_count))
|
|
149
|
-
non_nulls_mapped = [m for m in mapped if m != NULL_FLAG]
|
|
150
|
-
if non_nulls_mapped:
|
|
151
|
-
vmin = min(non_nulls_mapped)
|
|
152
|
-
vmax = max(non_nulls_mapped)
|
|
153
|
-
col_min = int(vmin)
|
|
154
|
-
col_max = int(vmax)
|
|
155
|
-
if compute_hist:
|
|
156
|
-
if vmin == vmax:
|
|
157
|
-
col_hist = [0] * HISTOGRAM_BINS
|
|
158
|
-
col_hist[-1] = len(non_nulls_mapped)
|
|
159
|
-
else:
|
|
160
|
-
col_hist = [0] * HISTOGRAM_BINS
|
|
161
|
-
span = float(vmax - vmin)
|
|
162
|
-
for m in non_nulls_mapped:
|
|
163
|
-
b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
|
|
164
|
-
if b < 0:
|
|
165
|
-
b = 0
|
|
166
|
-
if b >= HISTOGRAM_BINS:
|
|
167
|
-
b = HISTOGRAM_BINS - 1
|
|
168
|
-
col_hist[b] += 1
|
|
178
|
+
if vmin == vmax:
|
|
179
|
+
col_hist = []
|
|
169
180
|
else:
|
|
170
181
|
col_hist = [0] * HISTOGRAM_BINS
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
b = (h >> (64 - 5)) & 0x1F
|
|
182
|
+
span = float(vmax - vmin)
|
|
183
|
+
for m in non_nulls_compressed:
|
|
184
|
+
b = int(((float(m) - float(vmin)) / span) * (HISTOGRAM_BINS - 1))
|
|
185
|
+
if b < 0:
|
|
186
|
+
b = 0
|
|
187
|
+
if b >= HISTOGRAM_BINS:
|
|
188
|
+
b = HISTOGRAM_BINS - 1
|
|
179
189
|
col_hist[b] += 1
|
|
180
|
-
|
|
181
|
-
|
|
190
|
+
else:
|
|
191
|
+
col_hist = []
|
|
192
|
+
else:
|
|
193
|
+
# no non-null values
|
|
194
|
+
col_min = NULL_FLAG
|
|
195
|
+
col_max = NULL_FLAG
|
|
196
|
+
col_hist = []
|
|
182
197
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
# end for
|
|
188
|
-
except Exception:
|
|
189
|
-
# Draken not available or failed; leave min_k_hashes/histograms empty
|
|
190
|
-
min_k_hashes = [[] for _ in table.columns]
|
|
191
|
-
histograms = [[] for _ in table.columns]
|
|
192
|
-
# Attempt to compute per-column min/max from the table directly
|
|
193
|
-
try:
|
|
194
|
-
for col in table.columns:
|
|
198
|
+
# display values
|
|
199
|
+
try:
|
|
200
|
+
if pa.types.is_string(field_type) or pa.types.is_large_string(field_type):
|
|
201
|
+
if col_py is None:
|
|
195
202
|
try:
|
|
196
203
|
col_py = col.to_pylist()
|
|
197
|
-
non_nulls = [v for v in col_py if v is not None]
|
|
198
|
-
null_count = len(col_py) - len(non_nulls)
|
|
199
|
-
null_counts.append(int(null_count))
|
|
200
|
-
if non_nulls:
|
|
201
|
-
try:
|
|
202
|
-
min_values.append(min(non_nulls))
|
|
203
|
-
max_values.append(max(non_nulls))
|
|
204
|
-
except Exception:
|
|
205
|
-
min_values.append(None)
|
|
206
|
-
max_values.append(None)
|
|
207
|
-
else:
|
|
208
|
-
min_values.append(None)
|
|
209
|
-
max_values.append(None)
|
|
210
204
|
except Exception:
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
205
|
+
col_py = None
|
|
206
|
+
if col_py is not None:
|
|
207
|
+
non_nulls_str = [x for x in col_py if x is not None]
|
|
208
|
+
if non_nulls_str:
|
|
209
|
+
min_value = min(non_nulls_str)
|
|
210
|
+
max_value = max(non_nulls_str)
|
|
211
|
+
if len(min_value) > 16:
|
|
212
|
+
min_value = min_value[:16] + "..."
|
|
213
|
+
if len(max_value) > 16:
|
|
214
|
+
max_value = max_value[:16] + "..."
|
|
215
|
+
min_display = min_value
|
|
216
|
+
max_display = max_value
|
|
217
|
+
else:
|
|
218
|
+
min_display = None
|
|
219
|
+
max_display = None
|
|
220
|
+
else:
|
|
221
|
+
min_display = None
|
|
222
|
+
max_display = None
|
|
223
|
+
elif pa.types.is_binary(field_type) or pa.types.is_large_binary(field_type):
|
|
224
|
+
if col_py is None:
|
|
225
|
+
try:
|
|
226
|
+
col_py = col.to_pylist()
|
|
227
|
+
except Exception:
|
|
228
|
+
col_py = None
|
|
229
|
+
if col_py is not None:
|
|
230
|
+
non_nulls = [x for x in col_py if x is not None]
|
|
231
|
+
if non_nulls:
|
|
232
|
+
min_value = min(non_nulls)
|
|
233
|
+
max_value = max(non_nulls)
|
|
234
|
+
if len(min_value) > 16:
|
|
235
|
+
min_value = min_value[:16] + "..."
|
|
236
|
+
if len(max_value) > 16:
|
|
237
|
+
max_value = max_value[:16] + "..."
|
|
238
|
+
if any(ord(b) < 32 or ord(b) > 126 for b in min_value):
|
|
239
|
+
min_value = min_value.hex()
|
|
240
|
+
min_value = min_value[:16] + "..."
|
|
241
|
+
if any(ord(b) < 32 or ord(b) > 126 for b in max_value):
|
|
242
|
+
max_value = max_value.hex()
|
|
243
|
+
max_value = max_value[:16] + "..."
|
|
244
|
+
min_display = min_value
|
|
245
|
+
max_display = max_value
|
|
246
|
+
else:
|
|
247
|
+
min_display = None
|
|
248
|
+
max_display = None
|
|
249
|
+
else:
|
|
250
|
+
min_display = None
|
|
251
|
+
max_display = None
|
|
252
|
+
else:
|
|
253
|
+
if col_py is None:
|
|
254
|
+
try:
|
|
255
|
+
col_py = col.to_pylist()
|
|
256
|
+
except Exception:
|
|
257
|
+
col_py = None
|
|
258
|
+
if col_py is not None:
|
|
259
|
+
non_nulls = [x for x in col_py if x is not None]
|
|
260
|
+
if non_nulls:
|
|
261
|
+
min_display = min(non_nulls)
|
|
262
|
+
max_display = max(non_nulls)
|
|
263
|
+
else:
|
|
264
|
+
min_display = None
|
|
265
|
+
max_display = None
|
|
266
|
+
else:
|
|
267
|
+
min_display = None
|
|
268
|
+
max_display = None
|
|
269
|
+
except Exception:
|
|
270
|
+
min_display = None
|
|
271
|
+
max_display = None
|
|
272
|
+
|
|
273
|
+
return (
|
|
274
|
+
col_min_k,
|
|
275
|
+
col_hist,
|
|
276
|
+
int(col_min),
|
|
277
|
+
int(col_max),
|
|
278
|
+
min_display,
|
|
279
|
+
max_display,
|
|
280
|
+
int(null_count),
|
|
252
281
|
)
|
|
253
282
|
|
|
254
283
|
|
|
255
|
-
def
|
|
256
|
-
|
|
284
|
+
def build_parquet_manifest_entry_from_bytes(
|
|
285
|
+
data_bytes: bytes,
|
|
286
|
+
file_path: str,
|
|
287
|
+
file_size_in_bytes: int | None = None,
|
|
288
|
+
orig_table: Any | None = None,
|
|
289
|
+
) -> ParquetManifestEntry:
|
|
290
|
+
"""Build a manifest entry by reading a parquet file as bytes and scanning column-by-column.
|
|
257
291
|
|
|
258
|
-
This
|
|
259
|
-
|
|
292
|
+
This reads the compressed file once and materializes one full column at a time
|
|
293
|
+
(combine_chunks) which keeps peak memory low while letting per-column
|
|
294
|
+
stat calculation (draken) operate on contiguous arrays.
|
|
295
|
+
"""
|
|
296
|
+
import pyarrow as pa
|
|
297
|
+
import pyarrow.parquet as pq
|
|
260
298
|
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
299
|
+
t_start = time.perf_counter()
|
|
300
|
+
_manifest_metrics["files_read"] += 1
|
|
301
|
+
_manifest_metrics["bytes_read"] += len(data_bytes)
|
|
264
302
|
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
file_size = len(data)
|
|
303
|
+
buf = pa.BufferReader(data_bytes)
|
|
304
|
+
pf = pq.ParquetFile(buf)
|
|
305
|
+
meta = pf.metadata
|
|
269
306
|
|
|
270
|
-
#
|
|
271
|
-
#
|
|
307
|
+
# Try to read rugo metadata early so we can compute sizes without
|
|
308
|
+
# materializing the table later. This is zero-copy and fast.
|
|
272
309
|
try:
|
|
273
|
-
|
|
274
|
-
from opteryx.compiled.structures.relation_statistics import to_int
|
|
310
|
+
from opteryx.rugo.parquet import read_metadata_from_memoryview
|
|
275
311
|
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
metadata = parquet_meta.read_metadata_from_memoryview(
|
|
280
|
-
memoryview(data), include_statistics=True
|
|
281
|
-
)
|
|
312
|
+
rmeta = read_metadata_from_memoryview(memoryview(data_bytes))
|
|
313
|
+
except Exception:
|
|
314
|
+
rmeta = None
|
|
282
315
|
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
col_entry["min"] = getattr(stats, "min", None)
|
|
302
|
-
col_entry["max"] = getattr(stats, "max", None)
|
|
303
|
-
rg_entry["columns"].append(col_entry)
|
|
304
|
-
# total_byte_size may not be available; leave out to trigger full-table calculation later
|
|
305
|
-
metadata["row_groups"].append(rg_entry)
|
|
306
|
-
|
|
307
|
-
# Define a simple to_int fallback for the pyarrow path
|
|
308
|
-
def to_int(v: object) -> int:
|
|
316
|
+
# Prepare result containers
|
|
317
|
+
min_k_hashes: list[list[int]] = []
|
|
318
|
+
histograms: list[list[int]] = []
|
|
319
|
+
min_values: list[int] = []
|
|
320
|
+
null_counts: list[int] = []
|
|
321
|
+
max_values: list[int] = []
|
|
322
|
+
min_values_display: list = []
|
|
323
|
+
max_values_display: list = []
|
|
324
|
+
|
|
325
|
+
# iterate schema fields and process each column independently
|
|
326
|
+
schema = pf.schema_arrow
|
|
327
|
+
for col_idx, field in enumerate(schema):
|
|
328
|
+
col_name = field.name
|
|
329
|
+
try:
|
|
330
|
+
col_table = pf.read(columns=[col_name])
|
|
331
|
+
col = col_table.column(0).combine_chunks()
|
|
332
|
+
except Exception:
|
|
333
|
+
# fallback: try reading the row group column (more granular)
|
|
309
334
|
try:
|
|
310
|
-
|
|
335
|
+
tbl = pf.read_row_group(0, columns=[col_name])
|
|
336
|
+
col = tbl.column(0).combine_chunks()
|
|
311
337
|
except Exception:
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
# Extract min/max values (filter out None)
|
|
352
|
-
min_values = [stats["min"] for stats in column_stats.values() if stats["min"] is not None]
|
|
353
|
-
max_values = [stats["max"] for stats in column_stats.values() if stats["max"] is not None]
|
|
354
|
-
|
|
355
|
-
# Attempt to gather null counts from metadata row groups if available
|
|
356
|
-
column_nulls: dict = {}
|
|
357
|
-
for row_group in metadata["row_groups"]:
|
|
358
|
-
for column in row_group["columns"]:
|
|
359
|
-
cname = column["name"]
|
|
360
|
-
if cname not in column_nulls:
|
|
361
|
-
column_nulls[cname] = 0
|
|
362
|
-
nc = column.get("null_count")
|
|
363
|
-
if nc is not None:
|
|
364
|
-
try:
|
|
365
|
-
column_nulls[cname] += int(nc)
|
|
366
|
-
except Exception:
|
|
367
|
-
pass
|
|
368
|
-
|
|
369
|
-
if column_nulls:
|
|
370
|
-
null_counts = [column_nulls.get(n, 0) for n in column_stats.keys()]
|
|
371
|
-
else:
|
|
372
|
-
null_counts = []
|
|
338
|
+
# Last resort: read entire file and then take the column
|
|
339
|
+
tbl = pf.read()
|
|
340
|
+
col = tbl.column(col_idx).combine_chunks()
|
|
341
|
+
|
|
342
|
+
# compute stats using existing logic encapsulated in helper
|
|
343
|
+
(
|
|
344
|
+
col_min_k,
|
|
345
|
+
col_hist,
|
|
346
|
+
col_min,
|
|
347
|
+
col_max,
|
|
348
|
+
col_min_display,
|
|
349
|
+
col_max_display,
|
|
350
|
+
null_count,
|
|
351
|
+
) = _compute_stats_for_arrow_column(col, field.type, file_path)
|
|
352
|
+
|
|
353
|
+
# free the table-level reference if present so memory can be reclaimed
|
|
354
|
+
try:
|
|
355
|
+
del col_table
|
|
356
|
+
except Exception:
|
|
357
|
+
pass
|
|
358
|
+
try:
|
|
359
|
+
del tbl
|
|
360
|
+
except Exception:
|
|
361
|
+
pass
|
|
362
|
+
|
|
363
|
+
min_k_hashes.append(col_min_k)
|
|
364
|
+
histograms.append(col_hist)
|
|
365
|
+
min_values.append(col_min)
|
|
366
|
+
max_values.append(col_max)
|
|
367
|
+
min_values_display.append(col_min_display)
|
|
368
|
+
max_values_display.append(col_max_display)
|
|
369
|
+
null_counts.append(null_count)
|
|
370
|
+
|
|
371
|
+
# Calculate uncompressed sizes. When the original in-memory table is
|
|
372
|
+
# available (we just wrote it), prefer using it so sizes match the
|
|
373
|
+
# table-based builder exactly. Otherwise materialize the table from
|
|
374
|
+
# bytes and compute sizes the same way.
|
|
375
|
+
import pyarrow as pa
|
|
376
|
+
import pyarrow.parquet as pq
|
|
373
377
|
|
|
374
|
-
# Get uncompressed size from metadata; if missing, read full table and
|
|
375
|
-
# compute accurate uncompressed size from buffers. Also attempt to
|
|
376
|
-
# compute per-column uncompressed byte counts when reading the table.
|
|
377
|
-
uncompressed_size = 0
|
|
378
378
|
column_uncompressed: list[int] = []
|
|
379
|
-
|
|
380
|
-
for row_group in metadata["row_groups"]:
|
|
381
|
-
v = row_group.get("total_byte_size", None)
|
|
382
|
-
if v is None:
|
|
383
|
-
missing = True
|
|
384
|
-
break
|
|
385
|
-
uncompressed_size += v
|
|
386
|
-
|
|
387
|
-
if missing or uncompressed_size == 0:
|
|
388
|
-
try:
|
|
389
|
-
import pyarrow as pa
|
|
390
|
-
import pyarrow.parquet as pq
|
|
379
|
+
uncompressed_size = 0
|
|
391
380
|
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
381
|
+
# Free references to large objects we no longer need so memory can be reclaimed
|
|
382
|
+
try:
|
|
383
|
+
del buf
|
|
384
|
+
except Exception:
|
|
385
|
+
pass
|
|
386
|
+
try:
|
|
387
|
+
del pf
|
|
388
|
+
except Exception:
|
|
389
|
+
pass
|
|
390
|
+
try:
|
|
391
|
+
del data_bytes
|
|
392
|
+
except Exception:
|
|
393
|
+
pass
|
|
394
|
+
|
|
395
|
+
if orig_table is not None:
|
|
396
|
+
# Use the original table buffers so results match the table-based route
|
|
397
|
+
for col in orig_table.columns:
|
|
398
|
+
col_total = 0
|
|
399
|
+
for chunk in col.chunks:
|
|
400
|
+
try:
|
|
401
|
+
buffs = chunk.buffers()
|
|
402
|
+
except Exception as exc:
|
|
403
|
+
raise RuntimeError(
|
|
404
|
+
f"Unable to access chunk buffers to calculate uncompressed size for {file_path}: {exc}"
|
|
405
|
+
) from exc
|
|
406
|
+
for buffer in buffs:
|
|
407
|
+
if buffer is not None:
|
|
408
|
+
col_total += buffer.size
|
|
409
|
+
column_uncompressed.append(int(col_total))
|
|
410
|
+
uncompressed_size += col_total
|
|
420
411
|
else:
|
|
421
|
-
#
|
|
422
|
-
if
|
|
423
|
-
|
|
412
|
+
# Use rugo metadata (if available) to compute per-column uncompressed sizes
|
|
413
|
+
if rmeta:
|
|
414
|
+
rgs = rmeta.get("row_groups", [])
|
|
415
|
+
if rgs:
|
|
416
|
+
ncols = len(rgs[0].get("columns", []))
|
|
417
|
+
for cidx in range(ncols):
|
|
418
|
+
col_total = 0
|
|
419
|
+
for rg in rgs:
|
|
420
|
+
cols = rg.get("columns", [])
|
|
421
|
+
if cidx < len(cols):
|
|
422
|
+
col_total += int(cols[cidx].get("total_byte_size", 0) or 0)
|
|
423
|
+
column_uncompressed.append(int(col_total))
|
|
424
|
+
uncompressed_size += col_total
|
|
425
|
+
_manifest_metrics["sizes_from_rugo"] += 1
|
|
426
|
+
else:
|
|
427
|
+
column_uncompressed = [0] * len(schema)
|
|
428
|
+
uncompressed_size = 0
|
|
429
|
+
_manifest_metrics["sizes_from_rugo_missing"] += 1
|
|
430
|
+
else:
|
|
431
|
+
# If rugo metadata isn't available, avoid materializing the table;
|
|
432
|
+
# emit zero sizes (safe and memory-light) and track that we lacked
|
|
433
|
+
# metadata for sizes.
|
|
434
|
+
column_uncompressed = [0] * len(schema)
|
|
435
|
+
uncompressed_size = 0
|
|
436
|
+
_manifest_metrics["sizes_from_rugo_unavailable"] += 1
|
|
437
|
+
logger.debug(
|
|
438
|
+
"rugo metadata unavailable for %s; emitting zero column sizes to avoid materializing table",
|
|
439
|
+
file_path,
|
|
440
|
+
)
|
|
424
441
|
|
|
425
|
-
|
|
442
|
+
entry = ParquetManifestEntry(
|
|
426
443
|
file_path=file_path,
|
|
427
444
|
file_format="parquet",
|
|
428
|
-
record_count=int(
|
|
429
|
-
file_size_in_bytes=
|
|
445
|
+
record_count=int(meta.num_rows),
|
|
446
|
+
file_size_in_bytes=int(file_size_in_bytes or len(data_bytes)),
|
|
430
447
|
uncompressed_size_in_bytes=uncompressed_size,
|
|
431
448
|
column_uncompressed_sizes_in_bytes=column_uncompressed,
|
|
432
449
|
null_counts=null_counts,
|
|
433
|
-
min_k_hashes=
|
|
434
|
-
histogram_counts=
|
|
435
|
-
histogram_bins=
|
|
450
|
+
min_k_hashes=min_k_hashes,
|
|
451
|
+
histogram_counts=histograms,
|
|
452
|
+
histogram_bins=HISTOGRAM_BINS,
|
|
436
453
|
min_values=min_values,
|
|
437
454
|
max_values=max_values,
|
|
455
|
+
min_values_display=min_values_display,
|
|
456
|
+
max_values_display=max_values_display,
|
|
457
|
+
)
|
|
458
|
+
|
|
459
|
+
logger.debug(
|
|
460
|
+
"build_parquet_manifest_entry_from_bytes %s files=%d dur=%.3fs",
|
|
461
|
+
file_path,
|
|
462
|
+
_manifest_metrics["files_read"],
|
|
463
|
+
time.perf_counter() - t_start,
|
|
464
|
+
)
|
|
465
|
+
return entry
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
# Backwards-compatible wrapper that keeps the original calling convention
|
|
469
|
+
# when a pyarrow Table is already provided (tests and some scripts rely on it).
|
|
470
|
+
def build_parquet_manifest_entry(
|
|
471
|
+
table: Any, file_path: str, file_size_in_bytes: int | None = None
|
|
472
|
+
) -> ParquetManifestEntry:
|
|
473
|
+
"""DEPRECATED: explicit table-based manifest building is removed.
|
|
474
|
+
|
|
475
|
+
The implementation previously accepted a PyArrow ``table`` and performed
|
|
476
|
+
the same per-column statistics calculation. That behavior hid a different
|
|
477
|
+
IO/scan path and led to inconsistent performance characteristics.
|
|
478
|
+
|
|
479
|
+
Use ``build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=None)``
|
|
480
|
+
instead. If you have an in-memory table you can serialize it and call the
|
|
481
|
+
bytes-based builder, or pass ``orig_table`` to preserve exact uncompressed
|
|
482
|
+
size calculations.
|
|
483
|
+
|
|
484
|
+
This function now fails fast to avoid silently using the removed path.
|
|
485
|
+
"""
|
|
486
|
+
raise RuntimeError(
|
|
487
|
+
"table-based manifest builder removed: use build_parquet_manifest_entry_from_bytes(data_bytes, file_path, file_size_in_bytes, orig_table=table) instead"
|
|
438
488
|
)
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def get_manifest_metrics() -> dict:
|
|
492
|
+
"""Return a snapshot of manifest instrumentation counters (for tests/benchmarks)."""
|
|
493
|
+
return dict(_manifest_metrics)
|
|
494
|
+
|
|
495
|
+
|
|
496
|
+
def reset_manifest_metrics() -> None:
|
|
497
|
+
"""Reset the manifest metrics counters to zero."""
|
|
498
|
+
_manifest_metrics.clear()
|