opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- opteryx_catalog/catalog/compaction.py +15 -8
- opteryx_catalog/catalog/dataset.py +449 -111
- opteryx_catalog/catalog/manifest.py +390 -330
- opteryx_catalog/catalog/metadata.py +3 -0
- opteryx_catalog/iops/fileio.py +13 -0
- opteryx_catalog/maki_nage/__init__.py +8 -0
- opteryx_catalog/maki_nage/distogram.py +558 -0
- opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
- opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
- opteryx_catalog/maki_nage/tests/test_count.py +19 -0
- opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
- opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
- opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
- opteryx_catalog/maki_nage/tests/test_update.py +44 -0
- opteryx_catalog/opteryx_catalog.py +82 -54
- opteryx_catalog/webhooks/__init__.py +230 -0
- opteryx_catalog/webhooks/events.py +177 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
- opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
- scripts/collect_byte_counts.py +42 -0
- scripts/emit_full_single_file.py +81 -0
- scripts/inspect_manifest_dryrun.py +322 -0
- scripts/inspect_single_file.py +147 -0
- scripts/inspect_single_file_gcs.py +124 -0
- tests/test_collections.py +37 -0
- tests/test_describe_uncompressed.py +127 -0
- tests/test_refresh_manifest.py +275 -0
- tests/test_webhooks.py +177 -0
- opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
- {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,322 @@
|
|
|
1
|
+
"""Dry-run manifest inspector
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python scripts/inspect_manifest_dryrun.py <dataset_identifier> <output_jsonl_path>
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
python scripts/inspect_manifest_dryrun.py opteryx.ops.audit_log /tmp/audit_log_manifest_dryrun.jsonl
|
|
8
|
+
|
|
9
|
+
This script is non-mutating: it reads the manifest and referenced data files (when readable),
|
|
10
|
+
recomputes per-file statistics via `build_parquet_manifest_entry_from_bytes`, and writes per-file results
|
|
11
|
+
and a dataset-level summary to the output JSON-lines file.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
import time
|
|
17
|
+
|
|
18
|
+
import pyarrow as pa
|
|
19
|
+
import pyarrow.parquet as pq
|
|
20
|
+
|
|
21
|
+
from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
|
|
22
|
+
from opteryx_catalog.opteryx_catalog import OpteryxCatalog
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def safe_read_manifest(ds) -> list:
|
|
26
|
+
snap = ds.snapshot()
|
|
27
|
+
if snap is None or not getattr(snap, "manifest_list", None):
|
|
28
|
+
raise ValueError("No manifest available for this dataset/snapshot")
|
|
29
|
+
manifest_path = snap.manifest_list
|
|
30
|
+
try:
|
|
31
|
+
inp = ds.io.new_input(manifest_path)
|
|
32
|
+
with inp.open() as f:
|
|
33
|
+
data = f.read()
|
|
34
|
+
if not data:
|
|
35
|
+
return []
|
|
36
|
+
table = pq.read_table(pa.BufferReader(data))
|
|
37
|
+
rows = table.to_pylist()
|
|
38
|
+
return rows
|
|
39
|
+
except Exception as e:
|
|
40
|
+
return [{"__error": f"failed to read manifest: {e}"}]
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def inspect_dataset(dataset_identifier: str, output_path: str, catalog_kwargs: dict):
|
|
44
|
+
out_file = open(output_path, "w", encoding="utf-8")
|
|
45
|
+
|
|
46
|
+
meta = {"dataset": dataset_identifier, "timestamp": int(time.time() * 1000)}
|
|
47
|
+
out_file.write(json.dumps({"_meta": meta}) + "\n")
|
|
48
|
+
|
|
49
|
+
# Allow dataset identifiers in forms:
|
|
50
|
+
# - 'collection.dataset'
|
|
51
|
+
# - 'workspace.collection.dataset' (fully-qualified)
|
|
52
|
+
# If fully-qualified, override workspace from identifier.
|
|
53
|
+
parts = dataset_identifier.split(".")
|
|
54
|
+
if len(parts) == 3:
|
|
55
|
+
wk, collection, dname = parts
|
|
56
|
+
catalog_kwargs = dict(catalog_kwargs)
|
|
57
|
+
catalog_kwargs["workspace"] = wk
|
|
58
|
+
dataset_identifier = f"{collection}.{dname}"
|
|
59
|
+
|
|
60
|
+
catalog = OpteryxCatalog(**catalog_kwargs)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
ds = catalog.load_dataset(dataset_identifier)
|
|
64
|
+
except Exception as e:
|
|
65
|
+
out_file.write(json.dumps({"error": f"failed to load dataset: {e}"}) + "\n")
|
|
66
|
+
out_file.close()
|
|
67
|
+
return
|
|
68
|
+
|
|
69
|
+
# Describe before
|
|
70
|
+
try:
|
|
71
|
+
describe_before = ds.describe()
|
|
72
|
+
except Exception as e:
|
|
73
|
+
describe_before = {"__error": f"describe failed: {e}"}
|
|
74
|
+
|
|
75
|
+
out_file.write(json.dumps({"describe_before": describe_before}) + "\n")
|
|
76
|
+
|
|
77
|
+
# Read manifest rows
|
|
78
|
+
rows = safe_read_manifest(ds)
|
|
79
|
+
out_file.write(json.dumps({"manifest_rows_count": len(rows)}) + "\n")
|
|
80
|
+
|
|
81
|
+
# Get schema mapping
|
|
82
|
+
try:
|
|
83
|
+
orso_schema = ds.schema()
|
|
84
|
+
col_to_idx = {c.name: i for i, c in enumerate(orso_schema.columns)} if orso_schema else {}
|
|
85
|
+
except Exception:
|
|
86
|
+
col_to_idx = {}
|
|
87
|
+
|
|
88
|
+
# Aggregators for recomputed per-column uncompressed bytes
|
|
89
|
+
recomputed_col_bytes = {name: 0 for name in col_to_idx}
|
|
90
|
+
|
|
91
|
+
for i, ent in enumerate(rows):
|
|
92
|
+
rec = {"file_index": i}
|
|
93
|
+
if not isinstance(ent, dict):
|
|
94
|
+
rec["error"] = "manifest row not a dict"
|
|
95
|
+
out_file.write(json.dumps(rec) + "\n")
|
|
96
|
+
continue
|
|
97
|
+
rec["file_path"] = ent.get("file_path")
|
|
98
|
+
|
|
99
|
+
def _preview(lst, n=6):
|
|
100
|
+
if lst is None:
|
|
101
|
+
return None
|
|
102
|
+
if isinstance(lst, list):
|
|
103
|
+
if len(lst) > n:
|
|
104
|
+
return {"len": len(lst), "preview": lst[:n], "truncated": True}
|
|
105
|
+
return {"len": len(lst), "preview": lst, "truncated": False}
|
|
106
|
+
return lst
|
|
107
|
+
|
|
108
|
+
rec["manifest_entry_summary"] = {
|
|
109
|
+
"uncompressed_size": int(ent.get("uncompressed_size_in_bytes") or 0),
|
|
110
|
+
"column_uncompressed_sizes": _preview(ent.get("column_uncompressed_sizes_in_bytes")),
|
|
111
|
+
"null_counts": _preview(ent.get("null_counts")),
|
|
112
|
+
"min_values": _preview(ent.get("min_values")),
|
|
113
|
+
"max_values": _preview(ent.get("max_values")),
|
|
114
|
+
"min_values_display": _preview(ent.get("min_values_display")),
|
|
115
|
+
"max_values_display": _preview(ent.get("max_values_display")),
|
|
116
|
+
"min_k_hashes": _preview(ent.get("min_k_hashes")),
|
|
117
|
+
"histogram_counts_len": len(ent.get("histogram_counts") or []),
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
fp = ent.get("file_path")
|
|
121
|
+
if not fp:
|
|
122
|
+
rec["recomputed"] = {"error": "no file_path"}
|
|
123
|
+
out_file.write(json.dumps(rec) + "\n")
|
|
124
|
+
continue
|
|
125
|
+
|
|
126
|
+
# Try to read the data file and recompute stats
|
|
127
|
+
try:
|
|
128
|
+
inp = ds.io.new_input(fp)
|
|
129
|
+
with inp.open() as f:
|
|
130
|
+
data = f.read()
|
|
131
|
+
if not data:
|
|
132
|
+
rec["recomputed"] = {"error": "empty file"}
|
|
133
|
+
out_file.write(json.dumps(rec) + "\n")
|
|
134
|
+
continue
|
|
135
|
+
# Use bytes-based builder for deterministic behavior
|
|
136
|
+
recomputed_entry = build_parquet_manifest_entry_from_bytes(
|
|
137
|
+
data, fp, len(data)
|
|
138
|
+
).to_dict()
|
|
139
|
+
|
|
140
|
+
# Build a compact summary for output
|
|
141
|
+
recomputed_summary = {
|
|
142
|
+
"uncompressed_size": int(recomputed_entry.get("uncompressed_size_in_bytes") or 0),
|
|
143
|
+
"column_uncompressed_sizes": _preview(
|
|
144
|
+
recomputed_entry.get("column_uncompressed_sizes_in_bytes")
|
|
145
|
+
),
|
|
146
|
+
"null_counts": _preview(recomputed_entry.get("null_counts")),
|
|
147
|
+
"min_values": _preview(recomputed_entry.get("min_values")),
|
|
148
|
+
"max_values": _preview(recomputed_entry.get("max_values")),
|
|
149
|
+
"min_values_display": _preview(recomputed_entry.get("min_values_display")),
|
|
150
|
+
"max_values_display": _preview(recomputed_entry.get("max_values_display")),
|
|
151
|
+
"min_k_hashes": _preview(recomputed_entry.get("min_k_hashes")),
|
|
152
|
+
"histogram_counts_len": len(recomputed_entry.get("histogram_counts") or []),
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
rec["recomputed"] = recomputed_summary
|
|
156
|
+
|
|
157
|
+
# Compare some fields safely and do per-column comparisons (sampled)
|
|
158
|
+
diffs = {}
|
|
159
|
+
try:
|
|
160
|
+
manifest_us = int(ent.get("uncompressed_size_in_bytes") or 0)
|
|
161
|
+
recomputed_us = recomputed_summary["uncompressed_size"]
|
|
162
|
+
if manifest_us != recomputed_us:
|
|
163
|
+
diffs["uncompressed_size_mismatch"] = {
|
|
164
|
+
"manifest": manifest_us,
|
|
165
|
+
"recomputed": recomputed_us,
|
|
166
|
+
}
|
|
167
|
+
except Exception:
|
|
168
|
+
pass
|
|
169
|
+
|
|
170
|
+
try:
|
|
171
|
+
manifest_cols = len(ent.get("column_uncompressed_sizes_in_bytes") or [])
|
|
172
|
+
recomputed_cols = recomputed_entry.get("column_uncompressed_sizes_in_bytes") or []
|
|
173
|
+
if manifest_cols != len(recomputed_cols):
|
|
174
|
+
diffs["column_uncompressed_length_mismatch"] = {
|
|
175
|
+
"manifest_len": manifest_cols,
|
|
176
|
+
"recomputed_len": len(recomputed_cols),
|
|
177
|
+
}
|
|
178
|
+
except Exception:
|
|
179
|
+
pass
|
|
180
|
+
|
|
181
|
+
# Per-column array comparisons: sample up to N mismatches
|
|
182
|
+
def _cmp_lists(manifest_dict, recomputed_dict, field, max_samples=5):
|
|
183
|
+
man_list = manifest_dict.get(field) or []
|
|
184
|
+
rec_list = recomputed_dict.get(field) or []
|
|
185
|
+
mismatches = []
|
|
186
|
+
for idx in range(min(len(man_list), len(rec_list))):
|
|
187
|
+
if man_list[idx] != rec_list[idx]:
|
|
188
|
+
mismatches.append(
|
|
189
|
+
{"index": idx, "manifest": man_list[idx], "recomputed": rec_list[idx]}
|
|
190
|
+
)
|
|
191
|
+
if len(mismatches) >= max_samples:
|
|
192
|
+
break
|
|
193
|
+
if mismatches or len(man_list) != len(rec_list):
|
|
194
|
+
return {
|
|
195
|
+
"mismatch_count": len(mismatches),
|
|
196
|
+
"sample_mismatches": mismatches,
|
|
197
|
+
"manifest_len": len(man_list),
|
|
198
|
+
"recomputed_len": len(rec_list),
|
|
199
|
+
}
|
|
200
|
+
return None
|
|
201
|
+
|
|
202
|
+
for field in (
|
|
203
|
+
"null_counts",
|
|
204
|
+
"min_values",
|
|
205
|
+
"max_values",
|
|
206
|
+
"min_values_display",
|
|
207
|
+
"max_values_display",
|
|
208
|
+
):
|
|
209
|
+
cmp_res = _cmp_lists(ent, recomputed_entry, field)
|
|
210
|
+
if cmp_res:
|
|
211
|
+
diffs[f"{field}_mismatch"] = cmp_res
|
|
212
|
+
|
|
213
|
+
# Compare column uncompressed sizes (sample mismatches)
|
|
214
|
+
try:
|
|
215
|
+
man_cols = ent.get("column_uncompressed_sizes_in_bytes") or []
|
|
216
|
+
rec_cols = recomputed_entry.get("column_uncompressed_sizes_in_bytes") or []
|
|
217
|
+
col_mismatches = []
|
|
218
|
+
for idx in range(min(len(man_cols), len(rec_cols))):
|
|
219
|
+
if int(man_cols[idx]) != int(rec_cols[idx]):
|
|
220
|
+
col_mismatches.append(
|
|
221
|
+
{
|
|
222
|
+
"index": idx,
|
|
223
|
+
"manifest": int(man_cols[idx]),
|
|
224
|
+
"recomputed": int(rec_cols[idx]),
|
|
225
|
+
}
|
|
226
|
+
)
|
|
227
|
+
if len(col_mismatches) >= 5:
|
|
228
|
+
break
|
|
229
|
+
if col_mismatches or len(man_cols) != len(rec_cols):
|
|
230
|
+
diffs["column_uncompressed_size_mismatch"] = {
|
|
231
|
+
"count": len(col_mismatches),
|
|
232
|
+
"sample": col_mismatches,
|
|
233
|
+
"manifest_len": len(man_cols),
|
|
234
|
+
"recomputed_len": len(rec_cols),
|
|
235
|
+
}
|
|
236
|
+
except Exception:
|
|
237
|
+
pass
|
|
238
|
+
|
|
239
|
+
rec["diffs"] = diffs
|
|
240
|
+
|
|
241
|
+
# Accumulate recomputed per-column bytes by index -> by name when schema available
|
|
242
|
+
col_sizes = recomputed_entry.get("column_uncompressed_sizes_in_bytes") or []
|
|
243
|
+
for cname, cidx in col_to_idx.items():
|
|
244
|
+
try:
|
|
245
|
+
val = int((col_sizes or [0])[cidx])
|
|
246
|
+
except Exception:
|
|
247
|
+
val = 0
|
|
248
|
+
recomputed_col_bytes[cname] = recomputed_col_bytes.get(cname, 0) + val
|
|
249
|
+
|
|
250
|
+
except Exception as e:
|
|
251
|
+
rec["recomputed"] = {"error": f"failed to read/recompute: {e}"}
|
|
252
|
+
|
|
253
|
+
out_file.write(json.dumps(rec) + "\n")
|
|
254
|
+
|
|
255
|
+
# Write recomputed per-column summary
|
|
256
|
+
out_file.write(
|
|
257
|
+
json.dumps({"recomputed_column_uncompressed_bytes": recomputed_col_bytes}) + "\n"
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
# Attempt to build a recomputed describe-like summary for comparison
|
|
261
|
+
recompute_describe = {}
|
|
262
|
+
try:
|
|
263
|
+
for cname in col_to_idx:
|
|
264
|
+
recompute_describe[cname] = {"uncompressed_bytes": recomputed_col_bytes.get(cname, 0)}
|
|
265
|
+
except Exception as e:
|
|
266
|
+
recompute_describe = {"__error": str(e)}
|
|
267
|
+
|
|
268
|
+
out_file.write(json.dumps({"describe_recomputed": recompute_describe}) + "\n")
|
|
269
|
+
|
|
270
|
+
out_file.close()
|
|
271
|
+
|
|
272
|
+
|
|
273
|
+
if __name__ == "__main__":
|
|
274
|
+
# Usage: second arg optional; if omitted or outside the repo we write into ./artifacts/
|
|
275
|
+
if len(sys.argv) < 2:
|
|
276
|
+
print(
|
|
277
|
+
"Usage: python scripts/inspect_manifest_dryrun.py <dataset_identifier> [output_jsonl_path]"
|
|
278
|
+
)
|
|
279
|
+
sys.exit(2)
|
|
280
|
+
dataset_identifier = sys.argv[1]
|
|
281
|
+
out_arg = sys.argv[2] if len(sys.argv) >= 3 else ""
|
|
282
|
+
# Use environment defaults; allow overriding via env or pass-through edits.
|
|
283
|
+
import os
|
|
284
|
+
import sys
|
|
285
|
+
|
|
286
|
+
# Allow using local packages (same logic as tests) so we can exercise local OpteryxCatalog
|
|
287
|
+
sys.path.insert(0, os.path.join(sys.path[0], "..")) # parent dir (pyiceberg_firestore_gcs)
|
|
288
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
|
|
289
|
+
sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
|
|
290
|
+
|
|
291
|
+
# Default to the same test-friendly defaults when env vars are unset
|
|
292
|
+
catalog_kwargs = {
|
|
293
|
+
"workspace": os.getenv("OPTERYX_WORKSPACE", "opteryx"),
|
|
294
|
+
"firestore_project": os.getenv("OPTERYX_FIRESTORE_PROJECT", "mabeldev"),
|
|
295
|
+
"firestore_database": os.getenv("OPTERYX_FIRESTORE_DATABASE", "catalogs"),
|
|
296
|
+
"gcs_bucket": os.getenv("OPTERYX_GCS_BUCKET", "opteryx_data"),
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
# Always write into the repository's `artifacts/` directory unless an explicit repo-local path was provided
|
|
300
|
+
repo_root = os.getcwd()
|
|
301
|
+
artifacts_dir = os.path.join(repo_root, "artifacts")
|
|
302
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
303
|
+
|
|
304
|
+
if out_arg:
|
|
305
|
+
candidate = os.path.abspath(out_arg)
|
|
306
|
+
if candidate.startswith(repo_root):
|
|
307
|
+
output_path = candidate
|
|
308
|
+
else:
|
|
309
|
+
output_path = os.path.join(
|
|
310
|
+
artifacts_dir, f"{dataset_identifier.replace('.', '_')}_manifest_dryrun.jsonl"
|
|
311
|
+
)
|
|
312
|
+
print(f"Provided output path {out_arg} is outside the repo; writing to {output_path}")
|
|
313
|
+
else:
|
|
314
|
+
output_path = os.path.join(
|
|
315
|
+
artifacts_dir, f"{dataset_identifier.replace('.', '_')}_manifest_dryrun.jsonl"
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
print(
|
|
319
|
+
f"Using catalog workspace={catalog_kwargs['workspace']} firestore_project={catalog_kwargs['firestore_project']} gcs_bucket={catalog_kwargs['gcs_bucket']}"
|
|
320
|
+
)
|
|
321
|
+
print(f"Writing results to {output_path}")
|
|
322
|
+
inspect_dataset(dataset_identifier, output_path, catalog_kwargs)
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
"""Inspect a single manifest file entry and recompute its stats.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python scripts/inspect_single_file.py <dataset_identifier> <file_path> <output_jsonl_path>
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
python scripts/inspect_single_file.py opteryx.ops.audit_log gs://.../data-...parquet artifacts/single_file.jsonl
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
|
|
16
|
+
from opteryx_catalog.opteryx_catalog import OpteryxCatalog
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _preview(lst, n=6):
|
|
20
|
+
if lst is None:
|
|
21
|
+
return None
|
|
22
|
+
if isinstance(lst, list):
|
|
23
|
+
if len(lst) > n:
|
|
24
|
+
return {"len": len(lst), "preview": lst[:n], "truncated": True}
|
|
25
|
+
return {"len": len(lst), "preview": lst, "truncated": False}
|
|
26
|
+
return lst
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
if __name__ == "__main__":
|
|
30
|
+
if len(sys.argv) < 4:
|
|
31
|
+
print(
|
|
32
|
+
"Usage: python scripts/inspect_single_file.py <dataset_identifier> <file_path> <output_jsonl_path>"
|
|
33
|
+
)
|
|
34
|
+
sys.exit(2)
|
|
35
|
+
|
|
36
|
+
dataset_identifier = sys.argv[1]
|
|
37
|
+
target_fp = sys.argv[2]
|
|
38
|
+
out_path = sys.argv[3]
|
|
39
|
+
|
|
40
|
+
catalog_kwargs = {
|
|
41
|
+
"workspace": os.getenv("OPTERYX_WORKSPACE", "opteryx"),
|
|
42
|
+
"firestore_project": os.getenv("OPTERYX_FIRESTORE_PROJECT", "mabeldev"),
|
|
43
|
+
"gcs_bucket": os.getenv("OPTERYX_GCS_BUCKET", "opteryx_data"),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
repo_root = os.getcwd()
|
|
47
|
+
artifacts_dir = os.path.join(repo_root, "artifacts")
|
|
48
|
+
os.makedirs(artifacts_dir, exist_ok=True)
|
|
49
|
+
|
|
50
|
+
if not os.path.abspath(out_path).startswith(repo_root):
|
|
51
|
+
out_path = os.path.join(artifacts_dir, os.path.basename(out_path))
|
|
52
|
+
|
|
53
|
+
# Allow fully-qualified identifiers like 'workspace.collection.dataset'
|
|
54
|
+
parts = dataset_identifier.split(".")
|
|
55
|
+
if len(parts) == 3:
|
|
56
|
+
wk, collection, dname = parts
|
|
57
|
+
catalog_kwargs = dict(catalog_kwargs)
|
|
58
|
+
catalog_kwargs["workspace"] = wk
|
|
59
|
+
dataset_identifier = f"{collection}.{dname}"
|
|
60
|
+
|
|
61
|
+
cat = OpteryxCatalog(**catalog_kwargs)
|
|
62
|
+
ds = cat.load_dataset(dataset_identifier)
|
|
63
|
+
|
|
64
|
+
snap = ds.snapshot()
|
|
65
|
+
if snap is None or not getattr(snap, "manifest_list", None):
|
|
66
|
+
print("No manifest available", file=sys.stderr)
|
|
67
|
+
sys.exit(3)
|
|
68
|
+
|
|
69
|
+
inp = ds.io.new_input(snap.manifest_list)
|
|
70
|
+
with inp.open() as f:
|
|
71
|
+
mbytes = f.read()
|
|
72
|
+
|
|
73
|
+
rows = pq.read_table(mbytes).to_pylist()
|
|
74
|
+
|
|
75
|
+
match = None
|
|
76
|
+
match_idx = None
|
|
77
|
+
for i, r in enumerate(rows):
|
|
78
|
+
if r.get("file_path") == target_fp:
|
|
79
|
+
match = r
|
|
80
|
+
match_idx = i
|
|
81
|
+
break
|
|
82
|
+
|
|
83
|
+
if match is None:
|
|
84
|
+
print("Target file not found in manifest", file=sys.stderr)
|
|
85
|
+
sys.exit(4)
|
|
86
|
+
|
|
87
|
+
rec = {"file_index": match_idx, "file_path": target_fp}
|
|
88
|
+
ent = match
|
|
89
|
+
rec["manifest_entry_summary"] = {
|
|
90
|
+
"uncompressed_size": int(ent.get("uncompressed_size_in_bytes") or 0),
|
|
91
|
+
"column_uncompressed_sizes": _preview(ent.get("column_uncompressed_sizes_in_bytes")),
|
|
92
|
+
"null_counts": _preview(ent.get("null_counts")),
|
|
93
|
+
"min_values": _preview(ent.get("min_values")),
|
|
94
|
+
"max_values": _preview(ent.get("max_values")),
|
|
95
|
+
"min_values_display": _preview(ent.get("min_values_display")),
|
|
96
|
+
"max_values_display": _preview(ent.get("max_values_display")),
|
|
97
|
+
"min_k_hashes": _preview(ent.get("min_k_hashes")),
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
inp2 = ds.io.new_input(target_fp)
|
|
101
|
+
with inp2.open() as f:
|
|
102
|
+
data = f.read()
|
|
103
|
+
|
|
104
|
+
if not data:
|
|
105
|
+
rec["recomputed"] = {"error": "empty file"}
|
|
106
|
+
else:
|
|
107
|
+
recomputed = build_parquet_manifest_entry_from_bytes(data, target_fp, len(data)).to_dict()
|
|
108
|
+
rec["recomputed"] = {
|
|
109
|
+
"uncompressed_size": int(recomputed.get("uncompressed_size_in_bytes") or 0),
|
|
110
|
+
"column_uncompressed_sizes": _preview(
|
|
111
|
+
recomputed.get("column_uncompressed_sizes_in_bytes")
|
|
112
|
+
),
|
|
113
|
+
"null_counts": _preview(recomputed.get("null_counts")),
|
|
114
|
+
"min_values": _preview(recomputed.get("min_values")),
|
|
115
|
+
"max_values": _preview(recomputed.get("max_values")),
|
|
116
|
+
"min_values_display": _preview(recomputed.get("min_values_display")),
|
|
117
|
+
"max_values_display": _preview(recomputed.get("max_values_display")),
|
|
118
|
+
"min_k_hashes": _preview(recomputed.get("min_k_hashes")),
|
|
119
|
+
}
|
|
120
|
+
# diffs
|
|
121
|
+
diffs = {}
|
|
122
|
+
if (
|
|
123
|
+
rec["manifest_entry_summary"]["uncompressed_size"]
|
|
124
|
+
!= rec["recomputed"]["uncompressed_size"]
|
|
125
|
+
):
|
|
126
|
+
diffs["uncompressed_size_mismatch"] = {
|
|
127
|
+
"manifest": rec["manifest_entry_summary"]["uncompressed_size"],
|
|
128
|
+
"recomputed": rec["recomputed"]["uncompressed_size"],
|
|
129
|
+
}
|
|
130
|
+
man_k = ent.get("min_k_hashes") or []
|
|
131
|
+
rec_k = recomputed.get("min_k_hashes") or []
|
|
132
|
+
diffs["min_k_hashes_nonempty_counts"] = {
|
|
133
|
+
"manifest_nonempty": sum(1 for x in man_k if x),
|
|
134
|
+
"recomputed_nonempty": sum(1 for x in rec_k if x),
|
|
135
|
+
}
|
|
136
|
+
rec["diffs"] = diffs
|
|
137
|
+
|
|
138
|
+
with open(out_path, "w", encoding="utf-8") as of:
|
|
139
|
+
of.write(
|
|
140
|
+
json.dumps(
|
|
141
|
+
{"_meta": {"dataset": dataset_identifier, "timestamp": int(time.time() * 1000)}}
|
|
142
|
+
)
|
|
143
|
+
+ "\n"
|
|
144
|
+
)
|
|
145
|
+
of.write(json.dumps(rec) + "\n")
|
|
146
|
+
|
|
147
|
+
print("WROTE", out_path)
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""Inspect a single file by scanning manifests in GCS and recomputing stats for that file only.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python scripts/inspect_single_file_gcs.py <bucket> <manifest_prefix> <target_gs_path> <output_jsonl>
|
|
5
|
+
|
|
6
|
+
Example:
|
|
7
|
+
python scripts/inspect_single_file_gcs.py opteryx_data opteryx/ops/audit_log/metadata/ gs://opteryx_data/opteryx/ops/audit_log/data/188f... artifacts/out.jsonl
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import json
|
|
11
|
+
import os
|
|
12
|
+
import sys
|
|
13
|
+
import time
|
|
14
|
+
|
|
15
|
+
import pyarrow.parquet as pq
|
|
16
|
+
from google.cloud import storage
|
|
17
|
+
|
|
18
|
+
from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _preview(lst, n=6):
|
|
22
|
+
if lst is None:
|
|
23
|
+
return None
|
|
24
|
+
if isinstance(lst, list):
|
|
25
|
+
if len(lst) > n:
|
|
26
|
+
return {"len": len(lst), "preview": lst[:n], "truncated": True}
|
|
27
|
+
return {"len": len(lst), "preview": lst, "truncated": False}
|
|
28
|
+
return lst
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
if __name__ == "__main__":
|
|
32
|
+
if len(sys.argv) < 5:
|
|
33
|
+
print(
|
|
34
|
+
"Usage: python scripts/inspect_single_file_gcs.py <bucket> <manifest_prefix> <target_gs_path> <output_jsonl>"
|
|
35
|
+
)
|
|
36
|
+
sys.exit(2)
|
|
37
|
+
|
|
38
|
+
bucket_name = sys.argv[1]
|
|
39
|
+
manifest_prefix = sys.argv[2]
|
|
40
|
+
target_fp = sys.argv[3]
|
|
41
|
+
out_path = sys.argv[4]
|
|
42
|
+
|
|
43
|
+
repo_root = os.getcwd()
|
|
44
|
+
artifacts = os.path.join(repo_root, "artifacts")
|
|
45
|
+
os.makedirs(artifacts, exist_ok=True)
|
|
46
|
+
if not os.path.abspath(out_path).startswith(repo_root):
|
|
47
|
+
out_path = os.path.join(artifacts, os.path.basename(out_path))
|
|
48
|
+
|
|
49
|
+
client = storage.Client()
|
|
50
|
+
|
|
51
|
+
blobs = list(client.list_blobs(bucket_name, prefix=manifest_prefix))
|
|
52
|
+
if not blobs:
|
|
53
|
+
print("No manifest blobs found", file=sys.stderr)
|
|
54
|
+
sys.exit(3)
|
|
55
|
+
|
|
56
|
+
blobs.sort(key=lambda b: b.updated or b.time_created or 0, reverse=True)
|
|
57
|
+
|
|
58
|
+
match_row = None
|
|
59
|
+
match_idx = None
|
|
60
|
+
match_manifest = None
|
|
61
|
+
|
|
62
|
+
for b in blobs:
|
|
63
|
+
data = b.download_as_bytes()
|
|
64
|
+
try:
|
|
65
|
+
table = pq.read_table(data)
|
|
66
|
+
rows = table.to_pylist()
|
|
67
|
+
except Exception:
|
|
68
|
+
continue
|
|
69
|
+
for i, r in enumerate(rows):
|
|
70
|
+
if r.get("file_path") == target_fp:
|
|
71
|
+
match_row = r
|
|
72
|
+
match_idx = i
|
|
73
|
+
match_manifest = b
|
|
74
|
+
break
|
|
75
|
+
if match_row:
|
|
76
|
+
break
|
|
77
|
+
|
|
78
|
+
if match_row is None:
|
|
79
|
+
print("Target file not found in manifests", file=sys.stderr)
|
|
80
|
+
sys.exit(4)
|
|
81
|
+
|
|
82
|
+
# download target file
|
|
83
|
+
_, rest = target_fp.split("://", 1)
|
|
84
|
+
bucket2, path = rest.split("/", 1)
|
|
85
|
+
blob2 = client.bucket(bucket2).blob(path)
|
|
86
|
+
data = blob2.download_as_bytes()
|
|
87
|
+
|
|
88
|
+
recomputed = build_parquet_manifest_entry_from_bytes(data, target_fp, len(data)).to_dict()
|
|
89
|
+
|
|
90
|
+
rec = {"file_index": match_idx, "file_path": target_fp}
|
|
91
|
+
ent = match_row
|
|
92
|
+
rec["manifest_entry_summary"] = {
|
|
93
|
+
"uncompressed_size": int(ent.get("uncompressed_size_in_bytes") or 0),
|
|
94
|
+
"min_k_hashes": _preview(ent.get("min_k_hashes")),
|
|
95
|
+
}
|
|
96
|
+
rec["recomputed"] = {
|
|
97
|
+
"uncompressed_size": int(recomputed.get("uncompressed_size_in_bytes") or 0),
|
|
98
|
+
"min_k_hashes": _preview(recomputed.get("min_k_hashes")),
|
|
99
|
+
}
|
|
100
|
+
man_k = ent.get("min_k_hashes") or []
|
|
101
|
+
rec_k = recomputed.get("min_k_hashes") or []
|
|
102
|
+
rec["diffs"] = {
|
|
103
|
+
"min_k_hashes_nonempty_counts": {
|
|
104
|
+
"manifest_nonempty": sum(1 for x in man_k if x),
|
|
105
|
+
"recomputed_nonempty": sum(1 for x in rec_k if x),
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
with open(out_path, "w", encoding="utf-8") as of:
|
|
110
|
+
of.write(
|
|
111
|
+
json.dumps(
|
|
112
|
+
{
|
|
113
|
+
"_meta": {
|
|
114
|
+
"source": "gcs-manifest-scan",
|
|
115
|
+
"manifest_blob": match_manifest.name,
|
|
116
|
+
"timestamp": int(time.time() * 1000),
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
)
|
|
120
|
+
+ "\n"
|
|
121
|
+
)
|
|
122
|
+
of.write(json.dumps(rec) + "\n")
|
|
123
|
+
|
|
124
|
+
print("WROTE", out_path)
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from opteryx_catalog.opteryx_catalog import OpteryxCatalog
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class _Doc:
|
|
7
|
+
def __init__(self, id_):
|
|
8
|
+
self.id = id_
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def test_list_collections_excludes_properties():
|
|
12
|
+
# Construct catalog without calling __init__ to avoid external I/O
|
|
13
|
+
c = object.__new__(OpteryxCatalog)
|
|
14
|
+
c.workspace = "w"
|
|
15
|
+
|
|
16
|
+
class MockColl:
|
|
17
|
+
def stream(self):
|
|
18
|
+
return [_Doc("$properties"), _Doc("col_a"), _Doc("col_b")]
|
|
19
|
+
|
|
20
|
+
c._catalog_ref = MockColl()
|
|
21
|
+
|
|
22
|
+
cols = list(c.list_collections())
|
|
23
|
+
assert "$properties" not in cols
|
|
24
|
+
assert set(cols) == {"col_a", "col_b"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def test_list_collections_handles_errors():
|
|
28
|
+
c = object.__new__(OpteryxCatalog)
|
|
29
|
+
c.workspace = "w"
|
|
30
|
+
|
|
31
|
+
class BadColl:
|
|
32
|
+
def stream(self):
|
|
33
|
+
raise RuntimeError("boom")
|
|
34
|
+
|
|
35
|
+
c._catalog_ref = BadColl()
|
|
36
|
+
|
|
37
|
+
assert list(c.list_collections()) == []
|