opteryx-catalog 0.4.11__py3-none-any.whl → 0.4.26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. opteryx_catalog/catalog/compaction.py +15 -8
  2. opteryx_catalog/catalog/dataset.py +449 -111
  3. opteryx_catalog/catalog/manifest.py +390 -330
  4. opteryx_catalog/catalog/metadata.py +3 -0
  5. opteryx_catalog/iops/fileio.py +13 -0
  6. opteryx_catalog/maki_nage/__init__.py +8 -0
  7. opteryx_catalog/maki_nage/distogram.py +558 -0
  8. opteryx_catalog/maki_nage/tests/_test_histogram.py +52 -0
  9. opteryx_catalog/maki_nage/tests/test_bounds.py +24 -0
  10. opteryx_catalog/maki_nage/tests/test_count.py +19 -0
  11. opteryx_catalog/maki_nage/tests/test_count_at.py +89 -0
  12. opteryx_catalog/maki_nage/tests/test_quantile.py +81 -0
  13. opteryx_catalog/maki_nage/tests/test_stats.py +25 -0
  14. opteryx_catalog/maki_nage/tests/test_update.py +44 -0
  15. opteryx_catalog/opteryx_catalog.py +82 -54
  16. opteryx_catalog/webhooks/__init__.py +230 -0
  17. opteryx_catalog/webhooks/events.py +177 -0
  18. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/METADATA +15 -18
  19. opteryx_catalog-0.4.26.dist-info/RECORD +45 -0
  20. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/WHEEL +1 -1
  21. scripts/collect_byte_counts.py +42 -0
  22. scripts/emit_full_single_file.py +81 -0
  23. scripts/inspect_manifest_dryrun.py +322 -0
  24. scripts/inspect_single_file.py +147 -0
  25. scripts/inspect_single_file_gcs.py +124 -0
  26. tests/test_collections.py +37 -0
  27. tests/test_describe_uncompressed.py +127 -0
  28. tests/test_refresh_manifest.py +275 -0
  29. tests/test_webhooks.py +177 -0
  30. opteryx_catalog-0.4.11.dist-info/RECORD +0 -25
  31. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/licenses/LICENSE +0 -0
  32. {opteryx_catalog-0.4.11.dist-info → opteryx_catalog-0.4.26.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,322 @@
1
+ """Dry-run manifest inspector
2
+
3
+ Usage:
4
+ python scripts/inspect_manifest_dryrun.py <dataset_identifier> <output_jsonl_path>
5
+
6
+ Example:
7
+ python scripts/inspect_manifest_dryrun.py opteryx.ops.audit_log /tmp/audit_log_manifest_dryrun.jsonl
8
+
9
+ This script is non-mutating: it reads the manifest and referenced data files (when readable),
10
+ recomputes per-file statistics via `build_parquet_manifest_entry_from_bytes`, and writes per-file results
11
+ and a dataset-level summary to the output JSON-lines file.
12
+ """
13
+
14
+ import json
15
+ import sys
16
+ import time
17
+
18
+ import pyarrow as pa
19
+ import pyarrow.parquet as pq
20
+
21
+ from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
22
+ from opteryx_catalog.opteryx_catalog import OpteryxCatalog
23
+
24
+
25
+ def safe_read_manifest(ds) -> list:
26
+ snap = ds.snapshot()
27
+ if snap is None or not getattr(snap, "manifest_list", None):
28
+ raise ValueError("No manifest available for this dataset/snapshot")
29
+ manifest_path = snap.manifest_list
30
+ try:
31
+ inp = ds.io.new_input(manifest_path)
32
+ with inp.open() as f:
33
+ data = f.read()
34
+ if not data:
35
+ return []
36
+ table = pq.read_table(pa.BufferReader(data))
37
+ rows = table.to_pylist()
38
+ return rows
39
+ except Exception as e:
40
+ return [{"__error": f"failed to read manifest: {e}"}]
41
+
42
+
43
+ def inspect_dataset(dataset_identifier: str, output_path: str, catalog_kwargs: dict):
44
+ out_file = open(output_path, "w", encoding="utf-8")
45
+
46
+ meta = {"dataset": dataset_identifier, "timestamp": int(time.time() * 1000)}
47
+ out_file.write(json.dumps({"_meta": meta}) + "\n")
48
+
49
+ # Allow dataset identifiers in forms:
50
+ # - 'collection.dataset'
51
+ # - 'workspace.collection.dataset' (fully-qualified)
52
+ # If fully-qualified, override workspace from identifier.
53
+ parts = dataset_identifier.split(".")
54
+ if len(parts) == 3:
55
+ wk, collection, dname = parts
56
+ catalog_kwargs = dict(catalog_kwargs)
57
+ catalog_kwargs["workspace"] = wk
58
+ dataset_identifier = f"{collection}.{dname}"
59
+
60
+ catalog = OpteryxCatalog(**catalog_kwargs)
61
+
62
+ try:
63
+ ds = catalog.load_dataset(dataset_identifier)
64
+ except Exception as e:
65
+ out_file.write(json.dumps({"error": f"failed to load dataset: {e}"}) + "\n")
66
+ out_file.close()
67
+ return
68
+
69
+ # Describe before
70
+ try:
71
+ describe_before = ds.describe()
72
+ except Exception as e:
73
+ describe_before = {"__error": f"describe failed: {e}"}
74
+
75
+ out_file.write(json.dumps({"describe_before": describe_before}) + "\n")
76
+
77
+ # Read manifest rows
78
+ rows = safe_read_manifest(ds)
79
+ out_file.write(json.dumps({"manifest_rows_count": len(rows)}) + "\n")
80
+
81
+ # Get schema mapping
82
+ try:
83
+ orso_schema = ds.schema()
84
+ col_to_idx = {c.name: i for i, c in enumerate(orso_schema.columns)} if orso_schema else {}
85
+ except Exception:
86
+ col_to_idx = {}
87
+
88
+ # Aggregators for recomputed per-column uncompressed bytes
89
+ recomputed_col_bytes = {name: 0 for name in col_to_idx}
90
+
91
+ for i, ent in enumerate(rows):
92
+ rec = {"file_index": i}
93
+ if not isinstance(ent, dict):
94
+ rec["error"] = "manifest row not a dict"
95
+ out_file.write(json.dumps(rec) + "\n")
96
+ continue
97
+ rec["file_path"] = ent.get("file_path")
98
+
99
+ def _preview(lst, n=6):
100
+ if lst is None:
101
+ return None
102
+ if isinstance(lst, list):
103
+ if len(lst) > n:
104
+ return {"len": len(lst), "preview": lst[:n], "truncated": True}
105
+ return {"len": len(lst), "preview": lst, "truncated": False}
106
+ return lst
107
+
108
+ rec["manifest_entry_summary"] = {
109
+ "uncompressed_size": int(ent.get("uncompressed_size_in_bytes") or 0),
110
+ "column_uncompressed_sizes": _preview(ent.get("column_uncompressed_sizes_in_bytes")),
111
+ "null_counts": _preview(ent.get("null_counts")),
112
+ "min_values": _preview(ent.get("min_values")),
113
+ "max_values": _preview(ent.get("max_values")),
114
+ "min_values_display": _preview(ent.get("min_values_display")),
115
+ "max_values_display": _preview(ent.get("max_values_display")),
116
+ "min_k_hashes": _preview(ent.get("min_k_hashes")),
117
+ "histogram_counts_len": len(ent.get("histogram_counts") or []),
118
+ }
119
+
120
+ fp = ent.get("file_path")
121
+ if not fp:
122
+ rec["recomputed"] = {"error": "no file_path"}
123
+ out_file.write(json.dumps(rec) + "\n")
124
+ continue
125
+
126
+ # Try to read the data file and recompute stats
127
+ try:
128
+ inp = ds.io.new_input(fp)
129
+ with inp.open() as f:
130
+ data = f.read()
131
+ if not data:
132
+ rec["recomputed"] = {"error": "empty file"}
133
+ out_file.write(json.dumps(rec) + "\n")
134
+ continue
135
+ # Use bytes-based builder for deterministic behavior
136
+ recomputed_entry = build_parquet_manifest_entry_from_bytes(
137
+ data, fp, len(data)
138
+ ).to_dict()
139
+
140
+ # Build a compact summary for output
141
+ recomputed_summary = {
142
+ "uncompressed_size": int(recomputed_entry.get("uncompressed_size_in_bytes") or 0),
143
+ "column_uncompressed_sizes": _preview(
144
+ recomputed_entry.get("column_uncompressed_sizes_in_bytes")
145
+ ),
146
+ "null_counts": _preview(recomputed_entry.get("null_counts")),
147
+ "min_values": _preview(recomputed_entry.get("min_values")),
148
+ "max_values": _preview(recomputed_entry.get("max_values")),
149
+ "min_values_display": _preview(recomputed_entry.get("min_values_display")),
150
+ "max_values_display": _preview(recomputed_entry.get("max_values_display")),
151
+ "min_k_hashes": _preview(recomputed_entry.get("min_k_hashes")),
152
+ "histogram_counts_len": len(recomputed_entry.get("histogram_counts") or []),
153
+ }
154
+
155
+ rec["recomputed"] = recomputed_summary
156
+
157
+ # Compare some fields safely and do per-column comparisons (sampled)
158
+ diffs = {}
159
+ try:
160
+ manifest_us = int(ent.get("uncompressed_size_in_bytes") or 0)
161
+ recomputed_us = recomputed_summary["uncompressed_size"]
162
+ if manifest_us != recomputed_us:
163
+ diffs["uncompressed_size_mismatch"] = {
164
+ "manifest": manifest_us,
165
+ "recomputed": recomputed_us,
166
+ }
167
+ except Exception:
168
+ pass
169
+
170
+ try:
171
+ manifest_cols = len(ent.get("column_uncompressed_sizes_in_bytes") or [])
172
+ recomputed_cols = recomputed_entry.get("column_uncompressed_sizes_in_bytes") or []
173
+ if manifest_cols != len(recomputed_cols):
174
+ diffs["column_uncompressed_length_mismatch"] = {
175
+ "manifest_len": manifest_cols,
176
+ "recomputed_len": len(recomputed_cols),
177
+ }
178
+ except Exception:
179
+ pass
180
+
181
+ # Per-column array comparisons: sample up to N mismatches
182
+ def _cmp_lists(manifest_dict, recomputed_dict, field, max_samples=5):
183
+ man_list = manifest_dict.get(field) or []
184
+ rec_list = recomputed_dict.get(field) or []
185
+ mismatches = []
186
+ for idx in range(min(len(man_list), len(rec_list))):
187
+ if man_list[idx] != rec_list[idx]:
188
+ mismatches.append(
189
+ {"index": idx, "manifest": man_list[idx], "recomputed": rec_list[idx]}
190
+ )
191
+ if len(mismatches) >= max_samples:
192
+ break
193
+ if mismatches or len(man_list) != len(rec_list):
194
+ return {
195
+ "mismatch_count": len(mismatches),
196
+ "sample_mismatches": mismatches,
197
+ "manifest_len": len(man_list),
198
+ "recomputed_len": len(rec_list),
199
+ }
200
+ return None
201
+
202
+ for field in (
203
+ "null_counts",
204
+ "min_values",
205
+ "max_values",
206
+ "min_values_display",
207
+ "max_values_display",
208
+ ):
209
+ cmp_res = _cmp_lists(ent, recomputed_entry, field)
210
+ if cmp_res:
211
+ diffs[f"{field}_mismatch"] = cmp_res
212
+
213
+ # Compare column uncompressed sizes (sample mismatches)
214
+ try:
215
+ man_cols = ent.get("column_uncompressed_sizes_in_bytes") or []
216
+ rec_cols = recomputed_entry.get("column_uncompressed_sizes_in_bytes") or []
217
+ col_mismatches = []
218
+ for idx in range(min(len(man_cols), len(rec_cols))):
219
+ if int(man_cols[idx]) != int(rec_cols[idx]):
220
+ col_mismatches.append(
221
+ {
222
+ "index": idx,
223
+ "manifest": int(man_cols[idx]),
224
+ "recomputed": int(rec_cols[idx]),
225
+ }
226
+ )
227
+ if len(col_mismatches) >= 5:
228
+ break
229
+ if col_mismatches or len(man_cols) != len(rec_cols):
230
+ diffs["column_uncompressed_size_mismatch"] = {
231
+ "count": len(col_mismatches),
232
+ "sample": col_mismatches,
233
+ "manifest_len": len(man_cols),
234
+ "recomputed_len": len(rec_cols),
235
+ }
236
+ except Exception:
237
+ pass
238
+
239
+ rec["diffs"] = diffs
240
+
241
+ # Accumulate recomputed per-column bytes by index -> by name when schema available
242
+ col_sizes = recomputed_entry.get("column_uncompressed_sizes_in_bytes") or []
243
+ for cname, cidx in col_to_idx.items():
244
+ try:
245
+ val = int((col_sizes or [0])[cidx])
246
+ except Exception:
247
+ val = 0
248
+ recomputed_col_bytes[cname] = recomputed_col_bytes.get(cname, 0) + val
249
+
250
+ except Exception as e:
251
+ rec["recomputed"] = {"error": f"failed to read/recompute: {e}"}
252
+
253
+ out_file.write(json.dumps(rec) + "\n")
254
+
255
+ # Write recomputed per-column summary
256
+ out_file.write(
257
+ json.dumps({"recomputed_column_uncompressed_bytes": recomputed_col_bytes}) + "\n"
258
+ )
259
+
260
+ # Attempt to build a recomputed describe-like summary for comparison
261
+ recompute_describe = {}
262
+ try:
263
+ for cname in col_to_idx:
264
+ recompute_describe[cname] = {"uncompressed_bytes": recomputed_col_bytes.get(cname, 0)}
265
+ except Exception as e:
266
+ recompute_describe = {"__error": str(e)}
267
+
268
+ out_file.write(json.dumps({"describe_recomputed": recompute_describe}) + "\n")
269
+
270
+ out_file.close()
271
+
272
+
273
+ if __name__ == "__main__":
274
+ # Usage: second arg optional; if omitted or outside the repo we write into ./artifacts/
275
+ if len(sys.argv) < 2:
276
+ print(
277
+ "Usage: python scripts/inspect_manifest_dryrun.py <dataset_identifier> [output_jsonl_path]"
278
+ )
279
+ sys.exit(2)
280
+ dataset_identifier = sys.argv[1]
281
+ out_arg = sys.argv[2] if len(sys.argv) >= 3 else ""
282
+ # Use environment defaults; allow overriding via env or pass-through edits.
283
+ import os
284
+ import sys
285
+
286
+ # Allow using local packages (same logic as tests) so we can exercise local OpteryxCatalog
287
+ sys.path.insert(0, os.path.join(sys.path[0], "..")) # parent dir (pyiceberg_firestore_gcs)
288
+ sys.path.insert(1, os.path.join(sys.path[0], "../opteryx-core"))
289
+ sys.path.insert(1, os.path.join(sys.path[0], "../pyiceberg-firestore-gcs"))
290
+
291
+ # Default to the same test-friendly defaults when env vars are unset
292
+ catalog_kwargs = {
293
+ "workspace": os.getenv("OPTERYX_WORKSPACE", "opteryx"),
294
+ "firestore_project": os.getenv("OPTERYX_FIRESTORE_PROJECT", "mabeldev"),
295
+ "firestore_database": os.getenv("OPTERYX_FIRESTORE_DATABASE", "catalogs"),
296
+ "gcs_bucket": os.getenv("OPTERYX_GCS_BUCKET", "opteryx_data"),
297
+ }
298
+
299
+ # Always write into the repository's `artifacts/` directory unless an explicit repo-local path was provided
300
+ repo_root = os.getcwd()
301
+ artifacts_dir = os.path.join(repo_root, "artifacts")
302
+ os.makedirs(artifacts_dir, exist_ok=True)
303
+
304
+ if out_arg:
305
+ candidate = os.path.abspath(out_arg)
306
+ if candidate.startswith(repo_root):
307
+ output_path = candidate
308
+ else:
309
+ output_path = os.path.join(
310
+ artifacts_dir, f"{dataset_identifier.replace('.', '_')}_manifest_dryrun.jsonl"
311
+ )
312
+ print(f"Provided output path {out_arg} is outside the repo; writing to {output_path}")
313
+ else:
314
+ output_path = os.path.join(
315
+ artifacts_dir, f"{dataset_identifier.replace('.', '_')}_manifest_dryrun.jsonl"
316
+ )
317
+
318
+ print(
319
+ f"Using catalog workspace={catalog_kwargs['workspace']} firestore_project={catalog_kwargs['firestore_project']} gcs_bucket={catalog_kwargs['gcs_bucket']}"
320
+ )
321
+ print(f"Writing results to {output_path}")
322
+ inspect_dataset(dataset_identifier, output_path, catalog_kwargs)
@@ -0,0 +1,147 @@
1
+ """Inspect a single manifest file entry and recompute its stats.
2
+
3
+ Usage:
4
+ python scripts/inspect_single_file.py <dataset_identifier> <file_path> <output_jsonl_path>
5
+
6
+ Example:
7
+ python scripts/inspect_single_file.py opteryx.ops.audit_log gs://.../data-...parquet artifacts/single_file.jsonl
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import sys
13
+ import time
14
+
15
+ from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
16
+ from opteryx_catalog.opteryx_catalog import OpteryxCatalog
17
+
18
+
19
+ def _preview(lst, n=6):
20
+ if lst is None:
21
+ return None
22
+ if isinstance(lst, list):
23
+ if len(lst) > n:
24
+ return {"len": len(lst), "preview": lst[:n], "truncated": True}
25
+ return {"len": len(lst), "preview": lst, "truncated": False}
26
+ return lst
27
+
28
+
29
+ if __name__ == "__main__":
30
+ if len(sys.argv) < 4:
31
+ print(
32
+ "Usage: python scripts/inspect_single_file.py <dataset_identifier> <file_path> <output_jsonl_path>"
33
+ )
34
+ sys.exit(2)
35
+
36
+ dataset_identifier = sys.argv[1]
37
+ target_fp = sys.argv[2]
38
+ out_path = sys.argv[3]
39
+
40
+ catalog_kwargs = {
41
+ "workspace": os.getenv("OPTERYX_WORKSPACE", "opteryx"),
42
+ "firestore_project": os.getenv("OPTERYX_FIRESTORE_PROJECT", "mabeldev"),
43
+ "gcs_bucket": os.getenv("OPTERYX_GCS_BUCKET", "opteryx_data"),
44
+ }
45
+
46
+ repo_root = os.getcwd()
47
+ artifacts_dir = os.path.join(repo_root, "artifacts")
48
+ os.makedirs(artifacts_dir, exist_ok=True)
49
+
50
+ if not os.path.abspath(out_path).startswith(repo_root):
51
+ out_path = os.path.join(artifacts_dir, os.path.basename(out_path))
52
+
53
+ # Allow fully-qualified identifiers like 'workspace.collection.dataset'
54
+ parts = dataset_identifier.split(".")
55
+ if len(parts) == 3:
56
+ wk, collection, dname = parts
57
+ catalog_kwargs = dict(catalog_kwargs)
58
+ catalog_kwargs["workspace"] = wk
59
+ dataset_identifier = f"{collection}.{dname}"
60
+
61
+ cat = OpteryxCatalog(**catalog_kwargs)
62
+ ds = cat.load_dataset(dataset_identifier)
63
+
64
+ snap = ds.snapshot()
65
+ if snap is None or not getattr(snap, "manifest_list", None):
66
+ print("No manifest available", file=sys.stderr)
67
+ sys.exit(3)
68
+
69
+ inp = ds.io.new_input(snap.manifest_list)
70
+ with inp.open() as f:
71
+ mbytes = f.read()
72
+
73
+ rows = pq.read_table(mbytes).to_pylist()
74
+
75
+ match = None
76
+ match_idx = None
77
+ for i, r in enumerate(rows):
78
+ if r.get("file_path") == target_fp:
79
+ match = r
80
+ match_idx = i
81
+ break
82
+
83
+ if match is None:
84
+ print("Target file not found in manifest", file=sys.stderr)
85
+ sys.exit(4)
86
+
87
+ rec = {"file_index": match_idx, "file_path": target_fp}
88
+ ent = match
89
+ rec["manifest_entry_summary"] = {
90
+ "uncompressed_size": int(ent.get("uncompressed_size_in_bytes") or 0),
91
+ "column_uncompressed_sizes": _preview(ent.get("column_uncompressed_sizes_in_bytes")),
92
+ "null_counts": _preview(ent.get("null_counts")),
93
+ "min_values": _preview(ent.get("min_values")),
94
+ "max_values": _preview(ent.get("max_values")),
95
+ "min_values_display": _preview(ent.get("min_values_display")),
96
+ "max_values_display": _preview(ent.get("max_values_display")),
97
+ "min_k_hashes": _preview(ent.get("min_k_hashes")),
98
+ }
99
+
100
+ inp2 = ds.io.new_input(target_fp)
101
+ with inp2.open() as f:
102
+ data = f.read()
103
+
104
+ if not data:
105
+ rec["recomputed"] = {"error": "empty file"}
106
+ else:
107
+ recomputed = build_parquet_manifest_entry_from_bytes(data, target_fp, len(data)).to_dict()
108
+ rec["recomputed"] = {
109
+ "uncompressed_size": int(recomputed.get("uncompressed_size_in_bytes") or 0),
110
+ "column_uncompressed_sizes": _preview(
111
+ recomputed.get("column_uncompressed_sizes_in_bytes")
112
+ ),
113
+ "null_counts": _preview(recomputed.get("null_counts")),
114
+ "min_values": _preview(recomputed.get("min_values")),
115
+ "max_values": _preview(recomputed.get("max_values")),
116
+ "min_values_display": _preview(recomputed.get("min_values_display")),
117
+ "max_values_display": _preview(recomputed.get("max_values_display")),
118
+ "min_k_hashes": _preview(recomputed.get("min_k_hashes")),
119
+ }
120
+ # diffs
121
+ diffs = {}
122
+ if (
123
+ rec["manifest_entry_summary"]["uncompressed_size"]
124
+ != rec["recomputed"]["uncompressed_size"]
125
+ ):
126
+ diffs["uncompressed_size_mismatch"] = {
127
+ "manifest": rec["manifest_entry_summary"]["uncompressed_size"],
128
+ "recomputed": rec["recomputed"]["uncompressed_size"],
129
+ }
130
+ man_k = ent.get("min_k_hashes") or []
131
+ rec_k = recomputed.get("min_k_hashes") or []
132
+ diffs["min_k_hashes_nonempty_counts"] = {
133
+ "manifest_nonempty": sum(1 for x in man_k if x),
134
+ "recomputed_nonempty": sum(1 for x in rec_k if x),
135
+ }
136
+ rec["diffs"] = diffs
137
+
138
+ with open(out_path, "w", encoding="utf-8") as of:
139
+ of.write(
140
+ json.dumps(
141
+ {"_meta": {"dataset": dataset_identifier, "timestamp": int(time.time() * 1000)}}
142
+ )
143
+ + "\n"
144
+ )
145
+ of.write(json.dumps(rec) + "\n")
146
+
147
+ print("WROTE", out_path)
@@ -0,0 +1,124 @@
1
+ """Inspect a single file by scanning manifests in GCS and recomputing stats for that file only.
2
+
3
+ Usage:
4
+ python scripts/inspect_single_file_gcs.py <bucket> <manifest_prefix> <target_gs_path> <output_jsonl>
5
+
6
+ Example:
7
+ python scripts/inspect_single_file_gcs.py opteryx_data opteryx/ops/audit_log/metadata/ gs://opteryx_data/opteryx/ops/audit_log/data/188f... artifacts/out.jsonl
8
+ """
9
+
10
+ import json
11
+ import os
12
+ import sys
13
+ import time
14
+
15
+ import pyarrow.parquet as pq
16
+ from google.cloud import storage
17
+
18
+ from opteryx_catalog.catalog.manifest import build_parquet_manifest_entry_from_bytes
19
+
20
+
21
+ def _preview(lst, n=6):
22
+ if lst is None:
23
+ return None
24
+ if isinstance(lst, list):
25
+ if len(lst) > n:
26
+ return {"len": len(lst), "preview": lst[:n], "truncated": True}
27
+ return {"len": len(lst), "preview": lst, "truncated": False}
28
+ return lst
29
+
30
+
31
+ if __name__ == "__main__":
32
+ if len(sys.argv) < 5:
33
+ print(
34
+ "Usage: python scripts/inspect_single_file_gcs.py <bucket> <manifest_prefix> <target_gs_path> <output_jsonl>"
35
+ )
36
+ sys.exit(2)
37
+
38
+ bucket_name = sys.argv[1]
39
+ manifest_prefix = sys.argv[2]
40
+ target_fp = sys.argv[3]
41
+ out_path = sys.argv[4]
42
+
43
+ repo_root = os.getcwd()
44
+ artifacts = os.path.join(repo_root, "artifacts")
45
+ os.makedirs(artifacts, exist_ok=True)
46
+ if not os.path.abspath(out_path).startswith(repo_root):
47
+ out_path = os.path.join(artifacts, os.path.basename(out_path))
48
+
49
+ client = storage.Client()
50
+
51
+ blobs = list(client.list_blobs(bucket_name, prefix=manifest_prefix))
52
+ if not blobs:
53
+ print("No manifest blobs found", file=sys.stderr)
54
+ sys.exit(3)
55
+
56
+ blobs.sort(key=lambda b: b.updated or b.time_created or 0, reverse=True)
57
+
58
+ match_row = None
59
+ match_idx = None
60
+ match_manifest = None
61
+
62
+ for b in blobs:
63
+ data = b.download_as_bytes()
64
+ try:
65
+ table = pq.read_table(data)
66
+ rows = table.to_pylist()
67
+ except Exception:
68
+ continue
69
+ for i, r in enumerate(rows):
70
+ if r.get("file_path") == target_fp:
71
+ match_row = r
72
+ match_idx = i
73
+ match_manifest = b
74
+ break
75
+ if match_row:
76
+ break
77
+
78
+ if match_row is None:
79
+ print("Target file not found in manifests", file=sys.stderr)
80
+ sys.exit(4)
81
+
82
+ # download target file
83
+ _, rest = target_fp.split("://", 1)
84
+ bucket2, path = rest.split("/", 1)
85
+ blob2 = client.bucket(bucket2).blob(path)
86
+ data = blob2.download_as_bytes()
87
+
88
+ recomputed = build_parquet_manifest_entry_from_bytes(data, target_fp, len(data)).to_dict()
89
+
90
+ rec = {"file_index": match_idx, "file_path": target_fp}
91
+ ent = match_row
92
+ rec["manifest_entry_summary"] = {
93
+ "uncompressed_size": int(ent.get("uncompressed_size_in_bytes") or 0),
94
+ "min_k_hashes": _preview(ent.get("min_k_hashes")),
95
+ }
96
+ rec["recomputed"] = {
97
+ "uncompressed_size": int(recomputed.get("uncompressed_size_in_bytes") or 0),
98
+ "min_k_hashes": _preview(recomputed.get("min_k_hashes")),
99
+ }
100
+ man_k = ent.get("min_k_hashes") or []
101
+ rec_k = recomputed.get("min_k_hashes") or []
102
+ rec["diffs"] = {
103
+ "min_k_hashes_nonempty_counts": {
104
+ "manifest_nonempty": sum(1 for x in man_k if x),
105
+ "recomputed_nonempty": sum(1 for x in rec_k if x),
106
+ }
107
+ }
108
+
109
+ with open(out_path, "w", encoding="utf-8") as of:
110
+ of.write(
111
+ json.dumps(
112
+ {
113
+ "_meta": {
114
+ "source": "gcs-manifest-scan",
115
+ "manifest_blob": match_manifest.name,
116
+ "timestamp": int(time.time() * 1000),
117
+ }
118
+ }
119
+ )
120
+ + "\n"
121
+ )
122
+ of.write(json.dumps(rec) + "\n")
123
+
124
+ print("WROTE", out_path)
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from opteryx_catalog.opteryx_catalog import OpteryxCatalog
4
+
5
+
6
+ class _Doc:
7
+ def __init__(self, id_):
8
+ self.id = id_
9
+
10
+
11
+ def test_list_collections_excludes_properties():
12
+ # Construct catalog without calling __init__ to avoid external I/O
13
+ c = object.__new__(OpteryxCatalog)
14
+ c.workspace = "w"
15
+
16
+ class MockColl:
17
+ def stream(self):
18
+ return [_Doc("$properties"), _Doc("col_a"), _Doc("col_b")]
19
+
20
+ c._catalog_ref = MockColl()
21
+
22
+ cols = list(c.list_collections())
23
+ assert "$properties" not in cols
24
+ assert set(cols) == {"col_a", "col_b"}
25
+
26
+
27
+ def test_list_collections_handles_errors():
28
+ c = object.__new__(OpteryxCatalog)
29
+ c.workspace = "w"
30
+
31
+ class BadColl:
32
+ def stream(self):
33
+ raise RuntimeError("boom")
34
+
35
+ c._catalog_ref = BadColl()
36
+
37
+ assert list(c.list_collections()) == []