@ictechgy/context-guard 0.4.6 → 0.4.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2339 @@
1
+ #!/usr/bin/env python3
2
+ """Default-off ContextGuard experimental feature registry.
3
+
4
+ The registry is intentionally passive: it records explicit project-local opt-in
5
+ state for experimental lanes, but it does not activate runtime behavior by
6
+ itself. Individual helpers must still require their own explicit experimental
7
+ flags before changing stable behavior.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ from dataclasses import asdict, dataclass
13
+ from datetime import datetime, timezone
14
+ import hashlib
15
+ import ipaddress
16
+ import json
17
+ import math
18
+ import os
19
+ import re
20
+ import secrets
21
+ import shlex
22
+ from pathlib import Path
23
+ import stat
24
+ import sys
25
+ from typing import Any, NoReturn
26
+ import unicodedata
27
+ from urllib.parse import urlparse
28
+
29
+ TOOL_NAME = "context-guard-experiments"
30
+ CONFIG_SCHEMA_VERSION = "contextguard.experiments.v1"
31
+ DEFAULT_CONFIG = Path(".context-guard") / "experiments.json"
32
+ MAX_CONFIG_BYTES = 64_000
33
+ MAX_CONTEXT_DIFF_INPUT_BYTES = 256_000
34
+ MAX_VISUAL_OCR_TEXT_BYTES = 64_000
35
+ MAX_LEARNED_COMPRESSION_INPUT_BYTES = 128_000
36
+ MAX_SELF_HOSTED_METRICS_INPUT_BYTES = 64_000
37
+ SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
38
+ SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
39
+ SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
40
+ BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
41
+ MAX_SELF_HOSTED_LABEL_CHARS = 120
42
+ MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
43
+ MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
44
+ MAX_SELF_HOSTED_ENERGY_WH = 1_000_000
45
+ MAX_SELF_HOSTED_LOCAL_COST_USD = 1_000_000
46
+ MAX_SELF_HOSTED_TOKENS_PER_SECOND = 10_000_000
47
+ TOKEN_PROXY_BYTES_PER_TOKEN = 4
48
+ MAX_SELF_HOSTED_JSON_DEPTH = 100
49
+ MAX_SELF_HOSTED_JSON_NODES = 10_000
50
+ LOCAL_PROXY_SCHEMA_VERSION = "contextguard.experiments.local-proxy-plan.v1"
51
+ LOCAL_PROXY_DEFAULT_BIND_HOST = "127.0.0.1"
52
+ LOCAL_PROXY_DEFAULT_BIND_PORT = 0
53
+ LOCAL_PROXY_DEFAULT_TARGET_HOST = "127.0.0.1"
54
+ LOCAL_PROXY_DEFAULT_TARGET_PORT = 0
55
+ LOCAL_PROXY_LOCALHOST_NAMES = {"localhost"}
56
+ ALLOWED_FIRST_COMPONENT_SYMLINKS = {
57
+ "tmp": Path("/private/tmp"),
58
+ "var": Path("/private/var"),
59
+ }
60
+ DIR_FD_OPEN_SUPPORTED = os.open in getattr(os, "supports_dir_fd", set())
61
+ DIR_FD_MKDIR_SUPPORTED = os.mkdir in getattr(os, "supports_dir_fd", set())
62
+ DIR_FD_STAT_NOFOLLOW_SUPPORTED = (
63
+ os.stat in getattr(os, "supports_dir_fd", set())
64
+ and os.stat in getattr(os, "supports_follow_symlinks", set())
65
+ )
66
+ NO_FOLLOW_SUPPORTED = hasattr(os, "O_NOFOLLOW")
67
+
68
+
69
+ @dataclass(frozen=True)
70
+ class Experiment:
71
+ id: str
72
+ name: str
73
+ summary: str
74
+ stability: str
75
+ default_enabled: bool
76
+ risk_level: str
77
+ claim_boundary: str
78
+ gate_requirements: tuple[str, ...]
79
+ runtime_status: str = "metadata-only"
80
+ commands: tuple[str, ...] = ()
81
+ opt_in_flags: tuple[str, ...] = ()
82
+ config_effect: str = (
83
+ "Registry enablement records project-local intent only; helpers still require explicit experimental flags."
84
+ )
85
+ evidence_contract: str = "Evidence is local metadata only unless a later story adds a measured runtime gate."
86
+
87
+ def to_json(self, *, enabled: bool = False) -> dict[str, Any]:
88
+ data = asdict(self)
89
+ for key in ("gate_requirements", "commands", "opt_in_flags"):
90
+ data[key] = list(getattr(self, key))
91
+ data["enabled"] = bool(enabled)
92
+ return data
93
+
94
+
95
+ EXPERIMENTS: tuple[Experiment, ...] = (
96
+ Experiment(
97
+ id="output-receipt-trim",
98
+ name="Receipt-backed output trimming",
99
+ summary="Opt-in digest output with local artifact receipts and exact re-expand instructions.",
100
+ stability="experimental",
101
+ default_enabled=False,
102
+ risk_level="low",
103
+ claim_boundary="Local output-size reduction only; no hosted API token/cost savings claim without provider-measured matched tasks.",
104
+ gate_requirements=("explicit opt-in", "local artifact receipt", "exact re-expand command"),
105
+ runtime_status="available-explicit-flags",
106
+ commands=(
107
+ "context-guard-trim-output --digest markdown --artifact-receipt -- <command>",
108
+ "context-guard-trim-output --digest json --artifact-receipt -- <command>",
109
+ ),
110
+ opt_in_flags=("--digest markdown|json", "--artifact-receipt"),
111
+ config_effect=(
112
+ "Registry enablement records project-local intent only; output trimming still runs only when the helper is "
113
+ "invoked with --digest markdown|json plus --artifact-receipt."
114
+ ),
115
+ evidence_contract=(
116
+ "Stores the exact sanitized full output as a local context-guard-artifact receipt and emits an exact "
117
+ "re-expand command before omitted details are relied on."
118
+ ),
119
+ ),
120
+ Experiment(
121
+ id="protected-zone-policy",
122
+ name="Protected-zone transform policy",
123
+ summary="Metadata policy that denies semantic rewrites for code, diffs, identifiers, hashes, paths, and other exact evidence.",
124
+ stability="experimental",
125
+ default_enabled=False,
126
+ risk_level="low",
127
+ claim_boundary="Policy metadata only; it does not prove provider cache or token savings.",
128
+ gate_requirements=("explicit opt-in", "protected-zone detection", "exact retrieval fallback"),
129
+ runtime_status="available-explicit-flags",
130
+ commands=(
131
+ "context-guard-compress --json --protected-policy",
132
+ "context-guard cost compile --json",
133
+ "context-guard-cost compile --json",
134
+ ),
135
+ opt_in_flags=("--protected-policy", "protected=true manifest sections for cost compile"),
136
+ config_effect=(
137
+ "Registry enablement records project-local intent only; protected-zone policy metadata still appears only "
138
+ "when explicit helper flags or protected manifest sections are used."
139
+ ),
140
+ evidence_contract=(
141
+ "Denies semantic/paraphrase rewrites for protected classes and requires structural transforms plus exact "
142
+ "artifact retrieval guidance for protected evidence."
143
+ ),
144
+ ),
145
+ Experiment(
146
+ id="context-diff-compaction",
147
+ name="Reviewable context-diff compaction",
148
+ summary="Dry-run advisory lane for human-reviewable compaction plans with stable exact handles.",
149
+ stability="experimental",
150
+ default_enabled=False,
151
+ risk_level="medium",
152
+ claim_boundary="Smaller local diffs are proxy evidence only; hosted savings require provider-measured matched tasks.",
153
+ gate_requirements=("explicit opt-in", "human-reviewable diff", "local receipt", "exact re-expand handle"),
154
+ runtime_status="available-dry-run",
155
+ commands=("context-guard experiments plan context-diff-compaction",),
156
+ opt_in_flags=("plan context-diff-compaction", "--receipt-id", "--reexpand-command"),
157
+ config_effect=(
158
+ "Registry enablement records project-local intent only; context-diff compaction remains a dry-run plan "
159
+ "unless a future story adds an explicit replacement command."
160
+ ),
161
+ evidence_contract=(
162
+ "Dry-run plans require human-reviewable hunks plus user-supplied exact receipt and re-expand handles before "
163
+ "any future lossy replacement can be reviewed."
164
+ ),
165
+ ),
166
+ Experiment(
167
+ id="visual-crop-ocr",
168
+ name="Visual crop/OCR evidence planning",
169
+ summary="Dry-run fixture lane for comparing full visual evidence with cropped or OCR-derived evidence.",
170
+ stability="experimental",
171
+ default_enabled=False,
172
+ risk_level="medium",
173
+ claim_boundary="Image/OCR byte reductions are proxy evidence until provider image/text token fields are measured.",
174
+ gate_requirements=("explicit opt-in", "original evidence preserved", "confidence/error notes", "missed-context guardrail"),
175
+ runtime_status="available-dry-run",
176
+ commands=("context-guard experiments plan visual-crop-ocr",),
177
+ opt_in_flags=(
178
+ "plan visual-crop-ocr",
179
+ "--full-evidence-receipt",
180
+ "--crop-bounds",
181
+ "--image-size",
182
+ "--ocr-text|--ocr-text-file",
183
+ "--ocr-confidence",
184
+ "--ocr-error-note",
185
+ "--missed-context-note",
186
+ ),
187
+ config_effect=(
188
+ "Registry enablement records project-local intent only; visual crop/OCR planning remains a dry-run "
189
+ "metadata surface and does not run OCR, crop images, call providers, or change stable behavior."
190
+ ),
191
+ evidence_contract=(
192
+ "Dry-run plans require retrievable full visual evidence plus crop/OCR confidence, error, and "
193
+ "missed-context guardrails before human review."
194
+ ),
195
+ ),
196
+ Experiment(
197
+ id="learned-compression",
198
+ name="Learned/synthetic compression safe gate",
199
+ summary="Deny-by-default dry-run safety gate for already-sanitized unprotected prose only.",
200
+ stability="experimental",
201
+ default_enabled=False,
202
+ risk_level="high",
203
+ claim_boundary="Semantic compression cannot claim savings or correctness without matched-task quality and provider token evidence.",
204
+ gate_requirements=("explicit opt-in", "sanitized unprotected prose only", "protected-zone denial", "exact fallback or receipt"),
205
+ runtime_status="available-dry-run",
206
+ commands=("context-guard experiments plan learned-compression",),
207
+ opt_in_flags=("plan learned-compression", "--sanitized", "--trusted-source", "--exact-fallback-receipt", "--reexpand-command"),
208
+ config_effect=(
209
+ "Registry enablement records project-local intent only; learned compression remains a dry-run policy check "
210
+ "and does not run learned compressors, embeddings, model calls, or replacements."
211
+ ),
212
+ evidence_contract=(
213
+ "Dry-run eligibility requires caller-asserted sanitized trusted prose, exact local fallback handles, and "
214
+ "denial of protected or prompt-like signals."
215
+ ),
216
+ ),
217
+ Experiment(
218
+ id="self-hosted-metrics-ledger",
219
+ name="Self-hosted metrics ledger",
220
+ summary="Dry-run checker for self-hosted/local metrics ledger sidecars kept separate from hosted API claims.",
221
+ stability="experimental",
222
+ default_enabled=False,
223
+ risk_level="low",
224
+ claim_boundary="Self-hosted memory/latency metrics must stay separate from hosted API token/cost claims.",
225
+ gate_requirements=("explicit opt-in", "separate ledger fields", "shifted-cost accounting"),
226
+ runtime_status="available-dry-run",
227
+ commands=("context-guard experiments plan self-hosted-metrics-ledger",),
228
+ opt_in_flags=(
229
+ "plan self-hosted-metrics-ledger",
230
+ "--input",
231
+ "--latency-ms",
232
+ "--peak-memory-mb",
233
+ "--quality-score",
234
+ "--energy-wh",
235
+ "--local-cost-usd",
236
+ "--tokens-per-second",
237
+ "--model-server",
238
+ "--optimization",
239
+ ),
240
+ config_effect=(
241
+ "Registry enablement records project-local intent only; self-hosted metrics planning remains a dry-run "
242
+ "ledger-preview surface and does not write ledgers or alter benchmark/report behavior."
243
+ ),
244
+ evidence_contract=(
245
+ "Real evidence belongs in context-guard-bench JSONL ledger sidecars; self-hosted metrics remain separate "
246
+ "from hosted API token/cost savings."
247
+ ),
248
+ ),
249
+ Experiment(
250
+ id="local-proxy",
251
+ name="Local proxy advisory lane",
252
+ summary="Dry-run localhost-only proxy advisory plan with no hidden forwarding or API-key persistence.",
253
+ stability="experimental",
254
+ default_enabled=False,
255
+ risk_level="high",
256
+ claim_boundary="Proxy metrics are diagnostic only; no hosted savings claim without provider-measured evidence.",
257
+ gate_requirements=("explicit opt-in", "localhost-only default", "no API-key persistence", "no hidden external forwarding"),
258
+ runtime_status="available-dry-run",
259
+ commands=("context-guard experiments plan local-proxy",),
260
+ opt_in_flags=(
261
+ "plan local-proxy",
262
+ "--bind-host",
263
+ "--bind-port",
264
+ "--target-host",
265
+ "--target-port",
266
+ "--upstream-url",
267
+ "--runtime-gate-ack",
268
+ "--external-forwarding-intent",
269
+ "--persist-api-key",
270
+ ),
271
+ config_effect=(
272
+ "Registry enablement records project-local intent only; local proxy planning remains a dry-run advisory "
273
+ "surface and does not bind sockets, forward traffic, persist API keys, or write ledgers."
274
+ ),
275
+ evidence_contract=(
276
+ "Dry-run plans require localhost-only bind/target metadata, explicit runtime gate acknowledgement before "
277
+ "any future forwarding, and no raw API-key persistence."
278
+ ),
279
+ ),
280
+ )
281
+
282
+ REGISTRY = {experiment.id: experiment for experiment in EXPERIMENTS}
283
+
284
+
285
+ class RegistryError(RuntimeError):
286
+ pass
287
+
288
+
289
+ def fail(message: str, code: int = 2) -> NoReturn:
290
+ print(f"{TOOL_NAME}: {message}", file=sys.stderr)
291
+ raise SystemExit(code)
292
+
293
+
294
+ def os_error_detail(exc: OSError) -> str:
295
+ detail = exc.strerror or exc.__class__.__name__
296
+ if exc.errno is not None:
297
+ return f"{detail} (errno {exc.errno})"
298
+ return detail
299
+
300
+
301
+ def _no_follow_flag(*, label: str) -> int:
302
+ if not NO_FOLLOW_SUPPORTED:
303
+ raise RegistryError(f"{label} requires O_NOFOLLOW support")
304
+ return os.O_NOFOLLOW
305
+
306
+
307
+ def _directory_open_flags(*, follow_final: bool = False, label: str) -> int:
308
+ flags = os.O_RDONLY
309
+ if hasattr(os, "O_CLOEXEC"):
310
+ flags |= os.O_CLOEXEC
311
+ if hasattr(os, "O_DIRECTORY"):
312
+ flags |= os.O_DIRECTORY
313
+ if not follow_final:
314
+ flags |= _no_follow_flag(label=label)
315
+ return flags
316
+
317
+
318
+ def _file_open_flags(*, label: str, write: bool = False) -> int:
319
+ flags = os.O_WRONLY | os.O_CREAT | os.O_TRUNC if write else os.O_RDONLY
320
+ flags |= _no_follow_flag(label=label)
321
+ if hasattr(os, "O_CLOEXEC"):
322
+ flags |= os.O_CLOEXEC
323
+ if hasattr(os, "O_NONBLOCK"):
324
+ flags |= os.O_NONBLOCK
325
+ if hasattr(os, "O_NOCTTY"):
326
+ flags |= os.O_NOCTTY
327
+ return flags
328
+
329
+
330
+ def _temp_file_open_flags(*, label: str) -> int:
331
+ flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
332
+ flags |= _no_follow_flag(label=label)
333
+ if hasattr(os, "O_CLOEXEC"):
334
+ flags |= os.O_CLOEXEC
335
+ if hasattr(os, "O_NOCTTY"):
336
+ flags |= os.O_NOCTTY
337
+ return flags
338
+
339
+
340
+ def _leaf_name(path: Path, *, label: str) -> str:
341
+ name = path.name
342
+ if name in {"", ".", ".."}:
343
+ raise RegistryError(f"{label} must name a regular file")
344
+ return name
345
+
346
+
347
+ def _normalized_link_target(anchor: Path, raw_target: str) -> Path:
348
+ target = Path(raw_target)
349
+ if target.is_absolute():
350
+ return Path(os.path.normpath(str(target)))
351
+ return Path(os.path.normpath(str(anchor / target)))
352
+
353
+
354
+ def normalize_allowed_first_absolute_symlink(path: Path) -> Path:
355
+ if not path.is_absolute():
356
+ return path
357
+ parts = path.parts
358
+ if len(parts) < 2:
359
+ return path
360
+ first = parts[1]
361
+ expected = ALLOWED_FIRST_COMPONENT_SYMLINKS.get(first)
362
+ if expected is None:
363
+ return path
364
+ link = Path(path.anchor) / first
365
+ try:
366
+ if link.is_symlink() and _normalized_link_target(Path(path.anchor), os.readlink(link)) == expected:
367
+ return expected.joinpath(*parts[2:])
368
+ except OSError:
369
+ return path
370
+ return path
371
+
372
+
373
+ def normalize_local_path(path: Path) -> Path:
374
+ path = path.expanduser()
375
+ if not path.is_absolute():
376
+ path = Path.cwd() / path
377
+ return normalize_allowed_first_absolute_symlink(Path(os.path.normpath(str(path))))
378
+
379
+
380
+ def normalize_project_path(root: Path, candidate: Path, *, label: str) -> Path:
381
+ candidate = candidate.expanduser()
382
+ if not candidate.is_absolute():
383
+ candidate = root / candidate
384
+ normalized = normalize_allowed_first_absolute_symlink(Path(os.path.normpath(str(candidate))))
385
+ try:
386
+ normalized.relative_to(root)
387
+ except ValueError as exc:
388
+ raise RegistryError(f"{label} must stay inside project root: {normalized}") from exc
389
+ return normalized
390
+
391
+
392
+ def open_directory_no_follow(path: Path, *, label: str, create: bool = False, missing_ok: bool = False) -> int | None:
393
+ path = normalize_allowed_first_absolute_symlink(path)
394
+ if not DIR_FD_OPEN_SUPPORTED:
395
+ raise RegistryError(f"{label} requires dir_fd open support")
396
+ if create and not DIR_FD_MKDIR_SUPPORTED:
397
+ raise RegistryError(f"{label} requires dir_fd mkdir support")
398
+ flags = _directory_open_flags(label=label)
399
+ if path.is_absolute():
400
+ anchor = path.anchor or os.sep
401
+ parts = path.parts[1:]
402
+ try:
403
+ current_fd = os.open(anchor, _directory_open_flags(follow_final=True, label=label))
404
+ except OSError as exc:
405
+ raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
406
+ else:
407
+ parts = path.parts
408
+ try:
409
+ current_fd = os.open(".", flags)
410
+ except OSError as exc:
411
+ raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
412
+ try:
413
+ for part in parts:
414
+ if part in {"", "."}:
415
+ continue
416
+ if part == "..":
417
+ raise RegistryError(f"{label} must not contain parent traversal")
418
+ next_fd = -1
419
+ try:
420
+ next_fd = os.open(part, flags, dir_fd=current_fd)
421
+ except FileNotFoundError:
422
+ if missing_ok:
423
+ os.close(current_fd)
424
+ current_fd = -1
425
+ return None
426
+ if not create:
427
+ raise RegistryError(f"could not inspect {label}: missing directory component") from None
428
+ try:
429
+ os.mkdir(part, mode=0o755, dir_fd=current_fd)
430
+ except FileExistsError:
431
+ pass
432
+ except OSError as exc:
433
+ raise RegistryError(f"could not create {label}: {os_error_detail(exc)}") from exc
434
+ try:
435
+ next_fd = os.open(part, flags, dir_fd=current_fd)
436
+ except OSError as exc:
437
+ raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
438
+ except OSError as exc:
439
+ raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
440
+ try:
441
+ if not stat.S_ISDIR(os.fstat(next_fd).st_mode):
442
+ raise RegistryError(f"{label} must not traverse non-directory components")
443
+ except Exception:
444
+ if next_fd >= 0:
445
+ try:
446
+ os.close(next_fd)
447
+ except OSError:
448
+ pass
449
+ raise
450
+ try:
451
+ os.close(current_fd)
452
+ except OSError:
453
+ pass
454
+ current_fd = next_fd
455
+ owned_fd = current_fd
456
+ current_fd = -1
457
+ return owned_fd
458
+ finally:
459
+ if current_fd >= 0:
460
+ try:
461
+ os.close(current_fd)
462
+ except OSError:
463
+ pass
464
+
465
+
466
+ def _precheck_regular_leaf(parent_fd: int, leaf_name: str, *, label: str, missing_ok: bool = False) -> bool:
467
+ if not DIR_FD_STAT_NOFOLLOW_SUPPORTED:
468
+ raise RegistryError(f"{label} requires dir_fd stat support")
469
+ try:
470
+ st = os.stat(leaf_name, dir_fd=parent_fd, follow_symlinks=False)
471
+ except FileNotFoundError:
472
+ if missing_ok:
473
+ return False
474
+ raise RegistryError(f"could not inspect {label}: missing file") from None
475
+ except OSError as exc:
476
+ raise RegistryError(f"could not inspect {label}: {os_error_detail(exc)}") from exc
477
+ if not stat.S_ISREG(st.st_mode):
478
+ raise RegistryError(f"{label} must be a regular file")
479
+ return True
480
+
481
+
482
+ def read_bounded_regular_file(path: Path, *, max_bytes: int, label: str, missing_ok: bool = False) -> tuple[bytes, bool] | None:
483
+ path = normalize_local_path(path)
484
+ parent_fd = open_directory_no_follow(path.parent, label=f"{label} parent", missing_ok=missing_ok)
485
+ if parent_fd is None:
486
+ return None
487
+ fd = -1
488
+ try:
489
+ leaf = _leaf_name(path, label=label)
490
+ exists = _precheck_regular_leaf(parent_fd, leaf, label=label, missing_ok=missing_ok)
491
+ if not exists:
492
+ return None
493
+ fd = os.open(leaf, _file_open_flags(label=label), dir_fd=parent_fd)
494
+ if not stat.S_ISREG(os.fstat(fd).st_mode):
495
+ raise RegistryError(f"{label} must be a regular file")
496
+ chunks: list[bytes] = []
497
+ remaining = max_bytes + 1
498
+ while remaining > 0:
499
+ chunk = os.read(fd, min(64 * 1024, remaining))
500
+ if not chunk:
501
+ break
502
+ chunks.append(chunk)
503
+ remaining -= len(chunk)
504
+ raw = b"".join(chunks)
505
+ truncated = len(raw) > max_bytes
506
+ return raw[:max_bytes], truncated
507
+ except OSError as exc:
508
+ raise RegistryError(f"could not read {label}: {os_error_detail(exc)}") from exc
509
+ finally:
510
+ if fd >= 0:
511
+ try:
512
+ os.close(fd)
513
+ except OSError:
514
+ pass
515
+ try:
516
+ os.close(parent_fd)
517
+ except OSError:
518
+ pass
519
+
520
+
521
+ def write_all_fd(fd: int, data: bytes) -> None:
522
+ view = memoryview(data)
523
+ offset = 0
524
+ while offset < len(view):
525
+ written = os.write(fd, view[offset:])
526
+ if written <= 0:
527
+ raise OSError("short write")
528
+ offset += written
529
+
530
+
531
+ def write_regular_file_no_follow(path: Path, data: bytes, *, label: str) -> None:
532
+ path = normalize_local_path(path)
533
+ parent_fd = open_directory_no_follow(path.parent, label=f"{label} parent", create=True)
534
+ if parent_fd is None: # pragma: no cover - create=True never returns None.
535
+ raise RegistryError(f"could not inspect {label} parent")
536
+ fd = -1
537
+ temp_leaf: str | None = None
538
+ try:
539
+ leaf = _leaf_name(path, label=label)
540
+ exists = _precheck_regular_leaf(parent_fd, leaf, label=label, missing_ok=True)
541
+ mode = 0o644
542
+ if exists:
543
+ try:
544
+ mode = stat.S_IMODE(os.stat(leaf, dir_fd=parent_fd, follow_symlinks=False).st_mode) or 0o644
545
+ except OSError:
546
+ mode = 0o644
547
+ for _attempt in range(20):
548
+ candidate = _leaf_name(Path(f".{leaf}.{os.getpid()}.{secrets.token_hex(8)}.tmp"), label=f"{label} temp")
549
+ try:
550
+ fd = os.open(candidate, _temp_file_open_flags(label=f"{label} temp"), mode, dir_fd=parent_fd)
551
+ temp_leaf = candidate
552
+ break
553
+ except FileExistsError:
554
+ continue
555
+ if fd < 0 or temp_leaf is None:
556
+ raise RegistryError(f"could not create temporary {label}")
557
+ if not stat.S_ISREG(os.fstat(fd).st_mode):
558
+ raise RegistryError(f"{label} temp must be a regular file")
559
+ write_all_fd(fd, data)
560
+ try:
561
+ os.fsync(fd)
562
+ except OSError:
563
+ pass
564
+ try:
565
+ os.close(fd)
566
+ except OSError:
567
+ pass
568
+ fd = -1
569
+ os.replace(temp_leaf, leaf, src_dir_fd=parent_fd, dst_dir_fd=parent_fd)
570
+ temp_leaf = None
571
+ except OSError as exc:
572
+ raise RegistryError(f"could not write {label}: {os_error_detail(exc)}") from exc
573
+ finally:
574
+ if fd >= 0:
575
+ try:
576
+ os.close(fd)
577
+ except OSError:
578
+ pass
579
+ if temp_leaf is not None:
580
+ try:
581
+ os.unlink(temp_leaf, dir_fd=parent_fd)
582
+ except OSError:
583
+ pass
584
+ try:
585
+ os.fsync(parent_fd)
586
+ except OSError:
587
+ pass
588
+ try:
589
+ os.close(parent_fd)
590
+ except OSError:
591
+ pass
592
+
593
+
594
+ def resolve_root(raw_root: str | None) -> Path:
595
+ root = Path(raw_root) if raw_root else Path.cwd()
596
+ try:
597
+ return root.expanduser().resolve()
598
+ except OSError as exc:
599
+ raise RegistryError(f"could not resolve root: {root}: {exc}") from exc
600
+
601
+
602
+ def resolve_config_path(root: Path, raw_config: str | None) -> Path:
603
+ if raw_config:
604
+ candidate = Path(raw_config)
605
+ else:
606
+ candidate = DEFAULT_CONFIG
607
+ return normalize_project_path(root, candidate, label="config path")
608
+
609
+
610
+ def load_config(path: Path) -> dict[str, Any]:
611
+ loaded = read_bounded_regular_file(path, max_bytes=MAX_CONFIG_BYTES, label="config", missing_ok=True)
612
+ if loaded is None:
613
+ return {"schema_version": CONFIG_SCHEMA_VERSION, "enabled": []}
614
+ raw, truncated = loaded
615
+ if truncated:
616
+ raise RegistryError("config exceeded max bytes")
617
+ try:
618
+ text = raw.decode("utf-8")
619
+ except UnicodeDecodeError as exc:
620
+ raise RegistryError(f"could not decode config UTF-8: {path}: {exc.reason}") from exc
621
+ try:
622
+ data = json.loads(text)
623
+ except json.JSONDecodeError as exc:
624
+ raise RegistryError(f"could not parse config JSON: {path}: {exc.msg}") from exc
625
+ except OSError as exc:
626
+ raise RegistryError(f"could not read config: {path}: {exc}") from exc
627
+ if not isinstance(data, dict):
628
+ raise RegistryError(f"config must be a JSON object: {path}")
629
+ schema = data.get("schema_version")
630
+ if schema not in (None, CONFIG_SCHEMA_VERSION):
631
+ raise RegistryError(f"unsupported config schema_version: {schema!r}")
632
+ enabled = data.get("enabled", [])
633
+ if not isinstance(enabled, list) or not all(isinstance(item, str) for item in enabled):
634
+ raise RegistryError("config enabled must be a list of experiment ids")
635
+ return {"schema_version": CONFIG_SCHEMA_VERSION, "enabled": sorted(set(enabled))}
636
+
637
+
638
+ def write_config(path: Path, enabled: set[str]) -> dict[str, Any]:
639
+ data = {
640
+ "schema_version": CONFIG_SCHEMA_VERSION,
641
+ "updated_at": datetime.now(timezone.utc).replace(microsecond=0).isoformat().replace("+00:00", "Z"),
642
+ "enabled": sorted(enabled),
643
+ }
644
+ payload = (json.dumps(data, indent=2, sort_keys=True) + "\n").encode("utf-8")
645
+ write_regular_file_no_follow(path, payload, label="config")
646
+ return data
647
+
648
+
649
+ def configured_enabled_set(config: dict[str, Any]) -> set[str]:
650
+ return set(config.get("enabled", []))
651
+
652
+
653
+ def enabled_set(config: dict[str, Any]) -> set[str]:
654
+ return {item for item in configured_enabled_set(config) if item in REGISTRY}
655
+
656
+
657
+ def unknown_enabled(config: dict[str, Any]) -> list[str]:
658
+ return sorted(item for item in set(config.get("enabled", [])) if item not in REGISTRY)
659
+
660
+
661
+ def registry_payload(*, config_path: Path, config: dict[str, Any], root: Path) -> dict[str, Any]:
662
+ enabled = enabled_set(config)
663
+ return {
664
+ "tool": TOOL_NAME,
665
+ "schema_version": CONFIG_SCHEMA_VERSION,
666
+ "root": str(root),
667
+ "config_path": str(config_path),
668
+ "default_off": True,
669
+ "note": "Experiments are opt-in metadata gates; enabling an experiment does not activate stable runtime behavior by itself.",
670
+ "unknown_enabled": unknown_enabled(config),
671
+ "experiments": [experiment.to_json(enabled=experiment.id in enabled) for experiment in EXPERIMENTS],
672
+ }
673
+
674
+
675
+ def emit_json(payload: dict[str, Any]) -> None:
676
+ print(json.dumps(payload, indent=2, sort_keys=True))
677
+
678
+
679
+ def emit_human(payload: dict[str, Any], *, include_details: bool = False) -> None:
680
+ print("ContextGuard experiments (default off; explicit opt-in required)")
681
+ print(f"Config: {payload['config_path']}")
682
+ print("Enabling an experiment records project-local intent only; helpers still require explicit experimental use.")
683
+ for experiment in payload["experiments"]:
684
+ state = "enabled" if experiment["enabled"] else "disabled"
685
+ print(f"- {experiment['id']}: {state} [{experiment['stability']}, risk={experiment['risk_level']}]")
686
+ if include_details:
687
+ print(f" {experiment['summary']}")
688
+ print(f" Runtime: {experiment['runtime_status']}")
689
+ if experiment["commands"]:
690
+ print(" Commands: " + "; ".join(experiment["commands"]))
691
+ if experiment["opt_in_flags"]:
692
+ print(" Opt-in flags: " + ", ".join(experiment["opt_in_flags"]))
693
+ print(f" Config effect: {experiment['config_effect']}")
694
+ print(f" Evidence contract: {experiment['evidence_contract']}")
695
+ print(f" Claim boundary: {experiment['claim_boundary']}")
696
+ if payload["unknown_enabled"]:
697
+ print("Unknown enabled ids in config: " + ", ".join(payload["unknown_enabled"]))
698
+
699
+
700
+ def require_known(experiment_id: str) -> Experiment:
701
+ try:
702
+ return REGISTRY[experiment_id]
703
+ except KeyError:
704
+ choices = ", ".join(sorted(REGISTRY))
705
+ fail(f"unknown experiment id {experiment_id!r}; known ids: {choices}")
706
+
707
+
708
+ def command_list(args: argparse.Namespace) -> int:
709
+ root, config_path, config = load_args_context(args)
710
+ payload = registry_payload(config_path=config_path, config=config, root=root)
711
+ if args.json:
712
+ emit_json(payload)
713
+ else:
714
+ emit_human(payload, include_details=True)
715
+ return 0
716
+
717
+
718
+ def command_status(args: argparse.Namespace) -> int:
719
+ root, config_path, config = load_args_context(args)
720
+ payload = registry_payload(config_path=config_path, config=config, root=root)
721
+ if args.json:
722
+ emit_json(payload)
723
+ else:
724
+ emit_human(payload, include_details=False)
725
+ return 0
726
+
727
+
728
+ def command_enable(args: argparse.Namespace) -> int:
729
+ require_known(args.experiment_id)
730
+ root, config_path, config = load_args_context(args)
731
+ enabled = configured_enabled_set(config)
732
+ changed = args.experiment_id not in enabled
733
+ enabled.add(args.experiment_id)
734
+ written = write_config(config_path, enabled)
735
+ payload = registry_payload(config_path=config_path, config=written, root=root)
736
+ payload["changed"] = changed
737
+ payload["experiment_id"] = args.experiment_id
738
+ if args.json:
739
+ emit_json(payload)
740
+ else:
741
+ print(f"enabled {args.experiment_id} in {config_path}")
742
+ return 0
743
+
744
+
745
+ def command_disable(args: argparse.Namespace) -> int:
746
+ require_known(args.experiment_id)
747
+ root, config_path, config = load_args_context(args)
748
+ enabled = configured_enabled_set(config)
749
+ changed = args.experiment_id in enabled
750
+ enabled.discard(args.experiment_id)
751
+ written = write_config(config_path, enabled)
752
+ payload = registry_payload(config_path=config_path, config=written, root=root)
753
+ payload["changed"] = changed
754
+ payload["experiment_id"] = args.experiment_id
755
+ if args.json:
756
+ emit_json(payload)
757
+ else:
758
+ print(f"disabled {args.experiment_id} in {config_path}")
759
+ return 0
760
+
761
+
762
+
763
+ DIFF_GIT_RE = re.compile(r"^diff --git (?P<old>\S+) (?P<new>\S+)$")
764
+ HUNK_RE = re.compile(r"^@@\s+-(?P<old_start>\d+)(?:,(?P<old_count>\d+))?\s+\+(?P<new_start>\d+)(?:,(?P<new_count>\d+))?\s+@@(?P<section>.*)$")
765
+
766
+
767
+ def read_bounded_input(args: argparse.Namespace) -> tuple[str, dict[str, Any]]:
768
+ source_label = args.source_label
769
+ if args.input:
770
+ path = Path(args.input)
771
+ source_label = source_label or str(path)
772
+ loaded = read_bounded_regular_file(path, max_bytes=MAX_CONTEXT_DIFF_INPUT_BYTES, label="input")
773
+ assert loaded is not None
774
+ raw, truncated = loaded
775
+ else:
776
+ source_label = source_label or "stdin"
777
+ raw = sys.stdin.buffer.read(MAX_CONTEXT_DIFF_INPUT_BYTES + 1)
778
+ truncated = len(raw) > MAX_CONTEXT_DIFF_INPUT_BYTES
779
+ raw = raw[:MAX_CONTEXT_DIFF_INPUT_BYTES]
780
+ if not raw:
781
+ raise RegistryError("context-diff-compaction plan requires diff input on stdin or --input")
782
+ text = raw.decode("utf-8", errors="replace")
783
+ metadata = {
784
+ "source_label": source_label,
785
+ "bytes": len(raw),
786
+ "lines": len(text.splitlines()),
787
+ "sha256": hashlib.sha256(raw).hexdigest(),
788
+ "truncated": truncated,
789
+ "max_bytes": MAX_CONTEXT_DIFF_INPUT_BYTES,
790
+ }
791
+ return text, metadata
792
+
793
+
794
+ def strip_diff_prefix(path: str) -> str:
795
+ if path.startswith(("a/", "b/")):
796
+ return path[2:]
797
+ return path
798
+
799
+
800
+ def summarize_diff(text: str, *, max_files: int = 50, max_hunks: int = 200) -> dict[str, Any]:
801
+ files: list[dict[str, Any]] = []
802
+ current: dict[str, Any] | None = None
803
+ total_hunks = 0
804
+ lines = text.splitlines()
805
+ diff_header_count = 0
806
+ for line_number, line in enumerate(lines, start=1):
807
+ match = DIFF_GIT_RE.match(line)
808
+ if match:
809
+ diff_header_count += 1
810
+ if len(files) >= max_files:
811
+ current = None
812
+ continue
813
+ current = {
814
+ "old_path": strip_diff_prefix(match.group("old")),
815
+ "new_path": strip_diff_prefix(match.group("new")),
816
+ "diff_header_line": line_number,
817
+ "hunks": [],
818
+ }
819
+ files.append(current)
820
+ continue
821
+ hunk = HUNK_RE.match(line)
822
+ if hunk:
823
+ total_hunks += 1
824
+ if current is None:
825
+ if len(files) >= max_files:
826
+ continue
827
+ current = {"old_path": None, "new_path": None, "diff_header_line": None, "hunks": []}
828
+ files.append(current)
829
+ if len(current["hunks"]) < max_hunks:
830
+ current["hunks"].append(
831
+ {
832
+ "line": line_number,
833
+ "old_start": int(hunk.group("old_start")),
834
+ "old_count": int(hunk.group("old_count") or "1"),
835
+ "new_start": int(hunk.group("new_start")),
836
+ "new_count": int(hunk.group("new_count") or "1"),
837
+ "section": hunk.group("section").strip()[:120],
838
+ }
839
+ )
840
+ return {
841
+ "file_count": len(files),
842
+ "hunk_count": total_hunks,
843
+ "truncated_files": max(0, diff_header_count - len(files)),
844
+ "files": files,
845
+ }
846
+
847
+
848
+ def context_diff_plan_payload(args: argparse.Namespace) -> dict[str, Any]:
849
+ text, input_meta = read_bounded_input(args)
850
+ summary = summarize_diff(text)
851
+ receipt_id = args.receipt_id.strip() if args.receipt_id else None
852
+ reexpand_command = args.reexpand_command.strip() if args.reexpand_command else None
853
+ has_exact_handle = bool(receipt_id and reexpand_command)
854
+ readiness_blockers: list[str] = []
855
+ if not has_exact_handle:
856
+ readiness_blockers.append("missing_exact_receipt_or_reexpand_command")
857
+ if input_meta["truncated"]:
858
+ readiness_blockers.append("input_truncated")
859
+ if summary["file_count"] == 0 or summary["hunk_count"] == 0:
860
+ readiness_blockers.append("no_reviewable_diff_hunks")
861
+ status = (
862
+ "ready_for_human_review"
863
+ if not readiness_blockers
864
+ else "blocked_until_reviewable_diff"
865
+ if has_exact_handle
866
+ else "blocked_until_exact_receipt"
867
+ )
868
+ return {
869
+ "tool": TOOL_NAME,
870
+ "schema_version": CONFIG_SCHEMA_VERSION,
871
+ "experiment_id": "context-diff-compaction",
872
+ "mode": "dry_run",
873
+ "status": status,
874
+ "input": input_meta,
875
+ "transform_policy": {
876
+ "automatic_compaction": False,
877
+ "lossy_replacement_allowed": False,
878
+ "semantic_rewrite_allowed": False,
879
+ "human_review_required": True,
880
+ "stable_runtime_behavior_changed": False,
881
+ },
882
+ "exact_retrieval": {
883
+ "required": True,
884
+ "available": has_exact_handle,
885
+ "artifact_id": receipt_id,
886
+ "cli": reexpand_command,
887
+ "verified": False,
888
+ "note": "G003 records user-supplied handles for human review only; it does not verify local receipt storage.",
889
+ },
890
+ "review_plan": {
891
+ "summary": summary,
892
+ "readiness_blockers": readiness_blockers,
893
+ "bounded_loss_disclosure": (
894
+ "No compacted replacement was produced. Any future lossy replacement must keep this diff reviewable "
895
+ "and provide exact receipt/re-expand handles before use."
896
+ ),
897
+ "next_steps": [
898
+ "Store exact original evidence with context-guard-artifact or another local receipt before compacting.",
899
+ "Review file and hunk summaries against the original diff.",
900
+ "Do not claim hosted token/cost savings from this dry-run plan.",
901
+ ],
902
+ },
903
+ "claim_boundary": "Dry-run local planning only; no hosted API token/cost savings claim without provider-measured matched successful tasks.",
904
+ "compacted_replacement": None,
905
+ }
906
+
907
+
908
+ def command_plan_context_diff_compaction(args: argparse.Namespace) -> int:
909
+ payload = context_diff_plan_payload(args)
910
+ if args.json:
911
+ emit_json(payload)
912
+ else:
913
+ print("ContextGuard context-diff compaction plan (dry-run only)")
914
+ print("No compaction was performed and no replacement text was emitted.")
915
+ print(f"Status: {payload['status']}")
916
+ print(f"Input: {payload['input']['source_label']} lines={payload['input']['lines']} sha256={payload['input']['sha256']}")
917
+ print(
918
+ f"Review summary: files={payload['review_plan']['summary']['file_count']} "
919
+ f"hunks={payload['review_plan']['summary']['hunk_count']}"
920
+ )
921
+ if not payload["exact_retrieval"]["available"]:
922
+ print("Exact receipt/re-expand command required before any lossy replacement can be reviewed.")
923
+ else:
924
+ print("Exact retrieval handle supplied for human review only; verified=false.")
925
+ if payload["review_plan"]["readiness_blockers"]:
926
+ print(f"Readiness blockers: {', '.join(payload['review_plan']['readiness_blockers'])}")
927
+ print(payload["claim_boundary"])
928
+ return 0
929
+
930
+
931
+ def clean_values(values: list[str] | None) -> list[str]:
932
+ return [value.strip() for value in values or [] if value.strip()]
933
+
934
+
935
+ def parse_int_tuple(raw: str | None, *, count: int) -> tuple[int, ...] | None:
936
+ if raw is None or not raw.strip():
937
+ return None
938
+ parts = [part.strip() for part in raw.split(",")]
939
+ if len(parts) != count:
940
+ return None
941
+ try:
942
+ return tuple(int(part, 10) for part in parts)
943
+ except ValueError:
944
+ return None
945
+
946
+
947
+ def crop_payload(bounds: tuple[int, ...] | None, image_size: tuple[int, ...] | None) -> tuple[dict[str, Any] | None, dict[str, Any] | None]:
948
+ bounds_payload = None
949
+ image_payload = None
950
+ if bounds is not None:
951
+ x, y, width, height = bounds
952
+ bounds_payload = {"x": x, "y": y, "width": width, "height": height}
953
+ if image_size is not None:
954
+ width, height = image_size
955
+ image_payload = {"width": width, "height": height}
956
+ return bounds_payload, image_payload
957
+
958
+
959
+ def valid_crop_geometry(bounds: tuple[int, ...] | None, image_size: tuple[int, ...] | None) -> tuple[bool, bool]:
960
+ if bounds is None or image_size is None:
961
+ return False, False
962
+ x, y, crop_width, crop_height = bounds
963
+ image_width, image_height = image_size
964
+ if x < 0 or y < 0 or crop_width <= 0 or crop_height <= 0 or image_width <= 0 or image_height <= 0:
965
+ return False, False
966
+ if x + crop_width > image_width or y + crop_height > image_height:
967
+ return True, True
968
+ return True, False
969
+
970
+
971
+ def parse_confidence(raw: str | None) -> tuple[float | None, str | None]:
972
+ if raw is None or not raw.strip():
973
+ return None, "missing"
974
+ try:
975
+ value = float(raw)
976
+ except ValueError:
977
+ return None, "invalid"
978
+ if not (0.0 <= value <= 1.0):
979
+ return None, "invalid"
980
+ return value, None
981
+
982
+
983
+ def read_visual_ocr_text(args: argparse.Namespace) -> dict[str, Any]:
984
+ if args.ocr_text is not None and args.ocr_text_file is not None:
985
+ raise RegistryError("--ocr-text and --ocr-text-file are mutually exclusive")
986
+ if args.ocr_text_file is not None:
987
+ path = Path(args.ocr_text_file)
988
+ source_label = args.ocr_source_label.strip() if args.ocr_source_label else path.name
989
+ loaded = read_bounded_regular_file(path, max_bytes=MAX_VISUAL_OCR_TEXT_BYTES, label="OCR text file")
990
+ assert loaded is not None
991
+ raw, truncated = loaded
992
+ source_type = "file"
993
+ elif args.ocr_text is not None:
994
+ raw = args.ocr_text.encode("utf-8")
995
+ source_label = args.ocr_source_label.strip() if args.ocr_source_label else "inline"
996
+ source_type = "inline"
997
+ truncated = len(raw) > MAX_VISUAL_OCR_TEXT_BYTES
998
+ raw = raw[:MAX_VISUAL_OCR_TEXT_BYTES]
999
+ else:
1000
+ raw = b""
1001
+ source_label = args.ocr_source_label.strip() if args.ocr_source_label else None
1002
+ source_type = None
1003
+ truncated = False
1004
+ try:
1005
+ text = raw.decode("utf-8")
1006
+ valid_encoding = True
1007
+ except UnicodeDecodeError:
1008
+ text = raw.decode("utf-8", errors="replace")
1009
+ valid_encoding = False
1010
+ return {
1011
+ "source_type": source_type,
1012
+ "source_label": source_label,
1013
+ "bytes": len(raw),
1014
+ "lines": len(text.splitlines()),
1015
+ "sha256": hashlib.sha256(raw).hexdigest() if raw else None,
1016
+ "truncated": truncated,
1017
+ "max_bytes": MAX_VISUAL_OCR_TEXT_BYTES,
1018
+ "valid_utf8": valid_encoding,
1019
+ "text_preview": text,
1020
+ "has_text": bool(text.strip()),
1021
+ }
1022
+
1023
+
1024
+ def visual_crop_ocr_plan_payload(args: argparse.Namespace) -> dict[str, Any]:
1025
+ full_receipt = args.full_evidence_receipt.strip() if args.full_evidence_receipt else None
1026
+ full_label = args.full_evidence_label.strip() if args.full_evidence_label else None
1027
+ missed_context_notes = clean_values(args.missed_context_note)
1028
+ ocr_error_notes = clean_values(args.ocr_error_note)
1029
+ crop_label = args.crop_label.strip() if args.crop_label else None
1030
+
1031
+ bounds = parse_int_tuple(args.crop_bounds, count=4)
1032
+ image_size = parse_int_tuple(args.image_size, count=2)
1033
+ bounds_payload, image_payload = crop_payload(bounds, image_size)
1034
+ crop_fields_present = any(value is not None and str(value).strip() for value in (args.crop_label, args.crop_bounds, args.image_size))
1035
+ crop_geometry_valid, crop_exceeds = valid_crop_geometry(bounds, image_size)
1036
+ crop_complete = bool(crop_label and crop_geometry_valid and not crop_exceeds)
1037
+
1038
+ ocr_text = read_visual_ocr_text(args)
1039
+ confidence, confidence_error = parse_confidence(args.ocr_confidence)
1040
+ ocr_fields_present = any(
1041
+ [
1042
+ args.ocr_text is not None,
1043
+ args.ocr_text_file is not None,
1044
+ args.ocr_confidence is not None,
1045
+ bool(ocr_error_notes),
1046
+ ]
1047
+ )
1048
+ ocr_complete = bool(
1049
+ ocr_text["has_text"]
1050
+ and ocr_text["valid_utf8"]
1051
+ and not ocr_text["truncated"]
1052
+ and confidence_error is None
1053
+ and ocr_error_notes
1054
+ )
1055
+
1056
+ blockers: list[str] = []
1057
+ if not full_receipt:
1058
+ blockers.append("missing_full_evidence_receipt")
1059
+ if not missed_context_notes:
1060
+ blockers.append("missing_missed_context_note")
1061
+ if not crop_complete and not ocr_complete:
1062
+ blockers.append("missing_derived_evidence")
1063
+
1064
+ if crop_fields_present and (not crop_label or not crop_geometry_valid):
1065
+ blockers.append("invalid_crop_bounds")
1066
+ elif crop_fields_present and crop_exceeds:
1067
+ blockers.append("crop_exceeds_image_bounds")
1068
+
1069
+ if ocr_fields_present:
1070
+ if confidence_error == "missing":
1071
+ blockers.append("missing_ocr_confidence")
1072
+ elif confidence_error == "invalid":
1073
+ blockers.append("invalid_ocr_confidence")
1074
+ if not ocr_error_notes:
1075
+ blockers.append("missing_ocr_error_note")
1076
+ if not ocr_text["has_text"]:
1077
+ blockers.append("missing_ocr_text")
1078
+ if not ocr_text["valid_utf8"]:
1079
+ blockers.append("invalid_ocr_text_encoding")
1080
+ if ocr_text["truncated"]:
1081
+ blockers.append("ocr_text_truncated")
1082
+
1083
+ # Preserve stable ordering while avoiding duplicates when incomplete derived
1084
+ # evidence also contributed path-specific blockers.
1085
+ blockers = list(dict.fromkeys(blockers))
1086
+ status = "ready_for_human_review" if not blockers else "blocked_until_visual_evidence"
1087
+
1088
+ return {
1089
+ "tool": TOOL_NAME,
1090
+ "schema_version": CONFIG_SCHEMA_VERSION,
1091
+ "experiment_id": "visual-crop-ocr",
1092
+ "mode": "dry_run",
1093
+ "status": status,
1094
+ "external_services": {
1095
+ "called": False,
1096
+ "ocr_service": None,
1097
+ "image_service": None,
1098
+ "network": False,
1099
+ },
1100
+ "full_visual_evidence": {
1101
+ "required": True,
1102
+ "available": bool(full_receipt),
1103
+ "receipt_id": full_receipt,
1104
+ "label": full_label,
1105
+ "verified": False,
1106
+ "note": "G004 records user-supplied full visual evidence handles only; it does not verify receipt storage.",
1107
+ },
1108
+ "derived_evidence": {
1109
+ "crop": {
1110
+ "available": crop_complete,
1111
+ "label": crop_label,
1112
+ "bounds": bounds_payload,
1113
+ "image_size": image_payload,
1114
+ "source": "user_supplied_metadata" if crop_fields_present else None,
1115
+ },
1116
+ "ocr": {
1117
+ "available": ocr_complete,
1118
+ "source_type": ocr_text["source_type"],
1119
+ "source_label": ocr_text["source_label"],
1120
+ "text_preview": ocr_text["text_preview"] if ocr_text["has_text"] else None,
1121
+ "metadata": {
1122
+ "bytes": ocr_text["bytes"],
1123
+ "lines": ocr_text["lines"],
1124
+ "sha256": ocr_text["sha256"],
1125
+ "truncated": ocr_text["truncated"],
1126
+ "max_bytes": ocr_text["max_bytes"],
1127
+ "valid_utf8": ocr_text["valid_utf8"],
1128
+ },
1129
+ "confidence": confidence,
1130
+ "error_notes": ocr_error_notes,
1131
+ },
1132
+ },
1133
+ "guardrails": {
1134
+ "original_evidence_required": True,
1135
+ "full_visual_evidence_must_remain_available": True,
1136
+ "external_ocr_service_allowed": False,
1137
+ "external_image_service_allowed": False,
1138
+ "human_review_required": True,
1139
+ "missed_context_review_required": True,
1140
+ "confidence_error_notes_required_for_ocr": True,
1141
+ "stable_runtime_behavior_changed": False,
1142
+ "candidate_replacement_allowed": False,
1143
+ },
1144
+ "review_plan": {
1145
+ "readiness_blockers": blockers,
1146
+ "missed_context_notes": missed_context_notes,
1147
+ "next_steps": [
1148
+ "Keep full visual evidence retrievable before relying on cropped or OCR-derived evidence.",
1149
+ "Review crop bounds and OCR text against the original evidence for missed context.",
1150
+ "Do not claim hosted image/text token or cost savings from this dry-run plan.",
1151
+ ],
1152
+ },
1153
+ "claim_boundary": (
1154
+ "Dry-run visual/OCR fixture planning only; no hosted visual/text token or cost savings claim without "
1155
+ "provider-measured matched successful tasks."
1156
+ ),
1157
+ "candidate_replacement": None,
1158
+ }
1159
+
1160
+
1161
+ def command_plan_visual_crop_ocr(args: argparse.Namespace) -> int:
1162
+ payload = visual_crop_ocr_plan_payload(args)
1163
+ if args.json:
1164
+ emit_json(payload)
1165
+ else:
1166
+ print("ContextGuard visual crop/OCR plan (dry-run only)")
1167
+ print("No external OCR/image service was called and no replacement evidence was emitted.")
1168
+ print(f"Status: {payload['status']}")
1169
+ print(f"Full evidence available: {payload['full_visual_evidence']['available']} verified=false")
1170
+ print(
1171
+ "Derived evidence: "
1172
+ f"crop={payload['derived_evidence']['crop']['available']} "
1173
+ f"ocr={payload['derived_evidence']['ocr']['available']}"
1174
+ )
1175
+ if payload["review_plan"]["readiness_blockers"]:
1176
+ print(f"Readiness blockers: {', '.join(payload['review_plan']['readiness_blockers'])}")
1177
+ print(payload["claim_boundary"])
1178
+ return 0
1179
+
1180
+
1181
+ SECRET_LABEL_KEY_RE = (
1182
+ r"[A-Za-z0-9_.-]*(?:"
1183
+ r"api[-_]?key|apikey|token|secret|password|passwd|pwd|client[-_]?secret|"
1184
+ r"auth|authorization|bearer|basic|pass|credential|credentials|signature|sig|"
1185
+ r"x[-_]?amz[-_]?[a-z0-9_.-]*|aws[a-z0-9_.-]*|(?:aws[-_]?)?access[-_]?key(?:[-_]?id)?|"
1186
+ r"private[-_]?key|privatekey|pgp[-_]?private[-_]?key|pgpprivatekey|ssh[-_]?key|sshkey"
1187
+ r")[A-Za-z0-9_.-]*"
1188
+ )
1189
+ SECRET_LABEL_VALUE_RE = r"(?:'[^']*'|\"[^\"]*\"|[^\s,}&#;]+)"
1190
+ SECRET_LABEL_PATTERNS: tuple[tuple[re.Pattern[str], str], ...] = (
1191
+ (re.compile(r"(?i)\bAuthorization\s*:\s*(?:Bearer|Basic|AWS|AWS4-HMAC-SHA256)\s+[^\s,}\]]+(?:\s+[A-Za-z0-9_-]+=[^\s,}\]]+)*"), "Authorization: [REDACTED]"),
1192
+ (re.compile(r"(?i)\b(?:Bearer|Basic)\s*(?:[:=]\s*)?[A-Za-z0-9._~+/=-]+"), "[REDACTED]"),
1193
+ (re.compile(r"(?i)\b(?:AWS|AWS4-HMAC-SHA256)\s+[A-Za-z0-9,=:/+._~%-]+"), "[REDACTED]"),
1194
+ (re.compile(rf"(?i)([?&#;]({SECRET_LABEL_KEY_RE})=)[^\s?&#;]+"), r"\1[REDACTED]"),
1195
+ (
1196
+ re.compile(rf"(?i)(^|[\s{{,?&#;])([\"']?(?:{SECRET_LABEL_KEY_RE})[\"']?\s*[:=]\s*){SECRET_LABEL_VALUE_RE}"),
1197
+ r"\1\2[REDACTED]",
1198
+ ),
1199
+ (
1200
+ re.compile(rf"(?i)(^|[\s\"'])(--(?:{SECRET_LABEL_KEY_RE})(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"),
1201
+ r"\1\2[REDACTED]",
1202
+ ),
1203
+ (re.compile(r"(?i)(^|[\s\"'])((?:-u|--user)(?:\s+|=))(?:'[^']*'|\"[^\"]*\"|[^\s\"']+)"), r"\1\2[REDACTED]"),
1204
+ (re.compile(rf"(?i)(^|[/\\\s{{,?&#;\[\(<])({SECRET_LABEL_KEY_RE}(?:[:=][^\s,}}&#;\]\)\\/]*)?)"), r"\1[REDACTED]"),
1205
+ (re.compile(r"gh[pousr]_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
1206
+ (re.compile(r"github_pat_[A-Za-z0-9_]{20,}"), "[REDACTED]"),
1207
+ (re.compile(r"glpat-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
1208
+ (re.compile(r"xox[abprs]-[A-Za-z0-9-]{10,}"), "[REDACTED]"),
1209
+ (re.compile(r"(?:AKIA|ASIA)[0-9A-Z]{16}"), "[REDACTED]"),
1210
+ (re.compile(r"(?:sk|pk|rk)_(?:live|test)_[A-Za-z0-9]{16,}"), "[REDACTED]"),
1211
+ (re.compile(r"sk-(?:ant|proj)-[A-Za-z0-9_-]{12,}"), "[REDACTED]"),
1212
+ (re.compile(r"npm_[A-Za-z0-9]{20,}"), "[REDACTED]"),
1213
+ (re.compile(r"AIza[0-9A-Za-z_\-]{20,}"), "[REDACTED]"),
1214
+ (re.compile(r"SG\.[A-Za-z0-9_-]{16,}\.[A-Za-z0-9_-]{16,}"), "[REDACTED]"),
1215
+ (re.compile(r"eyJ[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+\.[A-Za-z0-9_-]+"), "[REDACTED]"),
1216
+ (re.compile(r"([a-z][a-z0-9+.-]*://)[^/\s@]+@", re.IGNORECASE), r"\1[REDACTED]@"),
1217
+ )
1218
+
1219
+
1220
+ def sanitize_self_hosted_text(value: Any) -> str:
1221
+ text = "" if value is None else str(value)
1222
+ text = "".join(" " if unicodedata.category(ch)[0] == "C" else ch for ch in text)
1223
+ text = " ".join(text.split())
1224
+ for pattern, replacement in SECRET_LABEL_PATTERNS:
1225
+ text = pattern.sub(replacement, text)
1226
+ text = re.sub(r"\[REDACTED\]\]+", "[REDACTED]", text)
1227
+ text = re.sub(r"(?:\[REDACTED\]\s*){2,}", "[REDACTED]", text)
1228
+ if len(text) > MAX_SELF_HOSTED_LABEL_CHARS:
1229
+ text = text[: MAX_SELF_HOSTED_LABEL_CHARS - 12].rstrip() + "…[truncated]"
1230
+ return text
1231
+
1232
+
1233
+ def sanitize_self_hosted_label(value: Any) -> str | None:
1234
+ if not isinstance(value, str):
1235
+ return None
1236
+ text = sanitize_self_hosted_text(value)
1237
+ if not text:
1238
+ return None
1239
+ return text
1240
+
1241
+
1242
+ def sanitize_self_hosted_ignored_key(value: Any) -> str:
1243
+ if not isinstance(value, str):
1244
+ return "non_string_key"
1245
+ text = sanitize_self_hosted_text(value)
1246
+ if not text:
1247
+ return "empty_key"
1248
+ if "[REDACTED]" in text:
1249
+ return "redacted_key"
1250
+ return text
1251
+
1252
+
1253
+ def normalize_self_hosted_metric(value: Any, *, maximum: float) -> float | None:
1254
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
1255
+ return None
1256
+ number = float(value)
1257
+ if not math.isfinite(number) or number < 0 or number > maximum:
1258
+ return None
1259
+ return number
1260
+
1261
+
1262
+ SELF_HOSTED_METRIC_LIMITS: dict[str, float] = {
1263
+ "latency_ms": MAX_SELF_HOSTED_LATENCY_MS,
1264
+ "peak_memory_mb": MAX_SELF_HOSTED_MEMORY_MB,
1265
+ "quality_score": 1.0,
1266
+ "energy_wh": MAX_SELF_HOSTED_ENERGY_WH,
1267
+ "local_cost_usd": MAX_SELF_HOSTED_LOCAL_COST_USD,
1268
+ "tokens_per_second": MAX_SELF_HOSTED_TOKENS_PER_SECOND,
1269
+ }
1270
+ SELF_HOSTED_LABEL_KEYS = ("model_server", "optimization", "quality_metric", "hardware", "runtime", "dataset")
1271
+
1272
+
1273
+ def normalize_self_hosted_metrics(raw: Any, *, source: str) -> tuple[dict[str, Any] | None, list[str], list[str]]:
1274
+ invalid_keys: list[str] = []
1275
+ ignored_keys: list[str] = []
1276
+ if not isinstance(raw, dict):
1277
+ return None, ["self_hosted_metrics_not_object"], ignored_keys
1278
+ metrics: dict[str, float] = {}
1279
+ labels: dict[str, str] = {}
1280
+ availability = {key: False for key in SELF_HOSTED_METRIC_LIMITS}
1281
+ for key, value in raw.items():
1282
+ if key in SELF_HOSTED_METRIC_LIMITS:
1283
+ metric = normalize_self_hosted_metric(value, maximum=SELF_HOSTED_METRIC_LIMITS[key])
1284
+ if metric is None:
1285
+ invalid_keys.append(key)
1286
+ else:
1287
+ metrics[key] = metric
1288
+ availability[key] = True
1289
+ elif key in SELF_HOSTED_LABEL_KEYS:
1290
+ label = sanitize_self_hosted_label(value)
1291
+ if label is not None:
1292
+ labels[key] = label
1293
+ elif value is not None:
1294
+ invalid_keys.append(key)
1295
+ else:
1296
+ ignored_keys.append(sanitize_self_hosted_ignored_key(key))
1297
+ if not metrics:
1298
+ return None, invalid_keys, ignored_keys
1299
+ return {
1300
+ "schema_version": SELF_HOSTED_METRICS_SCHEMA_VERSION,
1301
+ "source": source,
1302
+ "metrics": metrics,
1303
+ "labels": labels,
1304
+ "measurement_availability": availability,
1305
+ "claim_boundary": {
1306
+ "id": SELF_HOSTED_METRICS_CLAIM_BOUNDARY,
1307
+ "hosted_api_token_savings_claim_allowed": False,
1308
+ "hosted_api_cost_savings_claim_allowed": False,
1309
+ "requires_provider_measured_matched_tasks_for_hosted_claims": True,
1310
+ "reason": (
1311
+ "Self-hosted local/model-server latency, memory, quality, energy, and local cost metrics "
1312
+ "are not hosted API token or cost telemetry."
1313
+ ),
1314
+ },
1315
+ }, invalid_keys, ignored_keys
1316
+
1317
+
1318
+ def cli_self_hosted_metrics(args: argparse.Namespace) -> dict[str, Any]:
1319
+ raw: dict[str, Any] = {}
1320
+ for arg_name, metric_name in (
1321
+ ("latency_ms", "latency_ms"),
1322
+ ("peak_memory_mb", "peak_memory_mb"),
1323
+ ("quality_score", "quality_score"),
1324
+ ("energy_wh", "energy_wh"),
1325
+ ("local_cost_usd", "local_cost_usd"),
1326
+ ("tokens_per_second", "tokens_per_second"),
1327
+ ):
1328
+ value = getattr(args, arg_name)
1329
+ if value is not None:
1330
+ raw[metric_name] = value
1331
+ for arg_name in SELF_HOSTED_LABEL_KEYS:
1332
+ value = getattr(args, arg_name)
1333
+ if value is not None:
1334
+ raw[arg_name] = value
1335
+ return raw
1336
+
1337
+
1338
+ def reject_non_finite_json_constant(value: str) -> NoReturn:
1339
+ raise ValueError(f"non-finite JSON value {value}")
1340
+
1341
+
1342
+ def has_non_finite_json_number(value: Any) -> bool:
1343
+ stack: list[tuple[Any, int]] = [(value, 0)]
1344
+ visited = 0
1345
+ while stack:
1346
+ item, depth = stack.pop()
1347
+ visited += 1
1348
+ if depth > MAX_SELF_HOSTED_JSON_DEPTH or visited > MAX_SELF_HOSTED_JSON_NODES:
1349
+ return True
1350
+ if isinstance(item, bool):
1351
+ continue
1352
+ if isinstance(item, float):
1353
+ if not math.isfinite(item):
1354
+ return True
1355
+ elif isinstance(item, list):
1356
+ stack.extend((child, depth + 1) for child in item)
1357
+ elif isinstance(item, dict):
1358
+ stack.extend((child, depth + 1) for child in item.values())
1359
+ return False
1360
+
1361
+
1362
+ def read_self_hosted_payload(args: argparse.Namespace) -> tuple[Any, dict[str, Any]]:
1363
+ source_label = sanitize_self_hosted_text(args.source_label) if args.source_label else None
1364
+ if args.input:
1365
+ path = Path(args.input)
1366
+ source_label = source_label or sanitize_self_hosted_text(path)
1367
+ try:
1368
+ loaded = read_bounded_regular_file(path, max_bytes=MAX_SELF_HOSTED_METRICS_INPUT_BYTES, label=f"self-hosted metrics input: {source_label}")
1369
+ except RegistryError as exc:
1370
+ raise RegistryError(f"could not read self-hosted metrics input: {source_label}: {exc}") from exc
1371
+ assert loaded is not None
1372
+ raw, loaded_truncated = loaded
1373
+ else:
1374
+ source_label = source_label or "stdin"
1375
+ raw = sys.stdin.buffer.read(MAX_SELF_HOSTED_METRICS_INPUT_BYTES + 1)
1376
+ loaded_truncated = len(raw) > MAX_SELF_HOSTED_METRICS_INPUT_BYTES
1377
+ raw = raw[:MAX_SELF_HOSTED_METRICS_INPUT_BYTES]
1378
+ if loaded_truncated:
1379
+ return None, {
1380
+ "source_label": source_label,
1381
+ "bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1382
+ "sha256": hashlib.sha256(raw).hexdigest(),
1383
+ "truncated": True,
1384
+ "max_bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1385
+ "envelope_source": None,
1386
+ "invalid_metric_keys": [],
1387
+ "ignored_keys": [],
1388
+ }
1389
+ if not raw.strip():
1390
+ return None, {
1391
+ "source_label": source_label,
1392
+ "bytes": len(raw),
1393
+ "sha256": hashlib.sha256(raw).hexdigest(),
1394
+ "truncated": False,
1395
+ "max_bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1396
+ "envelope_source": None,
1397
+ "invalid_metric_keys": [],
1398
+ "ignored_keys": [],
1399
+ }
1400
+ text = raw.decode("utf-8", errors="replace")
1401
+ try:
1402
+ payload = json.loads(text, parse_constant=reject_non_finite_json_constant)
1403
+ except json.JSONDecodeError as exc:
1404
+ raise RegistryError(f"could not parse self-hosted metrics JSON: {exc.msg}") from exc
1405
+ except ValueError as exc:
1406
+ raise RegistryError(f"could not parse self-hosted metrics JSON: {exc}") from exc
1407
+ except RecursionError as exc:
1408
+ raise RegistryError("could not parse self-hosted metrics JSON: nesting too deep") from exc
1409
+ if has_non_finite_json_number(payload):
1410
+ raise RegistryError("could not parse self-hosted metrics JSON: non-finite JSON number")
1411
+ return payload, {
1412
+ "source_label": source_label,
1413
+ "bytes": len(raw),
1414
+ "sha256": hashlib.sha256(raw).hexdigest(),
1415
+ "truncated": False,
1416
+ "max_bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1417
+ "envelope_source": None,
1418
+ "invalid_metric_keys": [],
1419
+ "ignored_keys": [],
1420
+ }
1421
+
1422
+
1423
+ def select_self_hosted_envelope(payload: Any) -> tuple[Any, str | None, list[str]]:
1424
+ if not isinstance(payload, dict):
1425
+ return None, None, ["input_not_object"]
1426
+ ignored: list[str] = []
1427
+ if SELF_HOSTED_METRICS_KEY in payload:
1428
+ return payload.get(SELF_HOSTED_METRICS_KEY), f"explicit_provider_payload.{SELF_HOSTED_METRICS_KEY}", ignored
1429
+ metrics = payload.get("metrics")
1430
+ if isinstance(metrics, dict) and SELF_HOSTED_METRICS_KEY in metrics:
1431
+ return metrics.get(SELF_HOSTED_METRICS_KEY), f"explicit_provider_payload.metrics.{SELF_HOSTED_METRICS_KEY}", ignored
1432
+ if any(isinstance(key, str) and key.startswith("self_hosted_") for key in payload):
1433
+ ignored.append("incidental_self_hosted_keys")
1434
+ return None, None, ignored
1435
+
1436
+
1437
+ def self_hosted_metrics_plan_payload(args: argparse.Namespace) -> dict[str, Any]:
1438
+ cli_metrics = cli_self_hosted_metrics(args)
1439
+ if cli_metrics:
1440
+ raw_metrics = cli_metrics
1441
+ source = "cli_flags"
1442
+ ignored_envelope_keys = []
1443
+ input_meta = {
1444
+ "source_label": sanitize_self_hosted_text(args.source_label) if args.source_label else "cli_flags",
1445
+ "bytes": 0,
1446
+ "sha256": None,
1447
+ "truncated": False,
1448
+ "max_bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1449
+ "envelope_source": source,
1450
+ "invalid_metric_keys": [],
1451
+ "ignored_keys": [],
1452
+ }
1453
+ elif args.input or not sys.stdin.isatty():
1454
+ raw_payload, input_meta = read_self_hosted_payload(args)
1455
+ raw_metrics, source, ignored_envelope_keys = select_self_hosted_envelope(raw_payload)
1456
+ else:
1457
+ raw_metrics = {}
1458
+ source = None
1459
+ ignored_envelope_keys = []
1460
+ input_meta = {
1461
+ "source_label": sanitize_self_hosted_text(args.source_label) if args.source_label else "cli_flags",
1462
+ "bytes": 0,
1463
+ "sha256": None,
1464
+ "truncated": False,
1465
+ "max_bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1466
+ "envelope_source": source,
1467
+ "invalid_metric_keys": [],
1468
+ "ignored_keys": [],
1469
+ }
1470
+ if input_meta["truncated"]:
1471
+ sidecar = None
1472
+ invalid_keys: list[str] = []
1473
+ ignored_keys = ignored_envelope_keys
1474
+ elif raw_metrics is None:
1475
+ sidecar = None
1476
+ invalid_keys = []
1477
+ ignored_keys = ignored_envelope_keys
1478
+ else:
1479
+ sidecar, invalid_keys, ignored_keys = normalize_self_hosted_metrics(raw_metrics, source=source or "missing_explicit_envelope")
1480
+ input_meta["envelope_source"] = source
1481
+ input_meta["invalid_metric_keys"] = sorted(set(invalid_keys))
1482
+ input_meta["ignored_keys"] = sorted(set(ignored_keys + ignored_envelope_keys))
1483
+ blockers: list[str] = []
1484
+ if input_meta["truncated"]:
1485
+ blockers.append("input_truncated")
1486
+ if source is None:
1487
+ blockers.append("missing_explicit_self_hosted_metrics_envelope")
1488
+ if sidecar is None:
1489
+ blockers.append("missing_self_hosted_metrics")
1490
+ if invalid_keys:
1491
+ blockers.append("invalid_self_hosted_metrics")
1492
+ blockers = list(dict.fromkeys(blockers))
1493
+ ready = not blockers
1494
+ ledger_preview = None
1495
+ if sidecar is not None:
1496
+ ledger_preview = {
1497
+ "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
1498
+ "date": datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ"),
1499
+ "claude_version": "dry-run",
1500
+ "task_id": "self-hosted-metrics-dry-run",
1501
+ "variant": "self-hosted-metrics-ledger",
1502
+ "transform_id": "self-hosted-metrics-ledger",
1503
+ "success": None,
1504
+ "primary_tokens_measured": False,
1505
+ "primary_tokens": 0,
1506
+ "primary_cost_measured": False,
1507
+ "primary_cost_usd": 0.0,
1508
+ "provider_cached_tokens": None,
1509
+ "provider_cached_tokens_measured": False,
1510
+ "wall_time_seconds": 0.0,
1511
+ "external_tokens_measured": False,
1512
+ "external_tokens": 0,
1513
+ "external_cost_measured": False,
1514
+ "external_cost_usd": 0.0,
1515
+ "total_cost_with_shift_usd": None,
1516
+ "artifacts_used": 0,
1517
+ "bytes_before": 0,
1518
+ "bytes_after": 0,
1519
+ "hook_triggers": 0,
1520
+ "turns": 0,
1521
+ "notes": "dry-run preview; no ledger file written",
1522
+ "measurement_availability": {
1523
+ "primary_tokens": False,
1524
+ "primary_cost": False,
1525
+ "external_tokens": False,
1526
+ "external_cost": False,
1527
+ "shifted_cost": False,
1528
+ "provider_cache": False,
1529
+ "byte_metrics": False,
1530
+ "wall_time": False,
1531
+ "self_hosted_metrics": True,
1532
+ },
1533
+ "self_hosted_metrics": sidecar,
1534
+ "proxy_metrics": {
1535
+ "byte_metrics_observed": False,
1536
+ "token_proxy": "chars_div_4",
1537
+ "bytes_per_token": TOKEN_PROXY_BYTES_PER_TOKEN,
1538
+ "claim_boundary": "proxy_only_not_hosted_token_savings",
1539
+ },
1540
+ }
1541
+ return {
1542
+ "tool": TOOL_NAME,
1543
+ "schema_version": CONFIG_SCHEMA_VERSION,
1544
+ "experiment_id": "self-hosted-metrics-ledger",
1545
+ "mode": "dry_run",
1546
+ "status": "ready_for_ledger_review" if ready else "blocked_until_metrics",
1547
+ "input": input_meta,
1548
+ "policy": {
1549
+ "default_off": True,
1550
+ "ledger_write_performed": False,
1551
+ "hosted_api_token_savings_claim_allowed": False,
1552
+ "hosted_api_cost_savings_claim_allowed": False,
1553
+ "stable_runtime_behavior_changed": False,
1554
+ },
1555
+ "self_hosted_metrics": sidecar,
1556
+ "ledger_preview": ledger_preview,
1557
+ "review_plan": {
1558
+ "readiness_blockers": blockers,
1559
+ "next_steps": [
1560
+ "Record real run evidence with context-guard-bench --ledger-jsonl when benchmark data exists.",
1561
+ "Keep self-hosted local metrics out of hosted API token/cost savings claims.",
1562
+ "Use provider-measured matched successful tasks for hosted API savings claims.",
1563
+ ],
1564
+ },
1565
+ "claim_boundary": (
1566
+ "Dry-run self-hosted metrics ledger preview only; local/model-server metrics are diagnostic sidecars "
1567
+ "and are not hosted API token or cost savings evidence."
1568
+ ),
1569
+ }
1570
+
1571
+
1572
+ def command_plan_self_hosted_metrics_ledger(args: argparse.Namespace) -> int:
1573
+ payload = self_hosted_metrics_plan_payload(args)
1574
+ if args.json:
1575
+ emit_json(payload)
1576
+ else:
1577
+ print("ContextGuard self-hosted metrics ledger preview (dry-run only)")
1578
+ print("No ledger file was written and no hosted API token/cost savings claim is allowed from these metrics.")
1579
+ print(f"Status: {payload['status']}")
1580
+ if payload["review_plan"]["readiness_blockers"]:
1581
+ print(f"Readiness blockers: {', '.join(payload['review_plan']['readiness_blockers'])}")
1582
+ print(payload["claim_boundary"])
1583
+ return 0
1584
+
1585
+
1586
+ def sanitize_local_proxy_value(value: Any) -> str:
1587
+ return sanitize_self_hosted_text(value)
1588
+
1589
+
1590
+ def local_proxy_secret_like(value: Any) -> bool:
1591
+ if value is None:
1592
+ return False
1593
+ return "[REDACTED]" in sanitize_local_proxy_value(value)
1594
+
1595
+
1596
+ def is_localhost_host(value: Any) -> bool:
1597
+ if not isinstance(value, str):
1598
+ return False
1599
+ host = value.strip().strip("[]").lower().rstrip(".")
1600
+ if host in LOCAL_PROXY_LOCALHOST_NAMES:
1601
+ return True
1602
+ try:
1603
+ return ipaddress.ip_address(host).is_loopback
1604
+ except ValueError:
1605
+ return False
1606
+
1607
+
1608
+ def normalize_local_proxy_host(value: Any, *, default: str) -> tuple[str, bool, bool]:
1609
+ if value is None or str(value).strip() == "":
1610
+ host = default
1611
+ else:
1612
+ host = str(value).strip().strip("[]")
1613
+ sanitized = sanitize_local_proxy_value(host)
1614
+ return sanitized, is_localhost_host(host), "[REDACTED]" in sanitized
1615
+
1616
+
1617
+ def normalize_local_proxy_port(value: Any, *, default: int) -> tuple[int, bool]:
1618
+ if value is None or value == "":
1619
+ return default, True
1620
+ if isinstance(value, bool):
1621
+ return default, False
1622
+ try:
1623
+ port = int(value)
1624
+ except (TypeError, ValueError):
1625
+ return default, False
1626
+ return port, 0 <= port <= 65535
1627
+
1628
+
1629
+ def read_local_proxy_payload(args: argparse.Namespace) -> tuple[dict[str, Any], dict[str, Any]]:
1630
+ if not args.input:
1631
+ return {}, {
1632
+ "source_label": "cli_flags",
1633
+ "bytes": 0,
1634
+ "sha256": None,
1635
+ "truncated": False,
1636
+ "ignored_keys": [],
1637
+ }
1638
+ path = Path(args.input)
1639
+ safe_path = sanitize_local_proxy_value(path)
1640
+ try:
1641
+ loaded = read_bounded_regular_file(path, max_bytes=MAX_SELF_HOSTED_METRICS_INPUT_BYTES, label=f"local-proxy input: {safe_path}")
1642
+ except RegistryError as exc:
1643
+ raise RegistryError(f"could not read local-proxy input: {safe_path}: {exc}") from exc
1644
+ assert loaded is not None
1645
+ raw, loaded_truncated = loaded
1646
+ if loaded_truncated:
1647
+ return {}, {
1648
+ "source_label": safe_path,
1649
+ "bytes": MAX_SELF_HOSTED_METRICS_INPUT_BYTES,
1650
+ "sha256": hashlib.sha256(raw).hexdigest(),
1651
+ "truncated": True,
1652
+ "ignored_keys": [],
1653
+ }
1654
+ if not raw.strip():
1655
+ return {}, {
1656
+ "source_label": safe_path,
1657
+ "bytes": len(raw),
1658
+ "sha256": hashlib.sha256(raw).hexdigest(),
1659
+ "truncated": False,
1660
+ "ignored_keys": [],
1661
+ }
1662
+ text = raw.decode("utf-8", errors="replace")
1663
+ try:
1664
+ payload = json.loads(text, parse_constant=reject_non_finite_json_constant)
1665
+ except json.JSONDecodeError as exc:
1666
+ raise RegistryError(f"could not parse local-proxy JSON: {exc.msg}") from exc
1667
+ except ValueError as exc:
1668
+ raise RegistryError(f"could not parse local-proxy JSON: {exc}") from exc
1669
+ except RecursionError as exc:
1670
+ raise RegistryError("could not parse local-proxy JSON: nesting too deep") from exc
1671
+ if has_non_finite_json_number(payload):
1672
+ raise RegistryError("could not parse local-proxy JSON: non-finite JSON number")
1673
+ if not isinstance(payload, dict):
1674
+ return {}, {
1675
+ "source_label": safe_path,
1676
+ "bytes": len(raw),
1677
+ "sha256": hashlib.sha256(raw).hexdigest(),
1678
+ "truncated": False,
1679
+ "ignored_keys": ["input_not_object"],
1680
+ }
1681
+ envelope = payload.get("local_proxy", payload)
1682
+ ignored = []
1683
+ if not isinstance(envelope, dict):
1684
+ envelope = {}
1685
+ ignored.append("local_proxy_not_object")
1686
+ allowed = {
1687
+ "bind_host",
1688
+ "bind_port",
1689
+ "target_host",
1690
+ "target_port",
1691
+ "upstream_url",
1692
+ "ledger_jsonl",
1693
+ "proxy_label",
1694
+ "api_key",
1695
+ "authorization_header",
1696
+ "persist_api_key",
1697
+ "external_forwarding_intent",
1698
+ "runtime_gate_ack",
1699
+ }
1700
+ ignored.extend(sanitize_self_hosted_ignored_key(key) for key in envelope if key not in allowed)
1701
+ return dict(envelope), {
1702
+ "source_label": safe_path,
1703
+ "bytes": len(raw),
1704
+ "sha256": hashlib.sha256(raw).hexdigest(),
1705
+ "truncated": False,
1706
+ "ignored_keys": sorted(set(ignored)),
1707
+ }
1708
+
1709
+
1710
+ def coalesce_local_proxy_value(args: argparse.Namespace, payload: dict[str, Any], attr: str, key: str) -> Any:
1711
+ value = getattr(args, attr)
1712
+ return value if value is not None else payload.get(key)
1713
+
1714
+
1715
+ def coalesce_local_proxy_bool(args: argparse.Namespace, payload: dict[str, Any], attr: str, key: str) -> bool:
1716
+ if getattr(args, attr):
1717
+ return True
1718
+ return bool(payload.get(key))
1719
+
1720
+
1721
+ def local_proxy_plan_payload(args: argparse.Namespace) -> dict[str, Any]:
1722
+ input_payload, input_meta = read_local_proxy_payload(args)
1723
+ bind_host_raw = coalesce_local_proxy_value(args, input_payload, "bind_host", "bind_host")
1724
+ bind_port_raw = coalesce_local_proxy_value(args, input_payload, "bind_port", "bind_port")
1725
+ target_host_raw = coalesce_local_proxy_value(args, input_payload, "target_host", "target_host")
1726
+ target_port_raw = coalesce_local_proxy_value(args, input_payload, "target_port", "target_port")
1727
+ upstream_url_raw = coalesce_local_proxy_value(args, input_payload, "upstream_url", "upstream_url")
1728
+ ledger_jsonl_raw = coalesce_local_proxy_value(args, input_payload, "ledger_jsonl", "ledger_jsonl")
1729
+ proxy_label_raw = coalesce_local_proxy_value(args, input_payload, "proxy_label", "proxy_label")
1730
+ api_key_raw = coalesce_local_proxy_value(args, input_payload, "api_key", "api_key")
1731
+ authorization_raw = coalesce_local_proxy_value(args, input_payload, "authorization_header", "authorization_header")
1732
+ persist_api_key = coalesce_local_proxy_bool(args, input_payload, "persist_api_key", "persist_api_key")
1733
+ external_forwarding_intent = coalesce_local_proxy_bool(
1734
+ args,
1735
+ input_payload,
1736
+ "external_forwarding_intent",
1737
+ "external_forwarding_intent",
1738
+ )
1739
+ runtime_gate_ack = coalesce_local_proxy_bool(args, input_payload, "runtime_gate_ack", "runtime_gate_ack")
1740
+
1741
+ upstream_url = sanitize_local_proxy_value(upstream_url_raw) if upstream_url_raw else None
1742
+ upstream_host = None
1743
+ upstream_url_valid = True
1744
+ upstream_localhost = True
1745
+ upstream_secret_like = False
1746
+ if upstream_url_raw:
1747
+ upstream_secret_like = local_proxy_secret_like(upstream_url_raw)
1748
+ try:
1749
+ parsed = urlparse(str(upstream_url_raw))
1750
+ upstream_host = parsed.hostname
1751
+ except ValueError:
1752
+ upstream_url_valid = False
1753
+ upstream_host = None
1754
+ else:
1755
+ if upstream_host:
1756
+ upstream_localhost = is_localhost_host(upstream_host)
1757
+ else:
1758
+ upstream_url_valid = False
1759
+ upstream_localhost = False
1760
+ try:
1761
+ upstream_port = parsed.port
1762
+ except ValueError:
1763
+ upstream_url_valid = False
1764
+ upstream_port = None
1765
+ if upstream_port is not None and target_port_raw is None:
1766
+ target_port_raw = upstream_port
1767
+ if upstream_host and target_host_raw is None:
1768
+ target_host_raw = upstream_host
1769
+
1770
+ bind_host, bind_localhost, bind_secret_like = normalize_local_proxy_host(
1771
+ bind_host_raw,
1772
+ default=LOCAL_PROXY_DEFAULT_BIND_HOST,
1773
+ )
1774
+ target_host, target_localhost, target_secret_like = normalize_local_proxy_host(
1775
+ target_host_raw,
1776
+ default=LOCAL_PROXY_DEFAULT_TARGET_HOST,
1777
+ )
1778
+ bind_port, bind_port_valid = normalize_local_proxy_port(bind_port_raw, default=LOCAL_PROXY_DEFAULT_BIND_PORT)
1779
+ target_port, target_port_valid = normalize_local_proxy_port(target_port_raw, default=LOCAL_PROXY_DEFAULT_TARGET_PORT)
1780
+ ledger_jsonl = sanitize_local_proxy_value(ledger_jsonl_raw) if ledger_jsonl_raw else None
1781
+ proxy_label = sanitize_local_proxy_value(proxy_label_raw) if proxy_label_raw else "local-proxy-dry-run"
1782
+ api_key_provided = api_key_raw is not None and str(api_key_raw).strip() != ""
1783
+ authorization_header_provided = authorization_raw is not None and str(authorization_raw).strip() != ""
1784
+ secret_like_fields: list[str] = []
1785
+ for field, raw in (
1786
+ ("bind_host", bind_host_raw),
1787
+ ("bind_port", bind_port_raw),
1788
+ ("target_host", target_host_raw),
1789
+ ("target_port", target_port_raw),
1790
+ ("upstream_url", upstream_url_raw),
1791
+ ("ledger_jsonl", ledger_jsonl_raw),
1792
+ ("proxy_label", proxy_label_raw),
1793
+ ("api_key", api_key_raw),
1794
+ ("authorization_header", authorization_raw),
1795
+ ):
1796
+ if raw is not None and local_proxy_secret_like(raw):
1797
+ secret_like_fields.append(field)
1798
+ if bind_secret_like and "bind_host" not in secret_like_fields:
1799
+ secret_like_fields.append("bind_host")
1800
+ if target_secret_like and "target_host" not in secret_like_fields:
1801
+ secret_like_fields.append("target_host")
1802
+ if upstream_secret_like and "upstream_url" not in secret_like_fields:
1803
+ secret_like_fields.append("upstream_url")
1804
+
1805
+ blockers: list[str] = []
1806
+ if input_meta["truncated"]:
1807
+ blockers.append("input_truncated")
1808
+ if not bind_port_valid:
1809
+ blockers.append("invalid_bind_port")
1810
+ if not target_port_valid:
1811
+ blockers.append("invalid_target_port")
1812
+ if upstream_url_raw and not upstream_url_valid:
1813
+ blockers.append("invalid_upstream_url")
1814
+ if not bind_localhost:
1815
+ blockers.append("non_localhost_bind_host")
1816
+ if not target_localhost:
1817
+ blockers.append("non_localhost_target_host")
1818
+ if upstream_url_raw and not upstream_localhost:
1819
+ blockers.append("non_localhost_upstream_url")
1820
+ if api_key_provided or authorization_header_provided:
1821
+ blockers.append("api_key_material_provided")
1822
+ if persist_api_key:
1823
+ blockers.append("api_key_persistence_requested")
1824
+ if external_forwarding_intent:
1825
+ blockers.append("external_forwarding_intent_not_allowed")
1826
+ if not runtime_gate_ack:
1827
+ blockers.append("missing_runtime_gate_ack")
1828
+ if secret_like_fields:
1829
+ blockers.append("secret_like_proxy_metadata")
1830
+ blockers = list(dict.fromkeys(blockers))
1831
+ ready = not blockers
1832
+
1833
+ return {
1834
+ "tool": TOOL_NAME,
1835
+ "schema_version": CONFIG_SCHEMA_VERSION,
1836
+ "experiment_id": "local-proxy",
1837
+ "mode": "dry_run",
1838
+ "status": "ready_for_runtime_review" if ready else "blocked_until_local_proxy_constraints",
1839
+ "input": input_meta,
1840
+ "policy": {
1841
+ "default_off": True,
1842
+ "dry_run_only": True,
1843
+ "localhost_only": True,
1844
+ "runtime_gate_required_before_forwarding": True,
1845
+ "runtime_gate_acknowledged": runtime_gate_ack,
1846
+ "stable_runtime_behavior_changed": False,
1847
+ },
1848
+ "bind": {
1849
+ "host": bind_host,
1850
+ "port": bind_port,
1851
+ "localhost_only": bind_localhost,
1852
+ },
1853
+ "target": {
1854
+ "host": target_host,
1855
+ "port": target_port,
1856
+ "upstream_url": upstream_url,
1857
+ "localhost_only": target_localhost,
1858
+ },
1859
+ "network_actions": {
1860
+ "listener_started": False,
1861
+ "outbound_forwarding_attempted": False,
1862
+ "dns_lookup_attempted": False,
1863
+ "external_services_called": False,
1864
+ },
1865
+ "api_key_persistence": {
1866
+ "api_key_material_provided": api_key_provided,
1867
+ "authorization_header_provided": authorization_header_provided,
1868
+ "requested": persist_api_key,
1869
+ "performed": False,
1870
+ "allowed_by_default": False,
1871
+ },
1872
+ "ledger_preview": {
1873
+ "schema_version": LOCAL_PROXY_SCHEMA_VERSION,
1874
+ "ledger_jsonl": ledger_jsonl,
1875
+ "ledger_write_performed": False,
1876
+ "proxy_label": proxy_label,
1877
+ "claim_boundary": "local_proxy_advisory_only_not_hosted_token_or_cost_savings",
1878
+ },
1879
+ "forwarding": {
1880
+ "external_forwarding_intent": external_forwarding_intent,
1881
+ "hidden_external_forwarding": False,
1882
+ "runtime_gate_acknowledged": runtime_gate_ack,
1883
+ "future_runtime_gate_required": True,
1884
+ },
1885
+ "redaction": {
1886
+ "secret_like_fields": sorted(set(secret_like_fields)),
1887
+ "raw_api_key_output": False,
1888
+ },
1889
+ "review_plan": {
1890
+ "readiness_blockers": blockers,
1891
+ "next_steps": [
1892
+ "Keep any real proxy runtime behind a separate future runtime gate.",
1893
+ "Use localhost-only bind and target defaults for advisory review.",
1894
+ "Do not persist API keys or forward externally from this dry-run planner.",
1895
+ ],
1896
+ },
1897
+ "claim_boundary": (
1898
+ "Dry-run local proxy advisory preview only; no listener, forwarding, API-key persistence, ledger write, "
1899
+ "or hosted API token/cost savings claim is performed."
1900
+ ),
1901
+ }
1902
+
1903
+
1904
+ def command_plan_local_proxy(args: argparse.Namespace) -> int:
1905
+ payload = local_proxy_plan_payload(args)
1906
+ if args.json:
1907
+ emit_json(payload)
1908
+ else:
1909
+ print("ContextGuard local proxy plan (dry-run only)")
1910
+ print("No listener was started, no traffic was forwarded, no API key was persisted, and no ledger was written.")
1911
+ print(f"Status: {payload['status']}")
1912
+ print(f"Bind: {payload['bind']['host']}:{payload['bind']['port']} localhost_only={payload['bind']['localhost_only']}")
1913
+ print(
1914
+ f"Target: {payload['target']['host']}:{payload['target']['port']} "
1915
+ f"localhost_only={payload['target']['localhost_only']}"
1916
+ )
1917
+ if payload["review_plan"]["readiness_blockers"]:
1918
+ print(f"Readiness blockers: {', '.join(payload['review_plan']['readiness_blockers'])}")
1919
+ print(payload["claim_boundary"])
1920
+ return 0
1921
+
1922
+
1923
+ LEARNED_CODE_FENCE_RE = re.compile(r"(?m)^\s*(?:```|~~~)")
1924
+ LEARNED_DIFF_RE = re.compile(r"(?m)^\s*(diff --git |@@\s+-|--- |\+\+\+ |[+-].*)")
1925
+ LEARNED_IDENTIFIER_RE = re.compile(
1926
+ r"\b(?:"
1927
+ r"_*[A-Za-z]+_[A-Za-z0-9_]*"
1928
+ r"|_*[a-z]+[A-Z][A-Za-z0-9]*"
1929
+ r"|_*[A-Z][a-z]+[A-Z][A-Za-z0-9]*"
1930
+ r"|_*[A-Za-z_][A-Za-z0-9_]*(?:\.[A-Za-z_][A-Za-z0-9_]*)+"
1931
+ r"|_*[A-Z][A-Z0-9_]{2,}"
1932
+ r")\b"
1933
+ )
1934
+ LEARNED_PATH_RE = re.compile(
1935
+ r"(?x)(?:"
1936
+ r"(?<![\w.-])/(?:[A-Za-z0-9._@%+=:-]+/)*[A-Za-z0-9._@%+=:-]+"
1937
+ r"|"
1938
+ r"\b[A-Za-z]:\\(?:[^\\\s:\"'<>|]+\\)*[^\\\s:\"'<>|]+"
1939
+ r"|"
1940
+ r"(?<![\w.-])(?:\.{1,2}/)+[A-Za-z0-9._@%+=:-]+(?:/[A-Za-z0-9._@%+=:-]+)*\b"
1941
+ r"|"
1942
+ r"\b(?:\.{1,2}/)?(?:[A-Za-z0-9._@%+=:-]+/)+[A-Za-z0-9._@%+=:-]+\b"
1943
+ r"|"
1944
+ r"\b[A-Za-z0-9._-]+\.(?:py|js|ts|tsx|jsx|go|rs|java|kt|swift|json|ya?ml|toml|md|txt|log|sh|bash|zsh|sql|html|css)\b"
1945
+ r")"
1946
+ )
1947
+ LEARNED_HASH_RE = re.compile(r"\b(?:sha256:[0-9a-fA-F]{32,64}|[0-9a-fA-F]{7,64})\b")
1948
+ LEARNED_STACK_FRAME_RE = re.compile(
1949
+ r"(?m)^\s*(?:File\s+\"[^\"]+\",\s+line\s+\d+,\s+in\s+\S+|at\s+\S+.*\([^)]*:\d+(?::\d+)?\))"
1950
+ )
1951
+ LEARNED_JSON_KEY_RE = re.compile(r"""(?x)"(?:[^"\\]|\\.)*"\s*:|'(?:[^'\\]|\\.)*'\s*:""")
1952
+ LEARNED_QUOTED_STRING_RE = re.compile(
1953
+ r'''(?x)"""(?:.|\n)*?"""|''' + r"""'''(?:.|\n)*?'''|"(?:[^"\\]|\\.)*"|'(?:[^'\\]|\\.)*'"""
1954
+ )
1955
+ LEARNED_NUMERIC_CONSTANT_RE = re.compile(
1956
+ r"(?<![\w.])(?:[vV]?\d+(?:\.\d+)*|[-+]?0x[0-9A-Fa-f]+)(?![\w.])"
1957
+ )
1958
+ LEARNED_PROMPT_LIKE_RE = re.compile(
1959
+ r"(?imx)(?:"
1960
+ r"\b(?:ignore|disregard|forget)\s+(?:all\s+)?(?:the\s+)?(?:above|earlier|previous|prior)\s+instructions?\b"
1961
+ r"|^\s*(?:system|developer|user|assistant)\s*:"
1962
+ r"|\b(?:system|developer|user|assistant)\s+instructions?\b"
1963
+ r"|\b(?:system|developer)\s+message\b"
1964
+ r"|\byou\s+are\s+(?:now\s+)?(?:chatgpt|a\s+\w+|\w+)\b"
1965
+ r"|\bact\s+as\b"
1966
+ r"|\bjailbreak\b"
1967
+ r"|\bdo\s+not\s+follow\b"
1968
+ r"|\boverride\s+instructions\b"
1969
+ r")"
1970
+ )
1971
+ LEARNED_URL_RE = re.compile(
1972
+ r"(?i)\b(?:https?://|(?:[A-Za-z0-9](?:[A-Za-z0-9-]{0,61}[A-Za-z0-9])?\.)+[A-Za-z]{2,24})(?:/|\b)"
1973
+ )
1974
+ LEARNED_CODE_LIKE_RE = re.compile(
1975
+ r"(?mx)^\s*(?:"
1976
+ r"(?:from\s+\S+\s+import\s+\S+|import\s+\S+|def\s+[A-Za-z_]\w*\s*\(|class\s+[A-Za-z_]\w*\s*(?:\(|:)|"
1977
+ r"function\s+[A-Za-z_$][\w$]*\s*\(|(?:const|let|var)\s+[A-Za-z_$][\w$]*\s*=)"
1978
+ r"|(?:if|elif|else|for|while|try|except|finally|with)\b.*:"
1979
+ r"|(?:print|raise|return|yield|assert)\b(?:\s*\(|\s+\S+)"
1980
+ r"|[A-Za-z_][A-Za-z0-9_]*\s*(?:=|==|!=|<=|>=|\+=|-=|\*=|/=)\s*\S+"
1981
+ r"|.*[{};]\s*$"
1982
+ r"|(?:ls|cp|mv|rm|sudo|curl|wget|chmod|chown|git|npm|npx|pnpm|yarn|python3?|pip|node|bash|sh|zsh|cat|grep|sed|awk|make|cargo|pytest|tox|uv|ruff|mypy|pyright|docker|kubectl)(?:\s+(?:-\S+|\S+))*"
1983
+ r"|<[/!]?[A-Za-z][A-Za-z0-9-]*(?:\s+[^<>]*)?>"
1984
+ r")"
1985
+ )
1986
+ LEARNED_INLINE_CODE_RE = re.compile(r"`[^`\n]+`")
1987
+ LEARNED_NON_TEXT_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f\ufffd]")
1988
+ LEARNED_WORD_RE = re.compile(r"\b[\w.-]+\b")
1989
+ LEARNED_ARTIFACT_ID_RE = re.compile(r"^[a-f0-9]{16,64}$")
1990
+
1991
+
1992
+ def read_learned_input(args: argparse.Namespace) -> tuple[str, dict[str, Any]]:
1993
+ source_label = args.source_label
1994
+ if args.input:
1995
+ path = Path(args.input)
1996
+ source_label = source_label or path.name
1997
+ loaded = read_bounded_regular_file(path, max_bytes=MAX_LEARNED_COMPRESSION_INPUT_BYTES, label="learned-compression input")
1998
+ assert loaded is not None
1999
+ raw, truncated = loaded
2000
+ else:
2001
+ source_label = source_label or "stdin"
2002
+ raw = sys.stdin.buffer.read(MAX_LEARNED_COMPRESSION_INPUT_BYTES + 1)
2003
+ truncated = len(raw) > MAX_LEARNED_COMPRESSION_INPUT_BYTES
2004
+ raw = raw[:MAX_LEARNED_COMPRESSION_INPUT_BYTES]
2005
+ text = raw.decode("utf-8", errors="replace")
2006
+ metadata = {
2007
+ "source_label": source_label,
2008
+ "bytes": len(raw),
2009
+ "lines": len(text.splitlines()),
2010
+ "sha256": hashlib.sha256(raw).hexdigest() if raw else None,
2011
+ "truncated": truncated,
2012
+ "max_bytes": MAX_LEARNED_COMPRESSION_INPUT_BYTES,
2013
+ }
2014
+ return text, metadata
2015
+
2016
+
2017
+ def learned_content_type(text: str, counts: dict[str, int]) -> str:
2018
+ stripped = text.strip()
2019
+ if not stripped:
2020
+ return "empty"
2021
+ if counts["non_text_input"]:
2022
+ return "non_text"
2023
+ if counts["protected_json_key"]:
2024
+ return "json"
2025
+ if counts["protected_diff"]:
2026
+ return "diff"
2027
+ if counts["protected_code_fence"] or counts["protected_code_like"] or counts["protected_identifier"] >= 3:
2028
+ return "code"
2029
+ return "prose"
2030
+
2031
+
2032
+ def learned_signal_counts(text: str) -> dict[str, int]:
2033
+ words = LEARNED_WORD_RE.findall(text)
2034
+ numeric_count = len(LEARNED_NUMERIC_CONSTANT_RE.findall(text))
2035
+ code_like_count = len(LEARNED_CODE_LIKE_RE.findall(text)) + len(LEARNED_INLINE_CODE_RE.findall(text))
2036
+ numeric_density_high = 1 if words and numeric_count >= 3 and numeric_count / len(words) >= 0.20 else 0
2037
+ return {
2038
+ "protected_code_fence": len(LEARNED_CODE_FENCE_RE.findall(text)),
2039
+ "protected_diff": len(LEARNED_DIFF_RE.findall(text)),
2040
+ "protected_identifier": len(LEARNED_IDENTIFIER_RE.findall(text)),
2041
+ "protected_path": len(LEARNED_PATH_RE.findall(text)),
2042
+ "protected_hash": len(LEARNED_HASH_RE.findall(text)),
2043
+ "protected_stack_frame": len(LEARNED_STACK_FRAME_RE.findall(text)),
2044
+ "protected_json_key": len(LEARNED_JSON_KEY_RE.findall(text)),
2045
+ "protected_numeric_constant": numeric_count,
2046
+ "protected_quoted_string": len(LEARNED_QUOTED_STRING_RE.findall(text)),
2047
+ "prompt_like_instruction": len(LEARNED_PROMPT_LIKE_RE.findall(text)),
2048
+ "url_or_endpoint": len(LEARNED_URL_RE.findall(text)),
2049
+ "protected_code_like": code_like_count,
2050
+ "non_text_input": len(LEARNED_NON_TEXT_RE.findall(text)),
2051
+ "numeric_density_high": numeric_density_high,
2052
+ }
2053
+
2054
+
2055
+ def valid_learned_reexpand_command(receipt_id: str | None, command: str | None) -> tuple[bool, str | None]:
2056
+ if not receipt_id or not command:
2057
+ return False, "missing_exact_fallback"
2058
+ if not LEARNED_ARTIFACT_ID_RE.fullmatch(receipt_id):
2059
+ return False, "invalid_reexpand_command"
2060
+ if any(token in command for token in (";", "|", "&", ">", "<", "`", "$", "\n", "\r")):
2061
+ return False, "invalid_reexpand_command"
2062
+ try:
2063
+ argv = shlex.split(command)
2064
+ except ValueError:
2065
+ return False, "invalid_reexpand_command"
2066
+ if len(argv) < 4:
2067
+ return False, "invalid_reexpand_command"
2068
+ if argv == ["context-guard-artifact", "get", receipt_id, "--full"]:
2069
+ return True, None
2070
+ if argv == ["context-guard", "artifact", "get", receipt_id, "--full"]:
2071
+ return True, None
2072
+ return False, "invalid_reexpand_command"
2073
+
2074
+
2075
+ def learned_compression_plan_payload(args: argparse.Namespace) -> dict[str, Any]:
2076
+ text, input_meta = read_learned_input(args)
2077
+ receipt_id = args.exact_fallback_receipt.strip() if args.exact_fallback_receipt else None
2078
+ reexpand_command = args.reexpand_command.strip() if args.reexpand_command else None
2079
+ reexpand_valid, fallback_blocker = valid_learned_reexpand_command(receipt_id, reexpand_command)
2080
+ counts = learned_signal_counts(text)
2081
+ content_type = learned_content_type(text, counts)
2082
+
2083
+ blockers: list[str] = []
2084
+ if not text.strip():
2085
+ blockers.append("missing_input")
2086
+ if input_meta["truncated"]:
2087
+ blockers.append("input_truncated")
2088
+ if not args.sanitized:
2089
+ blockers.append("missing_sanitized_assertion")
2090
+ if not args.trusted_source:
2091
+ blockers.append("untrusted_input")
2092
+ if fallback_blocker:
2093
+ blockers.append(fallback_blocker)
2094
+ if content_type != "prose" and text.strip():
2095
+ blockers.append("non_prose_input")
2096
+ for blocker, count in counts.items():
2097
+ if count:
2098
+ blockers.append(blocker)
2099
+ blockers = list(dict.fromkeys(blockers))
2100
+ ready = not blockers
2101
+ return {
2102
+ "tool": TOOL_NAME,
2103
+ "schema_version": CONFIG_SCHEMA_VERSION,
2104
+ "experiment_id": "learned-compression",
2105
+ "mode": "dry_run",
2106
+ "status": "ready_for_human_review" if ready else "blocked_until_safe_input",
2107
+ "input": input_meta,
2108
+ "policy": {
2109
+ "deny_by_default": True,
2110
+ "runtime_compression_allowed": False,
2111
+ "eligible_for_human_review": ready,
2112
+ "human_review_required": True,
2113
+ "stable_runtime_behavior_changed": False,
2114
+ },
2115
+ "sanitization": {
2116
+ "required": True,
2117
+ "caller_asserted": bool(args.sanitized),
2118
+ "verified": False,
2119
+ },
2120
+ "trust": {
2121
+ "required": True,
2122
+ "caller_asserted": bool(args.trusted_source),
2123
+ "verified": False,
2124
+ },
2125
+ "exact_fallback": {
2126
+ "required": True,
2127
+ "available": bool(receipt_id and reexpand_command and reexpand_valid),
2128
+ "receipt_id": receipt_id,
2129
+ "cli": reexpand_command,
2130
+ "verified": False,
2131
+ },
2132
+ "protected_signal_scan": {
2133
+ "content_type": content_type,
2134
+ "counts": counts,
2135
+ },
2136
+ "review_plan": {
2137
+ "readiness_blockers": blockers,
2138
+ "protected_signals": [name for name, count in counts.items() if count],
2139
+ "next_steps": [
2140
+ "Keep exact fallback receipt and re-expand command available before considering any future summary.",
2141
+ "Reject learned compression for protected, prompt-like, untrusted, or non-prose input.",
2142
+ "Do not claim hosted token/cost savings from this dry-run policy check.",
2143
+ ],
2144
+ },
2145
+ "claim_boundary": (
2146
+ "Dry-run learned-compression policy check only; no hosted token/cost savings claim without "
2147
+ "provider-measured matched successful tasks."
2148
+ ),
2149
+ "candidate_replacement": None,
2150
+ }
2151
+
2152
+
2153
+ def command_plan_learned_compression(args: argparse.Namespace) -> int:
2154
+ payload = learned_compression_plan_payload(args)
2155
+ if args.json:
2156
+ emit_json(payload)
2157
+ else:
2158
+ print("ContextGuard learned/synthetic compression gate (dry-run only)")
2159
+ print("No learned compressor/model/provider was called and no replacement text was emitted.")
2160
+ print(f"Status: {payload['status']}")
2161
+ print(f"Input: {payload['input']['source_label']} lines={payload['input']['lines']} sha256={payload['input']['sha256']}")
2162
+ if payload["review_plan"]["readiness_blockers"]:
2163
+ print(f"Readiness blockers: {', '.join(payload['review_plan']['readiness_blockers'])}")
2164
+ print(payload["claim_boundary"])
2165
+ return 0
2166
+
2167
+
2168
+ def add_common_args(parser: argparse.ArgumentParser) -> None:
2169
+ parser.add_argument("--root", help="Project root for default project-local experiment config (default: cwd).")
2170
+ parser.add_argument("--config", help="Project-local config path. Relative paths resolve under --root; absolute paths must stay inside --root.")
2171
+ parser.add_argument("--json", action="store_true", help="Emit JSON output.")
2172
+
2173
+
2174
+ def load_args_context(args: argparse.Namespace) -> tuple[Path, Path, dict[str, Any]]:
2175
+ root = resolve_root(args.root)
2176
+ config_path = resolve_config_path(root, args.config)
2177
+ return root, config_path, load_config(config_path)
2178
+
2179
+
2180
+ def build_parser() -> argparse.ArgumentParser:
2181
+ parser = argparse.ArgumentParser(
2182
+ prog=TOOL_NAME,
2183
+ description="Inspect and manage default-off ContextGuard experimental feature opt-ins.",
2184
+ )
2185
+ sub = parser.add_subparsers(dest="command", required=True)
2186
+
2187
+ list_parser = sub.add_parser("list", help="List known experiments and metadata.")
2188
+ add_common_args(list_parser)
2189
+ list_parser.set_defaults(func=command_list)
2190
+
2191
+ status_parser = sub.add_parser("status", help="Show project-local experiment enablement status.")
2192
+ add_common_args(status_parser)
2193
+ status_parser.set_defaults(func=command_status)
2194
+
2195
+ enable_parser = sub.add_parser("enable", help="Enable one experiment in project-local config.")
2196
+ enable_parser.add_argument("experiment_id")
2197
+ add_common_args(enable_parser)
2198
+ enable_parser.set_defaults(func=command_enable)
2199
+
2200
+ disable_parser = sub.add_parser("disable", help="Disable one experiment in project-local config.")
2201
+ disable_parser.add_argument("experiment_id")
2202
+ add_common_args(disable_parser)
2203
+ disable_parser.set_defaults(func=command_disable)
2204
+
2205
+ plan_parser = sub.add_parser("plan", help="Run read-only dry-run planners for experimental lanes.")
2206
+ plan_sub = plan_parser.add_subparsers(dest="plan_command", required=True)
2207
+
2208
+ context_diff = plan_sub.add_parser(
2209
+ "context-diff-compaction",
2210
+ help="Dry-run a reviewable context-diff compaction plan without emitting a replacement.",
2211
+ )
2212
+ context_diff.add_argument("--input", help="Read diff text from a file instead of stdin.")
2213
+ context_diff.add_argument("--source-label", help="Safe label to use for the input source in reports.")
2214
+ context_diff.add_argument("--receipt-id", help="User-supplied exact receipt/artifact id for human review readiness.")
2215
+ context_diff.add_argument("--reexpand-command", help="User-supplied exact re-expand command for human review readiness.")
2216
+ context_diff.add_argument("--json", action="store_true", help="Emit JSON output.")
2217
+ context_diff.set_defaults(func=command_plan_context_diff_compaction)
2218
+
2219
+ visual_ocr = plan_sub.add_parser(
2220
+ "visual-crop-ocr",
2221
+ help="Dry-run visual crop/OCR evidence metadata without calling OCR or image services.",
2222
+ )
2223
+ visual_ocr.add_argument("--full-evidence-receipt", help="User-supplied receipt/id for the original full visual evidence.")
2224
+ visual_ocr.add_argument("--full-evidence-label", help="Safe label for the full visual evidence.")
2225
+ visual_ocr.add_argument("--crop-label", help="Safe label for the cropped region or crop fixture.")
2226
+ visual_ocr.add_argument("--crop-bounds", help="Crop bounds as x,y,width,height integers.")
2227
+ visual_ocr.add_argument("--image-size", help="Original image size as width,height integers.")
2228
+ visual_ocr.add_argument("--ocr-text", help="Bounded OCR fixture text supplied inline.")
2229
+ visual_ocr.add_argument("--ocr-text-file", help="Read bounded OCR fixture text from a UTF-8 text file.")
2230
+ visual_ocr.add_argument("--ocr-source-label", help="Safe label for OCR text source; defaults to inline or file basename.")
2231
+ visual_ocr.add_argument("--ocr-confidence", help="OCR confidence as a finite decimal from 0.0 to 1.0.")
2232
+ visual_ocr.add_argument("--ocr-error-note", action="append", help="Known OCR error/uncertainty note. Repeatable.")
2233
+ visual_ocr.add_argument("--missed-context-note", action="append", help="Potential context outside crop/OCR text. Repeatable.")
2234
+ visual_ocr.add_argument("--json", action="store_true", help="Emit JSON output.")
2235
+ visual_ocr.set_defaults(func=command_plan_visual_crop_ocr)
2236
+
2237
+ self_hosted = plan_sub.add_parser(
2238
+ "self-hosted-metrics-ledger",
2239
+ help="Dry-run self-hosted/local metrics ledger sidecar evidence without writing a ledger.",
2240
+ )
2241
+ self_hosted.add_argument("--input", help="Read an explicit self_hosted_metrics JSON envelope from a file instead of stdin.")
2242
+ self_hosted.add_argument("--source-label", help="Safe label to use for the input source in reports.")
2243
+ self_hosted.add_argument("--latency-ms", type=float, default=None, help="Local/model-server latency in milliseconds.")
2244
+ self_hosted.add_argument("--peak-memory-mb", type=float, default=None, help="Peak local/model-server memory in MiB/MB.")
2245
+ self_hosted.add_argument("--quality-score", type=float, default=None, help="Quality score from 0.0 to 1.0.")
2246
+ self_hosted.add_argument("--energy-wh", type=float, default=None, help="Diagnostic local energy use in watt-hours.")
2247
+ self_hosted.add_argument("--local-cost-usd", type=float, default=None, help="Diagnostic local/self-hosted cost in USD.")
2248
+ self_hosted.add_argument("--tokens-per-second", type=float, default=None, help="Diagnostic local throughput.")
2249
+ self_hosted.add_argument("--model-server", help="Sanitized label for local model server/runtime.")
2250
+ self_hosted.add_argument("--optimization", help="Sanitized label for the local optimization under test.")
2251
+ self_hosted.add_argument("--quality-metric", help="Sanitized label for quality metric.")
2252
+ self_hosted.add_argument("--hardware", help="Sanitized local hardware label.")
2253
+ self_hosted.add_argument("--runtime", help="Sanitized local runtime label.")
2254
+ self_hosted.add_argument("--dataset", help="Sanitized dataset label.")
2255
+ self_hosted.add_argument("--json", action="store_true", help="Emit JSON output.")
2256
+ self_hosted.set_defaults(func=command_plan_self_hosted_metrics_ledger)
2257
+
2258
+ local_proxy = plan_sub.add_parser(
2259
+ "local-proxy",
2260
+ help="Dry-run a localhost-only local proxy advisory plan without starting a proxy.",
2261
+ )
2262
+ local_proxy.add_argument("--input", help="Read a local_proxy JSON envelope from a file instead of CLI flags.")
2263
+ local_proxy.add_argument("--bind-host", help="Advisory bind host; must be localhost/loopback.")
2264
+ local_proxy.add_argument("--bind-port", default=None, help="Advisory bind port; 0 means unspecified/ephemeral.")
2265
+ local_proxy.add_argument("--target-host", help="Advisory target host; must be localhost/loopback.")
2266
+ local_proxy.add_argument("--target-port", default=None, help="Advisory target port; 0 means unspecified.")
2267
+ local_proxy.add_argument("--upstream-url", help="Advisory upstream URL; host must be localhost/loopback.")
2268
+ local_proxy.add_argument("--ledger-jsonl", help="Advisory ledger path preview; dry-run only, not written.")
2269
+ local_proxy.add_argument("--proxy-label", help="Safe label for this local proxy plan.")
2270
+ local_proxy.add_argument("--api-key", help="Blocked/redacted API key material; never persisted or emitted raw.")
2271
+ local_proxy.add_argument("--authorization-header", help="Blocked/redacted Authorization header; never persisted or emitted raw.")
2272
+ local_proxy.add_argument("--persist-api-key", action="store_true", help="Declare API-key persistence intent; blocked by default.")
2273
+ local_proxy.add_argument(
2274
+ "--external-forwarding-intent",
2275
+ action="store_true",
2276
+ help="Declare future external forwarding intent; blocked in this dry-run planner.",
2277
+ )
2278
+ local_proxy.add_argument(
2279
+ "--runtime-gate-ack",
2280
+ action="store_true",
2281
+ help="Acknowledge that any future forwarding needs a separate runtime gate.",
2282
+ )
2283
+ local_proxy.add_argument("--json", action="store_true", help="Emit JSON output.")
2284
+ local_proxy.set_defaults(func=command_plan_local_proxy)
2285
+
2286
+ learned = plan_sub.add_parser(
2287
+ "learned-compression",
2288
+ help="Dry-run a deny-by-default learned/synthetic compression safety gate.",
2289
+ )
2290
+ learned.add_argument("--input", help="Read candidate prose from a text file instead of stdin.")
2291
+ learned.add_argument("--source-label", help="Safe label to use for the input source in reports.")
2292
+ learned.add_argument("--sanitized", action="store_true", help="Assert input is already sanitized.")
2293
+ learned.add_argument("--trusted-source", action="store_true", help="Assert input came from a trusted source.")
2294
+ learned.add_argument("--exact-fallback-receipt", help="Local exact fallback receipt id for the original text.")
2295
+ learned.add_argument("--reexpand-command", help="Local exact re-expand command bound to the receipt id.")
2296
+ learned.add_argument("--json", action="store_true", help="Emit JSON output.")
2297
+ learned.set_defaults(func=command_plan_learned_compression)
2298
+
2299
+ return parser
2300
+
2301
+
2302
+ def normalize_negative_csv_option_values(argv: list[str] | None) -> list[str] | None:
2303
+ """Keep negative comma-separated option values portable across Python versions.
2304
+
2305
+ Python 3.11/3.12 argparse treats a value such as ``-1,0,20,10`` after an
2306
+ option as another option token rather than as the option's value. Python
2307
+ 3.14 accepts the same test input, so normalize the small set of CSV-valued
2308
+ options that intentionally accepts negative numbers for validation.
2309
+ """
2310
+ if argv is None:
2311
+ argv = sys.argv[1:]
2312
+ normalized: list[str] = []
2313
+ pending_csv_option: str | None = None
2314
+ csv_options = {"--crop-bounds"}
2315
+ for token in argv:
2316
+ if pending_csv_option is not None:
2317
+ normalized.append(f"{pending_csv_option}={token}")
2318
+ pending_csv_option = None
2319
+ continue
2320
+ if token in csv_options:
2321
+ pending_csv_option = token
2322
+ continue
2323
+ normalized.append(token)
2324
+ if pending_csv_option is not None:
2325
+ normalized.append(pending_csv_option)
2326
+ return normalized
2327
+
2328
+
2329
+ def main(argv: list[str] | None = None) -> int:
2330
+ parser = build_parser()
2331
+ args = parser.parse_args(normalize_negative_csv_option_values(argv))
2332
+ try:
2333
+ return int(args.func(args))
2334
+ except RegistryError as exc:
2335
+ fail(str(exc))
2336
+
2337
+
2338
+ if __name__ == "__main__":
2339
+ raise SystemExit(main())