@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.ko.md +59 -31
  3. package/README.md +85 -36
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  8. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  9. package/docs/benchmark-workflow-examples.md +3 -0
  10. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  11. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  12. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  13. package/docs/distribution.md +10 -7
  14. package/docs/experimental-benchmark-fixtures.md +30 -6
  15. package/package.json +4 -6
  16. package/packaging/homebrew/context-guard.rb.template +1 -1
  17. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  18. package/plugins/context-guard/README.ko.md +20 -14
  19. package/plugins/context-guard/README.md +26 -17
  20. package/plugins/context-guard/bin/context-guard +147 -25
  21. package/plugins/context-guard/bin/context-guard-artifact +884 -79
  22. package/plugins/context-guard/bin/context-guard-audit +33 -2
  23. package/plugins/context-guard/bin/context-guard-bench +1542 -31
  24. package/plugins/context-guard/bin/context-guard-cache-score +665 -0
  25. package/plugins/context-guard/bin/context-guard-compress +146 -1
  26. package/plugins/context-guard/bin/context-guard-cost +790 -6
  27. package/plugins/context-guard/bin/context-guard-experiments +463 -26
  28. package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
  29. package/plugins/context-guard/bin/context-guard-filter +163 -7
  30. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  31. package/plugins/context-guard/bin/context-guard-pack +892 -49
  32. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  33. package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
  34. package/plugins/context-guard/bin/context-guard-setup +165 -31
  35. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  36. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  37. package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
  38. package/plugins/context-guard/bin/context-guard-trim-output +288 -41
  39. package/plugins/context-guard/brief/README.md +5 -5
  40. package/plugins/context-guard/lib/context_guard_commands.py +230 -0
  41. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  42. package/context-guard-kit/README.md +0 -91
  43. package/context-guard-kit/benchmark_runner.py +0 -2401
  44. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  45. package/context-guard-kit/context_compress.py +0 -695
  46. package/context-guard-kit/context_escrow.py +0 -935
  47. package/context-guard-kit/context_filter.py +0 -637
  48. package/context-guard-kit/context_guard_cli.py +0 -325
  49. package/context-guard-kit/context_guard_diet.py +0 -1711
  50. package/context-guard-kit/context_pack.py +0 -2713
  51. package/context-guard-kit/cost_guard.py +0 -2349
  52. package/context-guard-kit/experimental_registry.py +0 -4348
  53. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  54. package/context-guard-kit/guard_large_read.py +0 -690
  55. package/context-guard-kit/hook_secret_patterns.py +0 -43
  56. package/context-guard-kit/read_symbol.py +0 -483
  57. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  58. package/context-guard-kit/sanitize_output.py +0 -725
  59. package/context-guard-kit/settings.example.json +0 -67
  60. package/context-guard-kit/setup_wizard.py +0 -2515
  61. package/context-guard-kit/statusline.sh +0 -362
  62. package/context-guard-kit/statusline_merged.sh +0 -157
  63. package/context-guard-kit/tool_schema_pruner.py +0 -837
  64. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -178,19 +178,137 @@ EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]],
178
178
  )
179
179
  MAX_USAGE_TOKEN_COUNT = 10**12
180
180
  MAX_USAGE_COST_USD = 10**9
181
+ MAX_EVIDENCE_JSONL_BYTES = 5_000_000
182
+ MAX_EVIDENCE_JSONL_LINES = 100_000
181
183
  # Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
182
184
  # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
183
185
  # ~4 bytes/token의 통용 근사값을 사용한다.
184
186
  TOKEN_PROXY_BYTES_PER_TOKEN = 4
185
187
  BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
186
188
  MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
189
+ MEASUREMENT_BASELINE_SCHEMA_VERSION = "contextguard.bench.measurement-baseline.v1"
190
+ DEFAULT_MATRIX_SCHEMA_VERSION = "contextguard.bench.default-matrix.v1"
191
+ PUBLIC_CLAIM_READINESS_SCHEMA_VERSION = "contextguard.bench.public-claim-readiness.v1"
187
192
  SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
188
193
  SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
189
194
  SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
195
+ EVIDENCE_REPLAY_SOURCE_TYPES = frozenset({"synthetic_fixture", "provider_export", "manual_audit"})
196
+ PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES = frozenset({
197
+ "provider_measured_matched_task",
198
+ "provider_measured_matched_task_public_claim",
199
+ "hosted_api_provider_measured_matched_task",
200
+ })
201
+ REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS = "provider_export_public_claim_candidate"
202
+ REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS = "provider_export_claim_gates_not_met"
203
+ REPLAY_NOT_PUBLIC_CLAIM_STATUS = "replay_only_not_public_claim"
204
+ REPLAY_UNKNOWN_MIXED_CSV_STATUS = "unknown_mixed_csv"
205
+ REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES = frozenset({
206
+ "token_and_shifted_cost_savings_observed",
207
+ })
208
+ REPLAY_CLAIM_BOUNDARY = (
209
+ "Evidence replay is an import/replay mode. Synthetic fixtures and manual audits are never "
210
+ "hosted API token/cost savings evidence; public claims require complete provider_export "
211
+ "provenance for every report row plus the normal matched-task quality, token, cost, and "
212
+ "shifted-cost gates."
213
+ )
214
+ DEFAULT_MATRIX_CLASSIFICATIONS = ("default-on", "advisory", "experimental", "reject/rework")
215
+ DEFAULT_MATRIX_CLASSIFICATION_STRENGTH = {
216
+ "experimental": 0,
217
+ "advisory": 1,
218
+ "default-on": 2,
219
+ }
220
+ DEFAULT_MATRIX_LANES: tuple[dict[str, Any], ...] = (
221
+ {
222
+ "id": "trimming",
223
+ "label": "Trimming / digest output",
224
+ "policy_ceiling": "default-on",
225
+ "task_keywords": ("long_log_analysis", "output_transform", "trim", "trimming", "sanitize_output", "digest"),
226
+ "variant_keywords": ("trim", "trimming", "sanitize", "digest", "brief"),
227
+ },
228
+ {
229
+ "id": "artifact_escrow",
230
+ "label": "Artifact escrow / receipt handles",
231
+ "policy_ceiling": "default-on",
232
+ "task_keywords": ("artifact_receipt", "artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
233
+ "variant_keywords": ("artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
234
+ },
235
+ {
236
+ "id": "tool_pruning",
237
+ "label": "Tool/MCP schema pruning",
238
+ "policy_ceiling": "default-on",
239
+ "task_keywords": ("tool_schema", "tool_prune", "tool_pruning", "mcp_schema", "defer_report"),
240
+ "variant_keywords": ("tool_prune", "tool_pruning", "tool_schema", "mcp", "defer"),
241
+ },
242
+ {
243
+ "id": "cache_advice",
244
+ "label": "Cache layout advice",
245
+ "policy_ceiling": "advisory",
246
+ "task_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache"),
247
+ "variant_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache", "cache"),
248
+ },
249
+ {
250
+ "id": "adaptive_k",
251
+ "label": "Adaptive-k context packing",
252
+ "policy_ceiling": "advisory",
253
+ "task_keywords": ("adaptive_k", "adaptive", "top_k", "context_pack"),
254
+ "variant_keywords": ("adaptive_k", "adaptive", "top_k", "pack_adaptive"),
255
+ },
256
+ {
257
+ "id": "optional_compression",
258
+ "label": "Optional compression",
259
+ "policy_ceiling": "advisory",
260
+ "task_keywords": ("learned_compression", "compression", "compress", "context_diff"),
261
+ "variant_keywords": ("learned_compression", "compression", "compress", "context_diff"),
262
+ },
263
+ )
264
+ DEFAULT_MATRIX_LANE_IDS = tuple(str(item["id"]) for item in DEFAULT_MATRIX_LANES)
265
+ DEFAULT_MATRIX_LANE_BY_ID = {str(item["id"]): item for item in DEFAULT_MATRIX_LANES}
266
+ MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS = 20
267
+ DEFAULT_MATRIX_CLAIM_BOUNDARY = {
268
+ "id": "default_matrix_reporting_only_not_runtime_default_or_savings_claim",
269
+ "reporting_only": True,
270
+ "changes_runtime_defaults": False,
271
+ "hosted_api_token_savings_claim_allowed": False,
272
+ "hosted_api_cost_savings_claim_allowed": False,
273
+ "public_claims_must_use_report_claim_status_and_matched_pair_evidence": True,
274
+ "reason": (
275
+ "The default matrix classifies local benchmark lanes for review only; it does not "
276
+ "turn features on by default and does not authorize hosted API savings claims."
277
+ ),
278
+ }
279
+ PUBLIC_CLAIM_READINESS_GATE_IDS = (
280
+ "matched_successful_tasks",
281
+ "provider_measured_token_cost",
282
+ "quality_non_inferiority",
283
+ "shifted_cost_accounting",
284
+ "confidence_failure_notes",
285
+ "provider_export_provenance",
286
+ )
287
+ PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY = {
288
+ "id": "public_claim_readiness_authoritative_release_gate",
289
+ "reporting_only": True,
290
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
291
+ "unsupported_claims_forbidden": True,
292
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": True,
293
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": True,
294
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": True,
295
+ "requires_matched_successful_tasks": True,
296
+ "requires_provider_measured_tokens_and_cost": True,
297
+ "requires_quality_non_inferiority": True,
298
+ "requires_shifted_cost_accounting": True,
299
+ "requires_confidence_and_failure_notes": True,
300
+ "requires_provider_export_provenance": True,
301
+ "reason": (
302
+ "Public hosted token/cost savings claims are forbidden unless every readiness gate passes "
303
+ "and public_claim_readiness.claim_allowed is true."
304
+ ),
305
+ }
190
306
  MAX_SELF_HOSTED_LABEL_CHARS = 120
191
307
  MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
192
308
  MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
193
309
  MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
310
+ MAX_FIXTURE_FILE_BYTES = 1_000_000
311
+ MAX_CLAUDE_PROMPT_ARG_BYTES = MAX_VARIANT_PROMPT_FILE_BYTES
194
312
  CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
195
313
  SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
196
314
  VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -317,12 +435,18 @@ def _open_regular_no_symlink(
317
435
  os.close(parent_fd)
318
436
 
319
437
 
320
- def _read_text_no_follow(path: Path) -> str:
438
+ def _read_text_no_follow(path: Path, *, max_bytes: int = MAX_FIXTURE_FILE_BYTES) -> str:
321
439
  fd = _open_regular_no_symlink(path)
322
440
  try:
323
- with os.fdopen(fd, "r", encoding="utf-8") as handle:
441
+ with os.fdopen(fd, "rb") as handle:
324
442
  fd = -1
325
- return handle.read()
443
+ raw = handle.read(max_bytes + 1)
444
+ if len(raw) > max_bytes:
445
+ raise SystemExit(f"fixture file exceeds {max_bytes} bytes: {path}")
446
+ try:
447
+ return raw.decode("utf-8")
448
+ except UnicodeDecodeError as exc:
449
+ raise SystemExit(f"fixture file must be UTF-8 text: {path}: {exc.reason}") from None
326
450
  finally:
327
451
  if fd != -1:
328
452
  os.close(fd)
@@ -400,6 +524,38 @@ class RunResult:
400
524
  self_hosted_metrics: dict[str, Any] | None = None
401
525
 
402
526
 
527
+ @dataclass
528
+ class EvidenceReplayRow:
529
+ result: RunResult
530
+ source_type: str
531
+ provider_name: str | None
532
+ capture_command_or_export_id: str | None
533
+ claim_scope: str
534
+ provider_export_provenance_complete: bool
535
+ public_claim_eligible: bool
536
+ explicit_notes: bool
537
+ line_number: int
538
+
539
+ @property
540
+ def key(self) -> tuple[str, str]:
541
+ return (self.result.task_id, self.result.variant)
542
+
543
+ def provenance_payload(self) -> dict[str, Any]:
544
+ return {
545
+ "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
546
+ "mode": "evidence_jsonl_replay",
547
+ "evidence_source_type": self.source_type,
548
+ "provider_name": self.provider_name,
549
+ "capture_command_or_export_id": self.capture_command_or_export_id,
550
+ "claim_scope": self.claim_scope,
551
+ "provider_export_provenance_complete": self.provider_export_provenance_complete,
552
+ "public_claim_eligible": self.public_claim_eligible,
553
+ "explicit_notes": self.explicit_notes,
554
+ "line_number": self.line_number,
555
+ "claim_boundary": REPLAY_CLAIM_BOUNDARY,
556
+ }
557
+
558
+
403
559
  @dataclass
404
560
  class BoundedProcessResult:
405
561
  returncode: int
@@ -470,6 +626,17 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
470
626
  return extra_args
471
627
 
472
628
 
629
+ def require_argv_safe_prompt(text: str, *, owner: str) -> str:
630
+ """Keep prompt-bearing argv below a bounded size to avoid E2BIG failures."""
631
+ size = len(text.encode("utf-8", errors="replace"))
632
+ if size > MAX_CLAUDE_PROMPT_ARG_BYTES:
633
+ raise SystemExit(
634
+ f"{owner} prompt exceeds argv-safe limit "
635
+ f"({size} bytes > {MAX_CLAUDE_PROMPT_ARG_BYTES}); use a smaller fixture prompt"
636
+ )
637
+ return text
638
+
639
+
473
640
  def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
474
641
  """Return a safe relative prompt-file path, or fail before any file read."""
475
642
  rel_path = Path(raw_path)
@@ -522,26 +689,28 @@ def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None
522
689
  f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
523
690
  )
524
691
  try:
525
- with os.fdopen(fd, "r", encoding="utf-8") as handle:
692
+ with os.fdopen(fd, "rb") as handle:
526
693
  fd = -1
527
- text = handle.read()
528
- except UnicodeDecodeError as exc:
529
- raise SystemExit(
530
- f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
531
- f"{label}: {exc.reason}"
532
- ) from None
694
+ raw = handle.read(MAX_VARIANT_PROMPT_FILE_BYTES + 1)
533
695
  except OSError as exc:
534
696
  detail = exc.strerror or exc.__class__.__name__
535
697
  raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
536
698
  finally:
537
699
  if fd != -1:
538
700
  os.close(fd)
539
- if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
701
+ if len(raw) > MAX_VARIANT_PROMPT_FILE_BYTES:
540
702
  raise SystemExit(
541
703
  f"{owner} variant_prompt_files prompt text exceeds "
542
704
  f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
543
705
  )
544
- return text
706
+ try:
707
+ text = raw.decode("utf-8")
708
+ except UnicodeDecodeError as exc:
709
+ raise SystemExit(
710
+ f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
711
+ f"{label}: {exc.reason}"
712
+ ) from None
713
+ return require_argv_safe_prompt(text, owner=f"{owner} variant_prompt_files")
545
714
 
546
715
 
547
716
  def load_variant_prompt_files_for_targets(
@@ -977,7 +1146,11 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
977
1146
  argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
978
1147
  argv.extend(variant.extra_args)
979
1148
  argv.append("--")
980
- argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
1149
+ prompt = require_argv_safe_prompt(
1150
+ task.variant_prompt_texts.get(variant.name, task.prompt),
1151
+ owner=f"task {task.id} variant {variant.name}",
1152
+ )
1153
+ argv.append(prompt)
981
1154
  return argv
982
1155
 
983
1156
 
@@ -1361,7 +1534,13 @@ def write_text_no_follow(path: Path, text: str) -> None:
1361
1534
  os.close(fd)
1362
1535
 
1363
1536
 
1364
- def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
1537
+ def append_cost_shift_ledger(
1538
+ path: Path,
1539
+ claude_ver: str,
1540
+ result: RunResult,
1541
+ *,
1542
+ replay_provenance: dict[str, Any] | None = None,
1543
+ ) -> None:
1365
1544
  shifted_cost_known = cost_shift_measured(result)
1366
1545
  byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
1367
1546
  payload = {
@@ -1412,6 +1591,10 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
1412
1591
  }
1413
1592
  if result.self_hosted_metrics is not None:
1414
1593
  payload["self_hosted_metrics"] = result.self_hosted_metrics
1594
+ if replay_provenance is not None:
1595
+ payload["replay_provenance"] = replay_provenance
1596
+ payload["evidence_source_type"] = replay_provenance.get("evidence_source_type")
1597
+ payload["public_claim_eligible"] = bool(replay_provenance.get("public_claim_eligible"))
1415
1598
  with csv_file_lock(path, create_parent=True):
1416
1599
  fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
1417
1600
  try:
@@ -1435,7 +1618,9 @@ def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
1435
1618
  reader = csv.DictReader(f)
1436
1619
  fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
1437
1620
  validate_csv_schema(csv_path, fieldnames)
1438
- for row in reader:
1621
+ for index, row in enumerate(reader, start=1):
1622
+ if index > MAX_CSV_ROWS:
1623
+ raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
1439
1624
  tid = row.get("task_id") or ""
1440
1625
  var = row.get("variant") or ""
1441
1626
  if tid and var:
@@ -1487,6 +1672,356 @@ def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
1487
1672
  os.close(fd)
1488
1673
 
1489
1674
 
1675
+ def file_has_content_no_follow(path: Path) -> bool:
1676
+ try:
1677
+ fd = _open_regular_no_symlink(path)
1678
+ except FileNotFoundError:
1679
+ return False
1680
+ try:
1681
+ return os.fstat(fd).st_size > 0
1682
+ finally:
1683
+ os.close(fd)
1684
+
1685
+
1686
+ def require_evidence_object(raw: Any, *, owner: str) -> dict[str, Any]:
1687
+ if not isinstance(raw, dict):
1688
+ raise SystemExit(f"{owner} evidence row must be a JSON object")
1689
+ return raw
1690
+
1691
+
1692
+ def evidence_non_empty_string(raw: Any, *, field: str, owner: str, required: bool = True) -> str | None:
1693
+ if raw is None:
1694
+ if required:
1695
+ raise SystemExit(f"{owner} {field} must be a non-empty string")
1696
+ return None
1697
+ if not isinstance(raw, str):
1698
+ raise SystemExit(f"{owner} {field} must be a string")
1699
+ text = sanitize_note_text(raw)
1700
+ if not text:
1701
+ if required:
1702
+ raise SystemExit(f"{owner} {field} must be a non-empty string")
1703
+ return None
1704
+ return text
1705
+
1706
+
1707
+ def evidence_bool(raw: Any, *, field: str, owner: str, default: bool = False) -> bool:
1708
+ if raw is None:
1709
+ return default
1710
+ if not isinstance(raw, bool):
1711
+ raise SystemExit(f"{owner} {field} must be a boolean")
1712
+ return raw
1713
+
1714
+
1715
+ def evidence_nonnegative_int(
1716
+ raw: Any,
1717
+ *,
1718
+ field: str,
1719
+ owner: str,
1720
+ default: int = 0,
1721
+ maximum: int = MAX_USAGE_TOKEN_COUNT,
1722
+ ) -> int:
1723
+ if raw is None:
1724
+ return default
1725
+ value = normalize_usage_token(raw)
1726
+ if value is None or value > maximum:
1727
+ raise SystemExit(f"{owner} {field} must be a finite non-negative integer")
1728
+ return value
1729
+
1730
+
1731
+ def evidence_nonnegative_float(
1732
+ raw: Any,
1733
+ *,
1734
+ field: str,
1735
+ owner: str,
1736
+ default: float = 0.0,
1737
+ maximum: float = MAX_USAGE_COST_USD,
1738
+ ) -> float:
1739
+ if raw is None:
1740
+ return default
1741
+ if isinstance(raw, bool) or not isinstance(raw, (int, float)):
1742
+ raise SystemExit(f"{owner} {field} must be a finite non-negative number")
1743
+ value = float(raw)
1744
+ if not math.isfinite(value) or value < 0 or value > maximum:
1745
+ raise SystemExit(f"{owner} {field} must be a finite non-negative number")
1746
+ return value
1747
+
1748
+
1749
+ def evidence_first(raw: dict[str, Any], *keys: str) -> Any:
1750
+ for key in keys:
1751
+ if key in raw:
1752
+ return raw[key]
1753
+ return None
1754
+
1755
+
1756
+ def parse_evidence_provenance(raw: dict[str, Any], *, owner: str) -> dict[str, Any]:
1757
+ provenance = raw.get("provenance")
1758
+ if provenance is not None and not isinstance(provenance, dict):
1759
+ raise SystemExit(f"{owner} provenance must be a JSON object")
1760
+ source_raw = (
1761
+ provenance.get("evidence_source_type")
1762
+ if isinstance(provenance, dict) and "evidence_source_type" in provenance
1763
+ else raw.get("evidence_source_type")
1764
+ )
1765
+ source_type = evidence_non_empty_string(source_raw, field="evidence_source_type", owner=owner)
1766
+ assert source_type is not None
1767
+ if source_type not in EVIDENCE_REPLAY_SOURCE_TYPES:
1768
+ raise SystemExit(
1769
+ f"{owner} evidence_source_type must be one of: {', '.join(sorted(EVIDENCE_REPLAY_SOURCE_TYPES))}"
1770
+ )
1771
+ provider_name = evidence_non_empty_string(
1772
+ provenance.get("provider_name") if isinstance(provenance, dict) else raw.get("provider_name"),
1773
+ field="provider_name",
1774
+ owner=owner,
1775
+ required=False,
1776
+ )
1777
+ capture_id = evidence_non_empty_string(
1778
+ (
1779
+ provenance.get("capture_command_or_export_id")
1780
+ if isinstance(provenance, dict) and "capture_command_or_export_id" in provenance
1781
+ else raw.get("capture_command_or_export_id")
1782
+ ),
1783
+ field="capture_command_or_export_id",
1784
+ owner=owner,
1785
+ required=False,
1786
+ )
1787
+ claim_scope = evidence_non_empty_string(
1788
+ provenance.get("claim_scope") if isinstance(provenance, dict) else raw.get("claim_scope"),
1789
+ field="claim_scope",
1790
+ owner=owner,
1791
+ )
1792
+ assert claim_scope is not None
1793
+ provider_authority = (
1794
+ source_type == "provider_export"
1795
+ and provider_name is not None
1796
+ and capture_id is not None
1797
+ and claim_scope in PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES
1798
+ )
1799
+ return {
1800
+ "source_type": source_type,
1801
+ "provider_name": provider_name,
1802
+ "capture_command_or_export_id": capture_id,
1803
+ "claim_scope": claim_scope,
1804
+ "provider_public_claim_authority": provider_authority,
1805
+ }
1806
+
1807
+
1808
+ def parse_evidence_tokens(raw: dict[str, Any], *, owner: str) -> tuple[dict[str, int], set[str]]:
1809
+ token_block = raw.get("tokens")
1810
+ if token_block is not None and not isinstance(token_block, dict):
1811
+ raise SystemExit(f"{owner} tokens must be a JSON object")
1812
+ tokens: dict[str, int] = {}
1813
+ observed: set[str] = set()
1814
+ source = token_block if isinstance(token_block, dict) else {}
1815
+ for bucket, _keys in USAGE_KEY_GROUPS:
1816
+ value = source.get(bucket) if bucket in source else raw.get(bucket)
1817
+ if value is not None:
1818
+ observed.add(bucket)
1819
+ tokens[bucket] = evidence_nonnegative_int(value, field=bucket, owner=owner)
1820
+ return tokens, observed
1821
+
1822
+
1823
+ def parse_evidence_row(raw_value: Any, *, owner: str, line_number: int) -> EvidenceReplayRow:
1824
+ raw = require_evidence_object(raw_value, owner=owner)
1825
+ schema = evidence_non_empty_string(raw.get("schema_version"), field="schema_version", owner=owner)
1826
+ if schema != BENCH_RUN_EVIDENCE_SCHEMA_VERSION:
1827
+ raise SystemExit(
1828
+ f"{owner} schema_version must be {BENCH_RUN_EVIDENCE_SCHEMA_VERSION}"
1829
+ )
1830
+ task_id = evidence_non_empty_string(raw.get("task_id"), field="task_id", owner=owner)
1831
+ variant = evidence_non_empty_string(raw.get("variant"), field="variant", owner=owner)
1832
+ assert task_id is not None and variant is not None
1833
+ provenance = parse_evidence_provenance(raw, owner=owner)
1834
+ provider_authority = bool(provenance["provider_public_claim_authority"])
1835
+ raw_primary_tokens_measured = evidence_bool(
1836
+ raw.get("primary_tokens_measured"),
1837
+ field="primary_tokens_measured",
1838
+ owner=owner,
1839
+ )
1840
+ raw_cost_measured = evidence_bool(
1841
+ evidence_first(raw, "cost_measured", "primary_cost_measured"),
1842
+ field="cost_measured",
1843
+ owner=owner,
1844
+ )
1845
+ if provenance["source_type"] in {"synthetic_fixture", "manual_audit"}:
1846
+ primary_tokens_measured = False
1847
+ cost_measured = False
1848
+ elif provider_authority:
1849
+ primary_tokens_measured = raw_primary_tokens_measured
1850
+ cost_measured = raw_cost_measured
1851
+ else:
1852
+ if raw_primary_tokens_measured or raw_cost_measured:
1853
+ raise SystemExit(
1854
+ f"{owner} provider_export measured flags require provider_name, "
1855
+ "capture_command_or_export_id, and a provider-measured matched-task claim_scope"
1856
+ )
1857
+ primary_tokens_measured = False
1858
+ cost_measured = False
1859
+
1860
+ tokens, observed_token_buckets = parse_evidence_tokens(raw, owner=owner)
1861
+ if primary_tokens_measured and not {"input_tokens", "output_tokens"}.issubset(observed_token_buckets):
1862
+ raise SystemExit(
1863
+ f"{owner} primary_tokens_measured=true requires input_tokens and output_tokens evidence"
1864
+ )
1865
+ cost_usd = evidence_nonnegative_float(
1866
+ evidence_first(raw, "cost_usd", "primary_cost_usd"),
1867
+ field="cost_usd",
1868
+ owner=owner,
1869
+ )
1870
+ if cost_measured and "cost_usd" not in raw and "primary_cost_usd" not in raw:
1871
+ raise SystemExit(f"{owner} cost_measured=true requires cost_usd evidence")
1872
+
1873
+ if "success" not in raw:
1874
+ raise SystemExit(f"{owner} success must be a boolean")
1875
+ success = evidence_bool(raw.get("success"), field="success", owner=owner)
1876
+ notes = evidence_non_empty_string(raw.get("notes"), field="notes", owner=owner, required=False)
1877
+ explicit_notes = notes is not None
1878
+ model = evidence_non_empty_string(raw.get("model"), field="model", owner=owner, required=False) or "evidence-replay"
1879
+ effort = evidence_non_empty_string(raw.get("effort"), field="effort", owner=owner, required=False) or ""
1880
+ self_hosted_metrics = None
1881
+ if SELF_HOSTED_METRICS_KEY in raw:
1882
+ self_hosted_metrics = normalize_self_hosted_metrics(
1883
+ raw.get(SELF_HOSTED_METRICS_KEY),
1884
+ source="evidence_jsonl.self_hosted_metrics",
1885
+ )
1886
+ if self_hosted_metrics is None:
1887
+ raise SystemExit(f"{owner} self_hosted_metrics must be normalized explicit metrics")
1888
+
1889
+ result = RunResult(
1890
+ task_id=task_id,
1891
+ variant=variant,
1892
+ model=model,
1893
+ effort=effort,
1894
+ tokens=tokens,
1895
+ cost_usd=cost_usd,
1896
+ success=success,
1897
+ notes=notes or f"evidence replay ({provenance['source_type']})",
1898
+ corrections=evidence_nonnegative_int(raw.get("corrections"), field="corrections", owner=owner),
1899
+ cost_measured=cost_measured,
1900
+ wall_time_seconds=evidence_nonnegative_float(
1901
+ raw.get("wall_time_seconds"),
1902
+ field="wall_time_seconds",
1903
+ owner=owner,
1904
+ maximum=MAX_SELF_HOSTED_LATENCY_MS / 1000,
1905
+ ),
1906
+ turns=evidence_nonnegative_int(raw.get("turns"), field="turns", owner=owner),
1907
+ hook_triggers=evidence_nonnegative_int(raw.get("hook_triggers"), field="hook_triggers", owner=owner),
1908
+ bytes_before=evidence_nonnegative_int(raw.get("bytes_before"), field="bytes_before", owner=owner),
1909
+ bytes_after=evidence_nonnegative_int(raw.get("bytes_after"), field="bytes_after", owner=owner),
1910
+ artifacts_used=evidence_nonnegative_int(raw.get("artifacts_used"), field="artifacts_used", owner=owner),
1911
+ external_tokens=evidence_nonnegative_int(raw.get("external_tokens"), field="external_tokens", owner=owner),
1912
+ external_tokens_measured=evidence_bool(
1913
+ raw.get("external_tokens_measured"),
1914
+ field="external_tokens_measured",
1915
+ owner=owner,
1916
+ ),
1917
+ external_cost_usd=evidence_nonnegative_float(
1918
+ raw.get("external_cost_usd"),
1919
+ field="external_cost_usd",
1920
+ owner=owner,
1921
+ ),
1922
+ external_cost_measured=evidence_bool(
1923
+ raw.get("external_cost_measured"),
1924
+ field="external_cost_measured",
1925
+ owner=owner,
1926
+ ),
1927
+ provider_cached_tokens=evidence_nonnegative_int(
1928
+ raw.get("provider_cached_tokens"),
1929
+ field="provider_cached_tokens",
1930
+ owner=owner,
1931
+ ),
1932
+ provider_cached_tokens_measured=evidence_bool(
1933
+ raw.get("provider_cached_tokens_measured"),
1934
+ field="provider_cached_tokens_measured",
1935
+ owner=owner,
1936
+ ),
1937
+ primary_tokens_measured=primary_tokens_measured,
1938
+ self_hosted_metrics=self_hosted_metrics,
1939
+ )
1940
+ return EvidenceReplayRow(
1941
+ result=result,
1942
+ source_type=str(provenance["source_type"]),
1943
+ provider_name=provenance["provider_name"],
1944
+ capture_command_or_export_id=provenance["capture_command_or_export_id"],
1945
+ claim_scope=str(provenance["claim_scope"]),
1946
+ provider_export_provenance_complete=provider_authority,
1947
+ public_claim_eligible=False,
1948
+ explicit_notes=explicit_notes,
1949
+ line_number=line_number,
1950
+ )
1951
+
1952
+
1953
+ def read_evidence_jsonl(path: Path) -> list[EvidenceReplayRow]:
1954
+ fd = _open_regular_no_symlink(path)
1955
+ try:
1956
+ size = os.fstat(fd).st_size
1957
+ if size > MAX_EVIDENCE_JSONL_BYTES:
1958
+ raise SystemExit(
1959
+ f"evidence JSONL exceeds {MAX_EVIDENCE_JSONL_BYTES} bytes: {path}"
1960
+ )
1961
+ rows: list[EvidenceReplayRow] = []
1962
+ with os.fdopen(fd, "r", encoding="utf-8") as handle:
1963
+ fd = -1
1964
+ for line_number, line in enumerate(handle, start=1):
1965
+ if line_number > MAX_EVIDENCE_JSONL_LINES:
1966
+ raise SystemExit(
1967
+ f"evidence JSONL line limit exceeded for {path}: > {MAX_EVIDENCE_JSONL_LINES}"
1968
+ )
1969
+ if not line.strip():
1970
+ continue
1971
+ try:
1972
+ payload = json.loads(line)
1973
+ except json.JSONDecodeError as exc:
1974
+ raise SystemExit(
1975
+ f"{path}:{line_number} evidence row must be JSON: {exc.msg}"
1976
+ ) from None
1977
+ rows.append(parse_evidence_row(payload, owner=f"{path}:{line_number}", line_number=line_number))
1978
+ finally:
1979
+ if fd != -1:
1980
+ os.close(fd)
1981
+ if not rows:
1982
+ raise SystemExit(f"evidence JSONL contains no rows: {path}")
1983
+ return rows
1984
+
1985
+
1986
+ def validate_evidence_coverage(
1987
+ evidence_rows: list[EvidenceReplayRow],
1988
+ runnable_targets: list[tuple[TaskFixture, Variant]],
1989
+ ) -> dict[tuple[str, str], EvidenceReplayRow]:
1990
+ by_key: dict[tuple[str, str], EvidenceReplayRow] = {}
1991
+ for row in evidence_rows:
1992
+ if row.key in by_key:
1993
+ raise SystemExit(
1994
+ f"duplicate evidence row for {row.key[0]}/{row.key[1]} "
1995
+ f"(lines {by_key[row.key].line_number} and {row.line_number})"
1996
+ )
1997
+ by_key[row.key] = row
1998
+ missing = [
1999
+ f"{task.id}/{variant.name}"
2000
+ for task, variant in runnable_targets
2001
+ if (task.id, variant.name) not in by_key
2002
+ ]
2003
+ if missing:
2004
+ raise SystemExit(f"missing evidence row(s) for selected targets: {', '.join(missing)}")
2005
+ return {
2006
+ (task.id, variant.name): by_key[(task.id, variant.name)]
2007
+ for task, variant in runnable_targets
2008
+ }
2009
+
2010
+
2011
+ def run_evidence_fixture(task: TaskFixture, variant: Variant, evidence: EvidenceReplayRow) -> RunResult:
2012
+ result = evidence.result
2013
+ if result.task_id != task.id or result.variant != variant.name:
2014
+ raise SystemExit(
2015
+ f"evidence target mismatch: expected {task.id}/{variant.name}, "
2016
+ f"got {result.task_id}/{result.variant}"
2017
+ )
2018
+ if result.model == "evidence-replay":
2019
+ result.model = task.model
2020
+ if not result.effort:
2021
+ result.effort = task.effort or ""
2022
+ return result
2023
+
2024
+
1490
2025
  def row_int(row: dict[str, str], key: str) -> int:
1491
2026
  try:
1492
2027
  return int(float(row.get(key) or 0))
@@ -1546,6 +2081,77 @@ def row_cost_shift_measured(row: dict[str, str]) -> bool:
1546
2081
  )
1547
2082
 
1548
2083
 
2084
+ def measurement_baseline_contract() -> dict[str, Any]:
2085
+ """Describe the benchmark report's current measurement baseline contract.
2086
+
2087
+ This block is descriptive. It does not change the CSV schema and does not
2088
+ grant token/cost savings claims by itself; those remain gated by matched
2089
+ successful tasks, measured primary tokens/costs, shifted-cost accounting,
2090
+ and quality gates.
2091
+ """
2092
+ return {
2093
+ "schema_version": MEASUREMENT_BASELINE_SCHEMA_VERSION,
2094
+ "csv_schema_unchanged": True,
2095
+ "csv_columns": list(CSV_COLUMNS),
2096
+ "captured_fields": {
2097
+ "task_identity": ["task_id", "variant"],
2098
+ "run_configuration": ["model", "effort", "claude_version"],
2099
+ "primary_token_buckets": [
2100
+ "input_tokens",
2101
+ "output_tokens",
2102
+ "cache_read",
2103
+ "cache_creation",
2104
+ "total_tokens",
2105
+ "primary_tokens_measured",
2106
+ ],
2107
+ "primary_cost": ["cost_usd", "cost_measured"],
2108
+ "provider_cache_telemetry": ["provider_cached_tokens", "provider_cached_tokens_measured"],
2109
+ "latency": ["wall_time_seconds"],
2110
+ "quality_and_result": ["success", "corrections", "notes"],
2111
+ "tooling_and_proxy_metrics": ["turns", "hook_triggers", "bytes_before", "bytes_after", "artifacts_used"],
2112
+ "shifted_cost_accounting": [
2113
+ "external_tokens",
2114
+ "external_tokens_measured",
2115
+ "external_cost_usd",
2116
+ "external_cost_measured",
2117
+ "total_cost_with_shift_usd",
2118
+ ],
2119
+ },
2120
+ "claim_eligible_fields": {
2121
+ "token_savings": [
2122
+ "matched successful baseline and variant tasks",
2123
+ "primary_tokens_measured=true on both sides",
2124
+ "quality_gate=pass",
2125
+ ],
2126
+ "shifted_cost_savings": [
2127
+ "matched successful baseline and variant tasks",
2128
+ "cost_measured=true on both sides",
2129
+ "external_cost_measured=true when external_tokens are present",
2130
+ "quality_gate=pass",
2131
+ ],
2132
+ },
2133
+ "proxy_only_fields": {
2134
+ "byte_metrics": ["bytes_before", "bytes_after"],
2135
+ "token_proxy": "chars_div_4_proxy_only",
2136
+ "provider_cache": "diagnostic_telemetry_not_contextguard_token_reduction",
2137
+ },
2138
+ "missing_future_run_identity_fields": [
2139
+ "repo_revision",
2140
+ "agent_harness",
2141
+ "feature_flags",
2142
+ "provider_name",
2143
+ "success_command_identity",
2144
+ ],
2145
+ "claim_boundary": {
2146
+ "descriptive_contract_only": True,
2147
+ "enables_savings_claims_by_itself": False,
2148
+ "requires_matched_successful_tasks": True,
2149
+ "requires_shifted_cost_accounting_for_cost_claims": True,
2150
+ "raw_proxy_estimates_are_not_hosted_api_token_savings": True,
2151
+ },
2152
+ }
2153
+
2154
+
1549
2155
  def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
1550
2156
  by_variant: dict[str, dict[str, Any]] = {}
1551
2157
  successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
@@ -2187,10 +2793,11 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
2187
2793
  claim_status = "token_savings_observed_cost_unmeasured"
2188
2794
  elif token_savings_observed:
2189
2795
  claim_status = "token_savings_observed_cost_shift_watch"
2190
- return {
2796
+ report = {
2191
2797
  "schema": "context-guard-bench-report-v1",
2192
2798
  "baseline_variant": baseline_variant,
2193
2799
  "row_count": len(rows),
2800
+ "measurement_baseline": measurement_baseline_contract(),
2194
2801
  "summary_by_variant": by_variant,
2195
2802
  "comparisons": comparisons,
2196
2803
  "matched_pair_evidence": matched_pair_evidence,
@@ -2200,22 +2807,854 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
2200
2807
  "shifted cost savings require measured primary cost and measured external cost when "
2201
2808
  "external tokens are present. Wall time and provider cached-token fields are diagnostic "
2202
2809
  "telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
2203
- "discounts must stay separate from token-reduction claims."
2810
+ "discounts must stay separate from token-reduction claims. Public hosted savings "
2811
+ "claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden."
2204
2812
  ),
2205
2813
  }
2814
+ report["public_claim_readiness"] = build_public_claim_readiness(report)
2815
+ report["default_matrix"] = build_default_matrix(report)
2816
+ return report
2817
+
2818
+ def annotate_replay_report(
2819
+ report: dict[str, Any],
2820
+ replay_rows: list[EvidenceReplayRow],
2821
+ *,
2822
+ mixed_csv: bool,
2823
+ ) -> dict[str, Any]:
2824
+ source_types = sorted({row.source_type for row in replay_rows})
2825
+ provider_names = sorted({row.provider_name for row in replay_rows if row.provider_name})
2826
+ claim_scopes = sorted({row.claim_scope for row in replay_rows})
2827
+ same_run_complete = (not mixed_csv) and len(replay_rows) == int(report.get("row_count") or 0)
2828
+ all_provider_claim_authority = bool(replay_rows) and all(
2829
+ row.provider_export_provenance_complete for row in replay_rows
2830
+ )
2831
+ raw_claim_status = str(report.get("claim_status") or "")
2832
+ matched_pair_evidence = report.get("matched_pair_evidence")
2833
+ matched_claim_gates_allow_public_claim = (
2834
+ isinstance(matched_pair_evidence, list)
2835
+ and bool(matched_pair_evidence)
2836
+ and all(
2837
+ isinstance(item, dict)
2838
+ and isinstance(item.get("claim_boundary"), dict)
2839
+ and bool(item["claim_boundary"].get("token_savings_claim_allowed"))
2840
+ and bool(item["claim_boundary"].get("shifted_cost_claim_allowed"))
2841
+ for item in matched_pair_evidence
2842
+ )
2843
+ )
2844
+ report_claim_gates_allow_public_claim = (
2845
+ raw_claim_status in REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES
2846
+ and matched_claim_gates_allow_public_claim
2847
+ )
2848
+ if not same_run_complete:
2849
+ public_claim_status = REPLAY_UNKNOWN_MIXED_CSV_STATUS
2850
+ public_claim_eligible = False
2851
+ elif all_provider_claim_authority and report_claim_gates_allow_public_claim:
2852
+ public_claim_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
2853
+ public_claim_eligible = True
2854
+ elif all_provider_claim_authority:
2855
+ public_claim_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
2856
+ public_claim_eligible = False
2857
+ else:
2858
+ public_claim_status = REPLAY_NOT_PUBLIC_CLAIM_STATUS
2859
+ public_claim_eligible = False
2860
+ report["raw_metric_claim_status"] = raw_claim_status
2861
+ report["public_claim_status"] = public_claim_status
2862
+ report["public_claim_eligible"] = public_claim_eligible
2863
+ if not public_claim_eligible:
2864
+ report["claim_status"] = public_claim_status
2865
+ report["replay_evidence"] = {
2866
+ "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
2867
+ "mode": "evidence_jsonl_replay",
2868
+ "row_count": len(replay_rows),
2869
+ "source_types": source_types,
2870
+ "provider_names": provider_names,
2871
+ "claim_scopes": claim_scopes,
2872
+ "same_run_complete": same_run_complete,
2873
+ "mixed_csv": mixed_csv,
2874
+ "provider_export_provenance_complete": all_provider_claim_authority,
2875
+ "report_claim_gates_allow_public_claim": report_claim_gates_allow_public_claim,
2876
+ "public_claim_status": public_claim_status,
2877
+ "public_claim_eligible": public_claim_eligible,
2878
+ "target_keys": [f"{row.result.task_id}/{row.result.variant}" for row in replay_rows],
2879
+ "claim_boundary": REPLAY_CLAIM_BOUNDARY,
2880
+ }
2881
+ report["public_claim_readiness"] = build_public_claim_readiness(
2882
+ report,
2883
+ replay_rows=replay_rows,
2884
+ mixed_csv=mixed_csv,
2885
+ )
2886
+ report["default_matrix"] = build_default_matrix(report)
2887
+ return report
2888
+
2889
+
2890
+ def report_public_claim_status(report: dict[str, Any]) -> tuple[str, bool | None]:
2891
+ if "public_claim_status" in report:
2892
+ return str(report.get("public_claim_status")), bool(report.get("public_claim_eligible"))
2893
+ return (
2894
+ "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
2895
+ None,
2896
+ )
2897
+
2898
+
2899
+
2900
+ def public_claim_readiness_gate(
2901
+ gate_id: str,
2902
+ label: str,
2903
+ passed: bool,
2904
+ reason: str,
2905
+ evidence: dict[str, Any] | None = None,
2906
+ *,
2907
+ unknown: bool = False,
2908
+ ) -> dict[str, Any]:
2909
+ status = "unknown" if unknown else ("pass" if passed else "fail")
2910
+ return {
2911
+ "id": gate_id,
2912
+ "label": label,
2913
+ "required": True,
2914
+ "status": status,
2915
+ "passed": passed and not unknown,
2916
+ "reason": reason,
2917
+ "evidence": evidence or {},
2918
+ }
2919
+
2920
+
2921
+ def public_claim_pair_side_measured(pair: dict[str, Any], side: str, metric: str) -> bool:
2922
+ measurements = pair.get("measurements") if isinstance(pair.get("measurements"), dict) else {}
2923
+ side_block = measurements.get(side) if isinstance(measurements.get(side), dict) else {}
2924
+ metric_block = side_block.get(metric) if isinstance(side_block.get(metric), dict) else {}
2925
+ return bool(metric_block.get("measured"))
2926
+
2927
+
2928
+ def public_claim_numeric_values(items: list[Any]) -> list[float]:
2929
+ values: list[float] = []
2930
+ for item in items:
2931
+ if isinstance(item, bool) or not isinstance(item, (int, float)):
2932
+ continue
2933
+ numeric = float(item)
2934
+ if math.isfinite(numeric):
2935
+ values.append(numeric)
2936
+ return values
2937
+
2938
+
2939
+ def public_claim_readiness_evidence_text(evidence: dict[str, Any]) -> str:
2940
+ parts: list[str] = []
2941
+ for key, value in evidence.items():
2942
+ if isinstance(value, list):
2943
+ display = ",".join(str(item) for item in value[:5])
2944
+ if len(value) > 5:
2945
+ display += ",…"
2946
+ elif isinstance(value, dict):
2947
+ display = ",".join(f"{k}={v}" for k, v in list(value.items())[:5])
2948
+ if len(value) > 5:
2949
+ display += ",…"
2950
+ else:
2951
+ display = str(value)
2952
+ parts.append(f"{key}={display}")
2953
+ return "; ".join(parts)
2954
+
2955
+
2956
+ def build_public_claim_readiness(
2957
+ report: dict[str, Any],
2958
+ *,
2959
+ replay_rows: list[EvidenceReplayRow] | None = None,
2960
+ mixed_csv: bool = False,
2961
+ ) -> dict[str, Any]:
2962
+ comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
2963
+ comparisons = [item for item in comparisons if isinstance(item, dict)]
2964
+ pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
2965
+ pairs = [item for item in pairs if isinstance(item, dict)]
2966
+ row_count = int(report.get("row_count") or 0)
2967
+ replay_evidence = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else {}
2968
+ replay_count = len(replay_rows or [])
2969
+ public_claim_status, public_claim_eligible = report_public_claim_status(report)
2970
+ raw_metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
2971
+
2972
+ comparison_variants = [str(item.get("variant")) for item in comparisons if item.get("variant")]
2973
+ matched_counts = public_claim_numeric_values([
2974
+ item.get("matched_successful_task_count") for item in comparisons
2975
+ ])
2976
+ missing_baseline_successes = [
2977
+ task
2978
+ for item in comparisons
2979
+ for task in (item.get("missing_baseline_success_tasks") or [])
2980
+ ]
2981
+ baseline_success_counts = public_claim_numeric_values([
2982
+ item.get("baseline_successful_task_count") for item in comparisons
2983
+ ])
2984
+ matched_tasks_pass = (
2985
+ bool(comparisons)
2986
+ and bool(pairs)
2987
+ and len(matched_counts) == len(comparisons)
2988
+ and all(value > 0 for value in matched_counts)
2989
+ and len(baseline_success_counts) == len(comparisons)
2990
+ and all(value > 0 for value in baseline_success_counts)
2991
+ and not missing_baseline_successes
2992
+ )
2993
+ gates = [
2994
+ public_claim_readiness_gate(
2995
+ "matched_successful_tasks",
2996
+ "Matched successful tasks",
2997
+ matched_tasks_pass,
2998
+ "matched_successful_tasks_present" if matched_tasks_pass else "missing_or_regressed_matched_successful_tasks",
2999
+ {
3000
+ "comparison_count": len(comparisons),
3001
+ "matched_pair_count": len(pairs),
3002
+ "variants": comparison_variants[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
3003
+ "min_matched_successful_task_count": min(matched_counts) if matched_counts else None,
3004
+ "missing_baseline_success_task_count": len(missing_baseline_successes),
3005
+ },
3006
+ )
3007
+ ]
3008
+
3009
+ provider_measured_token_cost_pass = bool(pairs) and all(
3010
+ public_claim_pair_side_measured(pair, "baseline", "primary_tokens")
3011
+ and public_claim_pair_side_measured(pair, "variant", "primary_tokens")
3012
+ and public_claim_pair_side_measured(pair, "baseline", "primary_cost_usd")
3013
+ and public_claim_pair_side_measured(pair, "variant", "primary_cost_usd")
3014
+ for pair in pairs
3015
+ )
3016
+ gates.append(public_claim_readiness_gate(
3017
+ "provider_measured_token_cost",
3018
+ "Provider-measured token and primary cost",
3019
+ provider_measured_token_cost_pass,
3020
+ "provider_measured_primary_tokens_and_cost" if provider_measured_token_cost_pass else "missing_provider_measured_primary_tokens_or_cost",
3021
+ {
3022
+ "matched_pair_count": len(pairs),
3023
+ "required_fields": [
3024
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
3025
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
3026
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
3027
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured",
3028
+ ],
3029
+ },
3030
+ ))
3031
+
3032
+ quality_gates = sorted({str(item.get("quality_gate") or "unknown") for item in comparisons})
3033
+ failure_deltas = public_claim_numeric_values([
3034
+ item.get("failure_rate_delta_pp") for item in comparisons
3035
+ ])
3036
+ correction_deltas = public_claim_numeric_values([
3037
+ item.get("corrections_delta_per_successful_task") for item in comparisons
3038
+ ])
3039
+ quality_pass = bool(comparisons) and all(item.get("quality_gate") == "pass" for item in comparisons)
3040
+ gates.append(public_claim_readiness_gate(
3041
+ "quality_non_inferiority",
3042
+ "Quality non-inferiority",
3043
+ quality_pass,
3044
+ "all_quality_gates_pass" if quality_pass else "quality_gate_not_pass",
3045
+ {
3046
+ "quality_gates": quality_gates,
3047
+ "max_failure_rate_delta_pp": max(failure_deltas) if failure_deltas else None,
3048
+ "max_corrections_delta_per_successful_task": max(correction_deltas) if correction_deltas else None,
3049
+ },
3050
+ ))
3051
+
3052
+ shifted_cost_pass = bool(pairs) and all(
3053
+ isinstance(pair.get("claim_boundary"), dict)
3054
+ and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
3055
+ and public_claim_pair_side_measured(pair, "baseline", "total_cost_with_shift_usd")
3056
+ and public_claim_pair_side_measured(pair, "variant", "total_cost_with_shift_usd")
3057
+ for pair in pairs
3058
+ )
3059
+ gates.append(public_claim_readiness_gate(
3060
+ "shifted_cost_accounting",
3061
+ "Shifted-cost accounting",
3062
+ shifted_cost_pass,
3063
+ "shifted_cost_claim_gates_pass" if shifted_cost_pass else "missing_shifted_cost_claim_accounting",
3064
+ {
3065
+ "matched_pair_count": len(pairs),
3066
+ "required_fields": [
3067
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
3068
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
3069
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured",
3070
+ ],
3071
+ },
3072
+ ))
3073
+
3074
+ has_replay = replay_rows is not None and bool(replay_rows)
3075
+ explicit_note_count = sum(1 for row in (replay_rows or []) if row.explicit_notes)
3076
+ failed_rows = [row for row in (replay_rows or []) if not row.result.success]
3077
+ failed_rows_with_notes = sum(1 for row in failed_rows if row.explicit_notes)
3078
+ comparison_failure_fields_present = bool(comparisons) and all(
3079
+ "baseline_failure_rate" in item
3080
+ and "variant_failure_rate" in item
3081
+ and "failure_rate_delta_pp" in item
3082
+ and "paired_corrections_task_count" in item
3083
+ for item in comparisons
3084
+ )
3085
+ confidence_notes_pass = (
3086
+ has_replay
3087
+ and explicit_note_count == replay_count
3088
+ and failed_rows_with_notes == len(failed_rows)
3089
+ and comparison_failure_fields_present
3090
+ )
3091
+ gates.append(public_claim_readiness_gate(
3092
+ "confidence_failure_notes",
3093
+ "Confidence and failure notes",
3094
+ confidence_notes_pass,
3095
+ "explicit_replay_notes_and_failure_rate_evidence_present" if confidence_notes_pass else "missing_explicit_replay_notes_or_failure_evidence",
3096
+ {
3097
+ "replay_row_count": replay_count,
3098
+ "explicit_note_count": explicit_note_count,
3099
+ "failed_row_count": len(failed_rows),
3100
+ "failed_rows_with_notes": failed_rows_with_notes,
3101
+ "comparison_failure_fields_present": comparison_failure_fields_present,
3102
+ },
3103
+ unknown=not has_replay,
3104
+ ))
3105
+
3106
+ same_run_complete = bool(replay_evidence.get("same_run_complete")) if replay_evidence else (
3107
+ has_replay and not mixed_csv and replay_count == row_count
3108
+ )
3109
+ source_types = sorted({row.source_type for row in (replay_rows or [])})
3110
+ provider_names = sorted({row.provider_name for row in (replay_rows or []) if row.provider_name})
3111
+ provider_export_pass = (
3112
+ has_replay
3113
+ and not mixed_csv
3114
+ and same_run_complete
3115
+ and replay_count == row_count
3116
+ and all(row.provider_export_provenance_complete for row in (replay_rows or []))
3117
+ )
3118
+ gates.append(public_claim_readiness_gate(
3119
+ "provider_export_provenance",
3120
+ "Provider-export provenance",
3121
+ provider_export_pass,
3122
+ "complete_provider_export_same_run_provenance" if provider_export_pass else "missing_or_mixed_provider_export_provenance",
3123
+ {
3124
+ "replay_row_count": replay_count,
3125
+ "report_row_count": row_count,
3126
+ "mixed_csv": mixed_csv,
3127
+ "same_run_complete": same_run_complete,
3128
+ "source_types": source_types,
3129
+ "provider_names": provider_names[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
3130
+ },
3131
+ unknown=not has_replay,
3132
+ ))
3133
+
3134
+ passed_required_gate_count = sum(1 for gate in gates if gate["passed"])
3135
+ blocking_gate_ids = [str(gate["id"]) for gate in gates if not gate["passed"]]
3136
+ required_gates_pass = passed_required_gate_count == len(gates)
3137
+ claim_allowed = (
3138
+ required_gates_pass
3139
+ and public_claim_status == REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
3140
+ and bool(public_claim_eligible)
3141
+ )
3142
+ if claim_allowed:
3143
+ readiness_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
3144
+ reason = "all_required_public_claim_gates_pass"
3145
+ elif not has_replay:
3146
+ readiness_status = "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
3147
+ reason = "replay_evidence_required_for_public_claim"
3148
+ elif provider_export_pass:
3149
+ readiness_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
3150
+ reason = "provider_export_present_but_readiness_gates_failed"
3151
+ else:
3152
+ readiness_status = "public_claim_blocked"
3153
+ reason = "unsupported_public_savings_claim_forbidden"
3154
+
3155
+ return {
3156
+ "schema_version": PUBLIC_CLAIM_READINESS_SCHEMA_VERSION,
3157
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
3158
+ "status": readiness_status,
3159
+ "reason": reason,
3160
+ "claim_allowed": claim_allowed,
3161
+ "public_claim_status_observed": public_claim_status,
3162
+ "public_claim_eligible_observed": public_claim_eligible,
3163
+ "raw_metric_claim_status_observed": raw_metric_claim_status,
3164
+ "required_gate_ids": list(PUBLIC_CLAIM_READINESS_GATE_IDS),
3165
+ "required_gate_count": len(gates),
3166
+ "passed_required_gate_count": passed_required_gate_count,
3167
+ "blocking_gate_ids": blocking_gate_ids,
3168
+ "gates": gates,
3169
+ "claim_boundary": PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY,
3170
+ }
3171
+
3172
+
3173
+ def default_matrix_normalized_key(value: Any) -> str:
3174
+ text = str(value or "").lower()
3175
+ return re.sub(r"[^a-z0-9]+", "_", text).strip("_")
3176
+
3177
+
3178
+ def default_matrix_contains_key(haystack: str, needle: str) -> bool:
3179
+ needle = default_matrix_normalized_key(needle)
3180
+ if not needle:
3181
+ return False
3182
+ return needle in haystack
3183
+
3184
+
3185
+ def infer_default_matrix_lanes(pair: dict[str, Any]) -> list[tuple[str, str]]:
3186
+ task_id = default_matrix_normalized_key(pair.get("task_id"))
3187
+ variant = default_matrix_normalized_key(pair.get("variant"))
3188
+ matches: list[tuple[str, str]] = []
3189
+ for lane in DEFAULT_MATRIX_LANES:
3190
+ lane_id = str(lane["id"])
3191
+ task_keywords = tuple(str(item) for item in lane.get("task_keywords", ()))
3192
+ variant_keywords = tuple(str(item) for item in lane.get("variant_keywords", ()))
3193
+ if any(default_matrix_contains_key(task_id, item) for item in task_keywords):
3194
+ matches.append((lane_id, "exact_key"))
3195
+ elif any(default_matrix_contains_key(variant, item) for item in variant_keywords):
3196
+ matches.append((lane_id, "name_heuristic"))
3197
+ return matches
3198
+
3199
+
3200
+ def default_matrix_number(value: Any) -> float | None:
3201
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
3202
+ return None
3203
+ numeric = float(value)
3204
+ if not math.isfinite(numeric):
3205
+ return None
3206
+ return numeric
3207
+
3208
+
3209
+ def default_matrix_unique(values: list[Any]) -> list[Any]:
3210
+ out: list[Any] = []
3211
+ for value in values:
3212
+ if value not in out:
3213
+ out.append(value)
3214
+ return out
3215
+
3216
+
3217
+ def default_matrix_cap(values: list[Any]) -> list[Any]:
3218
+ return default_matrix_unique(values)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS]
3219
+
3220
+
3221
+ def default_matrix_lane_match_method(methods: set[str]) -> str:
3222
+ if "exact_key" in methods:
3223
+ return "exact_key"
3224
+ if "name_heuristic" in methods:
3225
+ return "name_heuristic"
3226
+ return "absent"
3227
+
3228
+
3229
+ def default_matrix_clamp_classification(classification: str, ceiling: str) -> tuple[str, bool]:
3230
+ if classification == "reject/rework":
3231
+ return classification, False
3232
+ if ceiling not in DEFAULT_MATRIX_CLASSIFICATION_STRENGTH:
3233
+ return classification, False
3234
+ current_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH.get(classification, 0)
3235
+ ceiling_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH[ceiling]
3236
+ if current_strength > ceiling_strength:
3237
+ return ceiling, True
3238
+ return classification, False
3239
+
3240
+
3241
+ def default_matrix_token_evidence(token_values: list[float], pair_count: int, byte_proxy_positive: bool) -> str:
3242
+ if pair_count and len(token_values) == pair_count and all(value > 0 for value in token_values):
3243
+ return "measured_positive"
3244
+ if token_values:
3245
+ if any(value < 0 for value in token_values):
3246
+ return "measured_regression"
3247
+ return "measured_incomplete_or_mixed"
3248
+ if byte_proxy_positive:
3249
+ return "byte_proxy_only"
3250
+ return "unavailable"
3251
+
3252
+
3253
+ def classify_default_matrix_lane(
3254
+ lane_id: str,
3255
+ pairs: list[dict[str, Any]],
3256
+ methods: set[str],
3257
+ ) -> dict[str, Any]:
3258
+ lane = DEFAULT_MATRIX_LANE_BY_ID[lane_id]
3259
+ policy_ceiling = str(lane["policy_ceiling"])
3260
+ if not pairs:
3261
+ classification = "experimental"
3262
+ reason_codes = ["no_matched_lane_evidence"]
3263
+ return {
3264
+ "lane": lane_id,
3265
+ "label": lane["label"],
3266
+ "classification": classification,
3267
+ "policy_ceiling": policy_ceiling,
3268
+ "policy_clamped": False,
3269
+ "lane_match_method": "absent",
3270
+ "matched_task_count": 0,
3271
+ "matched_tasks": [],
3272
+ "matched_variants": [],
3273
+ "quality_gate": "insufficient_evidence",
3274
+ "quality_gates": [],
3275
+ "token_evidence": "unavailable",
3276
+ "shifted_cost_evidence": "unavailable",
3277
+ "byte_proxy_evidence": "unavailable",
3278
+ "matched_pair_claim_gates": {
3279
+ "token_savings_claim_allowed": False,
3280
+ "shifted_cost_claim_allowed": False,
3281
+ },
3282
+ "public_claim_allowed": False,
3283
+ "reason_codes": reason_codes,
3284
+ "claim_boundary": {
3285
+ "classification_is_reporting_only": True,
3286
+ "hosted_api_savings_claim_allowed": False,
3287
+ "requires_report_claim_status_and_matched_pair_evidence": True,
3288
+ },
3289
+ }
3290
+
3291
+ quality_gates = sorted({str(pair.get("quality_gate") or "unknown") for pair in pairs})
3292
+ quality_gate = quality_gates[0] if len(quality_gates) == 1 else "mixed"
3293
+ token_values = [
3294
+ value for value in (
3295
+ default_matrix_number((pair.get("delta") or {}).get("token_savings_pct"))
3296
+ for pair in pairs
3297
+ if isinstance(pair.get("delta"), dict)
3298
+ )
3299
+ if value is not None
3300
+ ]
3301
+ cost_values = [
3302
+ value for value in (
3303
+ default_matrix_number((pair.get("delta") or {}).get("cost_savings_pct_with_shift"))
3304
+ for pair in pairs
3305
+ if isinstance(pair.get("delta"), dict)
3306
+ )
3307
+ if value is not None
3308
+ ]
3309
+ byte_after_deltas = [
3310
+ value for value in (
3311
+ default_matrix_number((pair.get("delta") or {}).get("bytes_after_total"))
3312
+ for pair in pairs
3313
+ if isinstance(pair.get("delta"), dict)
3314
+ )
3315
+ if value is not None
3316
+ ]
3317
+ byte_proxy_positive = bool(byte_after_deltas) and any(value < 0 for value in byte_after_deltas)
3318
+ token_claim_gate = bool(pairs) and all(
3319
+ isinstance(pair.get("claim_boundary"), dict)
3320
+ and bool((pair.get("claim_boundary") or {}).get("token_savings_claim_allowed"))
3321
+ for pair in pairs
3322
+ )
3323
+ shifted_cost_claim_gate = bool(pairs) and all(
3324
+ isinstance(pair.get("claim_boundary"), dict)
3325
+ and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
3326
+ for pair in pairs
3327
+ )
3328
+ reason_codes: list[str] = []
3329
+ if any(gate != "pass" for gate in quality_gates):
3330
+ classification = "reject/rework"
3331
+ reason_codes.extend(f"quality_gate_{gate}" for gate in quality_gates if gate != "pass")
3332
+ elif any(value < 0 for value in token_values):
3333
+ classification = "reject/rework"
3334
+ reason_codes.append("measured_token_regression")
3335
+ elif any(value < 0 for value in cost_values):
3336
+ classification = "reject/rework"
3337
+ reason_codes.append("measured_shifted_cost_regression")
3338
+ elif (
3339
+ len(token_values) == len(pairs)
3340
+ and all(value > 0 for value in token_values)
3341
+ and len(cost_values) == len(pairs)
3342
+ and all(value >= 0 for value in cost_values)
3343
+ and token_claim_gate
3344
+ and shifted_cost_claim_gate
3345
+ ):
3346
+ classification = "default-on"
3347
+ reason_codes.append("quality_pass_measured_token_and_shifted_cost_non_regression")
3348
+ elif len(token_values) == len(pairs) and all(value > 0 for value in token_values) and token_claim_gate:
3349
+ classification = "advisory"
3350
+ reason_codes.append("quality_pass_measured_token_savings_shifted_cost_unproven")
3351
+ elif byte_proxy_positive:
3352
+ classification = "advisory"
3353
+ reason_codes.append("quality_pass_byte_proxy_only")
3354
+ else:
3355
+ classification = "experimental"
3356
+ reason_codes.append("quality_pass_but_no_positive_measured_or_proxy_savings")
3357
+
3358
+ if lane_id == "optional_compression" and classification == "advisory" and not token_values:
3359
+ classification = "experimental"
3360
+ reason_codes.append("optional_compression_requires_provider_token_evidence_for_advisory")
3361
+
3362
+ classification, policy_clamped = default_matrix_clamp_classification(classification, policy_ceiling)
3363
+ if policy_clamped:
3364
+ reason_codes.append(f"policy_ceiling_{policy_ceiling}")
3365
+
3366
+ return {
3367
+ "lane": lane_id,
3368
+ "label": lane["label"],
3369
+ "classification": classification,
3370
+ "policy_ceiling": policy_ceiling,
3371
+ "policy_clamped": policy_clamped,
3372
+ "lane_match_method": default_matrix_lane_match_method(methods),
3373
+ "matched_task_count": len({str(pair.get("task_id")) for pair in pairs}),
3374
+ "matched_tasks": default_matrix_cap([pair.get("task_id") for pair in pairs if pair.get("task_id")]),
3375
+ "matched_variants": default_matrix_cap([pair.get("variant") for pair in pairs if pair.get("variant")]),
3376
+ "quality_gate": quality_gate,
3377
+ "quality_gates": quality_gates,
3378
+ "token_evidence": default_matrix_token_evidence(token_values, len(pairs), byte_proxy_positive),
3379
+ "shifted_cost_evidence": (
3380
+ "measured_non_regression"
3381
+ if cost_values and len(cost_values) == len(pairs) and all(value >= 0 for value in cost_values)
3382
+ else ("measured_regression" if any(value < 0 for value in cost_values) else "unavailable")
3383
+ ),
3384
+ "byte_proxy_evidence": (
3385
+ "observed_positive" if byte_proxy_positive
3386
+ else ("observed_non_positive" if byte_after_deltas else "unavailable")
3387
+ ),
3388
+ "matched_pair_claim_gates": {
3389
+ "token_savings_claim_allowed": token_claim_gate,
3390
+ "shifted_cost_claim_allowed": shifted_cost_claim_gate,
3391
+ },
3392
+ "public_claim_allowed": False,
3393
+ "reason_codes": default_matrix_unique(reason_codes),
3394
+ "claim_boundary": {
3395
+ "classification_is_reporting_only": True,
3396
+ "hosted_api_savings_claim_allowed": False,
3397
+ "requires_report_claim_status_and_matched_pair_evidence": True,
3398
+ },
3399
+ }
3400
+
3401
+
3402
+ def build_default_matrix(report: dict[str, Any]) -> dict[str, Any]:
3403
+ buckets: dict[str, list[dict[str, Any]]] = {lane_id: [] for lane_id in DEFAULT_MATRIX_LANE_IDS}
3404
+ methods: dict[str, set[str]] = {lane_id: set() for lane_id in DEFAULT_MATRIX_LANE_IDS}
3405
+ unmatched_variants: set[str] = set()
3406
+ pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
3407
+ for pair in pairs:
3408
+ if not isinstance(pair, dict):
3409
+ continue
3410
+ lane_matches = infer_default_matrix_lanes(pair)
3411
+ if not lane_matches:
3412
+ if pair.get("variant"):
3413
+ unmatched_variants.add(str(pair.get("variant")))
3414
+ continue
3415
+ for lane_id, method in lane_matches:
3416
+ buckets[lane_id].append(pair)
3417
+ methods[lane_id].add(method)
3418
+ lanes = [
3419
+ classify_default_matrix_lane(lane_id, buckets[lane_id], methods[lane_id])
3420
+ for lane_id in DEFAULT_MATRIX_LANE_IDS
3421
+ ]
3422
+ classification_counts = {
3423
+ classification: sum(1 for lane in lanes if lane.get("classification") == classification)
3424
+ for classification in DEFAULT_MATRIX_CLASSIFICATIONS
3425
+ }
3426
+ return {
3427
+ "schema_version": DEFAULT_MATRIX_SCHEMA_VERSION,
3428
+ "classification_set": list(DEFAULT_MATRIX_CLASSIFICATIONS),
3429
+ "generated_from": "matched_pair_evidence",
3430
+ "reporting_only": True,
3431
+ "claim_status_observed": report.get("claim_status"),
3432
+ "public_claim_allowed": False,
3433
+ "claim_boundary": DEFAULT_MATRIX_CLAIM_BOUNDARY,
3434
+ "lanes": lanes,
3435
+ "summary": {
3436
+ "lane_count": len(lanes),
3437
+ "classification_counts": classification_counts,
3438
+ "unmatched_variants": sorted(unmatched_variants)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
3439
+ },
3440
+ }
3441
+
3442
+
3443
+ def markdown_value(value: Any) -> str:
3444
+ if value is None:
3445
+ return "n/a"
3446
+ if isinstance(value, bool):
3447
+ return "true" if value else "false"
3448
+ if isinstance(value, float):
3449
+ return f"{value:.6g}"
3450
+ text = sanitize_note_text(value)
3451
+ return text.replace("|", "\\|") or "n/a"
3452
+
3453
+
3454
+ def render_dashboard_markdown(report: dict[str, Any]) -> str:
3455
+ public_claim_status, public_claim_eligible = report_public_claim_status(report)
3456
+ metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
3457
+ lines = [
3458
+ "# ContextGuard Benchmark Dashboard",
3459
+ "",
3460
+ f"- Schema: `{markdown_value(report.get('schema'))}`",
3461
+ f"- Baseline variant: `{markdown_value(report.get('baseline_variant'))}`",
3462
+ f"- Rows: {markdown_value(report.get('row_count'))}",
3463
+ f"- Metric claim status: `{markdown_value(metric_claim_status)}`",
3464
+ f"- Public claim status: `{markdown_value(public_claim_status)}`",
3465
+ f"- Public claim eligible: `{markdown_value(public_claim_eligible)}`",
3466
+ "",
3467
+ "> Claim boundary: this dashboard is not a hosted savings claim unless report claim gates "
3468
+ "allow it and public-claim provenance is complete. Proxy byte reductions are diagnostic "
3469
+ "and are not hosted API token savings.",
3470
+ "",
3471
+ "## Variant summary",
3472
+ "",
3473
+ "| Variant | Runs | Successes | Failure rate | Tokens/success | Bytes saved | Token proxy saved | Quality notes |",
3474
+ "| --- | ---: | ---: | ---: | ---: | ---: | ---: | --- |",
3475
+ ]
3476
+ summaries = report.get("summary_by_variant") if isinstance(report.get("summary_by_variant"), dict) else {}
3477
+ comparison_by_variant = {
3478
+ item.get("variant"): item
3479
+ for item in report.get("comparisons", [])
3480
+ if isinstance(item, dict)
3481
+ }
3482
+ for variant, summary in sorted(summaries.items()):
3483
+ if not isinstance(summary, dict):
3484
+ continue
3485
+ comparison = comparison_by_variant.get(variant, {})
3486
+ quality = comparison.get("quality_gate") if isinstance(comparison, dict) else None
3487
+ if quality is None and summary.get("is_baseline_strategy"):
3488
+ quality = "baseline"
3489
+ lines.append(
3490
+ "| "
3491
+ + " | ".join([
3492
+ markdown_value(variant),
3493
+ markdown_value(summary.get("runs")),
3494
+ markdown_value(summary.get("successful_runs")),
3495
+ markdown_value(summary.get("failure_rate")),
3496
+ markdown_value(summary.get("tokens_per_successful_task")),
3497
+ markdown_value(summary.get("bytes_saved_successful")),
3498
+ markdown_value(summary.get("token_proxy_saved_successful")),
3499
+ markdown_value(quality),
3500
+ ])
3501
+ + " |"
3502
+ )
3503
+ lines.extend([
3504
+ "",
3505
+ "## Comparisons",
3506
+ "",
3507
+ "| Variant | Quality gate | Matched tasks | Token paired tasks | Token savings % | Shifted cost savings % |",
3508
+ "| --- | --- | ---: | ---: | ---: | ---: |",
3509
+ ])
3510
+ comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
3511
+ if comparisons:
3512
+ for item in comparisons:
3513
+ if not isinstance(item, dict):
3514
+ continue
3515
+ lines.append(
3516
+ "| "
3517
+ + " | ".join([
3518
+ markdown_value(item.get("variant")),
3519
+ markdown_value(item.get("quality_gate")),
3520
+ markdown_value(item.get("matched_successful_task_count")),
3521
+ markdown_value(item.get("paired_token_task_count")),
3522
+ markdown_value(item.get("token_savings_pct")),
3523
+ markdown_value(item.get("cost_savings_pct_with_shift")),
3524
+ ])
3525
+ + " |"
3526
+ )
3527
+ else:
3528
+ lines.append("| n/a | n/a | 0 | 0 | n/a | n/a |")
3529
+ readiness = report.get("public_claim_readiness") if isinstance(report.get("public_claim_readiness"), dict) else None
3530
+ if readiness is not None:
3531
+ lines.extend([
3532
+ "",
3533
+ "## Public claim readiness",
3534
+ "",
3535
+ f"- Status: `{markdown_value(readiness.get('status'))}`",
3536
+ f"- Claim allowed: `{markdown_value(readiness.get('claim_allowed'))}`",
3537
+ "",
3538
+ "| Gate | Status | Reason | Evidence |",
3539
+ "| --- | --- | --- | --- |",
3540
+ ])
3541
+ gates = readiness.get("gates") if isinstance(readiness.get("gates"), list) else []
3542
+ for gate in gates:
3543
+ if not isinstance(gate, dict):
3544
+ continue
3545
+ evidence = gate.get("evidence") if isinstance(gate.get("evidence"), dict) else {}
3546
+ lines.append(
3547
+ "| "
3548
+ + " | ".join([
3549
+ markdown_value(gate.get("id")),
3550
+ markdown_value(gate.get("status")),
3551
+ markdown_value(gate.get("reason")),
3552
+ markdown_value(public_claim_readiness_evidence_text(evidence)),
3553
+ ])
3554
+ + " |"
3555
+ )
3556
+ boundary = readiness.get("claim_boundary")
3557
+ if isinstance(boundary, dict):
3558
+ lines.extend([
3559
+ "",
3560
+ f"- Public claim boundary: {markdown_value(boundary.get('reason'))}",
3561
+ ])
3562
+ default_matrix = report.get("default_matrix") if isinstance(report.get("default_matrix"), dict) else None
3563
+ if default_matrix is not None:
3564
+ lines.extend([
3565
+ "",
3566
+ "## Default matrix",
3567
+ "",
3568
+ "| Lane | Classification | Matched Tasks | Quality Gate | Token Evidence | Public Claim | Reason |",
3569
+ "| --- | --- | ---: | --- | --- | --- | --- |",
3570
+ ])
3571
+ lanes = default_matrix.get("lanes") if isinstance(default_matrix.get("lanes"), list) else []
3572
+ for lane in lanes:
3573
+ if not isinstance(lane, dict):
3574
+ continue
3575
+ reasons = lane.get("reason_codes") if isinstance(lane.get("reason_codes"), list) else []
3576
+ lines.append(
3577
+ "| "
3578
+ + " | ".join([
3579
+ markdown_value(lane.get("lane")),
3580
+ markdown_value(lane.get("classification")),
3581
+ markdown_value(lane.get("matched_task_count")),
3582
+ markdown_value(lane.get("quality_gate")),
3583
+ markdown_value(lane.get("token_evidence")),
3584
+ markdown_value(lane.get("public_claim_allowed")),
3585
+ markdown_value(", ".join(str(item) for item in reasons[:3])),
3586
+ ])
3587
+ + " |"
3588
+ )
3589
+ boundary = default_matrix.get("claim_boundary")
3590
+ if isinstance(boundary, dict):
3591
+ lines.extend([
3592
+ "",
3593
+ f"- Matrix boundary: {markdown_value(boundary.get('reason'))}",
3594
+ ])
3595
+ replay = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else None
3596
+ if replay is not None:
3597
+ lines.extend([
3598
+ "",
3599
+ "## Replay evidence provenance",
3600
+ "",
3601
+ f"- Source types: `{markdown_value(', '.join(replay.get('source_types') or []))}`",
3602
+ f"- Claim scopes: `{markdown_value(', '.join(replay.get('claim_scopes') or []))}`",
3603
+ f"- Same-run complete: `{markdown_value(replay.get('same_run_complete'))}`",
3604
+ f"- Mixed/pre-existing CSV: `{markdown_value(replay.get('mixed_csv'))}`",
3605
+ f"- Boundary: {markdown_value(replay.get('claim_boundary'))}",
3606
+ ])
3607
+ else:
3608
+ lines.extend([
3609
+ "",
3610
+ "## Provenance note",
3611
+ "",
3612
+ "- CSV-only dashboards have unknown public-claim provenance unless regenerated from "
3613
+ "the original evidence JSONL or a future trusted provenance ledger.",
3614
+ ])
3615
+ lines.extend([
3616
+ "",
3617
+ "## Re-run context",
3618
+ "",
3619
+ "- Evidence replay: `context-guard-bench --tasks <tasks.json> --variants <variants.json> "
3620
+ "--evidence-jsonl <evidence.jsonl> --csv <results.csv> --report-json <report.json> "
3621
+ "--dashboard-md <dashboard.md>`",
3622
+ ])
3623
+ return "\n".join(lines) + "\n"
3624
+
3625
+
3626
+ def write_report_outputs(
3627
+ csv_path: Path,
3628
+ report_path: Path | None,
3629
+ dashboard_path: Path | None,
3630
+ baseline_variant: str,
3631
+ *,
3632
+ replay_rows: list[EvidenceReplayRow] | None = None,
3633
+ mixed_csv: bool = False,
3634
+ ) -> dict[str, Any]:
3635
+ # Keep lock order stable across all derived writes: source CSV first, then
3636
+ # report, then dashboard. Do not introduce a derived-output -> CSV path.
3637
+ with csv_file_lock(csv_path, create_parent=True):
3638
+ report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
3639
+ if replay_rows is not None:
3640
+ report = annotate_replay_report(report, replay_rows, mixed_csv=mixed_csv)
3641
+ if report_path is not None:
3642
+ with csv_file_lock(report_path, create_parent=True):
3643
+ write_text_no_follow(
3644
+ report_path,
3645
+ json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
3646
+ )
3647
+ if dashboard_path is not None:
3648
+ with csv_file_lock(dashboard_path, create_parent=True):
3649
+ write_text_no_follow(dashboard_path, render_dashboard_markdown(report))
3650
+ return report
3651
+
2206
3652
 
2207
3653
  def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
2208
3654
  # Keep lock order stable across all report writes: source CSV first, derived
2209
3655
  # report second. Do not introduce a report -> CSV path; that can deadlock
2210
3656
  # concurrent report generation.
2211
- with csv_file_lock(csv_path, create_parent=True):
2212
- report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
2213
- with csv_file_lock(report_path, create_parent=True):
2214
- write_text_no_follow(
2215
- report_path,
2216
- json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
2217
- )
2218
- return report
3657
+ return write_report_outputs(csv_path, report_path, None, baseline_variant)
2219
3658
 
2220
3659
 
2221
3660
  def sanitize_note_text(value: Any) -> str:
@@ -2278,8 +3717,18 @@ def existing_file_identity(path: Path) -> tuple[int, int] | None:
2278
3717
  os.close(fd)
2279
3718
 
2280
3719
 
2281
- def validate_distinct_output_paths(csv_path: Path, ledger_path: Path | None, report_path: Path | None) -> None:
2282
- outputs = [("csv", csv_path), ("ledger-jsonl", ledger_path), ("report-json", report_path)]
3720
+ def validate_distinct_output_paths(
3721
+ csv_path: Path,
3722
+ ledger_path: Path | None,
3723
+ report_path: Path | None,
3724
+ dashboard_path: Path | None = None,
3725
+ ) -> None:
3726
+ outputs = [
3727
+ ("csv", csv_path),
3728
+ ("ledger-jsonl", ledger_path),
3729
+ ("report-json", report_path),
3730
+ ("dashboard-md", dashboard_path),
3731
+ ]
2283
3732
  seen: dict[Path, str] = {}
2284
3733
  seen_identity: dict[tuple[int, int], str] = {}
2285
3734
  for label, path in outputs:
@@ -2318,12 +3767,16 @@ def main() -> int:
2318
3767
  help="optional JSONL ledger path for cost-shift accounting per run")
2319
3768
  parser.add_argument("--report-json", default=None, type=Path,
2320
3769
  help="optional A/B summary report JSON path generated from --csv after real runs")
3770
+ parser.add_argument("--dashboard-md", default=None, type=Path,
3771
+ help="optional Markdown dashboard path generated from the benchmark report")
3772
+ parser.add_argument("--evidence-jsonl", default=None, type=Path,
3773
+ help="optional validated run-evidence JSONL replay input; skips provider invocation")
2321
3774
  parser.add_argument("--baseline-variant", default="baseline",
2322
3775
  help="variant name used as the report baseline (default: baseline)")
2323
3776
  args = parser.parse_args()
2324
3777
 
2325
3778
  require_no_follow_file_ops_supported()
2326
- validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
3779
+ validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json, args.dashboard_md)
2327
3780
 
2328
3781
  variants = parse_variants(args.variants)
2329
3782
  tasks = parse_tasks(args.tasks, variants=variants)
@@ -2338,6 +3791,61 @@ def main() -> int:
2338
3791
  for task, variant in targets
2339
3792
  if (task.id, variant.name) not in skip_keys
2340
3793
  ]
3794
+ if args.evidence_jsonl is not None:
3795
+ if args.dry_run:
3796
+ for task, variant in targets:
3797
+ if (task.id, variant.name) in skip_keys:
3798
+ print(f"skip {task.id}/{variant.name} (already in {args.csv})")
3799
+ continue
3800
+ print(f"evidence replay dry-run: {task.id}/{variant.name} <- {args.evidence_jsonl}")
3801
+ print("completed 0 run(s); results in (dry-run; no CSV writes)")
3802
+ return 0
3803
+ csv_had_preexisting_content = file_has_content_no_follow(args.csv)
3804
+ evidence_rows = read_evidence_jsonl(args.evidence_jsonl)
3805
+ evidence_by_key = validate_evidence_coverage(evidence_rows, runnable_targets)
3806
+ claude_ver = "evidence-replay"
3807
+ completed = 0
3808
+ replay_rows_written: list[EvidenceReplayRow] = []
3809
+ for task, variant in targets:
3810
+ if (task.id, variant.name) in skip_keys:
3811
+ print(f"skip {task.id}/{variant.name} (already in {args.csv})")
3812
+ continue
3813
+ evidence = evidence_by_key[(task.id, variant.name)]
3814
+ print(f"replay {task.id}/{variant.name} ...", flush=True)
3815
+ result = run_evidence_fixture(task, variant, evidence)
3816
+ wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
3817
+ if wrote:
3818
+ replay_rows_written.append(evidence)
3819
+ if args.ledger_jsonl is not None:
3820
+ append_cost_shift_ledger(
3821
+ args.ledger_jsonl,
3822
+ claude_ver,
3823
+ result,
3824
+ replay_provenance=evidence.provenance_payload(),
3825
+ )
3826
+ completed += 1
3827
+ status = "ok" if result.success else "FAIL"
3828
+ suffix = "" if wrote else " (CSV not updated; row already present)"
3829
+ print(
3830
+ f" {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
3831
+ f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
3832
+ )
3833
+ if args.report_json is not None or args.dashboard_md is not None:
3834
+ report = write_report_outputs(
3835
+ args.csv,
3836
+ args.report_json,
3837
+ args.dashboard_md,
3838
+ args.baseline_variant,
3839
+ replay_rows=replay_rows_written,
3840
+ mixed_csv=csv_had_preexisting_content or bool(skip_keys) or len(replay_rows_written) != int(completed),
3841
+ )
3842
+ if args.report_json is not None:
3843
+ print(f"report {args.report_json}: {report['claim_status']}")
3844
+ if args.dashboard_md is not None:
3845
+ print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
3846
+ print(f"completed {completed} run(s); results in {args.csv}")
3847
+ return 0
3848
+
2341
3849
  placeholder_targets = [
2342
3850
  f"{task.id}/{variant.name}"
2343
3851
  for task, variant in runnable_targets
@@ -2390,9 +3898,12 @@ def main() -> int:
2390
3898
  f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
2391
3899
  )
2392
3900
  target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
2393
- if args.report_json is not None and not args.dry_run:
2394
- report = write_report_json(args.csv, args.report_json, args.baseline_variant)
2395
- print(f"report {args.report_json}: {report['claim_status']}")
3901
+ if (args.report_json is not None or args.dashboard_md is not None) and not args.dry_run:
3902
+ report = write_report_outputs(args.csv, args.report_json, args.dashboard_md, args.baseline_variant)
3903
+ if args.report_json is not None:
3904
+ print(f"report {args.report_json}: {report['claim_status']}")
3905
+ if args.dashboard_md is not None:
3906
+ print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
2396
3907
  print(f"completed {completed} run(s); results in {target}")
2397
3908
  return 0
2398
3909