npm - @ictechgy/context-guard - Versions diffs - 0.4.9 → 0.4.11 - Mend

@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/CHANGELOG.md +28 -0
package/README.ko.md +59 -31
package/README.md +85 -36
package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
package/docs/benchmark-workflow-examples.md +3 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
package/docs/distribution.md +10 -7
package/docs/experimental-benchmark-fixtures.md +30 -6
package/package.json +4 -6
package/packaging/homebrew/context-guard.rb.template +1 -1
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +20 -14
package/plugins/context-guard/README.md +26 -17
package/plugins/context-guard/bin/context-guard +147 -25
package/plugins/context-guard/bin/context-guard-artifact +884 -79
package/plugins/context-guard/bin/context-guard-audit +33 -2
package/plugins/context-guard/bin/context-guard-bench +1542 -31
package/plugins/context-guard/bin/context-guard-cache-score +665 -0
package/plugins/context-guard/bin/context-guard-compress +146 -1
package/plugins/context-guard/bin/context-guard-cost +790 -6
package/plugins/context-guard/bin/context-guard-experiments +463 -26
package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
package/plugins/context-guard/bin/context-guard-filter +163 -7
package/plugins/context-guard/bin/context-guard-guard-read +3 -0
package/plugins/context-guard/bin/context-guard-pack +892 -49
package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
package/plugins/context-guard/bin/context-guard-setup +165 -31
package/plugins/context-guard/bin/context-guard-statusline +490 -283
package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
package/plugins/context-guard/bin/context-guard-trim-output +288 -41
package/plugins/context-guard/brief/README.md +5 -5
package/plugins/context-guard/lib/context_guard_commands.py +230 -0
package/plugins/context-guard/skills/setup/SKILL.md +1 -0
package/context-guard-kit/README.md +0 -91
package/context-guard-kit/benchmark_runner.py +0 -2401
package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
package/context-guard-kit/context_compress.py +0 -695
package/context-guard-kit/context_escrow.py +0 -935
package/context-guard-kit/context_filter.py +0 -637
package/context-guard-kit/context_guard_cli.py +0 -325
package/context-guard-kit/context_guard_diet.py +0 -1711
package/context-guard-kit/context_pack.py +0 -2713
package/context-guard-kit/cost_guard.py +0 -2349
package/context-guard-kit/experimental_registry.py +0 -4348
package/context-guard-kit/failed_attempt_nudge.py +0 -567
package/context-guard-kit/guard_large_read.py +0 -690
package/context-guard-kit/hook_secret_patterns.py +0 -43
package/context-guard-kit/read_symbol.py +0 -483
package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
package/context-guard-kit/sanitize_output.py +0 -725
package/context-guard-kit/settings.example.json +0 -67
package/context-guard-kit/setup_wizard.py +0 -2515
package/context-guard-kit/statusline.sh +0 -362
package/context-guard-kit/statusline_merged.sh +0 -157
package/context-guard-kit/tool_schema_pruner.py +0 -837
package/context-guard-kit/trim_command_output.py +0 -1449

package/plugins/context-guard/bin/context-guard-bench CHANGED Viewed

@@ -178,19 +178,137 @@ EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]],
 )
 MAX_USAGE_TOKEN_COUNT = 10**12
 MAX_USAGE_COST_USD = 10**9
+MAX_EVIDENCE_JSONL_BYTES = 5_000_000
+MAX_EVIDENCE_JSONL_LINES = 100_000
 # Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
 # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
 # ~4 bytes/token의 통용 근사값을 사용한다.
 TOKEN_PROXY_BYTES_PER_TOKEN = 4
 BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
 MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
+MEASUREMENT_BASELINE_SCHEMA_VERSION = "contextguard.bench.measurement-baseline.v1"
+DEFAULT_MATRIX_SCHEMA_VERSION = "contextguard.bench.default-matrix.v1"
+PUBLIC_CLAIM_READINESS_SCHEMA_VERSION = "contextguard.bench.public-claim-readiness.v1"
 SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
 SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
 SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
+EVIDENCE_REPLAY_SOURCE_TYPES = frozenset({"synthetic_fixture", "provider_export", "manual_audit"})
+PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES = frozenset({
+    "provider_measured_matched_task",
+    "provider_measured_matched_task_public_claim",
+    "hosted_api_provider_measured_matched_task",
+})
+REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS = "provider_export_public_claim_candidate"
+REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS = "provider_export_claim_gates_not_met"
+REPLAY_NOT_PUBLIC_CLAIM_STATUS = "replay_only_not_public_claim"
+REPLAY_UNKNOWN_MIXED_CSV_STATUS = "unknown_mixed_csv"
+REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES = frozenset({
+    "token_and_shifted_cost_savings_observed",
+})
+REPLAY_CLAIM_BOUNDARY = (
+    "Evidence replay is an import/replay mode. Synthetic fixtures and manual audits are never "
+    "hosted API token/cost savings evidence; public claims require complete provider_export "
+    "provenance for every report row plus the normal matched-task quality, token, cost, and "
+    "shifted-cost gates."
+)
+DEFAULT_MATRIX_CLASSIFICATIONS = ("default-on", "advisory", "experimental", "reject/rework")
+DEFAULT_MATRIX_CLASSIFICATION_STRENGTH = {
+    "experimental": 0,
+    "advisory": 1,
+    "default-on": 2,
+}
+DEFAULT_MATRIX_LANES: tuple[dict[str, Any], ...] = (
+    {
+        "id": "trimming",
+        "label": "Trimming / digest output",
+        "policy_ceiling": "default-on",
+        "task_keywords": ("long_log_analysis", "output_transform", "trim", "trimming", "sanitize_output", "digest"),
+        "variant_keywords": ("trim", "trimming", "sanitize", "digest", "brief"),
+    },
+    {
+        "id": "artifact_escrow",
+        "label": "Artifact escrow / receipt handles",
+        "policy_ceiling": "default-on",
+        "task_keywords": ("artifact_receipt", "artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
+        "variant_keywords": ("artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
+    },
+    {
+        "id": "tool_pruning",
+        "label": "Tool/MCP schema pruning",
+        "policy_ceiling": "default-on",
+        "task_keywords": ("tool_schema", "tool_prune", "tool_pruning", "mcp_schema", "defer_report"),
+        "variant_keywords": ("tool_prune", "tool_pruning", "tool_schema", "mcp", "defer"),
+    },
+    {
+        "id": "cache_advice",
+        "label": "Cache layout advice",
+        "policy_ceiling": "advisory",
+        "task_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache"),
+        "variant_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache", "cache"),
+    },
+    {
+        "id": "adaptive_k",
+        "label": "Adaptive-k context packing",
+        "policy_ceiling": "advisory",
+        "task_keywords": ("adaptive_k", "adaptive", "top_k", "context_pack"),
+        "variant_keywords": ("adaptive_k", "adaptive", "top_k", "pack_adaptive"),
+    },
+    {
+        "id": "optional_compression",
+        "label": "Optional compression",
+        "policy_ceiling": "advisory",
+        "task_keywords": ("learned_compression", "compression", "compress", "context_diff"),
+        "variant_keywords": ("learned_compression", "compression", "compress", "context_diff"),
+    },
+)
+DEFAULT_MATRIX_LANE_IDS = tuple(str(item["id"]) for item in DEFAULT_MATRIX_LANES)
+DEFAULT_MATRIX_LANE_BY_ID = {str(item["id"]): item for item in DEFAULT_MATRIX_LANES}
+MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS = 20
+DEFAULT_MATRIX_CLAIM_BOUNDARY = {
+    "id": "default_matrix_reporting_only_not_runtime_default_or_savings_claim",
+    "reporting_only": True,
+    "changes_runtime_defaults": False,
+    "hosted_api_token_savings_claim_allowed": False,
+    "hosted_api_cost_savings_claim_allowed": False,
+    "public_claims_must_use_report_claim_status_and_matched_pair_evidence": True,
+    "reason": (
+        "The default matrix classifies local benchmark lanes for review only; it does not "
+        "turn features on by default and does not authorize hosted API savings claims."
+    ),
+}
+PUBLIC_CLAIM_READINESS_GATE_IDS = (
+    "matched_successful_tasks",
+    "provider_measured_token_cost",
+    "quality_non_inferiority",
+    "shifted_cost_accounting",
+    "confidence_failure_notes",
+    "provider_export_provenance",
+)
+PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY = {
+    "id": "public_claim_readiness_authoritative_release_gate",
+    "reporting_only": True,
+    "claim_allowed_field": "public_claim_readiness.claim_allowed",
+    "unsupported_claims_forbidden": True,
+    "hosted_api_token_savings_claim_without_claim_allowed_forbidden": True,
+    "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": True,
+    "fixed_percent_savings_claim_without_matched_provider_report_forbidden": True,
+    "requires_matched_successful_tasks": True,
+    "requires_provider_measured_tokens_and_cost": True,
+    "requires_quality_non_inferiority": True,
+    "requires_shifted_cost_accounting": True,
+    "requires_confidence_and_failure_notes": True,
+    "requires_provider_export_provenance": True,
+    "reason": (
+        "Public hosted token/cost savings claims are forbidden unless every readiness gate passes "
+        "and public_claim_readiness.claim_allowed is true."
+    ),
+}
 MAX_SELF_HOSTED_LABEL_CHARS = 120
 MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
 MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
 MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
+MAX_FIXTURE_FILE_BYTES = 1_000_000
+MAX_CLAUDE_PROMPT_ARG_BYTES = MAX_VARIANT_PROMPT_FILE_BYTES
 CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
 SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
 VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -317,12 +435,18 @@ def _open_regular_no_symlink(
         os.close(parent_fd)
-def _read_text_no_follow(path: Path) -> str:
+def _read_text_no_follow(path: Path, *, max_bytes: int = MAX_FIXTURE_FILE_BYTES) -> str:
     fd = _open_regular_no_symlink(path)
     try:
-        with os.fdopen(fd, "r", encoding="utf-8") as handle:
+        with os.fdopen(fd, "rb") as handle:
             fd = -1
-            return handle.read()
+            raw = handle.read(max_bytes + 1)
+            if len(raw) > max_bytes:
+                raise SystemExit(f"fixture file exceeds {max_bytes} bytes: {path}")
+            try:
+                return raw.decode("utf-8")
+            except UnicodeDecodeError as exc:
+                raise SystemExit(f"fixture file must be UTF-8 text: {path}: {exc.reason}") from None
     finally:
         if fd != -1:
             os.close(fd)
@@ -400,6 +524,38 @@ class RunResult:
     self_hosted_metrics: dict[str, Any] | None = None
+@dataclass
+class EvidenceReplayRow:
+    result: RunResult
+    source_type: str
+    provider_name: str | None
+    capture_command_or_export_id: str | None
+    claim_scope: str
+    provider_export_provenance_complete: bool
+    public_claim_eligible: bool
+    explicit_notes: bool
+    line_number: int
+    @property
+    def key(self) -> tuple[str, str]:
+        return (self.result.task_id, self.result.variant)
+    def provenance_payload(self) -> dict[str, Any]:
+        return {
+            "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
+            "mode": "evidence_jsonl_replay",
+            "evidence_source_type": self.source_type,
+            "provider_name": self.provider_name,
+            "capture_command_or_export_id": self.capture_command_or_export_id,
+            "claim_scope": self.claim_scope,
+            "provider_export_provenance_complete": self.provider_export_provenance_complete,
+            "public_claim_eligible": self.public_claim_eligible,
+            "explicit_notes": self.explicit_notes,
+            "line_number": self.line_number,
+            "claim_boundary": REPLAY_CLAIM_BOUNDARY,
+        }
 @dataclass
 class BoundedProcessResult:
     returncode: int
@@ -470,6 +626,17 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
     return extra_args
+def require_argv_safe_prompt(text: str, *, owner: str) -> str:
+    """Keep prompt-bearing argv below a bounded size to avoid E2BIG failures."""
+    size = len(text.encode("utf-8", errors="replace"))
+    if size > MAX_CLAUDE_PROMPT_ARG_BYTES:
+        raise SystemExit(
+            f"{owner} prompt exceeds argv-safe limit "
+            f"({size} bytes > {MAX_CLAUDE_PROMPT_ARG_BYTES}); use a smaller fixture prompt"
+        )
+    return text
 def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
     """Return a safe relative prompt-file path, or fail before any file read."""
     rel_path = Path(raw_path)
@@ -522,26 +689,28 @@ def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None
                 f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
             )
         try:
-            with os.fdopen(fd, "r", encoding="utf-8") as handle:
+            with os.fdopen(fd, "rb") as handle:
                 fd = -1
-                text = handle.read()
-        except UnicodeDecodeError as exc:
-            raise SystemExit(
-                f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
-                f"{label}: {exc.reason}"
-            ) from None
+                raw = handle.read(MAX_VARIANT_PROMPT_FILE_BYTES + 1)
         except OSError as exc:
             detail = exc.strerror or exc.__class__.__name__
             raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
     finally:
         if fd != -1:
             os.close(fd)
-    if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
+    if len(raw) > MAX_VARIANT_PROMPT_FILE_BYTES:
         raise SystemExit(
             f"{owner} variant_prompt_files prompt text exceeds "
             f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
         )
-    return text
+    try:
+        text = raw.decode("utf-8")
+    except UnicodeDecodeError as exc:
+        raise SystemExit(
+            f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
+            f"{label}: {exc.reason}"
+        ) from None
+    return require_argv_safe_prompt(text, owner=f"{owner} variant_prompt_files")
 def load_variant_prompt_files_for_targets(
@@ -977,7 +1146,11 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
         argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
     argv.extend(variant.extra_args)
     argv.append("--")
-    argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
+    prompt = require_argv_safe_prompt(
+        task.variant_prompt_texts.get(variant.name, task.prompt),
+        owner=f"task {task.id} variant {variant.name}",
+    )
+    argv.append(prompt)
     return argv
@@ -1361,7 +1534,13 @@ def write_text_no_follow(path: Path, text: str) -> None:
             os.close(fd)
-def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
+def append_cost_shift_ledger(
+    path: Path,
+    claude_ver: str,
+    result: RunResult,
+    *,
+    replay_provenance: dict[str, Any] | None = None,
+) -> None:
     shifted_cost_known = cost_shift_measured(result)
     byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
     payload = {
@@ -1412,6 +1591,10 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
     }
     if result.self_hosted_metrics is not None:
         payload["self_hosted_metrics"] = result.self_hosted_metrics
+    if replay_provenance is not None:
+        payload["replay_provenance"] = replay_provenance
+        payload["evidence_source_type"] = replay_provenance.get("evidence_source_type")
+        payload["public_claim_eligible"] = bool(replay_provenance.get("public_claim_eligible"))
     with csv_file_lock(path, create_parent=True):
         fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
         try:
@@ -1435,7 +1618,9 @@ def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
             reader = csv.DictReader(f)
             fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
             validate_csv_schema(csv_path, fieldnames)
-            for row in reader:
+            for index, row in enumerate(reader, start=1):
+                if index > MAX_CSV_ROWS:
+                    raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
                 tid = row.get("task_id") or ""
                 var = row.get("variant") or ""
                 if tid and var:
@@ -1487,6 +1672,356 @@ def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
             os.close(fd)
+def file_has_content_no_follow(path: Path) -> bool:
+    try:
+        fd = _open_regular_no_symlink(path)
+    except FileNotFoundError:
+        return False
+    try:
+        return os.fstat(fd).st_size > 0
+    finally:
+        os.close(fd)
+def require_evidence_object(raw: Any, *, owner: str) -> dict[str, Any]:
+    if not isinstance(raw, dict):
+        raise SystemExit(f"{owner} evidence row must be a JSON object")
+    return raw
+def evidence_non_empty_string(raw: Any, *, field: str, owner: str, required: bool = True) -> str | None:
+    if raw is None:
+        if required:
+            raise SystemExit(f"{owner} {field} must be a non-empty string")
+        return None
+    if not isinstance(raw, str):
+        raise SystemExit(f"{owner} {field} must be a string")
+    text = sanitize_note_text(raw)
+    if not text:
+        if required:
+            raise SystemExit(f"{owner} {field} must be a non-empty string")
+        return None
+    return text
+def evidence_bool(raw: Any, *, field: str, owner: str, default: bool = False) -> bool:
+    if raw is None:
+        return default
+    if not isinstance(raw, bool):
+        raise SystemExit(f"{owner} {field} must be a boolean")
+    return raw
+def evidence_nonnegative_int(
+    raw: Any,
+    *,
+    field: str,
+    owner: str,
+    default: int = 0,
+    maximum: int = MAX_USAGE_TOKEN_COUNT,
+) -> int:
+    if raw is None:
+        return default
+    value = normalize_usage_token(raw)
+    if value is None or value > maximum:
+        raise SystemExit(f"{owner} {field} must be a finite non-negative integer")
+    return value
+def evidence_nonnegative_float(
+    raw: Any,
+    *,
+    field: str,
+    owner: str,
+    default: float = 0.0,
+    maximum: float = MAX_USAGE_COST_USD,
+) -> float:
+    if raw is None:
+        return default
+    if isinstance(raw, bool) or not isinstance(raw, (int, float)):
+        raise SystemExit(f"{owner} {field} must be a finite non-negative number")
+    value = float(raw)
+    if not math.isfinite(value) or value < 0 or value > maximum:
+        raise SystemExit(f"{owner} {field} must be a finite non-negative number")
+    return value
+def evidence_first(raw: dict[str, Any], *keys: str) -> Any:
+    for key in keys:
+        if key in raw:
+            return raw[key]
+    return None
+def parse_evidence_provenance(raw: dict[str, Any], *, owner: str) -> dict[str, Any]:
+    provenance = raw.get("provenance")
+    if provenance is not None and not isinstance(provenance, dict):
+        raise SystemExit(f"{owner} provenance must be a JSON object")
+    source_raw = (
+        provenance.get("evidence_source_type")
+        if isinstance(provenance, dict) and "evidence_source_type" in provenance
+        else raw.get("evidence_source_type")
+    )
+    source_type = evidence_non_empty_string(source_raw, field="evidence_source_type", owner=owner)
+    assert source_type is not None
+    if source_type not in EVIDENCE_REPLAY_SOURCE_TYPES:
+        raise SystemExit(
+            f"{owner} evidence_source_type must be one of: {', '.join(sorted(EVIDENCE_REPLAY_SOURCE_TYPES))}"
+        )
+    provider_name = evidence_non_empty_string(
+        provenance.get("provider_name") if isinstance(provenance, dict) else raw.get("provider_name"),
+        field="provider_name",
+        owner=owner,
+        required=False,
+    )
+    capture_id = evidence_non_empty_string(
+        (
+            provenance.get("capture_command_or_export_id")
+            if isinstance(provenance, dict) and "capture_command_or_export_id" in provenance
+            else raw.get("capture_command_or_export_id")
+        ),
+        field="capture_command_or_export_id",
+        owner=owner,
+        required=False,
+    )
+    claim_scope = evidence_non_empty_string(
+        provenance.get("claim_scope") if isinstance(provenance, dict) else raw.get("claim_scope"),
+        field="claim_scope",
+        owner=owner,
+    )
+    assert claim_scope is not None
+    provider_authority = (
+        source_type == "provider_export"
+        and provider_name is not None
+        and capture_id is not None
+        and claim_scope in PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES
+    )
+    return {
+        "source_type": source_type,
+        "provider_name": provider_name,
+        "capture_command_or_export_id": capture_id,
+        "claim_scope": claim_scope,
+        "provider_public_claim_authority": provider_authority,
+    }
+def parse_evidence_tokens(raw: dict[str, Any], *, owner: str) -> tuple[dict[str, int], set[str]]:
+    token_block = raw.get("tokens")
+    if token_block is not None and not isinstance(token_block, dict):
+        raise SystemExit(f"{owner} tokens must be a JSON object")
+    tokens: dict[str, int] = {}
+    observed: set[str] = set()
+    source = token_block if isinstance(token_block, dict) else {}
+    for bucket, _keys in USAGE_KEY_GROUPS:
+        value = source.get(bucket) if bucket in source else raw.get(bucket)
+        if value is not None:
+            observed.add(bucket)
+        tokens[bucket] = evidence_nonnegative_int(value, field=bucket, owner=owner)
+    return tokens, observed
+def parse_evidence_row(raw_value: Any, *, owner: str, line_number: int) -> EvidenceReplayRow:
+    raw = require_evidence_object(raw_value, owner=owner)
+    schema = evidence_non_empty_string(raw.get("schema_version"), field="schema_version", owner=owner)
+    if schema != BENCH_RUN_EVIDENCE_SCHEMA_VERSION:
+        raise SystemExit(
+            f"{owner} schema_version must be {BENCH_RUN_EVIDENCE_SCHEMA_VERSION}"
+        )
+    task_id = evidence_non_empty_string(raw.get("task_id"), field="task_id", owner=owner)
+    variant = evidence_non_empty_string(raw.get("variant"), field="variant", owner=owner)
+    assert task_id is not None and variant is not None
+    provenance = parse_evidence_provenance(raw, owner=owner)
+    provider_authority = bool(provenance["provider_public_claim_authority"])
+    raw_primary_tokens_measured = evidence_bool(
+        raw.get("primary_tokens_measured"),
+        field="primary_tokens_measured",
+        owner=owner,
+    )
+    raw_cost_measured = evidence_bool(
+        evidence_first(raw, "cost_measured", "primary_cost_measured"),
+        field="cost_measured",
+        owner=owner,
+    )
+    if provenance["source_type"] in {"synthetic_fixture", "manual_audit"}:
+        primary_tokens_measured = False
+        cost_measured = False
+    elif provider_authority:
+        primary_tokens_measured = raw_primary_tokens_measured
+        cost_measured = raw_cost_measured
+    else:
+        if raw_primary_tokens_measured or raw_cost_measured:
+            raise SystemExit(
+                f"{owner} provider_export measured flags require provider_name, "
+                "capture_command_or_export_id, and a provider-measured matched-task claim_scope"
+            )
+        primary_tokens_measured = False
+        cost_measured = False
+    tokens, observed_token_buckets = parse_evidence_tokens(raw, owner=owner)
+    if primary_tokens_measured and not {"input_tokens", "output_tokens"}.issubset(observed_token_buckets):
+        raise SystemExit(
+            f"{owner} primary_tokens_measured=true requires input_tokens and output_tokens evidence"
+        )
+    cost_usd = evidence_nonnegative_float(
+        evidence_first(raw, "cost_usd", "primary_cost_usd"),
+        field="cost_usd",
+        owner=owner,
+    )
+    if cost_measured and "cost_usd" not in raw and "primary_cost_usd" not in raw:
+        raise SystemExit(f"{owner} cost_measured=true requires cost_usd evidence")
+    if "success" not in raw:
+        raise SystemExit(f"{owner} success must be a boolean")
+    success = evidence_bool(raw.get("success"), field="success", owner=owner)
+    notes = evidence_non_empty_string(raw.get("notes"), field="notes", owner=owner, required=False)
+    explicit_notes = notes is not None
+    model = evidence_non_empty_string(raw.get("model"), field="model", owner=owner, required=False) or "evidence-replay"
+    effort = evidence_non_empty_string(raw.get("effort"), field="effort", owner=owner, required=False) or ""
+    self_hosted_metrics = None
+    if SELF_HOSTED_METRICS_KEY in raw:
+        self_hosted_metrics = normalize_self_hosted_metrics(
+            raw.get(SELF_HOSTED_METRICS_KEY),
+            source="evidence_jsonl.self_hosted_metrics",
+        )
+        if self_hosted_metrics is None:
+            raise SystemExit(f"{owner} self_hosted_metrics must be normalized explicit metrics")
+    result = RunResult(
+        task_id=task_id,
+        variant=variant,
+        model=model,
+        effort=effort,
+        tokens=tokens,
+        cost_usd=cost_usd,
+        success=success,
+        notes=notes or f"evidence replay ({provenance['source_type']})",
+        corrections=evidence_nonnegative_int(raw.get("corrections"), field="corrections", owner=owner),
+        cost_measured=cost_measured,
+        wall_time_seconds=evidence_nonnegative_float(
+            raw.get("wall_time_seconds"),
+            field="wall_time_seconds",
+            owner=owner,
+            maximum=MAX_SELF_HOSTED_LATENCY_MS / 1000,
+        ),
+        turns=evidence_nonnegative_int(raw.get("turns"), field="turns", owner=owner),
+        hook_triggers=evidence_nonnegative_int(raw.get("hook_triggers"), field="hook_triggers", owner=owner),
+        bytes_before=evidence_nonnegative_int(raw.get("bytes_before"), field="bytes_before", owner=owner),
+        bytes_after=evidence_nonnegative_int(raw.get("bytes_after"), field="bytes_after", owner=owner),
+        artifacts_used=evidence_nonnegative_int(raw.get("artifacts_used"), field="artifacts_used", owner=owner),
+        external_tokens=evidence_nonnegative_int(raw.get("external_tokens"), field="external_tokens", owner=owner),
+        external_tokens_measured=evidence_bool(
+            raw.get("external_tokens_measured"),
+            field="external_tokens_measured",
+            owner=owner,
+        ),
+        external_cost_usd=evidence_nonnegative_float(
+            raw.get("external_cost_usd"),
+            field="external_cost_usd",
+            owner=owner,
+        ),
+        external_cost_measured=evidence_bool(
+            raw.get("external_cost_measured"),
+            field="external_cost_measured",
+            owner=owner,
+        ),
+        provider_cached_tokens=evidence_nonnegative_int(
+            raw.get("provider_cached_tokens"),
+            field="provider_cached_tokens",
+            owner=owner,
+        ),
+        provider_cached_tokens_measured=evidence_bool(
+            raw.get("provider_cached_tokens_measured"),
+            field="provider_cached_tokens_measured",
+            owner=owner,
+        ),
+        primary_tokens_measured=primary_tokens_measured,
+        self_hosted_metrics=self_hosted_metrics,
+    )
+    return EvidenceReplayRow(
+        result=result,
+        source_type=str(provenance["source_type"]),
+        provider_name=provenance["provider_name"],
+        capture_command_or_export_id=provenance["capture_command_or_export_id"],
+        claim_scope=str(provenance["claim_scope"]),
+        provider_export_provenance_complete=provider_authority,
+        public_claim_eligible=False,
+        explicit_notes=explicit_notes,
+        line_number=line_number,
+    )
+def read_evidence_jsonl(path: Path) -> list[EvidenceReplayRow]:
+    fd = _open_regular_no_symlink(path)
+    try:
+        size = os.fstat(fd).st_size
+        if size > MAX_EVIDENCE_JSONL_BYTES:
+            raise SystemExit(
+                f"evidence JSONL exceeds {MAX_EVIDENCE_JSONL_BYTES} bytes: {path}"
+            )
+        rows: list[EvidenceReplayRow] = []
+        with os.fdopen(fd, "r", encoding="utf-8") as handle:
+            fd = -1
+            for line_number, line in enumerate(handle, start=1):
+                if line_number > MAX_EVIDENCE_JSONL_LINES:
+                    raise SystemExit(
+                        f"evidence JSONL line limit exceeded for {path}: > {MAX_EVIDENCE_JSONL_LINES}"
+                    )
+                if not line.strip():
+                    continue
+                try:
+                    payload = json.loads(line)
+                except json.JSONDecodeError as exc:
+                    raise SystemExit(
+                        f"{path}:{line_number} evidence row must be JSON: {exc.msg}"
+                    ) from None
+                rows.append(parse_evidence_row(payload, owner=f"{path}:{line_number}", line_number=line_number))
+    finally:
+        if fd != -1:
+            os.close(fd)
+    if not rows:
+        raise SystemExit(f"evidence JSONL contains no rows: {path}")
+    return rows
+def validate_evidence_coverage(
+    evidence_rows: list[EvidenceReplayRow],
+    runnable_targets: list[tuple[TaskFixture, Variant]],
+) -> dict[tuple[str, str], EvidenceReplayRow]:
+    by_key: dict[tuple[str, str], EvidenceReplayRow] = {}
+    for row in evidence_rows:
+        if row.key in by_key:
+            raise SystemExit(
+                f"duplicate evidence row for {row.key[0]}/{row.key[1]} "
+                f"(lines {by_key[row.key].line_number} and {row.line_number})"
+            )
+        by_key[row.key] = row
+    missing = [
+        f"{task.id}/{variant.name}"
+        for task, variant in runnable_targets
+        if (task.id, variant.name) not in by_key
+    ]
+    if missing:
+        raise SystemExit(f"missing evidence row(s) for selected targets: {', '.join(missing)}")
+    return {
+        (task.id, variant.name): by_key[(task.id, variant.name)]
+        for task, variant in runnable_targets
+    }
+def run_evidence_fixture(task: TaskFixture, variant: Variant, evidence: EvidenceReplayRow) -> RunResult:
+    result = evidence.result
+    if result.task_id != task.id or result.variant != variant.name:
+        raise SystemExit(
+            f"evidence target mismatch: expected {task.id}/{variant.name}, "
+            f"got {result.task_id}/{result.variant}"
+        )
+    if result.model == "evidence-replay":
+        result.model = task.model
+    if not result.effort:
+        result.effort = task.effort or ""
+    return result
 def row_int(row: dict[str, str], key: str) -> int:
     try:
         return int(float(row.get(key) or 0))
@@ -1546,6 +2081,77 @@ def row_cost_shift_measured(row: dict[str, str]) -> bool:
     )
+def measurement_baseline_contract() -> dict[str, Any]:
+    """Describe the benchmark report's current measurement baseline contract.
+    This block is descriptive. It does not change the CSV schema and does not
+    grant token/cost savings claims by itself; those remain gated by matched
+    successful tasks, measured primary tokens/costs, shifted-cost accounting,
+    and quality gates.
+    """
+    return {
+        "schema_version": MEASUREMENT_BASELINE_SCHEMA_VERSION,
+        "csv_schema_unchanged": True,
+        "csv_columns": list(CSV_COLUMNS),
+        "captured_fields": {
+            "task_identity": ["task_id", "variant"],
+            "run_configuration": ["model", "effort", "claude_version"],
+            "primary_token_buckets": [
+                "input_tokens",
+                "output_tokens",
+                "cache_read",
+                "cache_creation",
+                "total_tokens",
+                "primary_tokens_measured",
+            ],
+            "primary_cost": ["cost_usd", "cost_measured"],
+            "provider_cache_telemetry": ["provider_cached_tokens", "provider_cached_tokens_measured"],
+            "latency": ["wall_time_seconds"],
+            "quality_and_result": ["success", "corrections", "notes"],
+            "tooling_and_proxy_metrics": ["turns", "hook_triggers", "bytes_before", "bytes_after", "artifacts_used"],
+            "shifted_cost_accounting": [
+                "external_tokens",
+                "external_tokens_measured",
+                "external_cost_usd",
+                "external_cost_measured",
+                "total_cost_with_shift_usd",
+            ],
+        },
+        "claim_eligible_fields": {
+            "token_savings": [
+                "matched successful baseline and variant tasks",
+                "primary_tokens_measured=true on both sides",
+                "quality_gate=pass",
+            ],
+            "shifted_cost_savings": [
+                "matched successful baseline and variant tasks",
+                "cost_measured=true on both sides",
+                "external_cost_measured=true when external_tokens are present",
+                "quality_gate=pass",
+            ],
+        },
+        "proxy_only_fields": {
+            "byte_metrics": ["bytes_before", "bytes_after"],
+            "token_proxy": "chars_div_4_proxy_only",
+            "provider_cache": "diagnostic_telemetry_not_contextguard_token_reduction",
+        },
+        "missing_future_run_identity_fields": [
+            "repo_revision",
+            "agent_harness",
+            "feature_flags",
+            "provider_name",
+            "success_command_identity",
+        ],
+        "claim_boundary": {
+            "descriptive_contract_only": True,
+            "enables_savings_claims_by_itself": False,
+            "requires_matched_successful_tasks": True,
+            "requires_shifted_cost_accounting_for_cost_claims": True,
+            "raw_proxy_estimates_are_not_hosted_api_token_savings": True,
+        },
+    }
 def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
     by_variant: dict[str, dict[str, Any]] = {}
     successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
@@ -2187,10 +2793,11 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
                 claim_status = "token_savings_observed_cost_unmeasured"
             elif token_savings_observed:
                 claim_status = "token_savings_observed_cost_shift_watch"
-    return {
+    report = {
         "schema": "context-guard-bench-report-v1",
         "baseline_variant": baseline_variant,
         "row_count": len(rows),
+        "measurement_baseline": measurement_baseline_contract(),
         "summary_by_variant": by_variant,
         "comparisons": comparisons,
         "matched_pair_evidence": matched_pair_evidence,
@@ -2200,22 +2807,854 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
             "shifted cost savings require measured primary cost and measured external cost when "
             "external tokens are present. Wall time and provider cached-token fields are diagnostic "
             "telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
-            "discounts must stay separate from token-reduction claims."
+            "discounts must stay separate from token-reduction claims. Public hosted savings "
+            "claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden."
         ),
     }
+    report["public_claim_readiness"] = build_public_claim_readiness(report)
+    report["default_matrix"] = build_default_matrix(report)
+    return report
+def annotate_replay_report(
+    report: dict[str, Any],
+    replay_rows: list[EvidenceReplayRow],
+    *,
+    mixed_csv: bool,
+) -> dict[str, Any]:
+    source_types = sorted({row.source_type for row in replay_rows})
+    provider_names = sorted({row.provider_name for row in replay_rows if row.provider_name})
+    claim_scopes = sorted({row.claim_scope for row in replay_rows})
+    same_run_complete = (not mixed_csv) and len(replay_rows) == int(report.get("row_count") or 0)
+    all_provider_claim_authority = bool(replay_rows) and all(
+        row.provider_export_provenance_complete for row in replay_rows
+    )
+    raw_claim_status = str(report.get("claim_status") or "")
+    matched_pair_evidence = report.get("matched_pair_evidence")
+    matched_claim_gates_allow_public_claim = (
+        isinstance(matched_pair_evidence, list)
+        and bool(matched_pair_evidence)
+        and all(
+            isinstance(item, dict)
+            and isinstance(item.get("claim_boundary"), dict)
+            and bool(item["claim_boundary"].get("token_savings_claim_allowed"))
+            and bool(item["claim_boundary"].get("shifted_cost_claim_allowed"))
+            for item in matched_pair_evidence
+        )
+    )
+    report_claim_gates_allow_public_claim = (
+        raw_claim_status in REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES
+        and matched_claim_gates_allow_public_claim
+    )
+    if not same_run_complete:
+        public_claim_status = REPLAY_UNKNOWN_MIXED_CSV_STATUS
+        public_claim_eligible = False
+    elif all_provider_claim_authority and report_claim_gates_allow_public_claim:
+        public_claim_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
+        public_claim_eligible = True
+    elif all_provider_claim_authority:
+        public_claim_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
+        public_claim_eligible = False
+    else:
+        public_claim_status = REPLAY_NOT_PUBLIC_CLAIM_STATUS
+        public_claim_eligible = False
+    report["raw_metric_claim_status"] = raw_claim_status
+    report["public_claim_status"] = public_claim_status
+    report["public_claim_eligible"] = public_claim_eligible
+    if not public_claim_eligible:
+        report["claim_status"] = public_claim_status
+    report["replay_evidence"] = {
+        "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
+        "mode": "evidence_jsonl_replay",
+        "row_count": len(replay_rows),
+        "source_types": source_types,
+        "provider_names": provider_names,
+        "claim_scopes": claim_scopes,
+        "same_run_complete": same_run_complete,
+        "mixed_csv": mixed_csv,
+        "provider_export_provenance_complete": all_provider_claim_authority,
+        "report_claim_gates_allow_public_claim": report_claim_gates_allow_public_claim,
+        "public_claim_status": public_claim_status,
+        "public_claim_eligible": public_claim_eligible,
+        "target_keys": [f"{row.result.task_id}/{row.result.variant}" for row in replay_rows],
+        "claim_boundary": REPLAY_CLAIM_BOUNDARY,
+    }
+    report["public_claim_readiness"] = build_public_claim_readiness(
+        report,
+        replay_rows=replay_rows,
+        mixed_csv=mixed_csv,
+    )
+    report["default_matrix"] = build_default_matrix(report)
+    return report
+def report_public_claim_status(report: dict[str, Any]) -> tuple[str, bool | None]:
+    if "public_claim_status" in report:
+        return str(report.get("public_claim_status")), bool(report.get("public_claim_eligible"))
+    return (
+        "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
+        None,
+    )
+def public_claim_readiness_gate(
+    gate_id: str,
+    label: str,
+    passed: bool,
+    reason: str,
+    evidence: dict[str, Any] | None = None,
+    *,
+    unknown: bool = False,
+) -> dict[str, Any]:
+    status = "unknown" if unknown else ("pass" if passed else "fail")
+    return {
+        "id": gate_id,
+        "label": label,
+        "required": True,
+        "status": status,
+        "passed": passed and not unknown,
+        "reason": reason,
+        "evidence": evidence or {},
+    }
+def public_claim_pair_side_measured(pair: dict[str, Any], side: str, metric: str) -> bool:
+    measurements = pair.get("measurements") if isinstance(pair.get("measurements"), dict) else {}
+    side_block = measurements.get(side) if isinstance(measurements.get(side), dict) else {}
+    metric_block = side_block.get(metric) if isinstance(side_block.get(metric), dict) else {}
+    return bool(metric_block.get("measured"))
+def public_claim_numeric_values(items: list[Any]) -> list[float]:
+    values: list[float] = []
+    for item in items:
+        if isinstance(item, bool) or not isinstance(item, (int, float)):
+            continue
+        numeric = float(item)
+        if math.isfinite(numeric):
+            values.append(numeric)
+    return values
+def public_claim_readiness_evidence_text(evidence: dict[str, Any]) -> str:
+    parts: list[str] = []
+    for key, value in evidence.items():
+        if isinstance(value, list):
+            display = ",".join(str(item) for item in value[:5])
+            if len(value) > 5:
+                display += ",…"
+        elif isinstance(value, dict):
+            display = ",".join(f"{k}={v}" for k, v in list(value.items())[:5])
+            if len(value) > 5:
+                display += ",…"
+        else:
+            display = str(value)
+        parts.append(f"{key}={display}")
+    return "; ".join(parts)
+def build_public_claim_readiness(
+    report: dict[str, Any],
+    *,
+    replay_rows: list[EvidenceReplayRow] | None = None,
+    mixed_csv: bool = False,
+) -> dict[str, Any]:
+    comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
+    comparisons = [item for item in comparisons if isinstance(item, dict)]
+    pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
+    pairs = [item for item in pairs if isinstance(item, dict)]
+    row_count = int(report.get("row_count") or 0)
+    replay_evidence = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else {}
+    replay_count = len(replay_rows or [])
+    public_claim_status, public_claim_eligible = report_public_claim_status(report)
+    raw_metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
+    comparison_variants = [str(item.get("variant")) for item in comparisons if item.get("variant")]
+    matched_counts = public_claim_numeric_values([
+        item.get("matched_successful_task_count") for item in comparisons
+    ])
+    missing_baseline_successes = [
+        task
+        for item in comparisons
+        for task in (item.get("missing_baseline_success_tasks") or [])
+    ]
+    baseline_success_counts = public_claim_numeric_values([
+        item.get("baseline_successful_task_count") for item in comparisons
+    ])
+    matched_tasks_pass = (
+        bool(comparisons)
+        and bool(pairs)
+        and len(matched_counts) == len(comparisons)
+        and all(value > 0 for value in matched_counts)
+        and len(baseline_success_counts) == len(comparisons)
+        and all(value > 0 for value in baseline_success_counts)
+        and not missing_baseline_successes
+    )
+    gates = [
+        public_claim_readiness_gate(
+            "matched_successful_tasks",
+            "Matched successful tasks",
+            matched_tasks_pass,
+            "matched_successful_tasks_present" if matched_tasks_pass else "missing_or_regressed_matched_successful_tasks",
+            {
+                "comparison_count": len(comparisons),
+                "matched_pair_count": len(pairs),
+                "variants": comparison_variants[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
+                "min_matched_successful_task_count": min(matched_counts) if matched_counts else None,
+                "missing_baseline_success_task_count": len(missing_baseline_successes),
+            },
+        )
+    ]
+    provider_measured_token_cost_pass = bool(pairs) and all(
+        public_claim_pair_side_measured(pair, "baseline", "primary_tokens")
+        and public_claim_pair_side_measured(pair, "variant", "primary_tokens")
+        and public_claim_pair_side_measured(pair, "baseline", "primary_cost_usd")
+        and public_claim_pair_side_measured(pair, "variant", "primary_cost_usd")
+        for pair in pairs
+    )
+    gates.append(public_claim_readiness_gate(
+        "provider_measured_token_cost",
+        "Provider-measured token and primary cost",
+        provider_measured_token_cost_pass,
+        "provider_measured_primary_tokens_and_cost" if provider_measured_token_cost_pass else "missing_provider_measured_primary_tokens_or_cost",
+        {
+            "matched_pair_count": len(pairs),
+            "required_fields": [
+                "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
+                "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
+                "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
+                "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured",
+            ],
+        },
+    ))
+    quality_gates = sorted({str(item.get("quality_gate") or "unknown") for item in comparisons})
+    failure_deltas = public_claim_numeric_values([
+        item.get("failure_rate_delta_pp") for item in comparisons
+    ])
+    correction_deltas = public_claim_numeric_values([
+        item.get("corrections_delta_per_successful_task") for item in comparisons
+    ])
+    quality_pass = bool(comparisons) and all(item.get("quality_gate") == "pass" for item in comparisons)
+    gates.append(public_claim_readiness_gate(
+        "quality_non_inferiority",
+        "Quality non-inferiority",
+        quality_pass,
+        "all_quality_gates_pass" if quality_pass else "quality_gate_not_pass",
+        {
+            "quality_gates": quality_gates,
+            "max_failure_rate_delta_pp": max(failure_deltas) if failure_deltas else None,
+            "max_corrections_delta_per_successful_task": max(correction_deltas) if correction_deltas else None,
+        },
+    ))
+    shifted_cost_pass = bool(pairs) and all(
+        isinstance(pair.get("claim_boundary"), dict)
+        and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
+        and public_claim_pair_side_measured(pair, "baseline", "total_cost_with_shift_usd")
+        and public_claim_pair_side_measured(pair, "variant", "total_cost_with_shift_usd")
+        for pair in pairs
+    )
+    gates.append(public_claim_readiness_gate(
+        "shifted_cost_accounting",
+        "Shifted-cost accounting",
+        shifted_cost_pass,
+        "shifted_cost_claim_gates_pass" if shifted_cost_pass else "missing_shifted_cost_claim_accounting",
+        {
+            "matched_pair_count": len(pairs),
+            "required_fields": [
+                "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
+                "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
+                "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured",
+            ],
+        },
+    ))
+    has_replay = replay_rows is not None and bool(replay_rows)
+    explicit_note_count = sum(1 for row in (replay_rows or []) if row.explicit_notes)
+    failed_rows = [row for row in (replay_rows or []) if not row.result.success]
+    failed_rows_with_notes = sum(1 for row in failed_rows if row.explicit_notes)
+    comparison_failure_fields_present = bool(comparisons) and all(
+        "baseline_failure_rate" in item
+        and "variant_failure_rate" in item
+        and "failure_rate_delta_pp" in item
+        and "paired_corrections_task_count" in item
+        for item in comparisons
+    )
+    confidence_notes_pass = (
+        has_replay
+        and explicit_note_count == replay_count
+        and failed_rows_with_notes == len(failed_rows)
+        and comparison_failure_fields_present
+    )
+    gates.append(public_claim_readiness_gate(
+        "confidence_failure_notes",
+        "Confidence and failure notes",
+        confidence_notes_pass,
+        "explicit_replay_notes_and_failure_rate_evidence_present" if confidence_notes_pass else "missing_explicit_replay_notes_or_failure_evidence",
+        {
+            "replay_row_count": replay_count,
+            "explicit_note_count": explicit_note_count,
+            "failed_row_count": len(failed_rows),
+            "failed_rows_with_notes": failed_rows_with_notes,
+            "comparison_failure_fields_present": comparison_failure_fields_present,
+        },
+        unknown=not has_replay,
+    ))
+    same_run_complete = bool(replay_evidence.get("same_run_complete")) if replay_evidence else (
+        has_replay and not mixed_csv and replay_count == row_count
+    )
+    source_types = sorted({row.source_type for row in (replay_rows or [])})
+    provider_names = sorted({row.provider_name for row in (replay_rows or []) if row.provider_name})
+    provider_export_pass = (
+        has_replay
+        and not mixed_csv
+        and same_run_complete
+        and replay_count == row_count
+        and all(row.provider_export_provenance_complete for row in (replay_rows or []))
+    )
+    gates.append(public_claim_readiness_gate(
+        "provider_export_provenance",
+        "Provider-export provenance",
+        provider_export_pass,
+        "complete_provider_export_same_run_provenance" if provider_export_pass else "missing_or_mixed_provider_export_provenance",
+        {
+            "replay_row_count": replay_count,
+            "report_row_count": row_count,
+            "mixed_csv": mixed_csv,
+            "same_run_complete": same_run_complete,
+            "source_types": source_types,
+            "provider_names": provider_names[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
+        },
+        unknown=not has_replay,
+    ))
+    passed_required_gate_count = sum(1 for gate in gates if gate["passed"])
+    blocking_gate_ids = [str(gate["id"]) for gate in gates if not gate["passed"]]
+    required_gates_pass = passed_required_gate_count == len(gates)
+    claim_allowed = (
+        required_gates_pass
+        and public_claim_status == REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
+        and bool(public_claim_eligible)
+    )
+    if claim_allowed:
+        readiness_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
+        reason = "all_required_public_claim_gates_pass"
+    elif not has_replay:
+        readiness_status = "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
+        reason = "replay_evidence_required_for_public_claim"
+    elif provider_export_pass:
+        readiness_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
+        reason = "provider_export_present_but_readiness_gates_failed"
+    else:
+        readiness_status = "public_claim_blocked"
+        reason = "unsupported_public_savings_claim_forbidden"
+    return {
+        "schema_version": PUBLIC_CLAIM_READINESS_SCHEMA_VERSION,
+        "generated_from": "matched_pair_evidence_and_replay_provenance",
+        "status": readiness_status,
+        "reason": reason,
+        "claim_allowed": claim_allowed,
+        "public_claim_status_observed": public_claim_status,
+        "public_claim_eligible_observed": public_claim_eligible,
+        "raw_metric_claim_status_observed": raw_metric_claim_status,
+        "required_gate_ids": list(PUBLIC_CLAIM_READINESS_GATE_IDS),
+        "required_gate_count": len(gates),
+        "passed_required_gate_count": passed_required_gate_count,
+        "blocking_gate_ids": blocking_gate_ids,
+        "gates": gates,
+        "claim_boundary": PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY,
+    }
+def default_matrix_normalized_key(value: Any) -> str:
+    text = str(value or "").lower()
+    return re.sub(r"[^a-z0-9]+", "_", text).strip("_")
+def default_matrix_contains_key(haystack: str, needle: str) -> bool:
+    needle = default_matrix_normalized_key(needle)
+    if not needle:
+        return False
+    return needle in haystack
+def infer_default_matrix_lanes(pair: dict[str, Any]) -> list[tuple[str, str]]:
+    task_id = default_matrix_normalized_key(pair.get("task_id"))
+    variant = default_matrix_normalized_key(pair.get("variant"))
+    matches: list[tuple[str, str]] = []
+    for lane in DEFAULT_MATRIX_LANES:
+        lane_id = str(lane["id"])
+        task_keywords = tuple(str(item) for item in lane.get("task_keywords", ()))
+        variant_keywords = tuple(str(item) for item in lane.get("variant_keywords", ()))
+        if any(default_matrix_contains_key(task_id, item) for item in task_keywords):
+            matches.append((lane_id, "exact_key"))
+        elif any(default_matrix_contains_key(variant, item) for item in variant_keywords):
+            matches.append((lane_id, "name_heuristic"))
+    return matches
+def default_matrix_number(value: Any) -> float | None:
+    if isinstance(value, bool) or not isinstance(value, (int, float)):
+        return None
+    numeric = float(value)
+    if not math.isfinite(numeric):
+        return None
+    return numeric
+def default_matrix_unique(values: list[Any]) -> list[Any]:
+    out: list[Any] = []
+    for value in values:
+        if value not in out:
+            out.append(value)
+    return out
+def default_matrix_cap(values: list[Any]) -> list[Any]:
+    return default_matrix_unique(values)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS]
+def default_matrix_lane_match_method(methods: set[str]) -> str:
+    if "exact_key" in methods:
+        return "exact_key"
+    if "name_heuristic" in methods:
+        return "name_heuristic"
+    return "absent"
+def default_matrix_clamp_classification(classification: str, ceiling: str) -> tuple[str, bool]:
+    if classification == "reject/rework":
+        return classification, False
+    if ceiling not in DEFAULT_MATRIX_CLASSIFICATION_STRENGTH:
+        return classification, False
+    current_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH.get(classification, 0)
+    ceiling_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH[ceiling]
+    if current_strength > ceiling_strength:
+        return ceiling, True
+    return classification, False
+def default_matrix_token_evidence(token_values: list[float], pair_count: int, byte_proxy_positive: bool) -> str:
+    if pair_count and len(token_values) == pair_count and all(value > 0 for value in token_values):
+        return "measured_positive"
+    if token_values:
+        if any(value < 0 for value in token_values):
+            return "measured_regression"
+        return "measured_incomplete_or_mixed"
+    if byte_proxy_positive:
+        return "byte_proxy_only"
+    return "unavailable"
+def classify_default_matrix_lane(
+    lane_id: str,
+    pairs: list[dict[str, Any]],
+    methods: set[str],
+) -> dict[str, Any]:
+    lane = DEFAULT_MATRIX_LANE_BY_ID[lane_id]
+    policy_ceiling = str(lane["policy_ceiling"])
+    if not pairs:
+        classification = "experimental"
+        reason_codes = ["no_matched_lane_evidence"]
+        return {
+            "lane": lane_id,
+            "label": lane["label"],
+            "classification": classification,
+            "policy_ceiling": policy_ceiling,
+            "policy_clamped": False,
+            "lane_match_method": "absent",
+            "matched_task_count": 0,
+            "matched_tasks": [],
+            "matched_variants": [],
+            "quality_gate": "insufficient_evidence",
+            "quality_gates": [],
+            "token_evidence": "unavailable",
+            "shifted_cost_evidence": "unavailable",
+            "byte_proxy_evidence": "unavailable",
+            "matched_pair_claim_gates": {
+                "token_savings_claim_allowed": False,
+                "shifted_cost_claim_allowed": False,
+            },
+            "public_claim_allowed": False,
+            "reason_codes": reason_codes,
+            "claim_boundary": {
+                "classification_is_reporting_only": True,
+                "hosted_api_savings_claim_allowed": False,
+                "requires_report_claim_status_and_matched_pair_evidence": True,
+            },
+        }
+    quality_gates = sorted({str(pair.get("quality_gate") or "unknown") for pair in pairs})
+    quality_gate = quality_gates[0] if len(quality_gates) == 1 else "mixed"
+    token_values = [
+        value for value in (
+            default_matrix_number((pair.get("delta") or {}).get("token_savings_pct"))
+            for pair in pairs
+            if isinstance(pair.get("delta"), dict)
+        )
+        if value is not None
+    ]
+    cost_values = [
+        value for value in (
+            default_matrix_number((pair.get("delta") or {}).get("cost_savings_pct_with_shift"))
+            for pair in pairs
+            if isinstance(pair.get("delta"), dict)
+        )
+        if value is not None
+    ]
+    byte_after_deltas = [
+        value for value in (
+            default_matrix_number((pair.get("delta") or {}).get("bytes_after_total"))
+            for pair in pairs
+            if isinstance(pair.get("delta"), dict)
+        )
+        if value is not None
+    ]
+    byte_proxy_positive = bool(byte_after_deltas) and any(value < 0 for value in byte_after_deltas)
+    token_claim_gate = bool(pairs) and all(
+        isinstance(pair.get("claim_boundary"), dict)
+        and bool((pair.get("claim_boundary") or {}).get("token_savings_claim_allowed"))
+        for pair in pairs
+    )
+    shifted_cost_claim_gate = bool(pairs) and all(
+        isinstance(pair.get("claim_boundary"), dict)
+        and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
+        for pair in pairs
+    )
+    reason_codes: list[str] = []
+    if any(gate != "pass" for gate in quality_gates):
+        classification = "reject/rework"
+        reason_codes.extend(f"quality_gate_{gate}" for gate in quality_gates if gate != "pass")
+    elif any(value < 0 for value in token_values):
+        classification = "reject/rework"
+        reason_codes.append("measured_token_regression")
+    elif any(value < 0 for value in cost_values):
+        classification = "reject/rework"
+        reason_codes.append("measured_shifted_cost_regression")
+    elif (
+        len(token_values) == len(pairs)
+        and all(value > 0 for value in token_values)
+        and len(cost_values) == len(pairs)
+        and all(value >= 0 for value in cost_values)
+        and token_claim_gate
+        and shifted_cost_claim_gate
+    ):
+        classification = "default-on"
+        reason_codes.append("quality_pass_measured_token_and_shifted_cost_non_regression")
+    elif len(token_values) == len(pairs) and all(value > 0 for value in token_values) and token_claim_gate:
+        classification = "advisory"
+        reason_codes.append("quality_pass_measured_token_savings_shifted_cost_unproven")
+    elif byte_proxy_positive:
+        classification = "advisory"
+        reason_codes.append("quality_pass_byte_proxy_only")
+    else:
+        classification = "experimental"
+        reason_codes.append("quality_pass_but_no_positive_measured_or_proxy_savings")
+    if lane_id == "optional_compression" and classification == "advisory" and not token_values:
+        classification = "experimental"
+        reason_codes.append("optional_compression_requires_provider_token_evidence_for_advisory")
+    classification, policy_clamped = default_matrix_clamp_classification(classification, policy_ceiling)
+    if policy_clamped:
+        reason_codes.append(f"policy_ceiling_{policy_ceiling}")
+    return {
+        "lane": lane_id,
+        "label": lane["label"],
+        "classification": classification,
+        "policy_ceiling": policy_ceiling,
+        "policy_clamped": policy_clamped,
+        "lane_match_method": default_matrix_lane_match_method(methods),
+        "matched_task_count": len({str(pair.get("task_id")) for pair in pairs}),
+        "matched_tasks": default_matrix_cap([pair.get("task_id") for pair in pairs if pair.get("task_id")]),
+        "matched_variants": default_matrix_cap([pair.get("variant") for pair in pairs if pair.get("variant")]),
+        "quality_gate": quality_gate,
+        "quality_gates": quality_gates,
+        "token_evidence": default_matrix_token_evidence(token_values, len(pairs), byte_proxy_positive),
+        "shifted_cost_evidence": (
+            "measured_non_regression"
+            if cost_values and len(cost_values) == len(pairs) and all(value >= 0 for value in cost_values)
+            else ("measured_regression" if any(value < 0 for value in cost_values) else "unavailable")
+        ),
+        "byte_proxy_evidence": (
+            "observed_positive" if byte_proxy_positive
+            else ("observed_non_positive" if byte_after_deltas else "unavailable")
+        ),
+        "matched_pair_claim_gates": {
+            "token_savings_claim_allowed": token_claim_gate,
+            "shifted_cost_claim_allowed": shifted_cost_claim_gate,
+        },
+        "public_claim_allowed": False,
+        "reason_codes": default_matrix_unique(reason_codes),
+        "claim_boundary": {
+            "classification_is_reporting_only": True,
+            "hosted_api_savings_claim_allowed": False,
+            "requires_report_claim_status_and_matched_pair_evidence": True,
+        },
+    }
+def build_default_matrix(report: dict[str, Any]) -> dict[str, Any]:
+    buckets: dict[str, list[dict[str, Any]]] = {lane_id: [] for lane_id in DEFAULT_MATRIX_LANE_IDS}
+    methods: dict[str, set[str]] = {lane_id: set() for lane_id in DEFAULT_MATRIX_LANE_IDS}
+    unmatched_variants: set[str] = set()
+    pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
+    for pair in pairs:
+        if not isinstance(pair, dict):
+            continue
+        lane_matches = infer_default_matrix_lanes(pair)
+        if not lane_matches:
+            if pair.get("variant"):
+                unmatched_variants.add(str(pair.get("variant")))
+            continue
+        for lane_id, method in lane_matches:
+            buckets[lane_id].append(pair)
+            methods[lane_id].add(method)
+    lanes = [
+        classify_default_matrix_lane(lane_id, buckets[lane_id], methods[lane_id])
+        for lane_id in DEFAULT_MATRIX_LANE_IDS
+    ]
+    classification_counts = {
+        classification: sum(1 for lane in lanes if lane.get("classification") == classification)
+        for classification in DEFAULT_MATRIX_CLASSIFICATIONS
+    }
+    return {
+        "schema_version": DEFAULT_MATRIX_SCHEMA_VERSION,
+        "classification_set": list(DEFAULT_MATRIX_CLASSIFICATIONS),
+        "generated_from": "matched_pair_evidence",
+        "reporting_only": True,
+        "claim_status_observed": report.get("claim_status"),
+        "public_claim_allowed": False,
+        "claim_boundary": DEFAULT_MATRIX_CLAIM_BOUNDARY,
+        "lanes": lanes,
+        "summary": {
+            "lane_count": len(lanes),
+            "classification_counts": classification_counts,
+            "unmatched_variants": sorted(unmatched_variants)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
+        },
+    }
+def markdown_value(value: Any) -> str:
+    if value is None:
+        return "n/a"
+    if isinstance(value, bool):
+        return "true" if value else "false"
+    if isinstance(value, float):
+        return f"{value:.6g}"
+    text = sanitize_note_text(value)
+    return text.replace("|", "\\|") or "n/a"
+def render_dashboard_markdown(report: dict[str, Any]) -> str:
+    public_claim_status, public_claim_eligible = report_public_claim_status(report)
+    metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
+    lines = [
+        "# ContextGuard Benchmark Dashboard",
+        "",
+        f"- Schema: `{markdown_value(report.get('schema'))}`",
+        f"- Baseline variant: `{markdown_value(report.get('baseline_variant'))}`",
+        f"- Rows: {markdown_value(report.get('row_count'))}",
+        f"- Metric claim status: `{markdown_value(metric_claim_status)}`",
+        f"- Public claim status: `{markdown_value(public_claim_status)}`",
+        f"- Public claim eligible: `{markdown_value(public_claim_eligible)}`",
+        "",
+        "> Claim boundary: this dashboard is not a hosted savings claim unless report claim gates "
+        "allow it and public-claim provenance is complete. Proxy byte reductions are diagnostic "
+        "and are not hosted API token savings.",
+        "",
+        "## Variant summary",
+        "",
+        "| Variant | Runs | Successes | Failure rate | Tokens/success | Bytes saved | Token proxy saved | Quality notes |",
+        "| --- | ---: | ---: | ---: | ---: | ---: | ---: | --- |",
+    ]
+    summaries = report.get("summary_by_variant") if isinstance(report.get("summary_by_variant"), dict) else {}
+    comparison_by_variant = {
+        item.get("variant"): item
+        for item in report.get("comparisons", [])
+        if isinstance(item, dict)
+    }
+    for variant, summary in sorted(summaries.items()):
+        if not isinstance(summary, dict):
+            continue
+        comparison = comparison_by_variant.get(variant, {})
+        quality = comparison.get("quality_gate") if isinstance(comparison, dict) else None
+        if quality is None and summary.get("is_baseline_strategy"):
+            quality = "baseline"
+        lines.append(
+            "| "
+            + " | ".join([
+                markdown_value(variant),
+                markdown_value(summary.get("runs")),
+                markdown_value(summary.get("successful_runs")),
+                markdown_value(summary.get("failure_rate")),
+                markdown_value(summary.get("tokens_per_successful_task")),
+                markdown_value(summary.get("bytes_saved_successful")),
+                markdown_value(summary.get("token_proxy_saved_successful")),
+                markdown_value(quality),
+            ])
+            + " |"
+        )
+    lines.extend([
+        "",
+        "## Comparisons",
+        "",
+        "| Variant | Quality gate | Matched tasks | Token paired tasks | Token savings % | Shifted cost savings % |",
+        "| --- | --- | ---: | ---: | ---: | ---: |",
+    ])
+    comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
+    if comparisons:
+        for item in comparisons:
+            if not isinstance(item, dict):
+                continue
+            lines.append(
+                "| "
+                + " | ".join([
+                    markdown_value(item.get("variant")),
+                    markdown_value(item.get("quality_gate")),
+                    markdown_value(item.get("matched_successful_task_count")),
+                    markdown_value(item.get("paired_token_task_count")),
+                    markdown_value(item.get("token_savings_pct")),
+                    markdown_value(item.get("cost_savings_pct_with_shift")),
+                ])
+                + " |"
+            )
+    else:
+        lines.append("| n/a | n/a | 0 | 0 | n/a | n/a |")
+    readiness = report.get("public_claim_readiness") if isinstance(report.get("public_claim_readiness"), dict) else None
+    if readiness is not None:
+        lines.extend([
+            "",
+            "## Public claim readiness",
+            "",
+            f"- Status: `{markdown_value(readiness.get('status'))}`",
+            f"- Claim allowed: `{markdown_value(readiness.get('claim_allowed'))}`",
+            "",
+            "| Gate | Status | Reason | Evidence |",
+            "| --- | --- | --- | --- |",
+        ])
+        gates = readiness.get("gates") if isinstance(readiness.get("gates"), list) else []
+        for gate in gates:
+            if not isinstance(gate, dict):
+                continue
+            evidence = gate.get("evidence") if isinstance(gate.get("evidence"), dict) else {}
+            lines.append(
+                "| "
+                + " | ".join([
+                    markdown_value(gate.get("id")),
+                    markdown_value(gate.get("status")),
+                    markdown_value(gate.get("reason")),
+                    markdown_value(public_claim_readiness_evidence_text(evidence)),
+                ])
+                + " |"
+            )
+        boundary = readiness.get("claim_boundary")
+        if isinstance(boundary, dict):
+            lines.extend([
+                "",
+                f"- Public claim boundary: {markdown_value(boundary.get('reason'))}",
+            ])
+    default_matrix = report.get("default_matrix") if isinstance(report.get("default_matrix"), dict) else None
+    if default_matrix is not None:
+        lines.extend([
+            "",
+            "## Default matrix",
+            "",
+            "| Lane | Classification | Matched Tasks | Quality Gate | Token Evidence | Public Claim | Reason |",
+            "| --- | --- | ---: | --- | --- | --- | --- |",
+        ])
+        lanes = default_matrix.get("lanes") if isinstance(default_matrix.get("lanes"), list) else []
+        for lane in lanes:
+            if not isinstance(lane, dict):
+                continue
+            reasons = lane.get("reason_codes") if isinstance(lane.get("reason_codes"), list) else []
+            lines.append(
+                "| "
+                + " | ".join([
+                    markdown_value(lane.get("lane")),
+                    markdown_value(lane.get("classification")),
+                    markdown_value(lane.get("matched_task_count")),
+                    markdown_value(lane.get("quality_gate")),
+                    markdown_value(lane.get("token_evidence")),
+                    markdown_value(lane.get("public_claim_allowed")),
+                    markdown_value(", ".join(str(item) for item in reasons[:3])),
+                ])
+                + " |"
+            )
+        boundary = default_matrix.get("claim_boundary")
+        if isinstance(boundary, dict):
+            lines.extend([
+                "",
+                f"- Matrix boundary: {markdown_value(boundary.get('reason'))}",
+            ])
+    replay = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else None
+    if replay is not None:
+        lines.extend([
+            "",
+            "## Replay evidence provenance",
+            "",
+            f"- Source types: `{markdown_value(', '.join(replay.get('source_types') or []))}`",
+            f"- Claim scopes: `{markdown_value(', '.join(replay.get('claim_scopes') or []))}`",
+            f"- Same-run complete: `{markdown_value(replay.get('same_run_complete'))}`",
+            f"- Mixed/pre-existing CSV: `{markdown_value(replay.get('mixed_csv'))}`",
+            f"- Boundary: {markdown_value(replay.get('claim_boundary'))}",
+        ])
+    else:
+        lines.extend([
+            "",
+            "## Provenance note",
+            "",
+            "- CSV-only dashboards have unknown public-claim provenance unless regenerated from "
+            "the original evidence JSONL or a future trusted provenance ledger.",
+        ])
+    lines.extend([
+        "",
+        "## Re-run context",
+        "",
+        "- Evidence replay: `context-guard-bench --tasks <tasks.json> --variants <variants.json> "
+        "--evidence-jsonl <evidence.jsonl> --csv <results.csv> --report-json <report.json> "
+        "--dashboard-md <dashboard.md>`",
+    ])
+    return "\n".join(lines) + "\n"
+def write_report_outputs(
+    csv_path: Path,
+    report_path: Path | None,
+    dashboard_path: Path | None,
+    baseline_variant: str,
+    *,
+    replay_rows: list[EvidenceReplayRow] | None = None,
+    mixed_csv: bool = False,
+) -> dict[str, Any]:
+    # Keep lock order stable across all derived writes: source CSV first, then
+    # report, then dashboard. Do not introduce a derived-output -> CSV path.
+    with csv_file_lock(csv_path, create_parent=True):
+        report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
+        if replay_rows is not None:
+            report = annotate_replay_report(report, replay_rows, mixed_csv=mixed_csv)
+        if report_path is not None:
+            with csv_file_lock(report_path, create_parent=True):
+                write_text_no_follow(
+                    report_path,
+                    json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
+                )
+        if dashboard_path is not None:
+            with csv_file_lock(dashboard_path, create_parent=True):
+                write_text_no_follow(dashboard_path, render_dashboard_markdown(report))
+    return report
 def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
     # Keep lock order stable across all report writes: source CSV first, derived
     # report second. Do not introduce a report -> CSV path; that can deadlock
     # concurrent report generation.
-    with csv_file_lock(csv_path, create_parent=True):
-        report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
-        with csv_file_lock(report_path, create_parent=True):
-            write_text_no_follow(
-                report_path,
-                json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
-            )
-    return report
+    return write_report_outputs(csv_path, report_path, None, baseline_variant)
 def sanitize_note_text(value: Any) -> str:
@@ -2278,8 +3717,18 @@ def existing_file_identity(path: Path) -> tuple[int, int] | None:
         os.close(fd)
-def validate_distinct_output_paths(csv_path: Path, ledger_path: Path | None, report_path: Path | None) -> None:
-    outputs = [("csv", csv_path), ("ledger-jsonl", ledger_path), ("report-json", report_path)]
+def validate_distinct_output_paths(
+    csv_path: Path,
+    ledger_path: Path | None,
+    report_path: Path | None,
+    dashboard_path: Path | None = None,
+) -> None:
+    outputs = [
+        ("csv", csv_path),
+        ("ledger-jsonl", ledger_path),
+        ("report-json", report_path),
+        ("dashboard-md", dashboard_path),
+    ]
     seen: dict[Path, str] = {}
     seen_identity: dict[tuple[int, int], str] = {}
     for label, path in outputs:
@@ -2318,12 +3767,16 @@ def main() -> int:
                         help="optional JSONL ledger path for cost-shift accounting per run")
     parser.add_argument("--report-json", default=None, type=Path,
                         help="optional A/B summary report JSON path generated from --csv after real runs")
+    parser.add_argument("--dashboard-md", default=None, type=Path,
+                        help="optional Markdown dashboard path generated from the benchmark report")
+    parser.add_argument("--evidence-jsonl", default=None, type=Path,
+                        help="optional validated run-evidence JSONL replay input; skips provider invocation")
     parser.add_argument("--baseline-variant", default="baseline",
                         help="variant name used as the report baseline (default: baseline)")
     args = parser.parse_args()
     require_no_follow_file_ops_supported()
-    validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
+    validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json, args.dashboard_md)
     variants = parse_variants(args.variants)
     tasks = parse_tasks(args.tasks, variants=variants)
@@ -2338,6 +3791,61 @@ def main() -> int:
         for task, variant in targets
         if (task.id, variant.name) not in skip_keys
     ]
+    if args.evidence_jsonl is not None:
+        if args.dry_run:
+            for task, variant in targets:
+                if (task.id, variant.name) in skip_keys:
+                    print(f"skip {task.id}/{variant.name} (already in {args.csv})")
+                    continue
+                print(f"evidence replay dry-run: {task.id}/{variant.name} <- {args.evidence_jsonl}")
+            print("completed 0 run(s); results in (dry-run; no CSV writes)")
+            return 0
+        csv_had_preexisting_content = file_has_content_no_follow(args.csv)
+        evidence_rows = read_evidence_jsonl(args.evidence_jsonl)
+        evidence_by_key = validate_evidence_coverage(evidence_rows, runnable_targets)
+        claude_ver = "evidence-replay"
+        completed = 0
+        replay_rows_written: list[EvidenceReplayRow] = []
+        for task, variant in targets:
+            if (task.id, variant.name) in skip_keys:
+                print(f"skip {task.id}/{variant.name} (already in {args.csv})")
+                continue
+            evidence = evidence_by_key[(task.id, variant.name)]
+            print(f"replay {task.id}/{variant.name} ...", flush=True)
+            result = run_evidence_fixture(task, variant, evidence)
+            wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
+            if wrote:
+                replay_rows_written.append(evidence)
+                if args.ledger_jsonl is not None:
+                    append_cost_shift_ledger(
+                        args.ledger_jsonl,
+                        claude_ver,
+                        result,
+                        replay_provenance=evidence.provenance_payload(),
+                    )
+            completed += 1
+            status = "ok" if result.success else "FAIL"
+            suffix = "" if wrote else " (CSV not updated; row already present)"
+            print(
+                f"  {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
+                f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
+            )
+        if args.report_json is not None or args.dashboard_md is not None:
+            report = write_report_outputs(
+                args.csv,
+                args.report_json,
+                args.dashboard_md,
+                args.baseline_variant,
+                replay_rows=replay_rows_written,
+                mixed_csv=csv_had_preexisting_content or bool(skip_keys) or len(replay_rows_written) != int(completed),
+            )
+            if args.report_json is not None:
+                print(f"report {args.report_json}: {report['claim_status']}")
+            if args.dashboard_md is not None:
+                print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
+        print(f"completed {completed} run(s); results in {args.csv}")
+        return 0
     placeholder_targets = [
         f"{task.id}/{variant.name}"
         for task, variant in runnable_targets
@@ -2390,9 +3898,12 @@ def main() -> int:
             f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
         )
     target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
-    if args.report_json is not None and not args.dry_run:
-        report = write_report_json(args.csv, args.report_json, args.baseline_variant)
-        print(f"report {args.report_json}: {report['claim_status']}")
+    if (args.report_json is not None or args.dashboard_md is not None) and not args.dry_run:
+        report = write_report_outputs(args.csv, args.report_json, args.dashboard_md, args.baseline_variant)
+        if args.report_json is not None:
+            print(f"report {args.report_json}: {report['claim_status']}")
+        if args.dashboard_md is not None:
+            print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
     print(f"completed {completed} run(s); results in {target}")
     return 0