npm - @ictechgy/context-guard - Versions diffs - 0.4.9 → 0.4.11 - Mend

@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/CHANGELOG.md +28 -0
package/README.ko.md +59 -31
package/README.md +85 -36
package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
package/docs/benchmark-workflow-examples.md +3 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
package/docs/distribution.md +10 -7
package/docs/experimental-benchmark-fixtures.md +30 -6
package/package.json +4 -6
package/packaging/homebrew/context-guard.rb.template +1 -1
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +20 -14
package/plugins/context-guard/README.md +26 -17
package/plugins/context-guard/bin/context-guard +147 -25
package/plugins/context-guard/bin/context-guard-artifact +884 -79
package/plugins/context-guard/bin/context-guard-audit +33 -2
package/plugins/context-guard/bin/context-guard-bench +1542 -31
package/plugins/context-guard/bin/context-guard-cache-score +665 -0
package/plugins/context-guard/bin/context-guard-compress +146 -1
package/plugins/context-guard/bin/context-guard-cost +790 -6
package/plugins/context-guard/bin/context-guard-experiments +463 -26
package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
package/plugins/context-guard/bin/context-guard-filter +163 -7
package/plugins/context-guard/bin/context-guard-guard-read +3 -0
package/plugins/context-guard/bin/context-guard-pack +892 -49
package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
package/plugins/context-guard/bin/context-guard-setup +165 -31
package/plugins/context-guard/bin/context-guard-statusline +490 -283
package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
package/plugins/context-guard/bin/context-guard-trim-output +288 -41
package/plugins/context-guard/brief/README.md +5 -5
package/plugins/context-guard/lib/context_guard_commands.py +230 -0
package/plugins/context-guard/skills/setup/SKILL.md +1 -0
package/context-guard-kit/README.md +0 -91
package/context-guard-kit/benchmark_runner.py +0 -2401
package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
package/context-guard-kit/context_compress.py +0 -695
package/context-guard-kit/context_escrow.py +0 -935
package/context-guard-kit/context_filter.py +0 -637
package/context-guard-kit/context_guard_cli.py +0 -325
package/context-guard-kit/context_guard_diet.py +0 -1711
package/context-guard-kit/context_pack.py +0 -2713
package/context-guard-kit/cost_guard.py +0 -2349
package/context-guard-kit/experimental_registry.py +0 -4348
package/context-guard-kit/failed_attempt_nudge.py +0 -567
package/context-guard-kit/guard_large_read.py +0 -690
package/context-guard-kit/hook_secret_patterns.py +0 -43
package/context-guard-kit/read_symbol.py +0 -483
package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
package/context-guard-kit/sanitize_output.py +0 -725
package/context-guard-kit/settings.example.json +0 -67
package/context-guard-kit/setup_wizard.py +0 -2515
package/context-guard-kit/statusline.sh +0 -362
package/context-guard-kit/statusline_merged.sh +0 -157
package/context-guard-kit/tool_schema_pruner.py +0 -837
package/context-guard-kit/trim_command_output.py +0 -1449

package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json ADDED Viewed

@@ -0,0 +1,182 @@
+[
+  {
+    "id": "token_savings_01_bugfix",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (bugfix). Fix a null-check regression in a sanitized request parser while preserving exact stack-frame evidence. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_02_exploration",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (exploration). Explore a small sanitized repository and identify the next file to inspect without loading unrelated logs. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_03_code_review",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (code_review). Review a focused diff and identify one correctness risk plus one test gap. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_04_long_log_analysis",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (long_log_analysis). Analyze a long sanitized CI log and cite the failing command, preserving artifact receipt fallback. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_05_migration",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (migration). Plan a safe migration of a deprecated CLI flag to a new option while keeping backwards compatibility. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_06_docs",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (docs). Update user-facing docs to clarify provider-measured matched successful task requirements. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_07_refactor",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (refactor). Refactor duplicated helper parsing into a shared function without changing public output schema. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_08_performance",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (performance). Find a deterministic hot path in a local-only helper and propose a bounded optimization. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_09_telemetry",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (telemetry). Add claim-safe telemetry fields for shifted local work without hosted cost-savings claims. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_10_cache_layout",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (cache_layout). Inspect a prompt layout and identify stable prefix versus dynamic suffix placement. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_11_tool_schema",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (tool_schema). Select a small core tool set from a sanitized MCP catalog and defer the rest by receipt. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  },
+  {
+    "id": "token_savings_12_artifact_receipt",
+    "prompt": "Fixture-only synthetic token-savings roadmap task (artifact_receipt). Verify that a digest plus receipt can re-expand omitted sanitized output exactly when needed. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
+    "model": "sonnet",
+    "effort": "medium",
+    "max_turns": 3,
+    "max_budget_usd": 1.0,
+    "allowed_tools": [],
+    "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
+    "success_cwd": ".",
+    "variant_prompt_files": {
+      "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
+      "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
+    }
+  }
+]

package/docs/benchmark-fixtures/token-savings-12task.variants.example.json ADDED Viewed

@@ -0,0 +1,10 @@
+[
+  {
+    "name": "baseline_full_context_fixture",
+    "extra_args": []
+  },
+  {
+    "name": "fixture_only_contextguard_advisory_foundations",
+    "extra_args": []
+  }
+]

package/docs/benchmark-workflow-examples.md CHANGED Viewed

@@ -26,6 +26,7 @@ Use them to decide what evidence a workflow has and what it does **not** prove:
 3. Treat `comparisons[].quality_gate != "pass"` as a warning to inspect failures, correction burden, and unmatched tasks before discussing savings.
 4. Keep byte-proxy, provider-cache, wall-time, and shifted-cost evidence in separate language from provider-measured token/cost claims. Provider-cache telemetry is not independent savings proof.
 5. Keep self-hosted local/model-server latency, memory, and quality metrics in the run-evidence ledger sidecar; do not fold them into hosted API token/cost savings claims unless provider-measured matched-task evidence separately supports that claim.
+6. For deterministic local replay, add `--evidence-jsonl ... --dashboard-md ...`. Synthetic/manual replay evidence regenerates CSV/report/dashboard artifacts, but the report is marked `replay_only_not_public_claim` or `unknown_mixed_csv` unless every report row has complete provider-export provenance. Public hosted savings claims must additionally have `public_claim_readiness.claim_allowed=true`, which requires matched successful tasks, provider-measured token/cost, quality non-inferiority, shifted-cost accounting, explicit confidence/failure notes, and complete provider-export provenance.
 ## Safe wording
@@ -42,3 +43,5 @@ The `.example.json` fixtures intentionally use full `context-guard-bench-report-
 The self-hosted metrics example is a JSONL run-evidence sidecar, not a full report shape. Its fields are additive ledger evidence only: `latency_ms`, `peak_memory_mb`, and normalized `quality_score` describe local/model-server behavior and leave hosted API report calculations unchanged. Use `context-guard experiments plan self-hosted-metrics-ledger --json ...` only as a dry-run ledger-preview checker for explicit metrics; it does not write the benchmark ledger.
 For task/variant starter fixtures rather than full report-shape examples, see [`experimental-benchmark-fixtures.md`](experimental-benchmark-fixtures.md). Those files are fixture-only and synthetic dry-run-only starters until users replace the placeholder prompts and success checks; they are not shipped OCR, visual-token, learned-compression, or output-transform benchmark results, and real claims still require provider-measured matched successful tasks plus failure-rate, correction, and shifted-cost guardrails.
+The token-savings 12-task starter also includes [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) for `context-guard-bench --evidence-jsonl` replay. That file is synthetic local replay evidence, not provider-measured savings proof; use it to validate dashboards and claim-boundary handling before collecting real provider exports.

package/docs/benchmark-workflows/context-pack-byte-proxy.example.json CHANGED Viewed

@@ -1,169 +1,310 @@
 {
-  "schema": "context-guard-bench-report-v1",
   "baseline_variant": "baseline",
+  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
+  "claim_status": "insufficient_paired_data",
+  "comparisons": [
+    {
+      "baseline_corrections_per_successful_task": 0.0,
+      "baseline_failure_rate": 0.0,
+      "baseline_successful_task_count": 1,
+      "baseline_variant": "baseline",
+      "corrections_delta_per_successful_task": 0.0,
+      "cost_savings_pct_with_shift": null,
+      "failure_rate_delta_pp": 0.0,
+      "matched_successful_task_count": 1,
+      "missing_baseline_success_tasks": [],
+      "paired_corrections_task_count": 1,
+      "paired_cost_task_count": 0,
+      "paired_token_task_count": 0,
+      "paired_wall_time_task_count": 1,
+      "quality_gate": "pass",
+      "token_savings_pct": null,
+      "variant": "context_pack_auto",
+      "variant_corrections_per_successful_task": 0.0,
+      "variant_failure_rate": 0.0,
+      "wall_time_change_pct": -8.333333333333332,
+      "wall_time_delta_seconds_per_successful_task": -1.0
+    }
+  ],
+  "public_claim_readiness": {
+    "blocking_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "claim_allowed": false,
+    "claim_boundary": {
+      "claim_allowed_field": "public_claim_readiness.claim_allowed",
+      "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
+      "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
+      "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
+      "id": "public_claim_readiness_authoritative_release_gate",
+      "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
+      "reporting_only": true,
+      "requires_confidence_and_failure_notes": true,
+      "requires_matched_successful_tasks": true,
+      "requires_provider_export_provenance": true,
+      "requires_provider_measured_tokens_and_cost": true,
+      "requires_quality_non_inferiority": true,
+      "requires_shifted_cost_accounting": true,
+      "unsupported_claims_forbidden": true
+    },
+    "gates": [
+      {
+        "evidence": {
+          "comparison_count": 1,
+          "matched_pair_count": 0,
+          "min_matched_successful_task_count": 1.0,
+          "missing_baseline_success_task_count": 0,
+          "variants": [
+            "context_pack_auto"
+          ]
+        },
+        "id": "matched_successful_tasks",
+        "label": "Matched successful tasks",
+        "passed": false,
+        "reason": "missing_or_regressed_matched_successful_tasks",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
+          ]
+        },
+        "id": "provider_measured_token_cost",
+        "label": "Provider-measured token and primary cost",
+        "passed": false,
+        "reason": "missing_provider_measured_primary_tokens_or_cost",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "max_corrections_delta_per_successful_task": 0.0,
+          "max_failure_rate_delta_pp": 0.0,
+          "quality_gates": [
+            "pass"
+          ]
+        },
+        "id": "quality_non_inferiority",
+        "label": "Quality non-inferiority",
+        "passed": true,
+        "reason": "all_quality_gates_pass",
+        "required": true,
+        "status": "pass"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
+            "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
+          ]
+        },
+        "id": "shifted_cost_accounting",
+        "label": "Shifted-cost accounting",
+        "passed": false,
+        "reason": "missing_shifted_cost_claim_accounting",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "comparison_failure_fields_present": true,
+          "explicit_note_count": 0,
+          "failed_row_count": 0,
+          "failed_rows_with_notes": 0,
+          "replay_row_count": 0
+        },
+        "id": "confidence_failure_notes",
+        "label": "Confidence and failure notes",
+        "passed": false,
+        "reason": "missing_explicit_replay_notes_or_failure_evidence",
+        "required": true,
+        "status": "unknown"
+      },
+      {
+        "evidence": {
+          "mixed_csv": false,
+          "provider_names": [],
+          "replay_row_count": 0,
+          "report_row_count": 2,
+          "same_run_complete": false,
+          "source_types": []
+        },
+        "id": "provider_export_provenance",
+        "label": "Provider-export provenance",
+        "passed": false,
+        "reason": "missing_or_mixed_provider_export_provenance",
+        "required": true,
+        "status": "unknown"
+      }
+    ],
+    "generated_from": "matched_pair_evidence_and_replay_provenance",
+    "passed_required_gate_count": 1,
+    "public_claim_eligible_observed": null,
+    "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
+    "raw_metric_claim_status_observed": "insufficient_paired_data",
+    "reason": "replay_evidence_required_for_public_claim",
+    "required_gate_count": 6,
+    "required_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "quality_non_inferiority",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "schema_version": "contextguard.bench.public-claim-readiness.v1",
+    "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
+  },
   "row_count": 2,
+  "schema": "context-guard-bench-report-v1",
   "summary_by_variant": {
     "baseline": {
-      "runs": 1,
-      "successful_runs": 1,
+      "artifacts_used_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
+      "byte_reduction_ratio": 1.0,
+      "byte_savings_pct": 0.0,
+      "bytes_after_successful": 24000,
+      "bytes_before_successful": 24000,
+      "bytes_saved_per_successful_task": 0.0,
+      "bytes_saved_successful": 0,
+      "compression_strategy": "baseline",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
+      "external_cost_successful_usd": 0.0,
+      "external_cost_unknown_successful": 1,
+      "external_tokens_measured_successful": 0,
+      "external_tokens_per_successful_task": null,
+      "external_tokens_successful": 0,
       "failed_runs": 0,
-      "total_tokens_all_runs": 0,
-      "primary_tokens_measured_runs": 0,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": true,
+      "observed_telemetry": {
+        "byte_savings": "observed",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "unavailable",
+        "token_proxy": "inferred",
+        "tokens": "unavailable",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 12.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 0,
+      "primary_tokens_measured_successful": 0,
       "provider_cached_tokens_all_runs": 0,
       "provider_cached_tokens_measured_runs": 0,
+      "provider_cached_tokens_measured_successful": 0,
+      "provider_cached_tokens_per_successful_task": 0.0,
+      "provider_cached_tokens_per_task_including_failures": 0.0,
+      "provider_cached_tokens_successful": 0,
+      "runs": 1,
+      "successful_runs": 1,
+      "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": 0.0,
+      "token_proxy_saved_successful": 0,
+      "tokens_per_successful_task": null,
+      "tokens_per_task_including_failures": null,
       "total_cost_with_shift_all_runs_usd": 0.0,
       "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
+      "total_cost_with_shift_per_task_including_failures_usd": null,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 0,
       "total_tokens_successful": 0,
-      "primary_tokens_measured_successful": 0,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 12.0,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 12.0,
+      "wall_time_seconds_measured_runs": 1,
       "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 0,
-      "provider_cached_tokens_measured_successful": 0,
+      "wall_time_seconds_per_successful_task": 12.0,
+      "wall_time_seconds_per_task_including_failures": 12.0,
+      "wall_time_seconds_successful": 12.0
+    },
+    "context_pack_auto": {
+      "artifacts_used_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
+      "byte_reduction_ratio": 0.25,
+      "byte_savings_pct": 75.0,
+      "bytes_after_successful": 6000,
+      "bytes_before_successful": 24000,
+      "bytes_saved_per_successful_task": 18000.0,
+      "bytes_saved_successful": 18000,
+      "compression_strategy": "context_pack_auto",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
       "external_cost_successful_usd": 0.0,
       "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
       "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 24000,
-      "bytes_after_successful": 24000,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
-      "successful_task_count": 1,
-      "tokens_per_task_including_failures": null,
-      "wall_time_seconds_per_task_including_failures": 12.0,
-      "provider_cached_tokens_per_task_including_failures": 0.0,
-      "primary_cost_per_task_including_failures_usd": null,
-      "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": null,
-      "wall_time_seconds_per_successful_task": 12.0,
-      "provider_cached_tokens_per_successful_task": 0.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
       "external_tokens_per_successful_task": null,
-      "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
-      "byte_reduction_ratio": 1.0,
-      "compression_strategy": "baseline",
-      "is_baseline_strategy": true,
-      "bytes_saved_successful": 0,
-      "bytes_saved_per_successful_task": 0.0,
-      "byte_savings_pct": 0.0,
-      "token_proxy_saved_successful": 0,
-      "token_proxy_saved_per_successful_task": 0.0,
+      "external_tokens_successful": 0,
+      "failed_runs": 0,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": false,
       "observed_telemetry": {
-        "tokens": "unavailable",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
         "byte_savings": "observed",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "unavailable",
         "token_proxy": "inferred",
-        "wall_time": "observed",
-        "provider_cache": "unavailable"
-      }
-    },
-    "context_pack_auto": {
-      "runs": 1,
-      "successful_runs": 1,
-      "failed_runs": 0,
-      "total_tokens_all_runs": 0,
-      "primary_tokens_measured_runs": 0,
+        "tokens": "unavailable",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 11.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 0,
+      "primary_tokens_measured_successful": 0,
       "provider_cached_tokens_all_runs": 0,
       "provider_cached_tokens_measured_runs": 0,
-      "total_cost_with_shift_all_runs_usd": 0.0,
-      "total_cost_with_shift_measured_runs": 0,
-      "total_tokens_successful": 0,
-      "primary_tokens_measured_successful": 0,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 11.0,
-      "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 0,
       "provider_cached_tokens_measured_successful": 0,
-      "external_cost_successful_usd": 0.0,
-      "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
-      "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 24000,
-      "bytes_after_successful": 6000,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
+      "provider_cached_tokens_per_successful_task": 0.0,
+      "provider_cached_tokens_per_task_including_failures": 0.0,
+      "provider_cached_tokens_successful": 0,
+      "runs": 1,
+      "successful_runs": 1,
       "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": 4500.0,
+      "token_proxy_saved_successful": 4500,
+      "tokens_per_successful_task": null,
       "tokens_per_task_including_failures": null,
-      "wall_time_seconds_per_task_including_failures": 11.0,
-      "provider_cached_tokens_per_task_including_failures": 0.0,
-      "primary_cost_per_task_including_failures_usd": null,
+      "total_cost_with_shift_all_runs_usd": 0.0,
+      "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
       "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": null,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 0,
+      "total_tokens_successful": 0,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 11.0,
+      "wall_time_seconds_measured_runs": 1,
+      "wall_time_seconds_measured_successful": 1,
       "wall_time_seconds_per_successful_task": 11.0,
-      "provider_cached_tokens_per_successful_task": 0.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
-      "external_tokens_per_successful_task": null,
-      "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
-      "byte_reduction_ratio": 0.25,
-      "compression_strategy": "context_pack_auto",
-      "is_baseline_strategy": false,
-      "bytes_saved_successful": 18000,
-      "bytes_saved_per_successful_task": 18000.0,
-      "byte_savings_pct": 75.0,
-      "token_proxy_saved_successful": 4500,
-      "token_proxy_saved_per_successful_task": 4500.0,
-      "observed_telemetry": {
-        "tokens": "unavailable",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
-        "byte_savings": "observed",
-        "token_proxy": "inferred",
-        "wall_time": "observed",
-        "provider_cache": "unavailable"
-      }
-    }
-  },
-  "comparisons": [
-    {
-      "variant": "context_pack_auto",
-      "baseline_variant": "baseline",
-      "quality_gate": "pass",
-      "baseline_failure_rate": 0.0,
-      "variant_failure_rate": 0.0,
-      "failure_rate_delta_pp": 0.0,
-      "matched_successful_task_count": 1,
-      "baseline_successful_task_count": 1,
-      "missing_baseline_success_tasks": [],
-      "baseline_corrections_per_successful_task": 0.0,
-      "variant_corrections_per_successful_task": 0.0,
-      "paired_corrections_task_count": 1,
-      "corrections_delta_per_successful_task": 0.0,
-      "token_savings_pct": null,
-      "paired_token_task_count": 0,
-      "wall_time_delta_seconds_per_successful_task": -1.0,
-      "wall_time_change_pct": -8.333333333333332,
-      "paired_wall_time_task_count": 1,
-      "cost_savings_pct_with_shift": null,
-      "paired_cost_task_count": 0
+      "wall_time_seconds_per_task_including_failures": 11.0,
+      "wall_time_seconds_successful": 11.0
     }
-  ],
-  "claim_status": "insufficient_paired_data",
-  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
+  }
 }