npm - @ictechgy/context-guard - Versions diffs - 0.4.9 → 0.4.11 - Mend

@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

package/CHANGELOG.md +28 -0
package/README.ko.md +59 -31
package/README.md +85 -36
package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
package/docs/benchmark-workflow-examples.md +3 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
package/docs/distribution.md +10 -7
package/docs/experimental-benchmark-fixtures.md +30 -6
package/package.json +4 -6
package/packaging/homebrew/context-guard.rb.template +1 -1
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +20 -14
package/plugins/context-guard/README.md +26 -17
package/plugins/context-guard/bin/context-guard +147 -25
package/plugins/context-guard/bin/context-guard-artifact +884 -79
package/plugins/context-guard/bin/context-guard-audit +33 -2
package/plugins/context-guard/bin/context-guard-bench +1542 -31
package/plugins/context-guard/bin/context-guard-cache-score +665 -0
package/plugins/context-guard/bin/context-guard-compress +146 -1
package/plugins/context-guard/bin/context-guard-cost +790 -6
package/plugins/context-guard/bin/context-guard-experiments +463 -26
package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
package/plugins/context-guard/bin/context-guard-filter +163 -7
package/plugins/context-guard/bin/context-guard-guard-read +3 -0
package/plugins/context-guard/bin/context-guard-pack +892 -49
package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
package/plugins/context-guard/bin/context-guard-setup +165 -31
package/plugins/context-guard/bin/context-guard-statusline +490 -283
package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
package/plugins/context-guard/bin/context-guard-trim-output +288 -41
package/plugins/context-guard/brief/README.md +5 -5
package/plugins/context-guard/lib/context_guard_commands.py +230 -0
package/plugins/context-guard/skills/setup/SKILL.md +1 -0
package/context-guard-kit/README.md +0 -91
package/context-guard-kit/benchmark_runner.py +0 -2401
package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
package/context-guard-kit/context_compress.py +0 -695
package/context-guard-kit/context_escrow.py +0 -935
package/context-guard-kit/context_filter.py +0 -637
package/context-guard-kit/context_guard_cli.py +0 -325
package/context-guard-kit/context_guard_diet.py +0 -1711
package/context-guard-kit/context_pack.py +0 -2713
package/context-guard-kit/cost_guard.py +0 -2349
package/context-guard-kit/experimental_registry.py +0 -4348
package/context-guard-kit/failed_attempt_nudge.py +0 -567
package/context-guard-kit/guard_large_read.py +0 -690
package/context-guard-kit/hook_secret_patterns.py +0 -43
package/context-guard-kit/read_symbol.py +0 -483
package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
package/context-guard-kit/sanitize_output.py +0 -725
package/context-guard-kit/settings.example.json +0 -67
package/context-guard-kit/setup_wizard.py +0 -2515
package/context-guard-kit/statusline.sh +0 -362
package/context-guard-kit/statusline_merged.sh +0 -157
package/context-guard-kit/tool_schema_pruner.py +0 -837
package/context-guard-kit/trim_command_output.py +0 -1449

package/docs/benchmark-workflows/provider-cache-telemetry.example.json CHANGED Viewed

@@ -1,170 +1,311 @@
 {
-  "schema": "context-guard-bench-report-v1",
   "baseline_variant": "baseline",
+  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
+  "claim_status": "compare_variants",
+  "comparisons": [
+    {
+      "baseline_corrections_per_successful_task": 0.0,
+      "baseline_failure_rate": 0.0,
+      "baseline_successful_task_count": 1,
+      "baseline_variant": "baseline",
+      "corrections_delta_per_successful_task": 0.0,
+      "cost_savings_pct_with_shift": null,
+      "failure_rate_delta_pp": 0.0,
+      "matched_successful_task_count": 1,
+      "missing_baseline_success_tasks": [],
+      "paired_corrections_task_count": 1,
+      "paired_cost_task_count": 0,
+      "paired_token_task_count": 1,
+      "paired_wall_time_task_count": 1,
+      "quality_gate": "pass",
+      "token_delta_per_successful_task": 0.0,
+      "token_savings_pct": 0.0,
+      "variant": "cache_layout_check",
+      "variant_corrections_per_successful_task": 0.0,
+      "variant_failure_rate": 0.0,
+      "wall_time_change_pct": 0.0,
+      "wall_time_delta_seconds_per_successful_task": 0.0
+    }
+  ],
+  "public_claim_readiness": {
+    "blocking_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "claim_allowed": false,
+    "claim_boundary": {
+      "claim_allowed_field": "public_claim_readiness.claim_allowed",
+      "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
+      "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
+      "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
+      "id": "public_claim_readiness_authoritative_release_gate",
+      "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
+      "reporting_only": true,
+      "requires_confidence_and_failure_notes": true,
+      "requires_matched_successful_tasks": true,
+      "requires_provider_export_provenance": true,
+      "requires_provider_measured_tokens_and_cost": true,
+      "requires_quality_non_inferiority": true,
+      "requires_shifted_cost_accounting": true,
+      "unsupported_claims_forbidden": true
+    },
+    "gates": [
+      {
+        "evidence": {
+          "comparison_count": 1,
+          "matched_pair_count": 0,
+          "min_matched_successful_task_count": 1.0,
+          "missing_baseline_success_task_count": 0,
+          "variants": [
+            "cache_layout_check"
+          ]
+        },
+        "id": "matched_successful_tasks",
+        "label": "Matched successful tasks",
+        "passed": false,
+        "reason": "missing_or_regressed_matched_successful_tasks",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
+          ]
+        },
+        "id": "provider_measured_token_cost",
+        "label": "Provider-measured token and primary cost",
+        "passed": false,
+        "reason": "missing_provider_measured_primary_tokens_or_cost",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "max_corrections_delta_per_successful_task": 0.0,
+          "max_failure_rate_delta_pp": 0.0,
+          "quality_gates": [
+            "pass"
+          ]
+        },
+        "id": "quality_non_inferiority",
+        "label": "Quality non-inferiority",
+        "passed": true,
+        "reason": "all_quality_gates_pass",
+        "required": true,
+        "status": "pass"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
+            "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
+          ]
+        },
+        "id": "shifted_cost_accounting",
+        "label": "Shifted-cost accounting",
+        "passed": false,
+        "reason": "missing_shifted_cost_claim_accounting",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "comparison_failure_fields_present": true,
+          "explicit_note_count": 0,
+          "failed_row_count": 0,
+          "failed_rows_with_notes": 0,
+          "replay_row_count": 0
+        },
+        "id": "confidence_failure_notes",
+        "label": "Confidence and failure notes",
+        "passed": false,
+        "reason": "missing_explicit_replay_notes_or_failure_evidence",
+        "required": true,
+        "status": "unknown"
+      },
+      {
+        "evidence": {
+          "mixed_csv": false,
+          "provider_names": [],
+          "replay_row_count": 0,
+          "report_row_count": 2,
+          "same_run_complete": false,
+          "source_types": []
+        },
+        "id": "provider_export_provenance",
+        "label": "Provider-export provenance",
+        "passed": false,
+        "reason": "missing_or_mixed_provider_export_provenance",
+        "required": true,
+        "status": "unknown"
+      }
+    ],
+    "generated_from": "matched_pair_evidence_and_replay_provenance",
+    "passed_required_gate_count": 1,
+    "public_claim_eligible_observed": null,
+    "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
+    "raw_metric_claim_status_observed": "compare_variants",
+    "reason": "replay_evidence_required_for_public_claim",
+    "required_gate_count": 6,
+    "required_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "quality_non_inferiority",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "schema_version": "contextguard.bench.public-claim-readiness.v1",
+    "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
+  },
   "row_count": 2,
+  "schema": "context-guard-bench-report-v1",
   "summary_by_variant": {
     "baseline": {
-      "runs": 1,
-      "successful_runs": 1,
+      "artifacts_used_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
+      "byte_reduction_ratio": null,
+      "byte_savings_pct": null,
+      "bytes_after_successful": 0,
+      "bytes_before_successful": 0,
+      "bytes_saved_per_successful_task": null,
+      "bytes_saved_successful": null,
+      "compression_strategy": "baseline",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
+      "external_cost_successful_usd": 0.0,
+      "external_cost_unknown_successful": 1,
+      "external_tokens_measured_successful": 0,
+      "external_tokens_per_successful_task": null,
+      "external_tokens_successful": 0,
       "failed_runs": 0,
-      "total_tokens_all_runs": 1200,
-      "primary_tokens_measured_runs": 1,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": true,
+      "observed_telemetry": {
+        "byte_savings": "unavailable",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "observed",
+        "token_proxy": "unavailable",
+        "tokens": "observed",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 10.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 1,
+      "primary_tokens_measured_successful": 1,
       "provider_cached_tokens_all_runs": 0,
       "provider_cached_tokens_measured_runs": 1,
-      "total_cost_with_shift_all_runs_usd": 0.0,
-      "total_cost_with_shift_measured_runs": 0,
-      "total_tokens_successful": 1200,
-      "primary_tokens_measured_successful": 1,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 10.0,
-      "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 0,
       "provider_cached_tokens_measured_successful": 1,
-      "external_cost_successful_usd": 0.0,
-      "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
-      "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 0,
-      "bytes_after_successful": 0,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
+      "provider_cached_tokens_per_successful_task": 0.0,
+      "provider_cached_tokens_per_task_including_failures": 0.0,
+      "provider_cached_tokens_successful": 0,
+      "runs": 1,
+      "successful_runs": 1,
       "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": null,
+      "token_proxy_saved_successful": null,
+      "tokens_per_successful_task": 1200.0,
       "tokens_per_task_including_failures": 1200.0,
-      "wall_time_seconds_per_task_including_failures": 10.0,
-      "provider_cached_tokens_per_task_including_failures": 0.0,
-      "primary_cost_per_task_including_failures_usd": null,
+      "total_cost_with_shift_all_runs_usd": 0.0,
+      "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
       "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": 1200.0,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 1200,
+      "total_tokens_successful": 1200,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 10.0,
+      "wall_time_seconds_measured_runs": 1,
+      "wall_time_seconds_measured_successful": 1,
       "wall_time_seconds_per_successful_task": 10.0,
-      "provider_cached_tokens_per_successful_task": 0.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
-      "external_tokens_per_successful_task": null,
+      "wall_time_seconds_per_task_including_failures": 10.0,
+      "wall_time_seconds_successful": 10.0
+    },
+    "cache_layout_check": {
       "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
       "byte_reduction_ratio": null,
-      "compression_strategy": "baseline",
-      "is_baseline_strategy": true,
-      "bytes_saved_successful": null,
-      "bytes_saved_per_successful_task": null,
       "byte_savings_pct": null,
-      "token_proxy_saved_successful": null,
-      "token_proxy_saved_per_successful_task": null,
+      "bytes_after_successful": 0,
+      "bytes_before_successful": 0,
+      "bytes_saved_per_successful_task": null,
+      "bytes_saved_successful": null,
+      "compression_strategy": "cache_layout_check",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
+      "external_cost_successful_usd": 0.0,
+      "external_cost_unknown_successful": 1,
+      "external_tokens_measured_successful": 0,
+      "external_tokens_per_successful_task": null,
+      "external_tokens_successful": 0,
+      "failed_runs": 0,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": false,
       "observed_telemetry": {
-        "tokens": "observed",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
         "byte_savings": "unavailable",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "observed",
         "token_proxy": "unavailable",
-        "wall_time": "observed",
-        "provider_cache": "observed"
-      }
-    },
-    "cache_layout_check": {
-      "runs": 1,
-      "successful_runs": 1,
-      "failed_runs": 0,
-      "total_tokens_all_runs": 1200,
-      "primary_tokens_measured_runs": 1,
+        "tokens": "observed",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 10.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 1,
+      "primary_tokens_measured_successful": 1,
       "provider_cached_tokens_all_runs": 900,
       "provider_cached_tokens_measured_runs": 1,
-      "total_cost_with_shift_all_runs_usd": 0.0,
-      "total_cost_with_shift_measured_runs": 0,
-      "total_tokens_successful": 1200,
-      "primary_tokens_measured_successful": 1,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 10.0,
-      "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 900,
       "provider_cached_tokens_measured_successful": 1,
-      "external_cost_successful_usd": 0.0,
-      "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
-      "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 0,
-      "bytes_after_successful": 0,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
+      "provider_cached_tokens_per_successful_task": 900.0,
+      "provider_cached_tokens_per_task_including_failures": 900.0,
+      "provider_cached_tokens_successful": 900,
+      "runs": 1,
+      "successful_runs": 1,
       "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": null,
+      "token_proxy_saved_successful": null,
+      "tokens_per_successful_task": 1200.0,
       "tokens_per_task_including_failures": 1200.0,
-      "wall_time_seconds_per_task_including_failures": 10.0,
-      "provider_cached_tokens_per_task_including_failures": 900.0,
-      "primary_cost_per_task_including_failures_usd": null,
+      "total_cost_with_shift_all_runs_usd": 0.0,
+      "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
       "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": 1200.0,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 1200,
+      "total_tokens_successful": 1200,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 10.0,
+      "wall_time_seconds_measured_runs": 1,
+      "wall_time_seconds_measured_successful": 1,
       "wall_time_seconds_per_successful_task": 10.0,
-      "provider_cached_tokens_per_successful_task": 900.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
-      "external_tokens_per_successful_task": null,
-      "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
-      "byte_reduction_ratio": null,
-      "compression_strategy": "cache_layout_check",
-      "is_baseline_strategy": false,
-      "bytes_saved_successful": null,
-      "bytes_saved_per_successful_task": null,
-      "byte_savings_pct": null,
-      "token_proxy_saved_successful": null,
-      "token_proxy_saved_per_successful_task": null,
-      "observed_telemetry": {
-        "tokens": "observed",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
-        "byte_savings": "unavailable",
-        "token_proxy": "unavailable",
-        "wall_time": "observed",
-        "provider_cache": "observed"
-      }
-    }
-  },
-  "comparisons": [
-    {
-      "variant": "cache_layout_check",
-      "baseline_variant": "baseline",
-      "quality_gate": "pass",
-      "baseline_failure_rate": 0.0,
-      "variant_failure_rate": 0.0,
-      "failure_rate_delta_pp": 0.0,
-      "matched_successful_task_count": 1,
-      "baseline_successful_task_count": 1,
-      "missing_baseline_success_tasks": [],
-      "baseline_corrections_per_successful_task": 0.0,
-      "variant_corrections_per_successful_task": 0.0,
-      "paired_corrections_task_count": 1,
-      "corrections_delta_per_successful_task": 0.0,
-      "token_delta_per_successful_task": 0.0,
-      "token_savings_pct": 0.0,
-      "paired_token_task_count": 1,
-      "wall_time_delta_seconds_per_successful_task": 0.0,
-      "wall_time_change_pct": 0.0,
-      "paired_wall_time_task_count": 1,
-      "cost_savings_pct_with_shift": null,
-      "paired_cost_task_count": 0
+      "wall_time_seconds_per_task_including_failures": 10.0,
+      "wall_time_seconds_successful": 10.0
     }
-  ],
-  "claim_status": "compare_variants",
-  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
+  }
 }

package/docs/distribution.md CHANGED Viewed

@@ -25,11 +25,11 @@ context-guard setup --agent claude --scope user --verify --json
 context-guard setup --agent claude --scope user --plan
 ```
-Project scope is the default. `context-guard doctor` and `context-guard setup --verify` are read-only health checks. User scope is opt-in and requires an explicit agent for writes. Supported user-scope writes record backups and rollback metadata under `.context-guard/rollback` in the user home directory.
+Project scope is the default. `context-guard doctor` and `context-guard setup --verify` are read-only health checks. User scope is opt-in and requires an explicit agent for writes. Supported user-scope writes record backups and rollback metadata under `.context-guard/rollback` in the user home directory. Setup resolves packaged/check-out helpers first; `PATH` helper fallback is default-off and requires `--allow-path-helper-fallback` for a trusted install after canonical executable and identity validation.
 ## Runtime requirements
-The helpers are Python/shell scripts packaged through npm and Homebrew. Supported machines need:
+The helpers are Python/shell scripts packaged through npm and Homebrew as plugin-local `plugins/context-guard/bin` entrypoints plus `plugins/context-guard/lib` helpers; checkout-only `context-guard-kit` sources are not duplicated in the npm tarball. Supported machines need:
 - macOS or Linux
 - Python 3 available as `python3`
@@ -47,11 +47,14 @@ The helpers are Python/shell scripts packaged through npm and Homebrew. Supporte
 Before publishing the Homebrew tap, run the formula-specific checks locally or in CI when Homebrew is available:
+Render or copy `packaging/homebrew/context-guard.rb.template` into a real tap formula first; replace `{{VERSION}}` with the bare semver version (for example `0.4.9`, not `v0.4.9`) and `REPLACE_WITH_RELEASE_TARBALL_SHA256` with the verified tarball SHA. Do not run Homebrew audit/install directly against the placeholder template.
 ```bash
-brew style packaging/homebrew/context-guard.rb
-brew audit --strict --new packaging/homebrew/context-guard.rb
-brew install --build-from-source packaging/homebrew/context-guard.rb
-brew test context-guard
+# Example once Formula/context-guard.rb has been rendered in the tap checkout:
+brew style Formula/context-guard.rb
+brew audit --strict --new ictechgy/tap/context-guard
+brew install --build-from-source ictechgy/tap/context-guard
+brew test ictechgy/tap/context-guard
 ```
-The formula should rewrite Python shebangs to the declared Homebrew Python dependency and expose both `context-guard` and legacy compatibility wrappers from `plugins/context-guard/bin`.
+The rendered formula should rewrite Python shebangs to the declared Homebrew Python dependency and expose both `context-guard` and legacy compatibility wrappers from `plugins/context-guard/bin`.

package/docs/experimental-benchmark-fixtures.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # Experimental benchmark fixtures
-These fixtures are **fixture-only** starter scaffolds for future visual/OCR, learned-compression, and reversible output-transform experiments. They are **synthetic**, package-visible examples for `context-guard-bench` task and variant shapes; they are **not shipped benchmark results**, not OCR/compression implementations, and not hosted API savings claims.
+These fixtures are **fixture-only** starter scaffolds for future visual/OCR, learned-compression, reversible output-transform, and token-savings roadmap experiments. They are **synthetic**, package-visible examples for `context-guard-bench` task and variant shapes; they are **not shipped benchmark results**, not OCR/compression implementations, not cache/tool-deferral implementations, and not hosted API savings claims.
 Use them when designing an experiment that starts from ContextGuard's existing benchmark discipline:
@@ -12,6 +12,23 @@ Use them when designing an experiment that starts from ContextGuard's existing b
 5. Treat byte counts, image dimensions, OCR confidence, and local compressor ratios as proxy evidence. Real token/cost claims require **provider-measured** primary token/cost fields on both sides.
 6. Keep private screenshots, raw secrets, and external service endpoints out of fixture files.
+## Local replay evidence
+`context-guard-bench --evidence-jsonl <path>` can replay pre-recorded run evidence into the normal CSV/report pipeline without invoking `claude` or any task `success_command`. Pair it with `--report-json` and `--dashboard-md` to regenerate a deterministic local dashboard:
+```bash
+context-guard-bench \
+  --tasks docs/benchmark-fixtures/token-savings-12task.tasks.example.json \
+  --variants docs/benchmark-fixtures/token-savings-12task.variants.example.json \
+  --evidence-jsonl docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl \
+  --csv /tmp/contextguard-token-savings.csv \
+  --report-json /tmp/contextguard-token-savings.report.json \
+  --dashboard-md /tmp/contextguard-token-savings.dashboard.md \
+  --baseline-variant baseline_full_context_fixture
+```
+The included token-savings evidence file is deliberately `synthetic_fixture` provenance. It validates replay/dashboard mechanics and byte-proxy reporting only: replay forces synthetic/manual rows to `primary_tokens_measured=false` and `cost_measured=false`, so it is not public hosted API token/cost savings evidence even when token-looking numbers are present. A public claim still requires matched successful tasks, provider-export provenance, provider-measured primary tokens/cost, quality non-inferiority, and shifted-cost accounting.
 ## Runner-native variant prompt files
 `context-guard-bench` supports optional file-backed `variant_prompt_files` in task fixtures. The map is keyed by variant name and lets a single logical task swap sanitized prompt evidence per variant, for example a baseline raw-output prompt versus a digest plus artifact receipt prompt. Prompt files are resolved relative to the task JSON, must be relative paths, and are read with the same no-follow/symlink-safe posture as task and variant fixtures.
@@ -20,11 +37,12 @@ This runner-native swap only proves command shape and prompt selection until the
 ## Included fixture sets
-| Fixture set | Task file | Variant file | Intended future experiment |
-| --- | --- | --- | --- |
-| Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
-| Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
-| Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
+| Fixture set | Task file | Variant file | Evidence replay file | Intended future experiment |
+| --- | --- | --- | --- | --- |
+| Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | n/a | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
+| Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | n/a | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
+| Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | n/a | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
+| Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
 ## Visual/OCR fixture notes
@@ -38,6 +56,12 @@ The learned-compression fixtures describe already-sanitized context-pack or arti
 The output-transform fixtures describe already-sanitized command output comparisons and now demonstrate `variant_prompt_files` for raw sanitized output versus digest plus artifact receipt prompt evidence. They do not execute `context-guard-trim-output`, store artifacts, call `context-guard-artifact`, or invoke a provider. Future experiments should compare raw sanitized output against `--digest` output plus an `--artifact-receipt`, verify the receipt's exact re-expand command retrieves the omitted sanitized lines, and record bytes before/after, primary provider tokens, cost, success, corrections, artifact-store usage, and any external/local processing cost.
+## Token-savings 12-task roadmap fixture notes
+The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call. The companion `token-savings-12task.evidence.example.jsonl` lets users replay deterministic synthetic rows into CSV/report/dashboard outputs while preserving the same non-claim boundary.
+For real non-dry-run experiments, replace every placeholder `success_command`, keep task IDs matched across baseline and candidate variants, and require provider-measured primary token/cost data before interpreting `tokens_per_successful_task`, `total_cost_with_shift_usd`, or `external_cost_usd`. Cache predictions, char/4 token proxies, local latency, and byte reductions remain diagnostic proxy evidence unless the generated report contains matched successful task evidence and stays within the 10%p failure-rate guardrail.
 ## Safe wording
 Use language like:

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ictechgy/context-guard",
-  "version": "0.4.9",
+  "version": "0.4.11",
   "description": "ContextGuard CLI helpers for keeping AI coding agent context focused and local-first.",
   "license": "Apache-2.0",
   "homepage": "https://github.com/ictechgy/context-guard#readme",
@@ -32,7 +32,8 @@
     "context-guard-failed-nudge": "plugins/context-guard/bin/context-guard-failed-nudge",
     "context-guard-statusline": "plugins/context-guard/bin/context-guard-statusline",
     "context-guard-statusline-merged": "plugins/context-guard/bin/context-guard-statusline-merged",
-    "context-guard-cost": "plugins/context-guard/bin/context-guard-cost"
+    "context-guard-cost": "plugins/context-guard/bin/context-guard-cost",
+    "context-guard-cache-score": "plugins/context-guard/bin/context-guard-cache-score"
   },
   "files": [
     "CHANGELOG.md",
@@ -40,10 +41,6 @@
     "NOTICE",
     "README.md",
     "README.ko.md",
-    "context-guard-kit/*.py",
-    "context-guard-kit/*.sh",
-    "context-guard-kit/README.md",
-    "context-guard-kit/settings.example.json",
     "plugins/context-guard/.claude-plugin/plugin.json",
     "plugins/context-guard/README.md",
     "plugins/context-guard/README.ko.md",
@@ -62,6 +59,7 @@
     "docs/benchmark-workflows/*.example.jsonl",
     "docs/benchmark-workflow-examples.md",
     "docs/benchmark-fixtures/*.example.json",
+    "docs/benchmark-fixtures/*.example.jsonl",
     "docs/benchmark-fixtures/*.prompt.example.md",
     "docs/experimental-benchmark-fixtures.md",
     "packaging/homebrew/context-guard.rb.template"

package/packaging/homebrew/context-guard.rb.template CHANGED Viewed

@@ -5,7 +5,7 @@ class ContextGuard < Formula
   desc "Local-first context guardrails for AI coding agents"
   homepage "https://github.com/ictechgy/context-guard"
-  url "https://github.com/ictechgy/context-guard/archive/refs/tags/v0.4.8.tar.gz"
+  url "https://github.com/ictechgy/context-guard/archive/refs/tags/v{{VERSION}}.tar.gz"
   sha256 "REPLACE_WITH_RELEASE_TARBALL_SHA256"
   license "Apache-2.0"

package/plugins/context-guard/.claude-plugin/plugin.json CHANGED Viewed

@@ -37,5 +37,5 @@
     "gated-experiments",
     "future-roadmap"
   ],
-  "version": "0.4.9"
+  "version": "0.4.11"
 }