npm - @ictechgy/context-guard - Versions diffs - 0.4.10 → 0.4.12 - Mend

@ictechgy/context-guard 0.4.10 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/CHANGELOG.md +17 -1
package/README.ko.md +46 -28
package/README.md +42 -33
package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
package/docs/benchmark-workflow-examples.md +3 -0
package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
package/docs/experimental-benchmark-fixtures.md +24 -7
package/package.json +2 -1
package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
package/plugins/context-guard/README.ko.md +14 -11
package/plugins/context-guard/README.md +15 -14
package/plugins/context-guard/bin/context-guard +48 -17
package/plugins/context-guard/bin/context-guard-artifact +342 -33
package/plugins/context-guard/bin/context-guard-audit +36 -5
package/plugins/context-guard/bin/context-guard-bench +1675 -44
package/plugins/context-guard/bin/context-guard-cache-score +347 -35
package/plugins/context-guard/bin/context-guard-compress +89 -27
package/plugins/context-guard/bin/context-guard-cost +7 -2
package/plugins/context-guard/bin/context-guard-experiments +364 -8
package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
package/plugins/context-guard/bin/context-guard-filter +88 -18
package/plugins/context-guard/bin/context-guard-pack +329 -19
package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
package/plugins/context-guard/bin/context-guard-setup +21 -5
package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
package/plugins/context-guard/bin/context-guard-trim-output +394 -90
package/plugins/context-guard/brief/README.md +5 -5
package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
package/plugins/context-guard/lib/context_guard_commands.py +217 -190

package/docs/benchmark-workflows/provider-cache-telemetry.example.json CHANGED Viewed

@@ -1,170 +1,311 @@
 {
-  "schema": "context-guard-bench-report-v1",
   "baseline_variant": "baseline",
+  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
+  "claim_status": "compare_variants",
+  "comparisons": [
+    {
+      "baseline_corrections_per_successful_task": 0.0,
+      "baseline_failure_rate": 0.0,
+      "baseline_successful_task_count": 1,
+      "baseline_variant": "baseline",
+      "corrections_delta_per_successful_task": 0.0,
+      "cost_savings_pct_with_shift": null,
+      "failure_rate_delta_pp": 0.0,
+      "matched_successful_task_count": 1,
+      "missing_baseline_success_tasks": [],
+      "paired_corrections_task_count": 1,
+      "paired_cost_task_count": 0,
+      "paired_token_task_count": 1,
+      "paired_wall_time_task_count": 1,
+      "quality_gate": "pass",
+      "token_delta_per_successful_task": 0.0,
+      "token_savings_pct": 0.0,
+      "variant": "cache_layout_check",
+      "variant_corrections_per_successful_task": 0.0,
+      "variant_failure_rate": 0.0,
+      "wall_time_change_pct": 0.0,
+      "wall_time_delta_seconds_per_successful_task": 0.0
+    }
+  ],
+  "public_claim_readiness": {
+    "blocking_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "claim_allowed": false,
+    "claim_boundary": {
+      "claim_allowed_field": "public_claim_readiness.claim_allowed",
+      "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
+      "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
+      "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
+      "id": "public_claim_readiness_authoritative_release_gate",
+      "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
+      "reporting_only": true,
+      "requires_confidence_and_failure_notes": true,
+      "requires_matched_successful_tasks": true,
+      "requires_provider_export_provenance": true,
+      "requires_provider_measured_tokens_and_cost": true,
+      "requires_quality_non_inferiority": true,
+      "requires_shifted_cost_accounting": true,
+      "unsupported_claims_forbidden": true
+    },
+    "gates": [
+      {
+        "evidence": {
+          "comparison_count": 1,
+          "matched_pair_count": 0,
+          "min_matched_successful_task_count": 1.0,
+          "missing_baseline_success_task_count": 0,
+          "variants": [
+            "cache_layout_check"
+          ]
+        },
+        "id": "matched_successful_tasks",
+        "label": "Matched successful tasks",
+        "passed": false,
+        "reason": "missing_or_regressed_matched_successful_tasks",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
+            "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
+          ]
+        },
+        "id": "provider_measured_token_cost",
+        "label": "Provider-measured token and primary cost",
+        "passed": false,
+        "reason": "missing_provider_measured_primary_tokens_or_cost",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "max_corrections_delta_per_successful_task": 0.0,
+          "max_failure_rate_delta_pp": 0.0,
+          "quality_gates": [
+            "pass"
+          ]
+        },
+        "id": "quality_non_inferiority",
+        "label": "Quality non-inferiority",
+        "passed": true,
+        "reason": "all_quality_gates_pass",
+        "required": true,
+        "status": "pass"
+      },
+      {
+        "evidence": {
+          "matched_pair_count": 0,
+          "required_fields": [
+            "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
+            "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
+            "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
+          ]
+        },
+        "id": "shifted_cost_accounting",
+        "label": "Shifted-cost accounting",
+        "passed": false,
+        "reason": "missing_shifted_cost_claim_accounting",
+        "required": true,
+        "status": "fail"
+      },
+      {
+        "evidence": {
+          "comparison_failure_fields_present": true,
+          "explicit_note_count": 0,
+          "failed_row_count": 0,
+          "failed_rows_with_notes": 0,
+          "replay_row_count": 0
+        },
+        "id": "confidence_failure_notes",
+        "label": "Confidence and failure notes",
+        "passed": false,
+        "reason": "missing_explicit_replay_notes_or_failure_evidence",
+        "required": true,
+        "status": "unknown"
+      },
+      {
+        "evidence": {
+          "mixed_csv": false,
+          "provider_names": [],
+          "replay_row_count": 0,
+          "report_row_count": 2,
+          "same_run_complete": false,
+          "source_types": []
+        },
+        "id": "provider_export_provenance",
+        "label": "Provider-export provenance",
+        "passed": false,
+        "reason": "missing_or_mixed_provider_export_provenance",
+        "required": true,
+        "status": "unknown"
+      }
+    ],
+    "generated_from": "matched_pair_evidence_and_replay_provenance",
+    "passed_required_gate_count": 1,
+    "public_claim_eligible_observed": null,
+    "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
+    "raw_metric_claim_status_observed": "compare_variants",
+    "reason": "replay_evidence_required_for_public_claim",
+    "required_gate_count": 6,
+    "required_gate_ids": [
+      "matched_successful_tasks",
+      "provider_measured_token_cost",
+      "quality_non_inferiority",
+      "shifted_cost_accounting",
+      "confidence_failure_notes",
+      "provider_export_provenance"
+    ],
+    "schema_version": "contextguard.bench.public-claim-readiness.v1",
+    "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
+  },
   "row_count": 2,
+  "schema": "context-guard-bench-report-v1",
   "summary_by_variant": {
     "baseline": {
-      "runs": 1,
-      "successful_runs": 1,
+      "artifacts_used_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
+      "byte_reduction_ratio": null,
+      "byte_savings_pct": null,
+      "bytes_after_successful": 0,
+      "bytes_before_successful": 0,
+      "bytes_saved_per_successful_task": null,
+      "bytes_saved_successful": null,
+      "compression_strategy": "baseline",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
+      "external_cost_successful_usd": 0.0,
+      "external_cost_unknown_successful": 1,
+      "external_tokens_measured_successful": 0,
+      "external_tokens_per_successful_task": null,
+      "external_tokens_successful": 0,
       "failed_runs": 0,
-      "total_tokens_all_runs": 1200,
-      "primary_tokens_measured_runs": 1,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": true,
+      "observed_telemetry": {
+        "byte_savings": "unavailable",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "observed",
+        "token_proxy": "unavailable",
+        "tokens": "observed",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 10.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 1,
+      "primary_tokens_measured_successful": 1,
       "provider_cached_tokens_all_runs": 0,
       "provider_cached_tokens_measured_runs": 1,
-      "total_cost_with_shift_all_runs_usd": 0.0,
-      "total_cost_with_shift_measured_runs": 0,
-      "total_tokens_successful": 1200,
-      "primary_tokens_measured_successful": 1,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 10.0,
-      "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 0,
       "provider_cached_tokens_measured_successful": 1,
-      "external_cost_successful_usd": 0.0,
-      "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
-      "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 0,
-      "bytes_after_successful": 0,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
+      "provider_cached_tokens_per_successful_task": 0.0,
+      "provider_cached_tokens_per_task_including_failures": 0.0,
+      "provider_cached_tokens_successful": 0,
+      "runs": 1,
+      "successful_runs": 1,
       "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": null,
+      "token_proxy_saved_successful": null,
+      "tokens_per_successful_task": 1200.0,
       "tokens_per_task_including_failures": 1200.0,
-      "wall_time_seconds_per_task_including_failures": 10.0,
-      "provider_cached_tokens_per_task_including_failures": 0.0,
-      "primary_cost_per_task_including_failures_usd": null,
+      "total_cost_with_shift_all_runs_usd": 0.0,
+      "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
       "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": 1200.0,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 1200,
+      "total_tokens_successful": 1200,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 10.0,
+      "wall_time_seconds_measured_runs": 1,
+      "wall_time_seconds_measured_successful": 1,
       "wall_time_seconds_per_successful_task": 10.0,
-      "provider_cached_tokens_per_successful_task": 0.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
-      "external_tokens_per_successful_task": null,
+      "wall_time_seconds_per_task_including_failures": 10.0,
+      "wall_time_seconds_successful": 10.0
+    },
+    "cache_layout_check": {
       "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
+      "artifacts_used_successful": 0,
       "byte_reduction_ratio": null,
-      "compression_strategy": "baseline",
-      "is_baseline_strategy": true,
-      "bytes_saved_successful": null,
-      "bytes_saved_per_successful_task": null,
       "byte_savings_pct": null,
-      "token_proxy_saved_successful": null,
-      "token_proxy_saved_per_successful_task": null,
+      "bytes_after_successful": 0,
+      "bytes_before_successful": 0,
+      "bytes_saved_per_successful_task": null,
+      "bytes_saved_successful": null,
+      "compression_strategy": "cache_layout_check",
+      "corrections_per_successful_task": 0.0,
+      "corrections_successful": 0,
+      "external_cost_successful_usd": 0.0,
+      "external_cost_unknown_successful": 1,
+      "external_tokens_measured_successful": 0,
+      "external_tokens_per_successful_task": null,
+      "external_tokens_successful": 0,
+      "failed_runs": 0,
+      "failure_rate": 0.0,
+      "hook_triggers_successful": 0,
+      "is_baseline_strategy": false,
       "observed_telemetry": {
-        "tokens": "observed",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
         "byte_savings": "unavailable",
+        "external_tokens": "unavailable",
+        "primary_cost": "unavailable",
+        "provider_cache": "observed",
         "token_proxy": "unavailable",
-        "wall_time": "observed",
-        "provider_cache": "observed"
-      }
-    },
-    "cache_layout_check": {
-      "runs": 1,
-      "successful_runs": 1,
-      "failed_runs": 0,
-      "total_tokens_all_runs": 1200,
-      "primary_tokens_measured_runs": 1,
+        "tokens": "observed",
+        "wall_time": "observed"
+      },
       "primary_cost_all_runs_usd": 0.0,
       "primary_cost_measured_runs": 0,
-      "wall_time_seconds_all_runs": 10.0,
-      "wall_time_seconds_measured_runs": 1,
+      "primary_cost_measured_successful": 0,
+      "primary_cost_per_successful_task_usd": null,
+      "primary_cost_per_task_including_failures_usd": null,
+      "primary_cost_successful_usd": 0.0,
+      "primary_tokens_measured_runs": 1,
+      "primary_tokens_measured_successful": 1,
       "provider_cached_tokens_all_runs": 900,
       "provider_cached_tokens_measured_runs": 1,
-      "total_cost_with_shift_all_runs_usd": 0.0,
-      "total_cost_with_shift_measured_runs": 0,
-      "total_tokens_successful": 1200,
-      "primary_tokens_measured_successful": 1,
-      "primary_cost_successful_usd": 0.0,
-      "primary_cost_measured_successful": 0,
-      "wall_time_seconds_successful": 10.0,
-      "wall_time_seconds_measured_successful": 1,
-      "provider_cached_tokens_successful": 900,
       "provider_cached_tokens_measured_successful": 1,
-      "external_cost_successful_usd": 0.0,
-      "external_cost_unknown_successful": 1,
-      "total_cost_with_shift_successful_usd": 0.0,
-      "total_cost_with_shift_measured_successful": 0,
-      "external_tokens_successful": 0,
-      "external_tokens_measured_successful": 0,
-      "artifacts_used_successful": 0,
-      "corrections_successful": 0,
-      "bytes_before_successful": 0,
-      "bytes_after_successful": 0,
-      "turns_successful": 0,
-      "hook_triggers_successful": 0,
-      "failure_rate": 0.0,
-      "task_count": 1,
+      "provider_cached_tokens_per_successful_task": 900.0,
+      "provider_cached_tokens_per_task_including_failures": 900.0,
+      "provider_cached_tokens_successful": 900,
+      "runs": 1,
+      "successful_runs": 1,
       "successful_task_count": 1,
+      "task_count": 1,
+      "token_proxy_saved_per_successful_task": null,
+      "token_proxy_saved_successful": null,
+      "tokens_per_successful_task": 1200.0,
       "tokens_per_task_including_failures": 1200.0,
-      "wall_time_seconds_per_task_including_failures": 10.0,
-      "provider_cached_tokens_per_task_including_failures": 900.0,
-      "primary_cost_per_task_including_failures_usd": null,
+      "total_cost_with_shift_all_runs_usd": 0.0,
+      "total_cost_with_shift_measured_runs": 0,
+      "total_cost_with_shift_measured_successful": 0,
+      "total_cost_with_shift_per_successful_task_usd": null,
       "total_cost_with_shift_per_task_including_failures_usd": null,
-      "tokens_per_successful_task": 1200.0,
+      "total_cost_with_shift_successful_usd": 0.0,
+      "total_tokens_all_runs": 1200,
+      "total_tokens_successful": 1200,
+      "turns_successful": 0,
+      "wall_time_seconds_all_runs": 10.0,
+      "wall_time_seconds_measured_runs": 1,
+      "wall_time_seconds_measured_successful": 1,
       "wall_time_seconds_per_successful_task": 10.0,
-      "provider_cached_tokens_per_successful_task": 900.0,
-      "primary_cost_per_successful_task_usd": null,
-      "total_cost_with_shift_per_successful_task_usd": null,
-      "external_tokens_per_successful_task": null,
-      "artifacts_used_per_successful_task": 0.0,
-      "corrections_per_successful_task": 0.0,
-      "byte_reduction_ratio": null,
-      "compression_strategy": "cache_layout_check",
-      "is_baseline_strategy": false,
-      "bytes_saved_successful": null,
-      "bytes_saved_per_successful_task": null,
-      "byte_savings_pct": null,
-      "token_proxy_saved_successful": null,
-      "token_proxy_saved_per_successful_task": null,
-      "observed_telemetry": {
-        "tokens": "observed",
-        "primary_cost": "unavailable",
-        "external_tokens": "unavailable",
-        "byte_savings": "unavailable",
-        "token_proxy": "unavailable",
-        "wall_time": "observed",
-        "provider_cache": "observed"
-      }
-    }
-  },
-  "comparisons": [
-    {
-      "variant": "cache_layout_check",
-      "baseline_variant": "baseline",
-      "quality_gate": "pass",
-      "baseline_failure_rate": 0.0,
-      "variant_failure_rate": 0.0,
-      "failure_rate_delta_pp": 0.0,
-      "matched_successful_task_count": 1,
-      "baseline_successful_task_count": 1,
-      "missing_baseline_success_tasks": [],
-      "baseline_corrections_per_successful_task": 0.0,
-      "variant_corrections_per_successful_task": 0.0,
-      "paired_corrections_task_count": 1,
-      "corrections_delta_per_successful_task": 0.0,
-      "token_delta_per_successful_task": 0.0,
-      "token_savings_pct": 0.0,
-      "paired_token_task_count": 1,
-      "wall_time_delta_seconds_per_successful_task": 0.0,
-      "wall_time_change_pct": 0.0,
-      "paired_wall_time_task_count": 1,
-      "cost_savings_pct_with_shift": null,
-      "paired_cost_task_count": 0
+      "wall_time_seconds_per_task_including_failures": 10.0,
+      "wall_time_seconds_successful": 10.0
     }
-  ],
-  "claim_status": "compare_variants",
-  "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
+  }
 }

package/docs/experimental-benchmark-fixtures.md CHANGED Viewed

@@ -12,6 +12,23 @@ Use them when designing an experiment that starts from ContextGuard's existing b
 5. Treat byte counts, image dimensions, OCR confidence, and local compressor ratios as proxy evidence. Real token/cost claims require **provider-measured** primary token/cost fields on both sides.
 6. Keep private screenshots, raw secrets, and external service endpoints out of fixture files.
+## Local replay evidence
+`context-guard-bench --evidence-jsonl <path>` can replay pre-recorded run evidence into the normal CSV/report pipeline without invoking `claude` or any task `success_command`. Pair it with `--report-json` and `--dashboard-md` to regenerate a deterministic local dashboard:
+```bash
+context-guard-bench \
+  --tasks docs/benchmark-fixtures/token-savings-12task.tasks.example.json \
+  --variants docs/benchmark-fixtures/token-savings-12task.variants.example.json \
+  --evidence-jsonl docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl \
+  --csv /tmp/contextguard-token-savings.csv \
+  --report-json /tmp/contextguard-token-savings.report.json \
+  --dashboard-md /tmp/contextguard-token-savings.dashboard.md \
+  --baseline-variant baseline_full_context_fixture
+```
+The included token-savings evidence file is deliberately `synthetic_fixture` provenance. It validates replay/dashboard mechanics and byte-proxy reporting only: replay forces synthetic/manual rows to `primary_tokens_measured=false` and `cost_measured=false`, so it is not public hosted API token/cost savings evidence even when token-looking numbers are present. A public claim still requires matched successful tasks, provider-export provenance, provider-measured primary tokens/cost, quality non-inferiority, and shifted-cost accounting.
 ## Runner-native variant prompt files
 `context-guard-bench` supports optional file-backed `variant_prompt_files` in task fixtures. The map is keyed by variant name and lets a single logical task swap sanitized prompt evidence per variant, for example a baseline raw-output prompt versus a digest plus artifact receipt prompt. Prompt files are resolved relative to the task JSON, must be relative paths, and are read with the same no-follow/symlink-safe posture as task and variant fixtures.
@@ -20,12 +37,12 @@ This runner-native swap only proves command shape and prompt selection until the
 ## Included fixture sets
-| Fixture set | Task file | Variant file | Intended future experiment |
-| --- | --- | --- | --- |
-| Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
-| Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
-| Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
-| Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
+| Fixture set | Task file | Variant file | Evidence replay file | Intended future experiment |
+| --- | --- | --- | --- | --- |
+| Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | n/a | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
+| Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | n/a | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
+| Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | n/a | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
+| Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
 ## Visual/OCR fixture notes
@@ -41,7 +58,7 @@ The output-transform fixtures describe already-sanitized command output comparis
 ## Token-savings 12-task roadmap fixture notes
-The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call.
+The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call. The companion `token-savings-12task.evidence.example.jsonl` lets users replay deterministic synthetic rows into CSV/report/dashboard outputs while preserving the same non-claim boundary.
 For real non-dry-run experiments, replace every placeholder `success_command`, keep task IDs matched across baseline and candidate variants, and require provider-measured primary token/cost data before interpreting `tokens_per_successful_task`, `total_cost_with_shift_usd`, or `external_cost_usd`. Cache predictions, char/4 token proxies, local latency, and byte reductions remain diagnostic proxy evidence unless the generated report contains matched successful task evidence and stays within the 10%p failure-rate guardrail.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@ictechgy/context-guard",
-  "version": "0.4.10",
+  "version": "0.4.12",
   "description": "ContextGuard CLI helpers for keeping AI coding agent context focused and local-first.",
   "license": "Apache-2.0",
   "homepage": "https://github.com/ictechgy/context-guard#readme",
@@ -59,6 +59,7 @@
     "docs/benchmark-workflows/*.example.jsonl",
     "docs/benchmark-workflow-examples.md",
     "docs/benchmark-fixtures/*.example.json",
+    "docs/benchmark-fixtures/*.example.jsonl",
     "docs/benchmark-fixtures/*.prompt.example.md",
     "docs/experimental-benchmark-fixtures.md",
     "packaging/homebrew/context-guard.rb.template"

package/plugins/context-guard/.claude-plugin/plugin.json CHANGED Viewed

@@ -37,5 +37,5 @@
     "gated-experiments",
     "future-roadmap"
   ],
-  "version": "0.4.10"
+  "version": "0.4.12"
 }