@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.ko.md +59 -31
  3. package/README.md +85 -36
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  8. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  9. package/docs/benchmark-workflow-examples.md +3 -0
  10. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  11. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  12. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  13. package/docs/distribution.md +10 -7
  14. package/docs/experimental-benchmark-fixtures.md +30 -6
  15. package/package.json +4 -6
  16. package/packaging/homebrew/context-guard.rb.template +1 -1
  17. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  18. package/plugins/context-guard/README.ko.md +20 -14
  19. package/plugins/context-guard/README.md +26 -17
  20. package/plugins/context-guard/bin/context-guard +147 -25
  21. package/plugins/context-guard/bin/context-guard-artifact +884 -79
  22. package/plugins/context-guard/bin/context-guard-audit +33 -2
  23. package/plugins/context-guard/bin/context-guard-bench +1542 -31
  24. package/plugins/context-guard/bin/context-guard-cache-score +665 -0
  25. package/plugins/context-guard/bin/context-guard-compress +146 -1
  26. package/plugins/context-guard/bin/context-guard-cost +790 -6
  27. package/plugins/context-guard/bin/context-guard-experiments +463 -26
  28. package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
  29. package/plugins/context-guard/bin/context-guard-filter +163 -7
  30. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  31. package/plugins/context-guard/bin/context-guard-pack +892 -49
  32. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  33. package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
  34. package/plugins/context-guard/bin/context-guard-setup +165 -31
  35. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  36. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  37. package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
  38. package/plugins/context-guard/bin/context-guard-trim-output +288 -41
  39. package/plugins/context-guard/brief/README.md +5 -5
  40. package/plugins/context-guard/lib/context_guard_commands.py +230 -0
  41. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  42. package/context-guard-kit/README.md +0 -91
  43. package/context-guard-kit/benchmark_runner.py +0 -2401
  44. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  45. package/context-guard-kit/context_compress.py +0 -695
  46. package/context-guard-kit/context_escrow.py +0 -935
  47. package/context-guard-kit/context_filter.py +0 -637
  48. package/context-guard-kit/context_guard_cli.py +0 -325
  49. package/context-guard-kit/context_guard_diet.py +0 -1711
  50. package/context-guard-kit/context_pack.py +0 -2713
  51. package/context-guard-kit/cost_guard.py +0 -2349
  52. package/context-guard-kit/experimental_registry.py +0 -4348
  53. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  54. package/context-guard-kit/guard_large_read.py +0 -690
  55. package/context-guard-kit/hook_secret_patterns.py +0 -43
  56. package/context-guard-kit/read_symbol.py +0 -483
  57. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  58. package/context-guard-kit/sanitize_output.py +0 -725
  59. package/context-guard-kit/settings.example.json +0 -67
  60. package/context-guard-kit/setup_wizard.py +0 -2515
  61. package/context-guard-kit/statusline.sh +0 -362
  62. package/context-guard-kit/statusline_merged.sh +0 -157
  63. package/context-guard-kit/tool_schema_pruner.py +0 -837
  64. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -0,0 +1,182 @@
1
+ [
2
+ {
3
+ "id": "token_savings_01_bugfix",
4
+ "prompt": "Fixture-only synthetic token-savings roadmap task (bugfix). Fix a null-check regression in a sanitized request parser while preserving exact stack-frame evidence. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
5
+ "model": "sonnet",
6
+ "effort": "medium",
7
+ "max_turns": 3,
8
+ "max_budget_usd": 1.0,
9
+ "allowed_tools": [],
10
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
11
+ "success_cwd": ".",
12
+ "variant_prompt_files": {
13
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
14
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
15
+ }
16
+ },
17
+ {
18
+ "id": "token_savings_02_exploration",
19
+ "prompt": "Fixture-only synthetic token-savings roadmap task (exploration). Explore a small sanitized repository and identify the next file to inspect without loading unrelated logs. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
20
+ "model": "sonnet",
21
+ "effort": "medium",
22
+ "max_turns": 3,
23
+ "max_budget_usd": 1.0,
24
+ "allowed_tools": [],
25
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
26
+ "success_cwd": ".",
27
+ "variant_prompt_files": {
28
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
29
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
30
+ }
31
+ },
32
+ {
33
+ "id": "token_savings_03_code_review",
34
+ "prompt": "Fixture-only synthetic token-savings roadmap task (code_review). Review a focused diff and identify one correctness risk plus one test gap. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
35
+ "model": "sonnet",
36
+ "effort": "medium",
37
+ "max_turns": 3,
38
+ "max_budget_usd": 1.0,
39
+ "allowed_tools": [],
40
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
41
+ "success_cwd": ".",
42
+ "variant_prompt_files": {
43
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
44
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
45
+ }
46
+ },
47
+ {
48
+ "id": "token_savings_04_long_log_analysis",
49
+ "prompt": "Fixture-only synthetic token-savings roadmap task (long_log_analysis). Analyze a long sanitized CI log and cite the failing command, preserving artifact receipt fallback. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
50
+ "model": "sonnet",
51
+ "effort": "medium",
52
+ "max_turns": 3,
53
+ "max_budget_usd": 1.0,
54
+ "allowed_tools": [],
55
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
56
+ "success_cwd": ".",
57
+ "variant_prompt_files": {
58
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
59
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
60
+ }
61
+ },
62
+ {
63
+ "id": "token_savings_05_migration",
64
+ "prompt": "Fixture-only synthetic token-savings roadmap task (migration). Plan a safe migration of a deprecated CLI flag to a new option while keeping backwards compatibility. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
65
+ "model": "sonnet",
66
+ "effort": "medium",
67
+ "max_turns": 3,
68
+ "max_budget_usd": 1.0,
69
+ "allowed_tools": [],
70
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
71
+ "success_cwd": ".",
72
+ "variant_prompt_files": {
73
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
74
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
75
+ }
76
+ },
77
+ {
78
+ "id": "token_savings_06_docs",
79
+ "prompt": "Fixture-only synthetic token-savings roadmap task (docs). Update user-facing docs to clarify provider-measured matched successful task requirements. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
80
+ "model": "sonnet",
81
+ "effort": "medium",
82
+ "max_turns": 3,
83
+ "max_budget_usd": 1.0,
84
+ "allowed_tools": [],
85
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
86
+ "success_cwd": ".",
87
+ "variant_prompt_files": {
88
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
89
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
90
+ }
91
+ },
92
+ {
93
+ "id": "token_savings_07_refactor",
94
+ "prompt": "Fixture-only synthetic token-savings roadmap task (refactor). Refactor duplicated helper parsing into a shared function without changing public output schema. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
95
+ "model": "sonnet",
96
+ "effort": "medium",
97
+ "max_turns": 3,
98
+ "max_budget_usd": 1.0,
99
+ "allowed_tools": [],
100
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
101
+ "success_cwd": ".",
102
+ "variant_prompt_files": {
103
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
104
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
105
+ }
106
+ },
107
+ {
108
+ "id": "token_savings_08_performance",
109
+ "prompt": "Fixture-only synthetic token-savings roadmap task (performance). Find a deterministic hot path in a local-only helper and propose a bounded optimization. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
110
+ "model": "sonnet",
111
+ "effort": "medium",
112
+ "max_turns": 3,
113
+ "max_budget_usd": 1.0,
114
+ "allowed_tools": [],
115
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
116
+ "success_cwd": ".",
117
+ "variant_prompt_files": {
118
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
119
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
120
+ }
121
+ },
122
+ {
123
+ "id": "token_savings_09_telemetry",
124
+ "prompt": "Fixture-only synthetic token-savings roadmap task (telemetry). Add claim-safe telemetry fields for shifted local work without hosted cost-savings claims. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
125
+ "model": "sonnet",
126
+ "effort": "medium",
127
+ "max_turns": 3,
128
+ "max_budget_usd": 1.0,
129
+ "allowed_tools": [],
130
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
131
+ "success_cwd": ".",
132
+ "variant_prompt_files": {
133
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
134
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
135
+ }
136
+ },
137
+ {
138
+ "id": "token_savings_10_cache_layout",
139
+ "prompt": "Fixture-only synthetic token-savings roadmap task (cache_layout). Inspect a prompt layout and identify stable prefix versus dynamic suffix placement. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
140
+ "model": "sonnet",
141
+ "effort": "medium",
142
+ "max_turns": 3,
143
+ "max_budget_usd": 1.0,
144
+ "allowed_tools": [],
145
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
146
+ "success_cwd": ".",
147
+ "variant_prompt_files": {
148
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
149
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
150
+ }
151
+ },
152
+ {
153
+ "id": "token_savings_11_tool_schema",
154
+ "prompt": "Fixture-only synthetic token-savings roadmap task (tool_schema). Select a small core tool set from a sanitized MCP catalog and defer the rest by receipt. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
155
+ "model": "sonnet",
156
+ "effort": "medium",
157
+ "max_turns": 3,
158
+ "max_budget_usd": 1.0,
159
+ "allowed_tools": [],
160
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
161
+ "success_cwd": ".",
162
+ "variant_prompt_files": {
163
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
164
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
165
+ }
166
+ },
167
+ {
168
+ "id": "token_savings_12_artifact_receipt",
169
+ "prompt": "Fixture-only synthetic token-savings roadmap task (artifact_receipt). Verify that a digest plus receipt can re-expand omitted sanitized output exactly when needed. This validates benchmark shape only; real claims require provider-measured tokens/costs for matched successful tasks, failure-rate guardrail, human corrections, and shifted-cost accounting.",
170
+ "model": "sonnet",
171
+ "effort": "medium",
172
+ "max_turns": 3,
173
+ "max_budget_usd": 1.0,
174
+ "allowed_tools": [],
175
+ "success_command": "python3 -c \"raise SystemExit('fixture-only placeholder: replace success_command before real benchmark runs')\"",
176
+ "success_cwd": ".",
177
+ "variant_prompt_files": {
178
+ "baseline_full_context_fixture": "token-savings-12task-baseline.prompt.example.md",
179
+ "fixture_only_contextguard_advisory_foundations": "token-savings-12task-contextguard.prompt.example.md"
180
+ }
181
+ }
182
+ ]
@@ -0,0 +1,10 @@
1
+ [
2
+ {
3
+ "name": "baseline_full_context_fixture",
4
+ "extra_args": []
5
+ },
6
+ {
7
+ "name": "fixture_only_contextguard_advisory_foundations",
8
+ "extra_args": []
9
+ }
10
+ ]
@@ -26,6 +26,7 @@ Use them to decide what evidence a workflow has and what it does **not** prove:
26
26
  3. Treat `comparisons[].quality_gate != "pass"` as a warning to inspect failures, correction burden, and unmatched tasks before discussing savings.
27
27
  4. Keep byte-proxy, provider-cache, wall-time, and shifted-cost evidence in separate language from provider-measured token/cost claims. Provider-cache telemetry is not independent savings proof.
28
28
  5. Keep self-hosted local/model-server latency, memory, and quality metrics in the run-evidence ledger sidecar; do not fold them into hosted API token/cost savings claims unless provider-measured matched-task evidence separately supports that claim.
29
+ 6. For deterministic local replay, add `--evidence-jsonl ... --dashboard-md ...`. Synthetic/manual replay evidence regenerates CSV/report/dashboard artifacts, but the report is marked `replay_only_not_public_claim` or `unknown_mixed_csv` unless every report row has complete provider-export provenance. Public hosted savings claims must additionally have `public_claim_readiness.claim_allowed=true`, which requires matched successful tasks, provider-measured token/cost, quality non-inferiority, shifted-cost accounting, explicit confidence/failure notes, and complete provider-export provenance.
29
30
 
30
31
  ## Safe wording
31
32
 
@@ -42,3 +43,5 @@ The `.example.json` fixtures intentionally use full `context-guard-bench-report-
42
43
  The self-hosted metrics example is a JSONL run-evidence sidecar, not a full report shape. Its fields are additive ledger evidence only: `latency_ms`, `peak_memory_mb`, and normalized `quality_score` describe local/model-server behavior and leave hosted API report calculations unchanged. Use `context-guard experiments plan self-hosted-metrics-ledger --json ...` only as a dry-run ledger-preview checker for explicit metrics; it does not write the benchmark ledger.
43
44
 
44
45
  For task/variant starter fixtures rather than full report-shape examples, see [`experimental-benchmark-fixtures.md`](experimental-benchmark-fixtures.md). Those files are fixture-only and synthetic dry-run-only starters until users replace the placeholder prompts and success checks; they are not shipped OCR, visual-token, learned-compression, or output-transform benchmark results, and real claims still require provider-measured matched successful tasks plus failure-rate, correction, and shifted-cost guardrails.
46
+
47
+ The token-savings 12-task starter also includes [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) for `context-guard-bench --evidence-jsonl` replay. That file is synthetic local replay evidence, not provider-measured savings proof; use it to validate dashboards and claim-boundary handling before collecting real provider exports.
@@ -1,169 +1,310 @@
1
1
  {
2
- "schema": "context-guard-bench-report-v1",
3
2
  "baseline_variant": "baseline",
3
+ "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
4
+ "claim_status": "insufficient_paired_data",
5
+ "comparisons": [
6
+ {
7
+ "baseline_corrections_per_successful_task": 0.0,
8
+ "baseline_failure_rate": 0.0,
9
+ "baseline_successful_task_count": 1,
10
+ "baseline_variant": "baseline",
11
+ "corrections_delta_per_successful_task": 0.0,
12
+ "cost_savings_pct_with_shift": null,
13
+ "failure_rate_delta_pp": 0.0,
14
+ "matched_successful_task_count": 1,
15
+ "missing_baseline_success_tasks": [],
16
+ "paired_corrections_task_count": 1,
17
+ "paired_cost_task_count": 0,
18
+ "paired_token_task_count": 0,
19
+ "paired_wall_time_task_count": 1,
20
+ "quality_gate": "pass",
21
+ "token_savings_pct": null,
22
+ "variant": "context_pack_auto",
23
+ "variant_corrections_per_successful_task": 0.0,
24
+ "variant_failure_rate": 0.0,
25
+ "wall_time_change_pct": -8.333333333333332,
26
+ "wall_time_delta_seconds_per_successful_task": -1.0
27
+ }
28
+ ],
29
+ "public_claim_readiness": {
30
+ "blocking_gate_ids": [
31
+ "matched_successful_tasks",
32
+ "provider_measured_token_cost",
33
+ "shifted_cost_accounting",
34
+ "confidence_failure_notes",
35
+ "provider_export_provenance"
36
+ ],
37
+ "claim_allowed": false,
38
+ "claim_boundary": {
39
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
40
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
41
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
42
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
43
+ "id": "public_claim_readiness_authoritative_release_gate",
44
+ "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
45
+ "reporting_only": true,
46
+ "requires_confidence_and_failure_notes": true,
47
+ "requires_matched_successful_tasks": true,
48
+ "requires_provider_export_provenance": true,
49
+ "requires_provider_measured_tokens_and_cost": true,
50
+ "requires_quality_non_inferiority": true,
51
+ "requires_shifted_cost_accounting": true,
52
+ "unsupported_claims_forbidden": true
53
+ },
54
+ "gates": [
55
+ {
56
+ "evidence": {
57
+ "comparison_count": 1,
58
+ "matched_pair_count": 0,
59
+ "min_matched_successful_task_count": 1.0,
60
+ "missing_baseline_success_task_count": 0,
61
+ "variants": [
62
+ "context_pack_auto"
63
+ ]
64
+ },
65
+ "id": "matched_successful_tasks",
66
+ "label": "Matched successful tasks",
67
+ "passed": false,
68
+ "reason": "missing_or_regressed_matched_successful_tasks",
69
+ "required": true,
70
+ "status": "fail"
71
+ },
72
+ {
73
+ "evidence": {
74
+ "matched_pair_count": 0,
75
+ "required_fields": [
76
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
77
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
78
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
79
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
80
+ ]
81
+ },
82
+ "id": "provider_measured_token_cost",
83
+ "label": "Provider-measured token and primary cost",
84
+ "passed": false,
85
+ "reason": "missing_provider_measured_primary_tokens_or_cost",
86
+ "required": true,
87
+ "status": "fail"
88
+ },
89
+ {
90
+ "evidence": {
91
+ "max_corrections_delta_per_successful_task": 0.0,
92
+ "max_failure_rate_delta_pp": 0.0,
93
+ "quality_gates": [
94
+ "pass"
95
+ ]
96
+ },
97
+ "id": "quality_non_inferiority",
98
+ "label": "Quality non-inferiority",
99
+ "passed": true,
100
+ "reason": "all_quality_gates_pass",
101
+ "required": true,
102
+ "status": "pass"
103
+ },
104
+ {
105
+ "evidence": {
106
+ "matched_pair_count": 0,
107
+ "required_fields": [
108
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
109
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
110
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
111
+ ]
112
+ },
113
+ "id": "shifted_cost_accounting",
114
+ "label": "Shifted-cost accounting",
115
+ "passed": false,
116
+ "reason": "missing_shifted_cost_claim_accounting",
117
+ "required": true,
118
+ "status": "fail"
119
+ },
120
+ {
121
+ "evidence": {
122
+ "comparison_failure_fields_present": true,
123
+ "explicit_note_count": 0,
124
+ "failed_row_count": 0,
125
+ "failed_rows_with_notes": 0,
126
+ "replay_row_count": 0
127
+ },
128
+ "id": "confidence_failure_notes",
129
+ "label": "Confidence and failure notes",
130
+ "passed": false,
131
+ "reason": "missing_explicit_replay_notes_or_failure_evidence",
132
+ "required": true,
133
+ "status": "unknown"
134
+ },
135
+ {
136
+ "evidence": {
137
+ "mixed_csv": false,
138
+ "provider_names": [],
139
+ "replay_row_count": 0,
140
+ "report_row_count": 2,
141
+ "same_run_complete": false,
142
+ "source_types": []
143
+ },
144
+ "id": "provider_export_provenance",
145
+ "label": "Provider-export provenance",
146
+ "passed": false,
147
+ "reason": "missing_or_mixed_provider_export_provenance",
148
+ "required": true,
149
+ "status": "unknown"
150
+ }
151
+ ],
152
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
153
+ "passed_required_gate_count": 1,
154
+ "public_claim_eligible_observed": null,
155
+ "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
156
+ "raw_metric_claim_status_observed": "insufficient_paired_data",
157
+ "reason": "replay_evidence_required_for_public_claim",
158
+ "required_gate_count": 6,
159
+ "required_gate_ids": [
160
+ "matched_successful_tasks",
161
+ "provider_measured_token_cost",
162
+ "quality_non_inferiority",
163
+ "shifted_cost_accounting",
164
+ "confidence_failure_notes",
165
+ "provider_export_provenance"
166
+ ],
167
+ "schema_version": "contextguard.bench.public-claim-readiness.v1",
168
+ "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
169
+ },
4
170
  "row_count": 2,
171
+ "schema": "context-guard-bench-report-v1",
5
172
  "summary_by_variant": {
6
173
  "baseline": {
7
- "runs": 1,
8
- "successful_runs": 1,
174
+ "artifacts_used_per_successful_task": 0.0,
175
+ "artifacts_used_successful": 0,
176
+ "byte_reduction_ratio": 1.0,
177
+ "byte_savings_pct": 0.0,
178
+ "bytes_after_successful": 24000,
179
+ "bytes_before_successful": 24000,
180
+ "bytes_saved_per_successful_task": 0.0,
181
+ "bytes_saved_successful": 0,
182
+ "compression_strategy": "baseline",
183
+ "corrections_per_successful_task": 0.0,
184
+ "corrections_successful": 0,
185
+ "external_cost_successful_usd": 0.0,
186
+ "external_cost_unknown_successful": 1,
187
+ "external_tokens_measured_successful": 0,
188
+ "external_tokens_per_successful_task": null,
189
+ "external_tokens_successful": 0,
9
190
  "failed_runs": 0,
10
- "total_tokens_all_runs": 0,
11
- "primary_tokens_measured_runs": 0,
191
+ "failure_rate": 0.0,
192
+ "hook_triggers_successful": 0,
193
+ "is_baseline_strategy": true,
194
+ "observed_telemetry": {
195
+ "byte_savings": "observed",
196
+ "external_tokens": "unavailable",
197
+ "primary_cost": "unavailable",
198
+ "provider_cache": "unavailable",
199
+ "token_proxy": "inferred",
200
+ "tokens": "unavailable",
201
+ "wall_time": "observed"
202
+ },
12
203
  "primary_cost_all_runs_usd": 0.0,
13
204
  "primary_cost_measured_runs": 0,
14
- "wall_time_seconds_all_runs": 12.0,
15
- "wall_time_seconds_measured_runs": 1,
205
+ "primary_cost_measured_successful": 0,
206
+ "primary_cost_per_successful_task_usd": null,
207
+ "primary_cost_per_task_including_failures_usd": null,
208
+ "primary_cost_successful_usd": 0.0,
209
+ "primary_tokens_measured_runs": 0,
210
+ "primary_tokens_measured_successful": 0,
16
211
  "provider_cached_tokens_all_runs": 0,
17
212
  "provider_cached_tokens_measured_runs": 0,
213
+ "provider_cached_tokens_measured_successful": 0,
214
+ "provider_cached_tokens_per_successful_task": 0.0,
215
+ "provider_cached_tokens_per_task_including_failures": 0.0,
216
+ "provider_cached_tokens_successful": 0,
217
+ "runs": 1,
218
+ "successful_runs": 1,
219
+ "successful_task_count": 1,
220
+ "task_count": 1,
221
+ "token_proxy_saved_per_successful_task": 0.0,
222
+ "token_proxy_saved_successful": 0,
223
+ "tokens_per_successful_task": null,
224
+ "tokens_per_task_including_failures": null,
18
225
  "total_cost_with_shift_all_runs_usd": 0.0,
19
226
  "total_cost_with_shift_measured_runs": 0,
227
+ "total_cost_with_shift_measured_successful": 0,
228
+ "total_cost_with_shift_per_successful_task_usd": null,
229
+ "total_cost_with_shift_per_task_including_failures_usd": null,
230
+ "total_cost_with_shift_successful_usd": 0.0,
231
+ "total_tokens_all_runs": 0,
20
232
  "total_tokens_successful": 0,
21
- "primary_tokens_measured_successful": 0,
22
- "primary_cost_successful_usd": 0.0,
23
- "primary_cost_measured_successful": 0,
24
- "wall_time_seconds_successful": 12.0,
233
+ "turns_successful": 0,
234
+ "wall_time_seconds_all_runs": 12.0,
235
+ "wall_time_seconds_measured_runs": 1,
25
236
  "wall_time_seconds_measured_successful": 1,
26
- "provider_cached_tokens_successful": 0,
27
- "provider_cached_tokens_measured_successful": 0,
237
+ "wall_time_seconds_per_successful_task": 12.0,
238
+ "wall_time_seconds_per_task_including_failures": 12.0,
239
+ "wall_time_seconds_successful": 12.0
240
+ },
241
+ "context_pack_auto": {
242
+ "artifacts_used_per_successful_task": 0.0,
243
+ "artifacts_used_successful": 0,
244
+ "byte_reduction_ratio": 0.25,
245
+ "byte_savings_pct": 75.0,
246
+ "bytes_after_successful": 6000,
247
+ "bytes_before_successful": 24000,
248
+ "bytes_saved_per_successful_task": 18000.0,
249
+ "bytes_saved_successful": 18000,
250
+ "compression_strategy": "context_pack_auto",
251
+ "corrections_per_successful_task": 0.0,
252
+ "corrections_successful": 0,
28
253
  "external_cost_successful_usd": 0.0,
29
254
  "external_cost_unknown_successful": 1,
30
- "total_cost_with_shift_successful_usd": 0.0,
31
- "total_cost_with_shift_measured_successful": 0,
32
- "external_tokens_successful": 0,
33
255
  "external_tokens_measured_successful": 0,
34
- "artifacts_used_successful": 0,
35
- "corrections_successful": 0,
36
- "bytes_before_successful": 24000,
37
- "bytes_after_successful": 24000,
38
- "turns_successful": 0,
39
- "hook_triggers_successful": 0,
40
- "failure_rate": 0.0,
41
- "task_count": 1,
42
- "successful_task_count": 1,
43
- "tokens_per_task_including_failures": null,
44
- "wall_time_seconds_per_task_including_failures": 12.0,
45
- "provider_cached_tokens_per_task_including_failures": 0.0,
46
- "primary_cost_per_task_including_failures_usd": null,
47
- "total_cost_with_shift_per_task_including_failures_usd": null,
48
- "tokens_per_successful_task": null,
49
- "wall_time_seconds_per_successful_task": 12.0,
50
- "provider_cached_tokens_per_successful_task": 0.0,
51
- "primary_cost_per_successful_task_usd": null,
52
- "total_cost_with_shift_per_successful_task_usd": null,
53
256
  "external_tokens_per_successful_task": null,
54
- "artifacts_used_per_successful_task": 0.0,
55
- "corrections_per_successful_task": 0.0,
56
- "byte_reduction_ratio": 1.0,
57
- "compression_strategy": "baseline",
58
- "is_baseline_strategy": true,
59
- "bytes_saved_successful": 0,
60
- "bytes_saved_per_successful_task": 0.0,
61
- "byte_savings_pct": 0.0,
62
- "token_proxy_saved_successful": 0,
63
- "token_proxy_saved_per_successful_task": 0.0,
257
+ "external_tokens_successful": 0,
258
+ "failed_runs": 0,
259
+ "failure_rate": 0.0,
260
+ "hook_triggers_successful": 0,
261
+ "is_baseline_strategy": false,
64
262
  "observed_telemetry": {
65
- "tokens": "unavailable",
66
- "primary_cost": "unavailable",
67
- "external_tokens": "unavailable",
68
263
  "byte_savings": "observed",
264
+ "external_tokens": "unavailable",
265
+ "primary_cost": "unavailable",
266
+ "provider_cache": "unavailable",
69
267
  "token_proxy": "inferred",
70
- "wall_time": "observed",
71
- "provider_cache": "unavailable"
72
- }
73
- },
74
- "context_pack_auto": {
75
- "runs": 1,
76
- "successful_runs": 1,
77
- "failed_runs": 0,
78
- "total_tokens_all_runs": 0,
79
- "primary_tokens_measured_runs": 0,
268
+ "tokens": "unavailable",
269
+ "wall_time": "observed"
270
+ },
80
271
  "primary_cost_all_runs_usd": 0.0,
81
272
  "primary_cost_measured_runs": 0,
82
- "wall_time_seconds_all_runs": 11.0,
83
- "wall_time_seconds_measured_runs": 1,
273
+ "primary_cost_measured_successful": 0,
274
+ "primary_cost_per_successful_task_usd": null,
275
+ "primary_cost_per_task_including_failures_usd": null,
276
+ "primary_cost_successful_usd": 0.0,
277
+ "primary_tokens_measured_runs": 0,
278
+ "primary_tokens_measured_successful": 0,
84
279
  "provider_cached_tokens_all_runs": 0,
85
280
  "provider_cached_tokens_measured_runs": 0,
86
- "total_cost_with_shift_all_runs_usd": 0.0,
87
- "total_cost_with_shift_measured_runs": 0,
88
- "total_tokens_successful": 0,
89
- "primary_tokens_measured_successful": 0,
90
- "primary_cost_successful_usd": 0.0,
91
- "primary_cost_measured_successful": 0,
92
- "wall_time_seconds_successful": 11.0,
93
- "wall_time_seconds_measured_successful": 1,
94
- "provider_cached_tokens_successful": 0,
95
281
  "provider_cached_tokens_measured_successful": 0,
96
- "external_cost_successful_usd": 0.0,
97
- "external_cost_unknown_successful": 1,
98
- "total_cost_with_shift_successful_usd": 0.0,
99
- "total_cost_with_shift_measured_successful": 0,
100
- "external_tokens_successful": 0,
101
- "external_tokens_measured_successful": 0,
102
- "artifacts_used_successful": 0,
103
- "corrections_successful": 0,
104
- "bytes_before_successful": 24000,
105
- "bytes_after_successful": 6000,
106
- "turns_successful": 0,
107
- "hook_triggers_successful": 0,
108
- "failure_rate": 0.0,
109
- "task_count": 1,
282
+ "provider_cached_tokens_per_successful_task": 0.0,
283
+ "provider_cached_tokens_per_task_including_failures": 0.0,
284
+ "provider_cached_tokens_successful": 0,
285
+ "runs": 1,
286
+ "successful_runs": 1,
110
287
  "successful_task_count": 1,
288
+ "task_count": 1,
289
+ "token_proxy_saved_per_successful_task": 4500.0,
290
+ "token_proxy_saved_successful": 4500,
291
+ "tokens_per_successful_task": null,
111
292
  "tokens_per_task_including_failures": null,
112
- "wall_time_seconds_per_task_including_failures": 11.0,
113
- "provider_cached_tokens_per_task_including_failures": 0.0,
114
- "primary_cost_per_task_including_failures_usd": null,
293
+ "total_cost_with_shift_all_runs_usd": 0.0,
294
+ "total_cost_with_shift_measured_runs": 0,
295
+ "total_cost_with_shift_measured_successful": 0,
296
+ "total_cost_with_shift_per_successful_task_usd": null,
115
297
  "total_cost_with_shift_per_task_including_failures_usd": null,
116
- "tokens_per_successful_task": null,
298
+ "total_cost_with_shift_successful_usd": 0.0,
299
+ "total_tokens_all_runs": 0,
300
+ "total_tokens_successful": 0,
301
+ "turns_successful": 0,
302
+ "wall_time_seconds_all_runs": 11.0,
303
+ "wall_time_seconds_measured_runs": 1,
304
+ "wall_time_seconds_measured_successful": 1,
117
305
  "wall_time_seconds_per_successful_task": 11.0,
118
- "provider_cached_tokens_per_successful_task": 0.0,
119
- "primary_cost_per_successful_task_usd": null,
120
- "total_cost_with_shift_per_successful_task_usd": null,
121
- "external_tokens_per_successful_task": null,
122
- "artifacts_used_per_successful_task": 0.0,
123
- "corrections_per_successful_task": 0.0,
124
- "byte_reduction_ratio": 0.25,
125
- "compression_strategy": "context_pack_auto",
126
- "is_baseline_strategy": false,
127
- "bytes_saved_successful": 18000,
128
- "bytes_saved_per_successful_task": 18000.0,
129
- "byte_savings_pct": 75.0,
130
- "token_proxy_saved_successful": 4500,
131
- "token_proxy_saved_per_successful_task": 4500.0,
132
- "observed_telemetry": {
133
- "tokens": "unavailable",
134
- "primary_cost": "unavailable",
135
- "external_tokens": "unavailable",
136
- "byte_savings": "observed",
137
- "token_proxy": "inferred",
138
- "wall_time": "observed",
139
- "provider_cache": "unavailable"
140
- }
141
- }
142
- },
143
- "comparisons": [
144
- {
145
- "variant": "context_pack_auto",
146
- "baseline_variant": "baseline",
147
- "quality_gate": "pass",
148
- "baseline_failure_rate": 0.0,
149
- "variant_failure_rate": 0.0,
150
- "failure_rate_delta_pp": 0.0,
151
- "matched_successful_task_count": 1,
152
- "baseline_successful_task_count": 1,
153
- "missing_baseline_success_tasks": [],
154
- "baseline_corrections_per_successful_task": 0.0,
155
- "variant_corrections_per_successful_task": 0.0,
156
- "paired_corrections_task_count": 1,
157
- "corrections_delta_per_successful_task": 0.0,
158
- "token_savings_pct": null,
159
- "paired_token_task_count": 0,
160
- "wall_time_delta_seconds_per_successful_task": -1.0,
161
- "wall_time_change_pct": -8.333333333333332,
162
- "paired_wall_time_task_count": 1,
163
- "cost_savings_pct_with_shift": null,
164
- "paired_cost_task_count": 0
306
+ "wall_time_seconds_per_task_including_failures": 11.0,
307
+ "wall_time_seconds_successful": 11.0
165
308
  }
166
- ],
167
- "claim_status": "insufficient_paired_data",
168
- "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
309
+ }
169
310
  }