@ictechgy/context-guard 0.4.10 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CHANGELOG.md +17 -1
  2. package/README.ko.md +46 -28
  3. package/README.md +42 -33
  4. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  5. package/docs/benchmark-workflow-examples.md +3 -0
  6. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  7. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  8. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  9. package/docs/experimental-benchmark-fixtures.md +24 -7
  10. package/package.json +2 -1
  11. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  12. package/plugins/context-guard/README.ko.md +14 -11
  13. package/plugins/context-guard/README.md +15 -14
  14. package/plugins/context-guard/bin/context-guard +48 -17
  15. package/plugins/context-guard/bin/context-guard-artifact +342 -33
  16. package/plugins/context-guard/bin/context-guard-audit +36 -5
  17. package/plugins/context-guard/bin/context-guard-bench +1675 -44
  18. package/plugins/context-guard/bin/context-guard-cache-score +347 -35
  19. package/plugins/context-guard/bin/context-guard-compress +89 -27
  20. package/plugins/context-guard/bin/context-guard-cost +7 -2
  21. package/plugins/context-guard/bin/context-guard-experiments +364 -8
  22. package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
  23. package/plugins/context-guard/bin/context-guard-filter +88 -18
  24. package/plugins/context-guard/bin/context-guard-pack +329 -19
  25. package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
  26. package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
  27. package/plugins/context-guard/bin/context-guard-setup +21 -5
  28. package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
  29. package/plugins/context-guard/bin/context-guard-trim-output +394 -90
  30. package/plugins/context-guard/brief/README.md +5 -5
  31. package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
  32. package/plugins/context-guard/lib/context_guard_commands.py +217 -190
@@ -1,170 +1,311 @@
1
1
  {
2
- "schema": "context-guard-bench-report-v1",
3
2
  "baseline_variant": "baseline",
3
+ "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
4
+ "claim_status": "compare_variants",
5
+ "comparisons": [
6
+ {
7
+ "baseline_corrections_per_successful_task": 0.0,
8
+ "baseline_failure_rate": 0.0,
9
+ "baseline_successful_task_count": 1,
10
+ "baseline_variant": "baseline",
11
+ "corrections_delta_per_successful_task": 0.0,
12
+ "cost_savings_pct_with_shift": null,
13
+ "failure_rate_delta_pp": 0.0,
14
+ "matched_successful_task_count": 1,
15
+ "missing_baseline_success_tasks": [],
16
+ "paired_corrections_task_count": 1,
17
+ "paired_cost_task_count": 0,
18
+ "paired_token_task_count": 1,
19
+ "paired_wall_time_task_count": 1,
20
+ "quality_gate": "pass",
21
+ "token_delta_per_successful_task": 0.0,
22
+ "token_savings_pct": 0.0,
23
+ "variant": "cache_layout_check",
24
+ "variant_corrections_per_successful_task": 0.0,
25
+ "variant_failure_rate": 0.0,
26
+ "wall_time_change_pct": 0.0,
27
+ "wall_time_delta_seconds_per_successful_task": 0.0
28
+ }
29
+ ],
30
+ "public_claim_readiness": {
31
+ "blocking_gate_ids": [
32
+ "matched_successful_tasks",
33
+ "provider_measured_token_cost",
34
+ "shifted_cost_accounting",
35
+ "confidence_failure_notes",
36
+ "provider_export_provenance"
37
+ ],
38
+ "claim_allowed": false,
39
+ "claim_boundary": {
40
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
41
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
42
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
43
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
44
+ "id": "public_claim_readiness_authoritative_release_gate",
45
+ "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
46
+ "reporting_only": true,
47
+ "requires_confidence_and_failure_notes": true,
48
+ "requires_matched_successful_tasks": true,
49
+ "requires_provider_export_provenance": true,
50
+ "requires_provider_measured_tokens_and_cost": true,
51
+ "requires_quality_non_inferiority": true,
52
+ "requires_shifted_cost_accounting": true,
53
+ "unsupported_claims_forbidden": true
54
+ },
55
+ "gates": [
56
+ {
57
+ "evidence": {
58
+ "comparison_count": 1,
59
+ "matched_pair_count": 0,
60
+ "min_matched_successful_task_count": 1.0,
61
+ "missing_baseline_success_task_count": 0,
62
+ "variants": [
63
+ "cache_layout_check"
64
+ ]
65
+ },
66
+ "id": "matched_successful_tasks",
67
+ "label": "Matched successful tasks",
68
+ "passed": false,
69
+ "reason": "missing_or_regressed_matched_successful_tasks",
70
+ "required": true,
71
+ "status": "fail"
72
+ },
73
+ {
74
+ "evidence": {
75
+ "matched_pair_count": 0,
76
+ "required_fields": [
77
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
78
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
79
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
80
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
81
+ ]
82
+ },
83
+ "id": "provider_measured_token_cost",
84
+ "label": "Provider-measured token and primary cost",
85
+ "passed": false,
86
+ "reason": "missing_provider_measured_primary_tokens_or_cost",
87
+ "required": true,
88
+ "status": "fail"
89
+ },
90
+ {
91
+ "evidence": {
92
+ "max_corrections_delta_per_successful_task": 0.0,
93
+ "max_failure_rate_delta_pp": 0.0,
94
+ "quality_gates": [
95
+ "pass"
96
+ ]
97
+ },
98
+ "id": "quality_non_inferiority",
99
+ "label": "Quality non-inferiority",
100
+ "passed": true,
101
+ "reason": "all_quality_gates_pass",
102
+ "required": true,
103
+ "status": "pass"
104
+ },
105
+ {
106
+ "evidence": {
107
+ "matched_pair_count": 0,
108
+ "required_fields": [
109
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
110
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
111
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
112
+ ]
113
+ },
114
+ "id": "shifted_cost_accounting",
115
+ "label": "Shifted-cost accounting",
116
+ "passed": false,
117
+ "reason": "missing_shifted_cost_claim_accounting",
118
+ "required": true,
119
+ "status": "fail"
120
+ },
121
+ {
122
+ "evidence": {
123
+ "comparison_failure_fields_present": true,
124
+ "explicit_note_count": 0,
125
+ "failed_row_count": 0,
126
+ "failed_rows_with_notes": 0,
127
+ "replay_row_count": 0
128
+ },
129
+ "id": "confidence_failure_notes",
130
+ "label": "Confidence and failure notes",
131
+ "passed": false,
132
+ "reason": "missing_explicit_replay_notes_or_failure_evidence",
133
+ "required": true,
134
+ "status": "unknown"
135
+ },
136
+ {
137
+ "evidence": {
138
+ "mixed_csv": false,
139
+ "provider_names": [],
140
+ "replay_row_count": 0,
141
+ "report_row_count": 2,
142
+ "same_run_complete": false,
143
+ "source_types": []
144
+ },
145
+ "id": "provider_export_provenance",
146
+ "label": "Provider-export provenance",
147
+ "passed": false,
148
+ "reason": "missing_or_mixed_provider_export_provenance",
149
+ "required": true,
150
+ "status": "unknown"
151
+ }
152
+ ],
153
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
154
+ "passed_required_gate_count": 1,
155
+ "public_claim_eligible_observed": null,
156
+ "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
157
+ "raw_metric_claim_status_observed": "compare_variants",
158
+ "reason": "replay_evidence_required_for_public_claim",
159
+ "required_gate_count": 6,
160
+ "required_gate_ids": [
161
+ "matched_successful_tasks",
162
+ "provider_measured_token_cost",
163
+ "quality_non_inferiority",
164
+ "shifted_cost_accounting",
165
+ "confidence_failure_notes",
166
+ "provider_export_provenance"
167
+ ],
168
+ "schema_version": "contextguard.bench.public-claim-readiness.v1",
169
+ "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
170
+ },
4
171
  "row_count": 2,
172
+ "schema": "context-guard-bench-report-v1",
5
173
  "summary_by_variant": {
6
174
  "baseline": {
7
- "runs": 1,
8
- "successful_runs": 1,
175
+ "artifacts_used_per_successful_task": 0.0,
176
+ "artifacts_used_successful": 0,
177
+ "byte_reduction_ratio": null,
178
+ "byte_savings_pct": null,
179
+ "bytes_after_successful": 0,
180
+ "bytes_before_successful": 0,
181
+ "bytes_saved_per_successful_task": null,
182
+ "bytes_saved_successful": null,
183
+ "compression_strategy": "baseline",
184
+ "corrections_per_successful_task": 0.0,
185
+ "corrections_successful": 0,
186
+ "external_cost_successful_usd": 0.0,
187
+ "external_cost_unknown_successful": 1,
188
+ "external_tokens_measured_successful": 0,
189
+ "external_tokens_per_successful_task": null,
190
+ "external_tokens_successful": 0,
9
191
  "failed_runs": 0,
10
- "total_tokens_all_runs": 1200,
11
- "primary_tokens_measured_runs": 1,
192
+ "failure_rate": 0.0,
193
+ "hook_triggers_successful": 0,
194
+ "is_baseline_strategy": true,
195
+ "observed_telemetry": {
196
+ "byte_savings": "unavailable",
197
+ "external_tokens": "unavailable",
198
+ "primary_cost": "unavailable",
199
+ "provider_cache": "observed",
200
+ "token_proxy": "unavailable",
201
+ "tokens": "observed",
202
+ "wall_time": "observed"
203
+ },
12
204
  "primary_cost_all_runs_usd": 0.0,
13
205
  "primary_cost_measured_runs": 0,
14
- "wall_time_seconds_all_runs": 10.0,
15
- "wall_time_seconds_measured_runs": 1,
206
+ "primary_cost_measured_successful": 0,
207
+ "primary_cost_per_successful_task_usd": null,
208
+ "primary_cost_per_task_including_failures_usd": null,
209
+ "primary_cost_successful_usd": 0.0,
210
+ "primary_tokens_measured_runs": 1,
211
+ "primary_tokens_measured_successful": 1,
16
212
  "provider_cached_tokens_all_runs": 0,
17
213
  "provider_cached_tokens_measured_runs": 1,
18
- "total_cost_with_shift_all_runs_usd": 0.0,
19
- "total_cost_with_shift_measured_runs": 0,
20
- "total_tokens_successful": 1200,
21
- "primary_tokens_measured_successful": 1,
22
- "primary_cost_successful_usd": 0.0,
23
- "primary_cost_measured_successful": 0,
24
- "wall_time_seconds_successful": 10.0,
25
- "wall_time_seconds_measured_successful": 1,
26
- "provider_cached_tokens_successful": 0,
27
214
  "provider_cached_tokens_measured_successful": 1,
28
- "external_cost_successful_usd": 0.0,
29
- "external_cost_unknown_successful": 1,
30
- "total_cost_with_shift_successful_usd": 0.0,
31
- "total_cost_with_shift_measured_successful": 0,
32
- "external_tokens_successful": 0,
33
- "external_tokens_measured_successful": 0,
34
- "artifacts_used_successful": 0,
35
- "corrections_successful": 0,
36
- "bytes_before_successful": 0,
37
- "bytes_after_successful": 0,
38
- "turns_successful": 0,
39
- "hook_triggers_successful": 0,
40
- "failure_rate": 0.0,
41
- "task_count": 1,
215
+ "provider_cached_tokens_per_successful_task": 0.0,
216
+ "provider_cached_tokens_per_task_including_failures": 0.0,
217
+ "provider_cached_tokens_successful": 0,
218
+ "runs": 1,
219
+ "successful_runs": 1,
42
220
  "successful_task_count": 1,
221
+ "task_count": 1,
222
+ "token_proxy_saved_per_successful_task": null,
223
+ "token_proxy_saved_successful": null,
224
+ "tokens_per_successful_task": 1200.0,
43
225
  "tokens_per_task_including_failures": 1200.0,
44
- "wall_time_seconds_per_task_including_failures": 10.0,
45
- "provider_cached_tokens_per_task_including_failures": 0.0,
46
- "primary_cost_per_task_including_failures_usd": null,
226
+ "total_cost_with_shift_all_runs_usd": 0.0,
227
+ "total_cost_with_shift_measured_runs": 0,
228
+ "total_cost_with_shift_measured_successful": 0,
229
+ "total_cost_with_shift_per_successful_task_usd": null,
47
230
  "total_cost_with_shift_per_task_including_failures_usd": null,
48
- "tokens_per_successful_task": 1200.0,
231
+ "total_cost_with_shift_successful_usd": 0.0,
232
+ "total_tokens_all_runs": 1200,
233
+ "total_tokens_successful": 1200,
234
+ "turns_successful": 0,
235
+ "wall_time_seconds_all_runs": 10.0,
236
+ "wall_time_seconds_measured_runs": 1,
237
+ "wall_time_seconds_measured_successful": 1,
49
238
  "wall_time_seconds_per_successful_task": 10.0,
50
- "provider_cached_tokens_per_successful_task": 0.0,
51
- "primary_cost_per_successful_task_usd": null,
52
- "total_cost_with_shift_per_successful_task_usd": null,
53
- "external_tokens_per_successful_task": null,
239
+ "wall_time_seconds_per_task_including_failures": 10.0,
240
+ "wall_time_seconds_successful": 10.0
241
+ },
242
+ "cache_layout_check": {
54
243
  "artifacts_used_per_successful_task": 0.0,
55
- "corrections_per_successful_task": 0.0,
244
+ "artifacts_used_successful": 0,
56
245
  "byte_reduction_ratio": null,
57
- "compression_strategy": "baseline",
58
- "is_baseline_strategy": true,
59
- "bytes_saved_successful": null,
60
- "bytes_saved_per_successful_task": null,
61
246
  "byte_savings_pct": null,
62
- "token_proxy_saved_successful": null,
63
- "token_proxy_saved_per_successful_task": null,
247
+ "bytes_after_successful": 0,
248
+ "bytes_before_successful": 0,
249
+ "bytes_saved_per_successful_task": null,
250
+ "bytes_saved_successful": null,
251
+ "compression_strategy": "cache_layout_check",
252
+ "corrections_per_successful_task": 0.0,
253
+ "corrections_successful": 0,
254
+ "external_cost_successful_usd": 0.0,
255
+ "external_cost_unknown_successful": 1,
256
+ "external_tokens_measured_successful": 0,
257
+ "external_tokens_per_successful_task": null,
258
+ "external_tokens_successful": 0,
259
+ "failed_runs": 0,
260
+ "failure_rate": 0.0,
261
+ "hook_triggers_successful": 0,
262
+ "is_baseline_strategy": false,
64
263
  "observed_telemetry": {
65
- "tokens": "observed",
66
- "primary_cost": "unavailable",
67
- "external_tokens": "unavailable",
68
264
  "byte_savings": "unavailable",
265
+ "external_tokens": "unavailable",
266
+ "primary_cost": "unavailable",
267
+ "provider_cache": "observed",
69
268
  "token_proxy": "unavailable",
70
- "wall_time": "observed",
71
- "provider_cache": "observed"
72
- }
73
- },
74
- "cache_layout_check": {
75
- "runs": 1,
76
- "successful_runs": 1,
77
- "failed_runs": 0,
78
- "total_tokens_all_runs": 1200,
79
- "primary_tokens_measured_runs": 1,
269
+ "tokens": "observed",
270
+ "wall_time": "observed"
271
+ },
80
272
  "primary_cost_all_runs_usd": 0.0,
81
273
  "primary_cost_measured_runs": 0,
82
- "wall_time_seconds_all_runs": 10.0,
83
- "wall_time_seconds_measured_runs": 1,
274
+ "primary_cost_measured_successful": 0,
275
+ "primary_cost_per_successful_task_usd": null,
276
+ "primary_cost_per_task_including_failures_usd": null,
277
+ "primary_cost_successful_usd": 0.0,
278
+ "primary_tokens_measured_runs": 1,
279
+ "primary_tokens_measured_successful": 1,
84
280
  "provider_cached_tokens_all_runs": 900,
85
281
  "provider_cached_tokens_measured_runs": 1,
86
- "total_cost_with_shift_all_runs_usd": 0.0,
87
- "total_cost_with_shift_measured_runs": 0,
88
- "total_tokens_successful": 1200,
89
- "primary_tokens_measured_successful": 1,
90
- "primary_cost_successful_usd": 0.0,
91
- "primary_cost_measured_successful": 0,
92
- "wall_time_seconds_successful": 10.0,
93
- "wall_time_seconds_measured_successful": 1,
94
- "provider_cached_tokens_successful": 900,
95
282
  "provider_cached_tokens_measured_successful": 1,
96
- "external_cost_successful_usd": 0.0,
97
- "external_cost_unknown_successful": 1,
98
- "total_cost_with_shift_successful_usd": 0.0,
99
- "total_cost_with_shift_measured_successful": 0,
100
- "external_tokens_successful": 0,
101
- "external_tokens_measured_successful": 0,
102
- "artifacts_used_successful": 0,
103
- "corrections_successful": 0,
104
- "bytes_before_successful": 0,
105
- "bytes_after_successful": 0,
106
- "turns_successful": 0,
107
- "hook_triggers_successful": 0,
108
- "failure_rate": 0.0,
109
- "task_count": 1,
283
+ "provider_cached_tokens_per_successful_task": 900.0,
284
+ "provider_cached_tokens_per_task_including_failures": 900.0,
285
+ "provider_cached_tokens_successful": 900,
286
+ "runs": 1,
287
+ "successful_runs": 1,
110
288
  "successful_task_count": 1,
289
+ "task_count": 1,
290
+ "token_proxy_saved_per_successful_task": null,
291
+ "token_proxy_saved_successful": null,
292
+ "tokens_per_successful_task": 1200.0,
111
293
  "tokens_per_task_including_failures": 1200.0,
112
- "wall_time_seconds_per_task_including_failures": 10.0,
113
- "provider_cached_tokens_per_task_including_failures": 900.0,
114
- "primary_cost_per_task_including_failures_usd": null,
294
+ "total_cost_with_shift_all_runs_usd": 0.0,
295
+ "total_cost_with_shift_measured_runs": 0,
296
+ "total_cost_with_shift_measured_successful": 0,
297
+ "total_cost_with_shift_per_successful_task_usd": null,
115
298
  "total_cost_with_shift_per_task_including_failures_usd": null,
116
- "tokens_per_successful_task": 1200.0,
299
+ "total_cost_with_shift_successful_usd": 0.0,
300
+ "total_tokens_all_runs": 1200,
301
+ "total_tokens_successful": 1200,
302
+ "turns_successful": 0,
303
+ "wall_time_seconds_all_runs": 10.0,
304
+ "wall_time_seconds_measured_runs": 1,
305
+ "wall_time_seconds_measured_successful": 1,
117
306
  "wall_time_seconds_per_successful_task": 10.0,
118
- "provider_cached_tokens_per_successful_task": 900.0,
119
- "primary_cost_per_successful_task_usd": null,
120
- "total_cost_with_shift_per_successful_task_usd": null,
121
- "external_tokens_per_successful_task": null,
122
- "artifacts_used_per_successful_task": 0.0,
123
- "corrections_per_successful_task": 0.0,
124
- "byte_reduction_ratio": null,
125
- "compression_strategy": "cache_layout_check",
126
- "is_baseline_strategy": false,
127
- "bytes_saved_successful": null,
128
- "bytes_saved_per_successful_task": null,
129
- "byte_savings_pct": null,
130
- "token_proxy_saved_successful": null,
131
- "token_proxy_saved_per_successful_task": null,
132
- "observed_telemetry": {
133
- "tokens": "observed",
134
- "primary_cost": "unavailable",
135
- "external_tokens": "unavailable",
136
- "byte_savings": "unavailable",
137
- "token_proxy": "unavailable",
138
- "wall_time": "observed",
139
- "provider_cache": "observed"
140
- }
141
- }
142
- },
143
- "comparisons": [
144
- {
145
- "variant": "cache_layout_check",
146
- "baseline_variant": "baseline",
147
- "quality_gate": "pass",
148
- "baseline_failure_rate": 0.0,
149
- "variant_failure_rate": 0.0,
150
- "failure_rate_delta_pp": 0.0,
151
- "matched_successful_task_count": 1,
152
- "baseline_successful_task_count": 1,
153
- "missing_baseline_success_tasks": [],
154
- "baseline_corrections_per_successful_task": 0.0,
155
- "variant_corrections_per_successful_task": 0.0,
156
- "paired_corrections_task_count": 1,
157
- "corrections_delta_per_successful_task": 0.0,
158
- "token_delta_per_successful_task": 0.0,
159
- "token_savings_pct": 0.0,
160
- "paired_token_task_count": 1,
161
- "wall_time_delta_seconds_per_successful_task": 0.0,
162
- "wall_time_change_pct": 0.0,
163
- "paired_wall_time_task_count": 1,
164
- "cost_savings_pct_with_shift": null,
165
- "paired_cost_task_count": 0
307
+ "wall_time_seconds_per_task_including_failures": 10.0,
308
+ "wall_time_seconds_successful": 10.0
166
309
  }
167
- ],
168
- "claim_status": "compare_variants",
169
- "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
310
+ }
170
311
  }
@@ -12,6 +12,23 @@ Use them when designing an experiment that starts from ContextGuard's existing b
12
12
  5. Treat byte counts, image dimensions, OCR confidence, and local compressor ratios as proxy evidence. Real token/cost claims require **provider-measured** primary token/cost fields on both sides.
13
13
  6. Keep private screenshots, raw secrets, and external service endpoints out of fixture files.
14
14
 
15
+ ## Local replay evidence
16
+
17
+ `context-guard-bench --evidence-jsonl <path>` can replay pre-recorded run evidence into the normal CSV/report pipeline without invoking `claude` or any task `success_command`. Pair it with `--report-json` and `--dashboard-md` to regenerate a deterministic local dashboard:
18
+
19
+ ```bash
20
+ context-guard-bench \
21
+ --tasks docs/benchmark-fixtures/token-savings-12task.tasks.example.json \
22
+ --variants docs/benchmark-fixtures/token-savings-12task.variants.example.json \
23
+ --evidence-jsonl docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl \
24
+ --csv /tmp/contextguard-token-savings.csv \
25
+ --report-json /tmp/contextguard-token-savings.report.json \
26
+ --dashboard-md /tmp/contextguard-token-savings.dashboard.md \
27
+ --baseline-variant baseline_full_context_fixture
28
+ ```
29
+
30
+ The included token-savings evidence file is deliberately `synthetic_fixture` provenance. It validates replay/dashboard mechanics and byte-proxy reporting only: replay forces synthetic/manual rows to `primary_tokens_measured=false` and `cost_measured=false`, so it is not public hosted API token/cost savings evidence even when token-looking numbers are present. A public claim still requires matched successful tasks, provider-export provenance, provider-measured primary tokens/cost, quality non-inferiority, and shifted-cost accounting.
31
+
15
32
  ## Runner-native variant prompt files
16
33
 
17
34
  `context-guard-bench` supports optional file-backed `variant_prompt_files` in task fixtures. The map is keyed by variant name and lets a single logical task swap sanitized prompt evidence per variant, for example a baseline raw-output prompt versus a digest plus artifact receipt prompt. Prompt files are resolved relative to the task JSON, must be relative paths, and are read with the same no-follow/symlink-safe posture as task and variant fixtures.
@@ -20,12 +37,12 @@ This runner-native swap only proves command shape and prompt selection until the
20
37
 
21
38
  ## Included fixture sets
22
39
 
23
- | Fixture set | Task file | Variant file | Intended future experiment |
24
- | --- | --- | --- | --- |
25
- | Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
26
- | Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
27
- | Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
28
- | Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
40
+ | Fixture set | Task file | Variant file | Evidence replay file | Intended future experiment |
41
+ | --- | --- | --- | --- | --- |
42
+ | Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | n/a | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
43
+ | Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | n/a | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
44
+ | Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | n/a | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
45
+ | Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
29
46
 
30
47
  ## Visual/OCR fixture notes
31
48
 
@@ -41,7 +58,7 @@ The output-transform fixtures describe already-sanitized command output comparis
41
58
 
42
59
  ## Token-savings 12-task roadmap fixture notes
43
60
 
44
- The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call.
61
+ The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call. The companion `token-savings-12task.evidence.example.jsonl` lets users replay deterministic synthetic rows into CSV/report/dashboard outputs while preserving the same non-claim boundary.
45
62
 
46
63
  For real non-dry-run experiments, replace every placeholder `success_command`, keep task IDs matched across baseline and candidate variants, and require provider-measured primary token/cost data before interpreting `tokens_per_successful_task`, `total_cost_with_shift_usd`, or `external_cost_usd`. Cache predictions, char/4 token proxies, local latency, and byte reductions remain diagnostic proxy evidence unless the generated report contains matched successful task evidence and stays within the 10%p failure-rate guardrail.
47
64
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ictechgy/context-guard",
3
- "version": "0.4.10",
3
+ "version": "0.4.12",
4
4
  "description": "ContextGuard CLI helpers for keeping AI coding agent context focused and local-first.",
5
5
  "license": "Apache-2.0",
6
6
  "homepage": "https://github.com/ictechgy/context-guard#readme",
@@ -59,6 +59,7 @@
59
59
  "docs/benchmark-workflows/*.example.jsonl",
60
60
  "docs/benchmark-workflow-examples.md",
61
61
  "docs/benchmark-fixtures/*.example.json",
62
+ "docs/benchmark-fixtures/*.example.jsonl",
62
63
  "docs/benchmark-fixtures/*.prompt.example.md",
63
64
  "docs/experimental-benchmark-fixtures.md",
64
65
  "packaging/homebrew/context-guard.rb.template"
@@ -37,5 +37,5 @@
37
37
  "gated-experiments",
38
38
  "future-roadmap"
39
39
  ],
40
- "version": "0.4.10"
40
+ "version": "0.4.12"
41
41
  }