@ictechgy/context-guard 0.4.9 → 0.4.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (64) hide show
  1. package/CHANGELOG.md +28 -0
  2. package/README.ko.md +59 -31
  3. package/README.md +85 -36
  4. package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
  5. package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
  6. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  7. package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
  8. package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
  9. package/docs/benchmark-workflow-examples.md +3 -0
  10. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  11. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  12. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  13. package/docs/distribution.md +10 -7
  14. package/docs/experimental-benchmark-fixtures.md +30 -6
  15. package/package.json +4 -6
  16. package/packaging/homebrew/context-guard.rb.template +1 -1
  17. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  18. package/plugins/context-guard/README.ko.md +20 -14
  19. package/plugins/context-guard/README.md +26 -17
  20. package/plugins/context-guard/bin/context-guard +147 -25
  21. package/plugins/context-guard/bin/context-guard-artifact +884 -79
  22. package/plugins/context-guard/bin/context-guard-audit +33 -2
  23. package/plugins/context-guard/bin/context-guard-bench +1542 -31
  24. package/plugins/context-guard/bin/context-guard-cache-score +665 -0
  25. package/plugins/context-guard/bin/context-guard-compress +146 -1
  26. package/plugins/context-guard/bin/context-guard-cost +790 -6
  27. package/plugins/context-guard/bin/context-guard-experiments +463 -26
  28. package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
  29. package/plugins/context-guard/bin/context-guard-filter +163 -7
  30. package/plugins/context-guard/bin/context-guard-guard-read +3 -0
  31. package/plugins/context-guard/bin/context-guard-pack +892 -49
  32. package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
  33. package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
  34. package/plugins/context-guard/bin/context-guard-setup +165 -31
  35. package/plugins/context-guard/bin/context-guard-statusline +490 -283
  36. package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
  37. package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
  38. package/plugins/context-guard/bin/context-guard-trim-output +288 -41
  39. package/plugins/context-guard/brief/README.md +5 -5
  40. package/plugins/context-guard/lib/context_guard_commands.py +230 -0
  41. package/plugins/context-guard/skills/setup/SKILL.md +1 -0
  42. package/context-guard-kit/README.md +0 -91
  43. package/context-guard-kit/benchmark_runner.py +0 -2401
  44. package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
  45. package/context-guard-kit/context_compress.py +0 -695
  46. package/context-guard-kit/context_escrow.py +0 -935
  47. package/context-guard-kit/context_filter.py +0 -637
  48. package/context-guard-kit/context_guard_cli.py +0 -325
  49. package/context-guard-kit/context_guard_diet.py +0 -1711
  50. package/context-guard-kit/context_pack.py +0 -2713
  51. package/context-guard-kit/cost_guard.py +0 -2349
  52. package/context-guard-kit/experimental_registry.py +0 -4348
  53. package/context-guard-kit/failed_attempt_nudge.py +0 -567
  54. package/context-guard-kit/guard_large_read.py +0 -690
  55. package/context-guard-kit/hook_secret_patterns.py +0 -43
  56. package/context-guard-kit/read_symbol.py +0 -483
  57. package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
  58. package/context-guard-kit/sanitize_output.py +0 -725
  59. package/context-guard-kit/settings.example.json +0 -67
  60. package/context-guard-kit/setup_wizard.py +0 -2515
  61. package/context-guard-kit/statusline.sh +0 -362
  62. package/context-guard-kit/statusline_merged.sh +0 -157
  63. package/context-guard-kit/tool_schema_pruner.py +0 -837
  64. package/context-guard-kit/trim_command_output.py +0 -1449
@@ -1,170 +1,311 @@
1
1
  {
2
- "schema": "context-guard-bench-report-v1",
3
2
  "baseline_variant": "baseline",
3
+ "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
4
+ "claim_status": "compare_variants",
5
+ "comparisons": [
6
+ {
7
+ "baseline_corrections_per_successful_task": 0.0,
8
+ "baseline_failure_rate": 0.0,
9
+ "baseline_successful_task_count": 1,
10
+ "baseline_variant": "baseline",
11
+ "corrections_delta_per_successful_task": 0.0,
12
+ "cost_savings_pct_with_shift": null,
13
+ "failure_rate_delta_pp": 0.0,
14
+ "matched_successful_task_count": 1,
15
+ "missing_baseline_success_tasks": [],
16
+ "paired_corrections_task_count": 1,
17
+ "paired_cost_task_count": 0,
18
+ "paired_token_task_count": 1,
19
+ "paired_wall_time_task_count": 1,
20
+ "quality_gate": "pass",
21
+ "token_delta_per_successful_task": 0.0,
22
+ "token_savings_pct": 0.0,
23
+ "variant": "cache_layout_check",
24
+ "variant_corrections_per_successful_task": 0.0,
25
+ "variant_failure_rate": 0.0,
26
+ "wall_time_change_pct": 0.0,
27
+ "wall_time_delta_seconds_per_successful_task": 0.0
28
+ }
29
+ ],
30
+ "public_claim_readiness": {
31
+ "blocking_gate_ids": [
32
+ "matched_successful_tasks",
33
+ "provider_measured_token_cost",
34
+ "shifted_cost_accounting",
35
+ "confidence_failure_notes",
36
+ "provider_export_provenance"
37
+ ],
38
+ "claim_allowed": false,
39
+ "claim_boundary": {
40
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
41
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
42
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
43
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
44
+ "id": "public_claim_readiness_authoritative_release_gate",
45
+ "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
46
+ "reporting_only": true,
47
+ "requires_confidence_and_failure_notes": true,
48
+ "requires_matched_successful_tasks": true,
49
+ "requires_provider_export_provenance": true,
50
+ "requires_provider_measured_tokens_and_cost": true,
51
+ "requires_quality_non_inferiority": true,
52
+ "requires_shifted_cost_accounting": true,
53
+ "unsupported_claims_forbidden": true
54
+ },
55
+ "gates": [
56
+ {
57
+ "evidence": {
58
+ "comparison_count": 1,
59
+ "matched_pair_count": 0,
60
+ "min_matched_successful_task_count": 1.0,
61
+ "missing_baseline_success_task_count": 0,
62
+ "variants": [
63
+ "cache_layout_check"
64
+ ]
65
+ },
66
+ "id": "matched_successful_tasks",
67
+ "label": "Matched successful tasks",
68
+ "passed": false,
69
+ "reason": "missing_or_regressed_matched_successful_tasks",
70
+ "required": true,
71
+ "status": "fail"
72
+ },
73
+ {
74
+ "evidence": {
75
+ "matched_pair_count": 0,
76
+ "required_fields": [
77
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
78
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
79
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
80
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
81
+ ]
82
+ },
83
+ "id": "provider_measured_token_cost",
84
+ "label": "Provider-measured token and primary cost",
85
+ "passed": false,
86
+ "reason": "missing_provider_measured_primary_tokens_or_cost",
87
+ "required": true,
88
+ "status": "fail"
89
+ },
90
+ {
91
+ "evidence": {
92
+ "max_corrections_delta_per_successful_task": 0.0,
93
+ "max_failure_rate_delta_pp": 0.0,
94
+ "quality_gates": [
95
+ "pass"
96
+ ]
97
+ },
98
+ "id": "quality_non_inferiority",
99
+ "label": "Quality non-inferiority",
100
+ "passed": true,
101
+ "reason": "all_quality_gates_pass",
102
+ "required": true,
103
+ "status": "pass"
104
+ },
105
+ {
106
+ "evidence": {
107
+ "matched_pair_count": 0,
108
+ "required_fields": [
109
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
110
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
111
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
112
+ ]
113
+ },
114
+ "id": "shifted_cost_accounting",
115
+ "label": "Shifted-cost accounting",
116
+ "passed": false,
117
+ "reason": "missing_shifted_cost_claim_accounting",
118
+ "required": true,
119
+ "status": "fail"
120
+ },
121
+ {
122
+ "evidence": {
123
+ "comparison_failure_fields_present": true,
124
+ "explicit_note_count": 0,
125
+ "failed_row_count": 0,
126
+ "failed_rows_with_notes": 0,
127
+ "replay_row_count": 0
128
+ },
129
+ "id": "confidence_failure_notes",
130
+ "label": "Confidence and failure notes",
131
+ "passed": false,
132
+ "reason": "missing_explicit_replay_notes_or_failure_evidence",
133
+ "required": true,
134
+ "status": "unknown"
135
+ },
136
+ {
137
+ "evidence": {
138
+ "mixed_csv": false,
139
+ "provider_names": [],
140
+ "replay_row_count": 0,
141
+ "report_row_count": 2,
142
+ "same_run_complete": false,
143
+ "source_types": []
144
+ },
145
+ "id": "provider_export_provenance",
146
+ "label": "Provider-export provenance",
147
+ "passed": false,
148
+ "reason": "missing_or_mixed_provider_export_provenance",
149
+ "required": true,
150
+ "status": "unknown"
151
+ }
152
+ ],
153
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
154
+ "passed_required_gate_count": 1,
155
+ "public_claim_eligible_observed": null,
156
+ "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
157
+ "raw_metric_claim_status_observed": "compare_variants",
158
+ "reason": "replay_evidence_required_for_public_claim",
159
+ "required_gate_count": 6,
160
+ "required_gate_ids": [
161
+ "matched_successful_tasks",
162
+ "provider_measured_token_cost",
163
+ "quality_non_inferiority",
164
+ "shifted_cost_accounting",
165
+ "confidence_failure_notes",
166
+ "provider_export_provenance"
167
+ ],
168
+ "schema_version": "contextguard.bench.public-claim-readiness.v1",
169
+ "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
170
+ },
4
171
  "row_count": 2,
172
+ "schema": "context-guard-bench-report-v1",
5
173
  "summary_by_variant": {
6
174
  "baseline": {
7
- "runs": 1,
8
- "successful_runs": 1,
175
+ "artifacts_used_per_successful_task": 0.0,
176
+ "artifacts_used_successful": 0,
177
+ "byte_reduction_ratio": null,
178
+ "byte_savings_pct": null,
179
+ "bytes_after_successful": 0,
180
+ "bytes_before_successful": 0,
181
+ "bytes_saved_per_successful_task": null,
182
+ "bytes_saved_successful": null,
183
+ "compression_strategy": "baseline",
184
+ "corrections_per_successful_task": 0.0,
185
+ "corrections_successful": 0,
186
+ "external_cost_successful_usd": 0.0,
187
+ "external_cost_unknown_successful": 1,
188
+ "external_tokens_measured_successful": 0,
189
+ "external_tokens_per_successful_task": null,
190
+ "external_tokens_successful": 0,
9
191
  "failed_runs": 0,
10
- "total_tokens_all_runs": 1200,
11
- "primary_tokens_measured_runs": 1,
192
+ "failure_rate": 0.0,
193
+ "hook_triggers_successful": 0,
194
+ "is_baseline_strategy": true,
195
+ "observed_telemetry": {
196
+ "byte_savings": "unavailable",
197
+ "external_tokens": "unavailable",
198
+ "primary_cost": "unavailable",
199
+ "provider_cache": "observed",
200
+ "token_proxy": "unavailable",
201
+ "tokens": "observed",
202
+ "wall_time": "observed"
203
+ },
12
204
  "primary_cost_all_runs_usd": 0.0,
13
205
  "primary_cost_measured_runs": 0,
14
- "wall_time_seconds_all_runs": 10.0,
15
- "wall_time_seconds_measured_runs": 1,
206
+ "primary_cost_measured_successful": 0,
207
+ "primary_cost_per_successful_task_usd": null,
208
+ "primary_cost_per_task_including_failures_usd": null,
209
+ "primary_cost_successful_usd": 0.0,
210
+ "primary_tokens_measured_runs": 1,
211
+ "primary_tokens_measured_successful": 1,
16
212
  "provider_cached_tokens_all_runs": 0,
17
213
  "provider_cached_tokens_measured_runs": 1,
18
- "total_cost_with_shift_all_runs_usd": 0.0,
19
- "total_cost_with_shift_measured_runs": 0,
20
- "total_tokens_successful": 1200,
21
- "primary_tokens_measured_successful": 1,
22
- "primary_cost_successful_usd": 0.0,
23
- "primary_cost_measured_successful": 0,
24
- "wall_time_seconds_successful": 10.0,
25
- "wall_time_seconds_measured_successful": 1,
26
- "provider_cached_tokens_successful": 0,
27
214
  "provider_cached_tokens_measured_successful": 1,
28
- "external_cost_successful_usd": 0.0,
29
- "external_cost_unknown_successful": 1,
30
- "total_cost_with_shift_successful_usd": 0.0,
31
- "total_cost_with_shift_measured_successful": 0,
32
- "external_tokens_successful": 0,
33
- "external_tokens_measured_successful": 0,
34
- "artifacts_used_successful": 0,
35
- "corrections_successful": 0,
36
- "bytes_before_successful": 0,
37
- "bytes_after_successful": 0,
38
- "turns_successful": 0,
39
- "hook_triggers_successful": 0,
40
- "failure_rate": 0.0,
41
- "task_count": 1,
215
+ "provider_cached_tokens_per_successful_task": 0.0,
216
+ "provider_cached_tokens_per_task_including_failures": 0.0,
217
+ "provider_cached_tokens_successful": 0,
218
+ "runs": 1,
219
+ "successful_runs": 1,
42
220
  "successful_task_count": 1,
221
+ "task_count": 1,
222
+ "token_proxy_saved_per_successful_task": null,
223
+ "token_proxy_saved_successful": null,
224
+ "tokens_per_successful_task": 1200.0,
43
225
  "tokens_per_task_including_failures": 1200.0,
44
- "wall_time_seconds_per_task_including_failures": 10.0,
45
- "provider_cached_tokens_per_task_including_failures": 0.0,
46
- "primary_cost_per_task_including_failures_usd": null,
226
+ "total_cost_with_shift_all_runs_usd": 0.0,
227
+ "total_cost_with_shift_measured_runs": 0,
228
+ "total_cost_with_shift_measured_successful": 0,
229
+ "total_cost_with_shift_per_successful_task_usd": null,
47
230
  "total_cost_with_shift_per_task_including_failures_usd": null,
48
- "tokens_per_successful_task": 1200.0,
231
+ "total_cost_with_shift_successful_usd": 0.0,
232
+ "total_tokens_all_runs": 1200,
233
+ "total_tokens_successful": 1200,
234
+ "turns_successful": 0,
235
+ "wall_time_seconds_all_runs": 10.0,
236
+ "wall_time_seconds_measured_runs": 1,
237
+ "wall_time_seconds_measured_successful": 1,
49
238
  "wall_time_seconds_per_successful_task": 10.0,
50
- "provider_cached_tokens_per_successful_task": 0.0,
51
- "primary_cost_per_successful_task_usd": null,
52
- "total_cost_with_shift_per_successful_task_usd": null,
53
- "external_tokens_per_successful_task": null,
239
+ "wall_time_seconds_per_task_including_failures": 10.0,
240
+ "wall_time_seconds_successful": 10.0
241
+ },
242
+ "cache_layout_check": {
54
243
  "artifacts_used_per_successful_task": 0.0,
55
- "corrections_per_successful_task": 0.0,
244
+ "artifacts_used_successful": 0,
56
245
  "byte_reduction_ratio": null,
57
- "compression_strategy": "baseline",
58
- "is_baseline_strategy": true,
59
- "bytes_saved_successful": null,
60
- "bytes_saved_per_successful_task": null,
61
246
  "byte_savings_pct": null,
62
- "token_proxy_saved_successful": null,
63
- "token_proxy_saved_per_successful_task": null,
247
+ "bytes_after_successful": 0,
248
+ "bytes_before_successful": 0,
249
+ "bytes_saved_per_successful_task": null,
250
+ "bytes_saved_successful": null,
251
+ "compression_strategy": "cache_layout_check",
252
+ "corrections_per_successful_task": 0.0,
253
+ "corrections_successful": 0,
254
+ "external_cost_successful_usd": 0.0,
255
+ "external_cost_unknown_successful": 1,
256
+ "external_tokens_measured_successful": 0,
257
+ "external_tokens_per_successful_task": null,
258
+ "external_tokens_successful": 0,
259
+ "failed_runs": 0,
260
+ "failure_rate": 0.0,
261
+ "hook_triggers_successful": 0,
262
+ "is_baseline_strategy": false,
64
263
  "observed_telemetry": {
65
- "tokens": "observed",
66
- "primary_cost": "unavailable",
67
- "external_tokens": "unavailable",
68
264
  "byte_savings": "unavailable",
265
+ "external_tokens": "unavailable",
266
+ "primary_cost": "unavailable",
267
+ "provider_cache": "observed",
69
268
  "token_proxy": "unavailable",
70
- "wall_time": "observed",
71
- "provider_cache": "observed"
72
- }
73
- },
74
- "cache_layout_check": {
75
- "runs": 1,
76
- "successful_runs": 1,
77
- "failed_runs": 0,
78
- "total_tokens_all_runs": 1200,
79
- "primary_tokens_measured_runs": 1,
269
+ "tokens": "observed",
270
+ "wall_time": "observed"
271
+ },
80
272
  "primary_cost_all_runs_usd": 0.0,
81
273
  "primary_cost_measured_runs": 0,
82
- "wall_time_seconds_all_runs": 10.0,
83
- "wall_time_seconds_measured_runs": 1,
274
+ "primary_cost_measured_successful": 0,
275
+ "primary_cost_per_successful_task_usd": null,
276
+ "primary_cost_per_task_including_failures_usd": null,
277
+ "primary_cost_successful_usd": 0.0,
278
+ "primary_tokens_measured_runs": 1,
279
+ "primary_tokens_measured_successful": 1,
84
280
  "provider_cached_tokens_all_runs": 900,
85
281
  "provider_cached_tokens_measured_runs": 1,
86
- "total_cost_with_shift_all_runs_usd": 0.0,
87
- "total_cost_with_shift_measured_runs": 0,
88
- "total_tokens_successful": 1200,
89
- "primary_tokens_measured_successful": 1,
90
- "primary_cost_successful_usd": 0.0,
91
- "primary_cost_measured_successful": 0,
92
- "wall_time_seconds_successful": 10.0,
93
- "wall_time_seconds_measured_successful": 1,
94
- "provider_cached_tokens_successful": 900,
95
282
  "provider_cached_tokens_measured_successful": 1,
96
- "external_cost_successful_usd": 0.0,
97
- "external_cost_unknown_successful": 1,
98
- "total_cost_with_shift_successful_usd": 0.0,
99
- "total_cost_with_shift_measured_successful": 0,
100
- "external_tokens_successful": 0,
101
- "external_tokens_measured_successful": 0,
102
- "artifacts_used_successful": 0,
103
- "corrections_successful": 0,
104
- "bytes_before_successful": 0,
105
- "bytes_after_successful": 0,
106
- "turns_successful": 0,
107
- "hook_triggers_successful": 0,
108
- "failure_rate": 0.0,
109
- "task_count": 1,
283
+ "provider_cached_tokens_per_successful_task": 900.0,
284
+ "provider_cached_tokens_per_task_including_failures": 900.0,
285
+ "provider_cached_tokens_successful": 900,
286
+ "runs": 1,
287
+ "successful_runs": 1,
110
288
  "successful_task_count": 1,
289
+ "task_count": 1,
290
+ "token_proxy_saved_per_successful_task": null,
291
+ "token_proxy_saved_successful": null,
292
+ "tokens_per_successful_task": 1200.0,
111
293
  "tokens_per_task_including_failures": 1200.0,
112
- "wall_time_seconds_per_task_including_failures": 10.0,
113
- "provider_cached_tokens_per_task_including_failures": 900.0,
114
- "primary_cost_per_task_including_failures_usd": null,
294
+ "total_cost_with_shift_all_runs_usd": 0.0,
295
+ "total_cost_with_shift_measured_runs": 0,
296
+ "total_cost_with_shift_measured_successful": 0,
297
+ "total_cost_with_shift_per_successful_task_usd": null,
115
298
  "total_cost_with_shift_per_task_including_failures_usd": null,
116
- "tokens_per_successful_task": 1200.0,
299
+ "total_cost_with_shift_successful_usd": 0.0,
300
+ "total_tokens_all_runs": 1200,
301
+ "total_tokens_successful": 1200,
302
+ "turns_successful": 0,
303
+ "wall_time_seconds_all_runs": 10.0,
304
+ "wall_time_seconds_measured_runs": 1,
305
+ "wall_time_seconds_measured_successful": 1,
117
306
  "wall_time_seconds_per_successful_task": 10.0,
118
- "provider_cached_tokens_per_successful_task": 900.0,
119
- "primary_cost_per_successful_task_usd": null,
120
- "total_cost_with_shift_per_successful_task_usd": null,
121
- "external_tokens_per_successful_task": null,
122
- "artifacts_used_per_successful_task": 0.0,
123
- "corrections_per_successful_task": 0.0,
124
- "byte_reduction_ratio": null,
125
- "compression_strategy": "cache_layout_check",
126
- "is_baseline_strategy": false,
127
- "bytes_saved_successful": null,
128
- "bytes_saved_per_successful_task": null,
129
- "byte_savings_pct": null,
130
- "token_proxy_saved_successful": null,
131
- "token_proxy_saved_per_successful_task": null,
132
- "observed_telemetry": {
133
- "tokens": "observed",
134
- "primary_cost": "unavailable",
135
- "external_tokens": "unavailable",
136
- "byte_savings": "unavailable",
137
- "token_proxy": "unavailable",
138
- "wall_time": "observed",
139
- "provider_cache": "observed"
140
- }
141
- }
142
- },
143
- "comparisons": [
144
- {
145
- "variant": "cache_layout_check",
146
- "baseline_variant": "baseline",
147
- "quality_gate": "pass",
148
- "baseline_failure_rate": 0.0,
149
- "variant_failure_rate": 0.0,
150
- "failure_rate_delta_pp": 0.0,
151
- "matched_successful_task_count": 1,
152
- "baseline_successful_task_count": 1,
153
- "missing_baseline_success_tasks": [],
154
- "baseline_corrections_per_successful_task": 0.0,
155
- "variant_corrections_per_successful_task": 0.0,
156
- "paired_corrections_task_count": 1,
157
- "corrections_delta_per_successful_task": 0.0,
158
- "token_delta_per_successful_task": 0.0,
159
- "token_savings_pct": 0.0,
160
- "paired_token_task_count": 1,
161
- "wall_time_delta_seconds_per_successful_task": 0.0,
162
- "wall_time_change_pct": 0.0,
163
- "paired_wall_time_task_count": 1,
164
- "cost_savings_pct_with_shift": null,
165
- "paired_cost_task_count": 0
307
+ "wall_time_seconds_per_task_including_failures": 10.0,
308
+ "wall_time_seconds_successful": 10.0
166
309
  }
167
- ],
168
- "claim_status": "compare_variants",
169
- "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
310
+ }
170
311
  }
@@ -25,11 +25,11 @@ context-guard setup --agent claude --scope user --verify --json
25
25
  context-guard setup --agent claude --scope user --plan
26
26
  ```
27
27
 
28
- Project scope is the default. `context-guard doctor` and `context-guard setup --verify` are read-only health checks. User scope is opt-in and requires an explicit agent for writes. Supported user-scope writes record backups and rollback metadata under `.context-guard/rollback` in the user home directory.
28
+ Project scope is the default. `context-guard doctor` and `context-guard setup --verify` are read-only health checks. User scope is opt-in and requires an explicit agent for writes. Supported user-scope writes record backups and rollback metadata under `.context-guard/rollback` in the user home directory. Setup resolves packaged/check-out helpers first; `PATH` helper fallback is default-off and requires `--allow-path-helper-fallback` for a trusted install after canonical executable and identity validation.
29
29
 
30
30
  ## Runtime requirements
31
31
 
32
- The helpers are Python/shell scripts packaged through npm and Homebrew. Supported machines need:
32
+ The helpers are Python/shell scripts packaged through npm and Homebrew as plugin-local `plugins/context-guard/bin` entrypoints plus `plugins/context-guard/lib` helpers; checkout-only `context-guard-kit` sources are not duplicated in the npm tarball. Supported machines need:
33
33
 
34
34
  - macOS or Linux
35
35
  - Python 3 available as `python3`
@@ -47,11 +47,14 @@ The helpers are Python/shell scripts packaged through npm and Homebrew. Supporte
47
47
 
48
48
  Before publishing the Homebrew tap, run the formula-specific checks locally or in CI when Homebrew is available:
49
49
 
50
+ Render or copy `packaging/homebrew/context-guard.rb.template` into a real tap formula first; replace `{{VERSION}}` with the bare semver version (for example `0.4.9`, not `v0.4.9`) and `REPLACE_WITH_RELEASE_TARBALL_SHA256` with the verified tarball SHA. Do not run Homebrew audit/install directly against the placeholder template.
51
+
50
52
  ```bash
51
- brew style packaging/homebrew/context-guard.rb
52
- brew audit --strict --new packaging/homebrew/context-guard.rb
53
- brew install --build-from-source packaging/homebrew/context-guard.rb
54
- brew test context-guard
53
+ # Example once Formula/context-guard.rb has been rendered in the tap checkout:
54
+ brew style Formula/context-guard.rb
55
+ brew audit --strict --new ictechgy/tap/context-guard
56
+ brew install --build-from-source ictechgy/tap/context-guard
57
+ brew test ictechgy/tap/context-guard
55
58
  ```
56
59
 
57
- The formula should rewrite Python shebangs to the declared Homebrew Python dependency and expose both `context-guard` and legacy compatibility wrappers from `plugins/context-guard/bin`.
60
+ The rendered formula should rewrite Python shebangs to the declared Homebrew Python dependency and expose both `context-guard` and legacy compatibility wrappers from `plugins/context-guard/bin`.
@@ -1,6 +1,6 @@
1
1
  # Experimental benchmark fixtures
2
2
 
3
- These fixtures are **fixture-only** starter scaffolds for future visual/OCR, learned-compression, and reversible output-transform experiments. They are **synthetic**, package-visible examples for `context-guard-bench` task and variant shapes; they are **not shipped benchmark results**, not OCR/compression implementations, and not hosted API savings claims.
3
+ These fixtures are **fixture-only** starter scaffolds for future visual/OCR, learned-compression, reversible output-transform, and token-savings roadmap experiments. They are **synthetic**, package-visible examples for `context-guard-bench` task and variant shapes; they are **not shipped benchmark results**, not OCR/compression implementations, not cache/tool-deferral implementations, and not hosted API savings claims.
4
4
 
5
5
  Use them when designing an experiment that starts from ContextGuard's existing benchmark discipline:
6
6
 
@@ -12,6 +12,23 @@ Use them when designing an experiment that starts from ContextGuard's existing b
12
12
  5. Treat byte counts, image dimensions, OCR confidence, and local compressor ratios as proxy evidence. Real token/cost claims require **provider-measured** primary token/cost fields on both sides.
13
13
  6. Keep private screenshots, raw secrets, and external service endpoints out of fixture files.
14
14
 
15
+ ## Local replay evidence
16
+
17
+ `context-guard-bench --evidence-jsonl <path>` can replay pre-recorded run evidence into the normal CSV/report pipeline without invoking `claude` or any task `success_command`. Pair it with `--report-json` and `--dashboard-md` to regenerate a deterministic local dashboard:
18
+
19
+ ```bash
20
+ context-guard-bench \
21
+ --tasks docs/benchmark-fixtures/token-savings-12task.tasks.example.json \
22
+ --variants docs/benchmark-fixtures/token-savings-12task.variants.example.json \
23
+ --evidence-jsonl docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl \
24
+ --csv /tmp/contextguard-token-savings.csv \
25
+ --report-json /tmp/contextguard-token-savings.report.json \
26
+ --dashboard-md /tmp/contextguard-token-savings.dashboard.md \
27
+ --baseline-variant baseline_full_context_fixture
28
+ ```
29
+
30
+ The included token-savings evidence file is deliberately `synthetic_fixture` provenance. It validates replay/dashboard mechanics and byte-proxy reporting only: replay forces synthetic/manual rows to `primary_tokens_measured=false` and `cost_measured=false`, so it is not public hosted API token/cost savings evidence even when token-looking numbers are present. A public claim still requires matched successful tasks, provider-export provenance, provider-measured primary tokens/cost, quality non-inferiority, and shifted-cost accounting.
31
+
15
32
  ## Runner-native variant prompt files
16
33
 
17
34
  `context-guard-bench` supports optional file-backed `variant_prompt_files` in task fixtures. The map is keyed by variant name and lets a single logical task swap sanitized prompt evidence per variant, for example a baseline raw-output prompt versus a digest plus artifact receipt prompt. Prompt files are resolved relative to the task JSON, must be relative paths, and are read with the same no-follow/symlink-safe posture as task and variant fixtures.
@@ -20,11 +37,12 @@ This runner-native swap only proves command shape and prompt selection until the
20
37
 
21
38
  ## Included fixture sets
22
39
 
23
- | Fixture set | Task file | Variant file | Intended future experiment |
24
- | --- | --- | --- | --- |
25
- | Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
26
- | Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
27
- | Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
40
+ | Fixture set | Task file | Variant file | Evidence replay file | Intended future experiment |
41
+ | --- | --- | --- | --- | --- |
42
+ | Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | n/a | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
43
+ | Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | n/a | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
44
+ | Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | n/a | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
45
+ | Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
28
46
 
29
47
  ## Visual/OCR fixture notes
30
48
 
@@ -38,6 +56,12 @@ The learned-compression fixtures describe already-sanitized context-pack or arti
38
56
 
39
57
  The output-transform fixtures describe already-sanitized command output comparisons and now demonstrate `variant_prompt_files` for raw sanitized output versus digest plus artifact receipt prompt evidence. They do not execute `context-guard-trim-output`, store artifacts, call `context-guard-artifact`, or invoke a provider. Future experiments should compare raw sanitized output against `--digest` output plus an `--artifact-receipt`, verify the receipt's exact re-expand command retrieves the omitted sanitized lines, and record bytes before/after, primary provider tokens, cost, success, corrections, artifact-store usage, and any external/local processing cost.
40
58
 
59
+ ## Token-savings 12-task roadmap fixture notes
60
+
61
+ The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call. The companion `token-savings-12task.evidence.example.jsonl` lets users replay deterministic synthetic rows into CSV/report/dashboard outputs while preserving the same non-claim boundary.
62
+
63
+ For real non-dry-run experiments, replace every placeholder `success_command`, keep task IDs matched across baseline and candidate variants, and require provider-measured primary token/cost data before interpreting `tokens_per_successful_task`, `total_cost_with_shift_usd`, or `external_cost_usd`. Cache predictions, char/4 token proxies, local latency, and byte reductions remain diagnostic proxy evidence unless the generated report contains matched successful task evidence and stays within the 10%p failure-rate guardrail.
64
+
41
65
  ## Safe wording
42
66
 
43
67
  Use language like:
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@ictechgy/context-guard",
3
- "version": "0.4.9",
3
+ "version": "0.4.11",
4
4
  "description": "ContextGuard CLI helpers for keeping AI coding agent context focused and local-first.",
5
5
  "license": "Apache-2.0",
6
6
  "homepage": "https://github.com/ictechgy/context-guard#readme",
@@ -32,7 +32,8 @@
32
32
  "context-guard-failed-nudge": "plugins/context-guard/bin/context-guard-failed-nudge",
33
33
  "context-guard-statusline": "plugins/context-guard/bin/context-guard-statusline",
34
34
  "context-guard-statusline-merged": "plugins/context-guard/bin/context-guard-statusline-merged",
35
- "context-guard-cost": "plugins/context-guard/bin/context-guard-cost"
35
+ "context-guard-cost": "plugins/context-guard/bin/context-guard-cost",
36
+ "context-guard-cache-score": "plugins/context-guard/bin/context-guard-cache-score"
36
37
  },
37
38
  "files": [
38
39
  "CHANGELOG.md",
@@ -40,10 +41,6 @@
40
41
  "NOTICE",
41
42
  "README.md",
42
43
  "README.ko.md",
43
- "context-guard-kit/*.py",
44
- "context-guard-kit/*.sh",
45
- "context-guard-kit/README.md",
46
- "context-guard-kit/settings.example.json",
47
44
  "plugins/context-guard/.claude-plugin/plugin.json",
48
45
  "plugins/context-guard/README.md",
49
46
  "plugins/context-guard/README.ko.md",
@@ -62,6 +59,7 @@
62
59
  "docs/benchmark-workflows/*.example.jsonl",
63
60
  "docs/benchmark-workflow-examples.md",
64
61
  "docs/benchmark-fixtures/*.example.json",
62
+ "docs/benchmark-fixtures/*.example.jsonl",
65
63
  "docs/benchmark-fixtures/*.prompt.example.md",
66
64
  "docs/experimental-benchmark-fixtures.md",
67
65
  "packaging/homebrew/context-guard.rb.template"
@@ -5,7 +5,7 @@ class ContextGuard < Formula
5
5
 
6
6
  desc "Local-first context guardrails for AI coding agents"
7
7
  homepage "https://github.com/ictechgy/context-guard"
8
- url "https://github.com/ictechgy/context-guard/archive/refs/tags/v0.4.8.tar.gz"
8
+ url "https://github.com/ictechgy/context-guard/archive/refs/tags/v{{VERSION}}.tar.gz"
9
9
  sha256 "REPLACE_WITH_RELEASE_TARBALL_SHA256"
10
10
  license "Apache-2.0"
11
11
 
@@ -37,5 +37,5 @@
37
37
  "gated-experiments",
38
38
  "future-roadmap"
39
39
  ],
40
- "version": "0.4.9"
40
+ "version": "0.4.11"
41
41
  }