@ictechgy/context-guard 0.4.9 → 0.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.ko.md +59 -31
- package/README.md +85 -36
- package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
- package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
- package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
- package/docs/benchmark-workflow-examples.md +3 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
- package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
- package/docs/distribution.md +10 -7
- package/docs/experimental-benchmark-fixtures.md +30 -6
- package/package.json +4 -6
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +20 -14
- package/plugins/context-guard/README.md +26 -17
- package/plugins/context-guard/bin/context-guard +147 -25
- package/plugins/context-guard/bin/context-guard-artifact +884 -79
- package/plugins/context-guard/bin/context-guard-audit +33 -2
- package/plugins/context-guard/bin/context-guard-bench +1542 -31
- package/plugins/context-guard/bin/context-guard-cache-score +665 -0
- package/plugins/context-guard/bin/context-guard-compress +146 -1
- package/plugins/context-guard/bin/context-guard-cost +790 -6
- package/plugins/context-guard/bin/context-guard-experiments +463 -26
- package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
- package/plugins/context-guard/bin/context-guard-filter +163 -7
- package/plugins/context-guard/bin/context-guard-guard-read +3 -0
- package/plugins/context-guard/bin/context-guard-pack +892 -49
- package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
- package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
- package/plugins/context-guard/bin/context-guard-setup +165 -31
- package/plugins/context-guard/bin/context-guard-statusline +490 -283
- package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
- package/plugins/context-guard/bin/context-guard-trim-output +288 -41
- package/plugins/context-guard/brief/README.md +5 -5
- package/plugins/context-guard/lib/context_guard_commands.py +230 -0
- package/plugins/context-guard/skills/setup/SKILL.md +1 -0
- package/context-guard-kit/README.md +0 -91
- package/context-guard-kit/benchmark_runner.py +0 -2401
- package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
- package/context-guard-kit/context_compress.py +0 -695
- package/context-guard-kit/context_escrow.py +0 -935
- package/context-guard-kit/context_filter.py +0 -637
- package/context-guard-kit/context_guard_cli.py +0 -325
- package/context-guard-kit/context_guard_diet.py +0 -1711
- package/context-guard-kit/context_pack.py +0 -2713
- package/context-guard-kit/cost_guard.py +0 -2349
- package/context-guard-kit/experimental_registry.py +0 -4348
- package/context-guard-kit/failed_attempt_nudge.py +0 -567
- package/context-guard-kit/guard_large_read.py +0 -690
- package/context-guard-kit/hook_secret_patterns.py +0 -43
- package/context-guard-kit/read_symbol.py +0 -483
- package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
- package/context-guard-kit/sanitize_output.py +0 -725
- package/context-guard-kit/settings.example.json +0 -67
- package/context-guard-kit/setup_wizard.py +0 -2515
- package/context-guard-kit/statusline.sh +0 -362
- package/context-guard-kit/statusline_merged.sh +0 -157
- package/context-guard-kit/tool_schema_pruner.py +0 -837
- package/context-guard-kit/trim_command_output.py +0 -1449
|
@@ -1,170 +1,311 @@
|
|
|
1
1
|
{
|
|
2
|
-
"schema": "context-guard-bench-report-v1",
|
|
3
2
|
"baseline_variant": "baseline",
|
|
3
|
+
"caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
|
|
4
|
+
"claim_status": "compare_variants",
|
|
5
|
+
"comparisons": [
|
|
6
|
+
{
|
|
7
|
+
"baseline_corrections_per_successful_task": 0.0,
|
|
8
|
+
"baseline_failure_rate": 0.0,
|
|
9
|
+
"baseline_successful_task_count": 1,
|
|
10
|
+
"baseline_variant": "baseline",
|
|
11
|
+
"corrections_delta_per_successful_task": 0.0,
|
|
12
|
+
"cost_savings_pct_with_shift": null,
|
|
13
|
+
"failure_rate_delta_pp": 0.0,
|
|
14
|
+
"matched_successful_task_count": 1,
|
|
15
|
+
"missing_baseline_success_tasks": [],
|
|
16
|
+
"paired_corrections_task_count": 1,
|
|
17
|
+
"paired_cost_task_count": 0,
|
|
18
|
+
"paired_token_task_count": 1,
|
|
19
|
+
"paired_wall_time_task_count": 1,
|
|
20
|
+
"quality_gate": "pass",
|
|
21
|
+
"token_delta_per_successful_task": 0.0,
|
|
22
|
+
"token_savings_pct": 0.0,
|
|
23
|
+
"variant": "cache_layout_check",
|
|
24
|
+
"variant_corrections_per_successful_task": 0.0,
|
|
25
|
+
"variant_failure_rate": 0.0,
|
|
26
|
+
"wall_time_change_pct": 0.0,
|
|
27
|
+
"wall_time_delta_seconds_per_successful_task": 0.0
|
|
28
|
+
}
|
|
29
|
+
],
|
|
30
|
+
"public_claim_readiness": {
|
|
31
|
+
"blocking_gate_ids": [
|
|
32
|
+
"matched_successful_tasks",
|
|
33
|
+
"provider_measured_token_cost",
|
|
34
|
+
"shifted_cost_accounting",
|
|
35
|
+
"confidence_failure_notes",
|
|
36
|
+
"provider_export_provenance"
|
|
37
|
+
],
|
|
38
|
+
"claim_allowed": false,
|
|
39
|
+
"claim_boundary": {
|
|
40
|
+
"claim_allowed_field": "public_claim_readiness.claim_allowed",
|
|
41
|
+
"fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
|
|
42
|
+
"hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
|
|
43
|
+
"hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
|
|
44
|
+
"id": "public_claim_readiness_authoritative_release_gate",
|
|
45
|
+
"reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
|
|
46
|
+
"reporting_only": true,
|
|
47
|
+
"requires_confidence_and_failure_notes": true,
|
|
48
|
+
"requires_matched_successful_tasks": true,
|
|
49
|
+
"requires_provider_export_provenance": true,
|
|
50
|
+
"requires_provider_measured_tokens_and_cost": true,
|
|
51
|
+
"requires_quality_non_inferiority": true,
|
|
52
|
+
"requires_shifted_cost_accounting": true,
|
|
53
|
+
"unsupported_claims_forbidden": true
|
|
54
|
+
},
|
|
55
|
+
"gates": [
|
|
56
|
+
{
|
|
57
|
+
"evidence": {
|
|
58
|
+
"comparison_count": 1,
|
|
59
|
+
"matched_pair_count": 0,
|
|
60
|
+
"min_matched_successful_task_count": 1.0,
|
|
61
|
+
"missing_baseline_success_task_count": 0,
|
|
62
|
+
"variants": [
|
|
63
|
+
"cache_layout_check"
|
|
64
|
+
]
|
|
65
|
+
},
|
|
66
|
+
"id": "matched_successful_tasks",
|
|
67
|
+
"label": "Matched successful tasks",
|
|
68
|
+
"passed": false,
|
|
69
|
+
"reason": "missing_or_regressed_matched_successful_tasks",
|
|
70
|
+
"required": true,
|
|
71
|
+
"status": "fail"
|
|
72
|
+
},
|
|
73
|
+
{
|
|
74
|
+
"evidence": {
|
|
75
|
+
"matched_pair_count": 0,
|
|
76
|
+
"required_fields": [
|
|
77
|
+
"matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
|
|
78
|
+
"matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
|
|
79
|
+
"matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
|
|
80
|
+
"matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
|
|
81
|
+
]
|
|
82
|
+
},
|
|
83
|
+
"id": "provider_measured_token_cost",
|
|
84
|
+
"label": "Provider-measured token and primary cost",
|
|
85
|
+
"passed": false,
|
|
86
|
+
"reason": "missing_provider_measured_primary_tokens_or_cost",
|
|
87
|
+
"required": true,
|
|
88
|
+
"status": "fail"
|
|
89
|
+
},
|
|
90
|
+
{
|
|
91
|
+
"evidence": {
|
|
92
|
+
"max_corrections_delta_per_successful_task": 0.0,
|
|
93
|
+
"max_failure_rate_delta_pp": 0.0,
|
|
94
|
+
"quality_gates": [
|
|
95
|
+
"pass"
|
|
96
|
+
]
|
|
97
|
+
},
|
|
98
|
+
"id": "quality_non_inferiority",
|
|
99
|
+
"label": "Quality non-inferiority",
|
|
100
|
+
"passed": true,
|
|
101
|
+
"reason": "all_quality_gates_pass",
|
|
102
|
+
"required": true,
|
|
103
|
+
"status": "pass"
|
|
104
|
+
},
|
|
105
|
+
{
|
|
106
|
+
"evidence": {
|
|
107
|
+
"matched_pair_count": 0,
|
|
108
|
+
"required_fields": [
|
|
109
|
+
"matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
|
|
110
|
+
"matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
|
|
111
|
+
"matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
|
|
112
|
+
]
|
|
113
|
+
},
|
|
114
|
+
"id": "shifted_cost_accounting",
|
|
115
|
+
"label": "Shifted-cost accounting",
|
|
116
|
+
"passed": false,
|
|
117
|
+
"reason": "missing_shifted_cost_claim_accounting",
|
|
118
|
+
"required": true,
|
|
119
|
+
"status": "fail"
|
|
120
|
+
},
|
|
121
|
+
{
|
|
122
|
+
"evidence": {
|
|
123
|
+
"comparison_failure_fields_present": true,
|
|
124
|
+
"explicit_note_count": 0,
|
|
125
|
+
"failed_row_count": 0,
|
|
126
|
+
"failed_rows_with_notes": 0,
|
|
127
|
+
"replay_row_count": 0
|
|
128
|
+
},
|
|
129
|
+
"id": "confidence_failure_notes",
|
|
130
|
+
"label": "Confidence and failure notes",
|
|
131
|
+
"passed": false,
|
|
132
|
+
"reason": "missing_explicit_replay_notes_or_failure_evidence",
|
|
133
|
+
"required": true,
|
|
134
|
+
"status": "unknown"
|
|
135
|
+
},
|
|
136
|
+
{
|
|
137
|
+
"evidence": {
|
|
138
|
+
"mixed_csv": false,
|
|
139
|
+
"provider_names": [],
|
|
140
|
+
"replay_row_count": 0,
|
|
141
|
+
"report_row_count": 2,
|
|
142
|
+
"same_run_complete": false,
|
|
143
|
+
"source_types": []
|
|
144
|
+
},
|
|
145
|
+
"id": "provider_export_provenance",
|
|
146
|
+
"label": "Provider-export provenance",
|
|
147
|
+
"passed": false,
|
|
148
|
+
"reason": "missing_or_mixed_provider_export_provenance",
|
|
149
|
+
"required": true,
|
|
150
|
+
"status": "unknown"
|
|
151
|
+
}
|
|
152
|
+
],
|
|
153
|
+
"generated_from": "matched_pair_evidence_and_replay_provenance",
|
|
154
|
+
"passed_required_gate_count": 1,
|
|
155
|
+
"public_claim_eligible_observed": null,
|
|
156
|
+
"public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
|
|
157
|
+
"raw_metric_claim_status_observed": "compare_variants",
|
|
158
|
+
"reason": "replay_evidence_required_for_public_claim",
|
|
159
|
+
"required_gate_count": 6,
|
|
160
|
+
"required_gate_ids": [
|
|
161
|
+
"matched_successful_tasks",
|
|
162
|
+
"provider_measured_token_cost",
|
|
163
|
+
"quality_non_inferiority",
|
|
164
|
+
"shifted_cost_accounting",
|
|
165
|
+
"confidence_failure_notes",
|
|
166
|
+
"provider_export_provenance"
|
|
167
|
+
],
|
|
168
|
+
"schema_version": "contextguard.bench.public-claim-readiness.v1",
|
|
169
|
+
"status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
|
|
170
|
+
},
|
|
4
171
|
"row_count": 2,
|
|
172
|
+
"schema": "context-guard-bench-report-v1",
|
|
5
173
|
"summary_by_variant": {
|
|
6
174
|
"baseline": {
|
|
7
|
-
"
|
|
8
|
-
"
|
|
175
|
+
"artifacts_used_per_successful_task": 0.0,
|
|
176
|
+
"artifacts_used_successful": 0,
|
|
177
|
+
"byte_reduction_ratio": null,
|
|
178
|
+
"byte_savings_pct": null,
|
|
179
|
+
"bytes_after_successful": 0,
|
|
180
|
+
"bytes_before_successful": 0,
|
|
181
|
+
"bytes_saved_per_successful_task": null,
|
|
182
|
+
"bytes_saved_successful": null,
|
|
183
|
+
"compression_strategy": "baseline",
|
|
184
|
+
"corrections_per_successful_task": 0.0,
|
|
185
|
+
"corrections_successful": 0,
|
|
186
|
+
"external_cost_successful_usd": 0.0,
|
|
187
|
+
"external_cost_unknown_successful": 1,
|
|
188
|
+
"external_tokens_measured_successful": 0,
|
|
189
|
+
"external_tokens_per_successful_task": null,
|
|
190
|
+
"external_tokens_successful": 0,
|
|
9
191
|
"failed_runs": 0,
|
|
10
|
-
"
|
|
11
|
-
"
|
|
192
|
+
"failure_rate": 0.0,
|
|
193
|
+
"hook_triggers_successful": 0,
|
|
194
|
+
"is_baseline_strategy": true,
|
|
195
|
+
"observed_telemetry": {
|
|
196
|
+
"byte_savings": "unavailable",
|
|
197
|
+
"external_tokens": "unavailable",
|
|
198
|
+
"primary_cost": "unavailable",
|
|
199
|
+
"provider_cache": "observed",
|
|
200
|
+
"token_proxy": "unavailable",
|
|
201
|
+
"tokens": "observed",
|
|
202
|
+
"wall_time": "observed"
|
|
203
|
+
},
|
|
12
204
|
"primary_cost_all_runs_usd": 0.0,
|
|
13
205
|
"primary_cost_measured_runs": 0,
|
|
14
|
-
"
|
|
15
|
-
"
|
|
206
|
+
"primary_cost_measured_successful": 0,
|
|
207
|
+
"primary_cost_per_successful_task_usd": null,
|
|
208
|
+
"primary_cost_per_task_including_failures_usd": null,
|
|
209
|
+
"primary_cost_successful_usd": 0.0,
|
|
210
|
+
"primary_tokens_measured_runs": 1,
|
|
211
|
+
"primary_tokens_measured_successful": 1,
|
|
16
212
|
"provider_cached_tokens_all_runs": 0,
|
|
17
213
|
"provider_cached_tokens_measured_runs": 1,
|
|
18
|
-
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
19
|
-
"total_cost_with_shift_measured_runs": 0,
|
|
20
|
-
"total_tokens_successful": 1200,
|
|
21
|
-
"primary_tokens_measured_successful": 1,
|
|
22
|
-
"primary_cost_successful_usd": 0.0,
|
|
23
|
-
"primary_cost_measured_successful": 0,
|
|
24
|
-
"wall_time_seconds_successful": 10.0,
|
|
25
|
-
"wall_time_seconds_measured_successful": 1,
|
|
26
|
-
"provider_cached_tokens_successful": 0,
|
|
27
214
|
"provider_cached_tokens_measured_successful": 1,
|
|
28
|
-
"
|
|
29
|
-
"
|
|
30
|
-
"
|
|
31
|
-
"
|
|
32
|
-
"
|
|
33
|
-
"external_tokens_measured_successful": 0,
|
|
34
|
-
"artifacts_used_successful": 0,
|
|
35
|
-
"corrections_successful": 0,
|
|
36
|
-
"bytes_before_successful": 0,
|
|
37
|
-
"bytes_after_successful": 0,
|
|
38
|
-
"turns_successful": 0,
|
|
39
|
-
"hook_triggers_successful": 0,
|
|
40
|
-
"failure_rate": 0.0,
|
|
41
|
-
"task_count": 1,
|
|
215
|
+
"provider_cached_tokens_per_successful_task": 0.0,
|
|
216
|
+
"provider_cached_tokens_per_task_including_failures": 0.0,
|
|
217
|
+
"provider_cached_tokens_successful": 0,
|
|
218
|
+
"runs": 1,
|
|
219
|
+
"successful_runs": 1,
|
|
42
220
|
"successful_task_count": 1,
|
|
221
|
+
"task_count": 1,
|
|
222
|
+
"token_proxy_saved_per_successful_task": null,
|
|
223
|
+
"token_proxy_saved_successful": null,
|
|
224
|
+
"tokens_per_successful_task": 1200.0,
|
|
43
225
|
"tokens_per_task_including_failures": 1200.0,
|
|
44
|
-
"
|
|
45
|
-
"
|
|
46
|
-
"
|
|
226
|
+
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
227
|
+
"total_cost_with_shift_measured_runs": 0,
|
|
228
|
+
"total_cost_with_shift_measured_successful": 0,
|
|
229
|
+
"total_cost_with_shift_per_successful_task_usd": null,
|
|
47
230
|
"total_cost_with_shift_per_task_including_failures_usd": null,
|
|
48
|
-
"
|
|
231
|
+
"total_cost_with_shift_successful_usd": 0.0,
|
|
232
|
+
"total_tokens_all_runs": 1200,
|
|
233
|
+
"total_tokens_successful": 1200,
|
|
234
|
+
"turns_successful": 0,
|
|
235
|
+
"wall_time_seconds_all_runs": 10.0,
|
|
236
|
+
"wall_time_seconds_measured_runs": 1,
|
|
237
|
+
"wall_time_seconds_measured_successful": 1,
|
|
49
238
|
"wall_time_seconds_per_successful_task": 10.0,
|
|
50
|
-
"
|
|
51
|
-
"
|
|
52
|
-
|
|
53
|
-
|
|
239
|
+
"wall_time_seconds_per_task_including_failures": 10.0,
|
|
240
|
+
"wall_time_seconds_successful": 10.0
|
|
241
|
+
},
|
|
242
|
+
"cache_layout_check": {
|
|
54
243
|
"artifacts_used_per_successful_task": 0.0,
|
|
55
|
-
"
|
|
244
|
+
"artifacts_used_successful": 0,
|
|
56
245
|
"byte_reduction_ratio": null,
|
|
57
|
-
"compression_strategy": "baseline",
|
|
58
|
-
"is_baseline_strategy": true,
|
|
59
|
-
"bytes_saved_successful": null,
|
|
60
|
-
"bytes_saved_per_successful_task": null,
|
|
61
246
|
"byte_savings_pct": null,
|
|
62
|
-
"
|
|
63
|
-
"
|
|
247
|
+
"bytes_after_successful": 0,
|
|
248
|
+
"bytes_before_successful": 0,
|
|
249
|
+
"bytes_saved_per_successful_task": null,
|
|
250
|
+
"bytes_saved_successful": null,
|
|
251
|
+
"compression_strategy": "cache_layout_check",
|
|
252
|
+
"corrections_per_successful_task": 0.0,
|
|
253
|
+
"corrections_successful": 0,
|
|
254
|
+
"external_cost_successful_usd": 0.0,
|
|
255
|
+
"external_cost_unknown_successful": 1,
|
|
256
|
+
"external_tokens_measured_successful": 0,
|
|
257
|
+
"external_tokens_per_successful_task": null,
|
|
258
|
+
"external_tokens_successful": 0,
|
|
259
|
+
"failed_runs": 0,
|
|
260
|
+
"failure_rate": 0.0,
|
|
261
|
+
"hook_triggers_successful": 0,
|
|
262
|
+
"is_baseline_strategy": false,
|
|
64
263
|
"observed_telemetry": {
|
|
65
|
-
"tokens": "observed",
|
|
66
|
-
"primary_cost": "unavailable",
|
|
67
|
-
"external_tokens": "unavailable",
|
|
68
264
|
"byte_savings": "unavailable",
|
|
265
|
+
"external_tokens": "unavailable",
|
|
266
|
+
"primary_cost": "unavailable",
|
|
267
|
+
"provider_cache": "observed",
|
|
69
268
|
"token_proxy": "unavailable",
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
}
|
|
73
|
-
},
|
|
74
|
-
"cache_layout_check": {
|
|
75
|
-
"runs": 1,
|
|
76
|
-
"successful_runs": 1,
|
|
77
|
-
"failed_runs": 0,
|
|
78
|
-
"total_tokens_all_runs": 1200,
|
|
79
|
-
"primary_tokens_measured_runs": 1,
|
|
269
|
+
"tokens": "observed",
|
|
270
|
+
"wall_time": "observed"
|
|
271
|
+
},
|
|
80
272
|
"primary_cost_all_runs_usd": 0.0,
|
|
81
273
|
"primary_cost_measured_runs": 0,
|
|
82
|
-
"
|
|
83
|
-
"
|
|
274
|
+
"primary_cost_measured_successful": 0,
|
|
275
|
+
"primary_cost_per_successful_task_usd": null,
|
|
276
|
+
"primary_cost_per_task_including_failures_usd": null,
|
|
277
|
+
"primary_cost_successful_usd": 0.0,
|
|
278
|
+
"primary_tokens_measured_runs": 1,
|
|
279
|
+
"primary_tokens_measured_successful": 1,
|
|
84
280
|
"provider_cached_tokens_all_runs": 900,
|
|
85
281
|
"provider_cached_tokens_measured_runs": 1,
|
|
86
|
-
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
87
|
-
"total_cost_with_shift_measured_runs": 0,
|
|
88
|
-
"total_tokens_successful": 1200,
|
|
89
|
-
"primary_tokens_measured_successful": 1,
|
|
90
|
-
"primary_cost_successful_usd": 0.0,
|
|
91
|
-
"primary_cost_measured_successful": 0,
|
|
92
|
-
"wall_time_seconds_successful": 10.0,
|
|
93
|
-
"wall_time_seconds_measured_successful": 1,
|
|
94
|
-
"provider_cached_tokens_successful": 900,
|
|
95
282
|
"provider_cached_tokens_measured_successful": 1,
|
|
96
|
-
"
|
|
97
|
-
"
|
|
98
|
-
"
|
|
99
|
-
"
|
|
100
|
-
"
|
|
101
|
-
"external_tokens_measured_successful": 0,
|
|
102
|
-
"artifacts_used_successful": 0,
|
|
103
|
-
"corrections_successful": 0,
|
|
104
|
-
"bytes_before_successful": 0,
|
|
105
|
-
"bytes_after_successful": 0,
|
|
106
|
-
"turns_successful": 0,
|
|
107
|
-
"hook_triggers_successful": 0,
|
|
108
|
-
"failure_rate": 0.0,
|
|
109
|
-
"task_count": 1,
|
|
283
|
+
"provider_cached_tokens_per_successful_task": 900.0,
|
|
284
|
+
"provider_cached_tokens_per_task_including_failures": 900.0,
|
|
285
|
+
"provider_cached_tokens_successful": 900,
|
|
286
|
+
"runs": 1,
|
|
287
|
+
"successful_runs": 1,
|
|
110
288
|
"successful_task_count": 1,
|
|
289
|
+
"task_count": 1,
|
|
290
|
+
"token_proxy_saved_per_successful_task": null,
|
|
291
|
+
"token_proxy_saved_successful": null,
|
|
292
|
+
"tokens_per_successful_task": 1200.0,
|
|
111
293
|
"tokens_per_task_including_failures": 1200.0,
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
294
|
+
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
295
|
+
"total_cost_with_shift_measured_runs": 0,
|
|
296
|
+
"total_cost_with_shift_measured_successful": 0,
|
|
297
|
+
"total_cost_with_shift_per_successful_task_usd": null,
|
|
115
298
|
"total_cost_with_shift_per_task_including_failures_usd": null,
|
|
116
|
-
"
|
|
299
|
+
"total_cost_with_shift_successful_usd": 0.0,
|
|
300
|
+
"total_tokens_all_runs": 1200,
|
|
301
|
+
"total_tokens_successful": 1200,
|
|
302
|
+
"turns_successful": 0,
|
|
303
|
+
"wall_time_seconds_all_runs": 10.0,
|
|
304
|
+
"wall_time_seconds_measured_runs": 1,
|
|
305
|
+
"wall_time_seconds_measured_successful": 1,
|
|
117
306
|
"wall_time_seconds_per_successful_task": 10.0,
|
|
118
|
-
"
|
|
119
|
-
"
|
|
120
|
-
"total_cost_with_shift_per_successful_task_usd": null,
|
|
121
|
-
"external_tokens_per_successful_task": null,
|
|
122
|
-
"artifacts_used_per_successful_task": 0.0,
|
|
123
|
-
"corrections_per_successful_task": 0.0,
|
|
124
|
-
"byte_reduction_ratio": null,
|
|
125
|
-
"compression_strategy": "cache_layout_check",
|
|
126
|
-
"is_baseline_strategy": false,
|
|
127
|
-
"bytes_saved_successful": null,
|
|
128
|
-
"bytes_saved_per_successful_task": null,
|
|
129
|
-
"byte_savings_pct": null,
|
|
130
|
-
"token_proxy_saved_successful": null,
|
|
131
|
-
"token_proxy_saved_per_successful_task": null,
|
|
132
|
-
"observed_telemetry": {
|
|
133
|
-
"tokens": "observed",
|
|
134
|
-
"primary_cost": "unavailable",
|
|
135
|
-
"external_tokens": "unavailable",
|
|
136
|
-
"byte_savings": "unavailable",
|
|
137
|
-
"token_proxy": "unavailable",
|
|
138
|
-
"wall_time": "observed",
|
|
139
|
-
"provider_cache": "observed"
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
},
|
|
143
|
-
"comparisons": [
|
|
144
|
-
{
|
|
145
|
-
"variant": "cache_layout_check",
|
|
146
|
-
"baseline_variant": "baseline",
|
|
147
|
-
"quality_gate": "pass",
|
|
148
|
-
"baseline_failure_rate": 0.0,
|
|
149
|
-
"variant_failure_rate": 0.0,
|
|
150
|
-
"failure_rate_delta_pp": 0.0,
|
|
151
|
-
"matched_successful_task_count": 1,
|
|
152
|
-
"baseline_successful_task_count": 1,
|
|
153
|
-
"missing_baseline_success_tasks": [],
|
|
154
|
-
"baseline_corrections_per_successful_task": 0.0,
|
|
155
|
-
"variant_corrections_per_successful_task": 0.0,
|
|
156
|
-
"paired_corrections_task_count": 1,
|
|
157
|
-
"corrections_delta_per_successful_task": 0.0,
|
|
158
|
-
"token_delta_per_successful_task": 0.0,
|
|
159
|
-
"token_savings_pct": 0.0,
|
|
160
|
-
"paired_token_task_count": 1,
|
|
161
|
-
"wall_time_delta_seconds_per_successful_task": 0.0,
|
|
162
|
-
"wall_time_change_pct": 0.0,
|
|
163
|
-
"paired_wall_time_task_count": 1,
|
|
164
|
-
"cost_savings_pct_with_shift": null,
|
|
165
|
-
"paired_cost_task_count": 0
|
|
307
|
+
"wall_time_seconds_per_task_including_failures": 10.0,
|
|
308
|
+
"wall_time_seconds_successful": 10.0
|
|
166
309
|
}
|
|
167
|
-
|
|
168
|
-
"claim_status": "compare_variants",
|
|
169
|
-
"caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
|
|
310
|
+
}
|
|
170
311
|
}
|
package/docs/distribution.md
CHANGED
|
@@ -25,11 +25,11 @@ context-guard setup --agent claude --scope user --verify --json
|
|
|
25
25
|
context-guard setup --agent claude --scope user --plan
|
|
26
26
|
```
|
|
27
27
|
|
|
28
|
-
Project scope is the default. `context-guard doctor` and `context-guard setup --verify` are read-only health checks. User scope is opt-in and requires an explicit agent for writes. Supported user-scope writes record backups and rollback metadata under `.context-guard/rollback` in the user home directory.
|
|
28
|
+
Project scope is the default. `context-guard doctor` and `context-guard setup --verify` are read-only health checks. User scope is opt-in and requires an explicit agent for writes. Supported user-scope writes record backups and rollback metadata under `.context-guard/rollback` in the user home directory. Setup resolves packaged/check-out helpers first; `PATH` helper fallback is default-off and requires `--allow-path-helper-fallback` for a trusted install after canonical executable and identity validation.
|
|
29
29
|
|
|
30
30
|
## Runtime requirements
|
|
31
31
|
|
|
32
|
-
The helpers are Python/shell scripts packaged through npm and Homebrew. Supported machines need:
|
|
32
|
+
The helpers are Python/shell scripts packaged through npm and Homebrew as plugin-local `plugins/context-guard/bin` entrypoints plus `plugins/context-guard/lib` helpers; checkout-only `context-guard-kit` sources are not duplicated in the npm tarball. Supported machines need:
|
|
33
33
|
|
|
34
34
|
- macOS or Linux
|
|
35
35
|
- Python 3 available as `python3`
|
|
@@ -47,11 +47,14 @@ The helpers are Python/shell scripts packaged through npm and Homebrew. Supporte
|
|
|
47
47
|
|
|
48
48
|
Before publishing the Homebrew tap, run the formula-specific checks locally or in CI when Homebrew is available:
|
|
49
49
|
|
|
50
|
+
Render or copy `packaging/homebrew/context-guard.rb.template` into a real tap formula first; replace `{{VERSION}}` with the bare semver version (for example `0.4.9`, not `v0.4.9`) and `REPLACE_WITH_RELEASE_TARBALL_SHA256` with the verified tarball SHA. Do not run Homebrew audit/install directly against the placeholder template.
|
|
51
|
+
|
|
50
52
|
```bash
|
|
51
|
-
|
|
52
|
-
brew
|
|
53
|
-
brew
|
|
54
|
-
brew
|
|
53
|
+
# Example once Formula/context-guard.rb has been rendered in the tap checkout:
|
|
54
|
+
brew style Formula/context-guard.rb
|
|
55
|
+
brew audit --strict --new ictechgy/tap/context-guard
|
|
56
|
+
brew install --build-from-source ictechgy/tap/context-guard
|
|
57
|
+
brew test ictechgy/tap/context-guard
|
|
55
58
|
```
|
|
56
59
|
|
|
57
|
-
The formula should rewrite Python shebangs to the declared Homebrew Python dependency and expose both `context-guard` and legacy compatibility wrappers from `plugins/context-guard/bin`.
|
|
60
|
+
The rendered formula should rewrite Python shebangs to the declared Homebrew Python dependency and expose both `context-guard` and legacy compatibility wrappers from `plugins/context-guard/bin`.
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
# Experimental benchmark fixtures
|
|
2
2
|
|
|
3
|
-
These fixtures are **fixture-only** starter scaffolds for future visual/OCR, learned-compression,
|
|
3
|
+
These fixtures are **fixture-only** starter scaffolds for future visual/OCR, learned-compression, reversible output-transform, and token-savings roadmap experiments. They are **synthetic**, package-visible examples for `context-guard-bench` task and variant shapes; they are **not shipped benchmark results**, not OCR/compression implementations, not cache/tool-deferral implementations, and not hosted API savings claims.
|
|
4
4
|
|
|
5
5
|
Use them when designing an experiment that starts from ContextGuard's existing benchmark discipline:
|
|
6
6
|
|
|
@@ -12,6 +12,23 @@ Use them when designing an experiment that starts from ContextGuard's existing b
|
|
|
12
12
|
5. Treat byte counts, image dimensions, OCR confidence, and local compressor ratios as proxy evidence. Real token/cost claims require **provider-measured** primary token/cost fields on both sides.
|
|
13
13
|
6. Keep private screenshots, raw secrets, and external service endpoints out of fixture files.
|
|
14
14
|
|
|
15
|
+
## Local replay evidence
|
|
16
|
+
|
|
17
|
+
`context-guard-bench --evidence-jsonl <path>` can replay pre-recorded run evidence into the normal CSV/report pipeline without invoking `claude` or any task `success_command`. Pair it with `--report-json` and `--dashboard-md` to regenerate a deterministic local dashboard:
|
|
18
|
+
|
|
19
|
+
```bash
|
|
20
|
+
context-guard-bench \
|
|
21
|
+
--tasks docs/benchmark-fixtures/token-savings-12task.tasks.example.json \
|
|
22
|
+
--variants docs/benchmark-fixtures/token-savings-12task.variants.example.json \
|
|
23
|
+
--evidence-jsonl docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl \
|
|
24
|
+
--csv /tmp/contextguard-token-savings.csv \
|
|
25
|
+
--report-json /tmp/contextguard-token-savings.report.json \
|
|
26
|
+
--dashboard-md /tmp/contextguard-token-savings.dashboard.md \
|
|
27
|
+
--baseline-variant baseline_full_context_fixture
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
The included token-savings evidence file is deliberately `synthetic_fixture` provenance. It validates replay/dashboard mechanics and byte-proxy reporting only: replay forces synthetic/manual rows to `primary_tokens_measured=false` and `cost_measured=false`, so it is not public hosted API token/cost savings evidence even when token-looking numbers are present. A public claim still requires matched successful tasks, provider-export provenance, provider-measured primary tokens/cost, quality non-inferiority, and shifted-cost accounting.
|
|
31
|
+
|
|
15
32
|
## Runner-native variant prompt files
|
|
16
33
|
|
|
17
34
|
`context-guard-bench` supports optional file-backed `variant_prompt_files` in task fixtures. The map is keyed by variant name and lets a single logical task swap sanitized prompt evidence per variant, for example a baseline raw-output prompt versus a digest plus artifact receipt prompt. Prompt files are resolved relative to the task JSON, must be relative paths, and are read with the same no-follow/symlink-safe posture as task and variant fixtures.
|
|
@@ -20,11 +37,12 @@ This runner-native swap only proves command shape and prompt selection until the
|
|
|
20
37
|
|
|
21
38
|
## Included fixture sets
|
|
22
39
|
|
|
23
|
-
| Fixture set | Task file | Variant file | Intended future experiment |
|
|
24
|
-
| --- | --- | --- | --- |
|
|
25
|
-
| Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
|
|
26
|
-
| Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
|
|
27
|
-
| Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
|
|
40
|
+
| Fixture set | Task file | Variant file | Evidence replay file | Intended future experiment |
|
|
41
|
+
| --- | --- | --- | --- | --- |
|
|
42
|
+
| Visual/OCR evidence | [`benchmark-fixtures/visual-ocr.tasks.example.json`](benchmark-fixtures/visual-ocr.tasks.example.json) | [`benchmark-fixtures/visual-ocr.variants.example.json`](benchmark-fixtures/visual-ocr.variants.example.json) | n/a | Compare full visual evidence against cropped or OCR-derived evidence after the user supplies sanitized textual evidence, missed-context notes, crop/OCR telemetry, and provider telemetry. |
|
|
43
|
+
| Learned compression | [`benchmark-fixtures/learned-compression.tasks.example.json`](benchmark-fixtures/learned-compression.tasks.example.json) | [`benchmark-fixtures/learned-compression.variants.example.json`](benchmark-fixtures/learned-compression.variants.example.json) | n/a | Compare sanitized baseline context packs against a fixture-only compressed digest candidate after exact retrieval or receipt fallback, quality gates, and shifted costs are measured. |
|
|
44
|
+
| Reversible output transform | [`benchmark-fixtures/output-transform.tasks.example.json`](benchmark-fixtures/output-transform.tasks.example.json) | [`benchmark-fixtures/output-transform.variants.example.json`](benchmark-fixtures/output-transform.variants.example.json) | n/a | Compare raw sanitized command output against a digest plus artifact receipt after variant prompt files, success checks, and provider telemetry are supplied. |
|
|
45
|
+
| Token-savings 12-task roadmap | [`benchmark-fixtures/token-savings-12task.tasks.example.json`](benchmark-fixtures/token-savings-12task.tasks.example.json) | [`benchmark-fixtures/token-savings-12task.variants.example.json`](benchmark-fixtures/token-savings-12task.variants.example.json) | [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) | Exercise a canonical 12-task spread for bugfix, exploration, review, log analysis, migration, docs, refactor, performance, telemetry, cache layout, tool-schema deferral, and artifact receipt experiments after real success commands and provider telemetry are supplied. |
|
|
28
46
|
|
|
29
47
|
## Visual/OCR fixture notes
|
|
30
48
|
|
|
@@ -38,6 +56,12 @@ The learned-compression fixtures describe already-sanitized context-pack or arti
|
|
|
38
56
|
|
|
39
57
|
The output-transform fixtures describe already-sanitized command output comparisons and now demonstrate `variant_prompt_files` for raw sanitized output versus digest plus artifact receipt prompt evidence. They do not execute `context-guard-trim-output`, store artifacts, call `context-guard-artifact`, or invoke a provider. Future experiments should compare raw sanitized output against `--digest` output plus an `--artifact-receipt`, verify the receipt's exact re-expand command retrieves the omitted sanitized lines, and record bytes before/after, primary provider tokens, cost, success, corrections, artifact-store usage, and any external/local processing cost.
|
|
40
58
|
|
|
59
|
+
## Token-savings 12-task roadmap fixture notes
|
|
60
|
+
|
|
61
|
+
The token-savings 12-task fixtures are a canonical **fixture-only** spread for roadmap-level A/B design. They demonstrate `variant_prompt_files` for a baseline full-context prompt versus a ContextGuard advisory-foundations prompt that may later include cache layout lint, core-vs-deferred tool schemas, artifact receipts, and claim-safe telemetry. They do not execute `context-guard-cache-score`, `context-guard-tool-prune`, or any provider call. The companion `token-savings-12task.evidence.example.jsonl` lets users replay deterministic synthetic rows into CSV/report/dashboard outputs while preserving the same non-claim boundary.
|
|
62
|
+
|
|
63
|
+
For real non-dry-run experiments, replace every placeholder `success_command`, keep task IDs matched across baseline and candidate variants, and require provider-measured primary token/cost data before interpreting `tokens_per_successful_task`, `total_cost_with_shift_usd`, or `external_cost_usd`. Cache predictions, char/4 token proxies, local latency, and byte reductions remain diagnostic proxy evidence unless the generated report contains matched successful task evidence and stays within the 10%p failure-rate guardrail.
|
|
64
|
+
|
|
41
65
|
## Safe wording
|
|
42
66
|
|
|
43
67
|
Use language like:
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ictechgy/context-guard",
|
|
3
|
-
"version": "0.4.
|
|
3
|
+
"version": "0.4.11",
|
|
4
4
|
"description": "ContextGuard CLI helpers for keeping AI coding agent context focused and local-first.",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"homepage": "https://github.com/ictechgy/context-guard#readme",
|
|
@@ -32,7 +32,8 @@
|
|
|
32
32
|
"context-guard-failed-nudge": "plugins/context-guard/bin/context-guard-failed-nudge",
|
|
33
33
|
"context-guard-statusline": "plugins/context-guard/bin/context-guard-statusline",
|
|
34
34
|
"context-guard-statusline-merged": "plugins/context-guard/bin/context-guard-statusline-merged",
|
|
35
|
-
"context-guard-cost": "plugins/context-guard/bin/context-guard-cost"
|
|
35
|
+
"context-guard-cost": "plugins/context-guard/bin/context-guard-cost",
|
|
36
|
+
"context-guard-cache-score": "plugins/context-guard/bin/context-guard-cache-score"
|
|
36
37
|
},
|
|
37
38
|
"files": [
|
|
38
39
|
"CHANGELOG.md",
|
|
@@ -40,10 +41,6 @@
|
|
|
40
41
|
"NOTICE",
|
|
41
42
|
"README.md",
|
|
42
43
|
"README.ko.md",
|
|
43
|
-
"context-guard-kit/*.py",
|
|
44
|
-
"context-guard-kit/*.sh",
|
|
45
|
-
"context-guard-kit/README.md",
|
|
46
|
-
"context-guard-kit/settings.example.json",
|
|
47
44
|
"plugins/context-guard/.claude-plugin/plugin.json",
|
|
48
45
|
"plugins/context-guard/README.md",
|
|
49
46
|
"plugins/context-guard/README.ko.md",
|
|
@@ -62,6 +59,7 @@
|
|
|
62
59
|
"docs/benchmark-workflows/*.example.jsonl",
|
|
63
60
|
"docs/benchmark-workflow-examples.md",
|
|
64
61
|
"docs/benchmark-fixtures/*.example.json",
|
|
62
|
+
"docs/benchmark-fixtures/*.example.jsonl",
|
|
65
63
|
"docs/benchmark-fixtures/*.prompt.example.md",
|
|
66
64
|
"docs/experimental-benchmark-fixtures.md",
|
|
67
65
|
"packaging/homebrew/context-guard.rb.template"
|
|
@@ -5,7 +5,7 @@ class ContextGuard < Formula
|
|
|
5
5
|
|
|
6
6
|
desc "Local-first context guardrails for AI coding agents"
|
|
7
7
|
homepage "https://github.com/ictechgy/context-guard"
|
|
8
|
-
url "https://github.com/ictechgy/context-guard/archive/refs/tags/
|
|
8
|
+
url "https://github.com/ictechgy/context-guard/archive/refs/tags/v{{VERSION}}.tar.gz"
|
|
9
9
|
sha256 "REPLACE_WITH_RELEASE_TARBALL_SHA256"
|
|
10
10
|
license "Apache-2.0"
|
|
11
11
|
|