@ictechgy/context-guard 0.4.10 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CHANGELOG.md +17 -1
  2. package/README.ko.md +46 -28
  3. package/README.md +42 -33
  4. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  5. package/docs/benchmark-workflow-examples.md +3 -0
  6. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  7. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  8. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  9. package/docs/experimental-benchmark-fixtures.md +24 -7
  10. package/package.json +2 -1
  11. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  12. package/plugins/context-guard/README.ko.md +14 -11
  13. package/plugins/context-guard/README.md +15 -14
  14. package/plugins/context-guard/bin/context-guard +48 -17
  15. package/plugins/context-guard/bin/context-guard-artifact +342 -33
  16. package/plugins/context-guard/bin/context-guard-audit +36 -5
  17. package/plugins/context-guard/bin/context-guard-bench +1675 -44
  18. package/plugins/context-guard/bin/context-guard-cache-score +347 -35
  19. package/plugins/context-guard/bin/context-guard-compress +89 -27
  20. package/plugins/context-guard/bin/context-guard-cost +7 -2
  21. package/plugins/context-guard/bin/context-guard-experiments +364 -8
  22. package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
  23. package/plugins/context-guard/bin/context-guard-filter +88 -18
  24. package/plugins/context-guard/bin/context-guard-pack +329 -19
  25. package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
  26. package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
  27. package/plugins/context-guard/bin/context-guard-setup +21 -5
  28. package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
  29. package/plugins/context-guard/bin/context-guard-trim-output +394 -90
  30. package/plugins/context-guard/brief/README.md +5 -5
  31. package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
  32. package/plugins/context-guard/lib/context_guard_commands.py +217 -190
@@ -1,170 +1,311 @@
1
1
  {
2
- "schema": "context-guard-bench-report-v1",
3
2
  "baseline_variant": "baseline",
3
+ "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
4
+ "claim_status": "token_savings_observed_cost_unmeasured",
5
+ "comparisons": [
6
+ {
7
+ "baseline_corrections_per_successful_task": 0.0,
8
+ "baseline_failure_rate": 0.0,
9
+ "baseline_successful_task_count": 1,
10
+ "baseline_variant": "baseline",
11
+ "corrections_delta_per_successful_task": 0.0,
12
+ "cost_savings_pct_with_shift": null,
13
+ "failure_rate_delta_pp": 0.0,
14
+ "matched_successful_task_count": 1,
15
+ "missing_baseline_success_tasks": [],
16
+ "paired_corrections_task_count": 1,
17
+ "paired_cost_task_count": 0,
18
+ "paired_token_task_count": 1,
19
+ "paired_wall_time_task_count": 1,
20
+ "quality_gate": "pass",
21
+ "token_delta_per_successful_task": -240.0,
22
+ "token_savings_pct": 24.0,
23
+ "variant": "brief_mode_standard",
24
+ "variant_corrections_per_successful_task": 0.0,
25
+ "variant_failure_rate": 0.0,
26
+ "wall_time_change_pct": -4.0000000000000036,
27
+ "wall_time_delta_seconds_per_successful_task": -0.40000000000000036
28
+ }
29
+ ],
30
+ "public_claim_readiness": {
31
+ "blocking_gate_ids": [
32
+ "matched_successful_tasks",
33
+ "provider_measured_token_cost",
34
+ "shifted_cost_accounting",
35
+ "confidence_failure_notes",
36
+ "provider_export_provenance"
37
+ ],
38
+ "claim_allowed": false,
39
+ "claim_boundary": {
40
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
41
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
42
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
43
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
44
+ "id": "public_claim_readiness_authoritative_release_gate",
45
+ "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
46
+ "reporting_only": true,
47
+ "requires_confidence_and_failure_notes": true,
48
+ "requires_matched_successful_tasks": true,
49
+ "requires_provider_export_provenance": true,
50
+ "requires_provider_measured_tokens_and_cost": true,
51
+ "requires_quality_non_inferiority": true,
52
+ "requires_shifted_cost_accounting": true,
53
+ "unsupported_claims_forbidden": true
54
+ },
55
+ "gates": [
56
+ {
57
+ "evidence": {
58
+ "comparison_count": 1,
59
+ "matched_pair_count": 0,
60
+ "min_matched_successful_task_count": 1.0,
61
+ "missing_baseline_success_task_count": 0,
62
+ "variants": [
63
+ "brief_mode_standard"
64
+ ]
65
+ },
66
+ "id": "matched_successful_tasks",
67
+ "label": "Matched successful tasks",
68
+ "passed": false,
69
+ "reason": "missing_or_regressed_matched_successful_tasks",
70
+ "required": true,
71
+ "status": "fail"
72
+ },
73
+ {
74
+ "evidence": {
75
+ "matched_pair_count": 0,
76
+ "required_fields": [
77
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
78
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
79
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
80
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
81
+ ]
82
+ },
83
+ "id": "provider_measured_token_cost",
84
+ "label": "Provider-measured token and primary cost",
85
+ "passed": false,
86
+ "reason": "missing_provider_measured_primary_tokens_or_cost",
87
+ "required": true,
88
+ "status": "fail"
89
+ },
90
+ {
91
+ "evidence": {
92
+ "max_corrections_delta_per_successful_task": 0.0,
93
+ "max_failure_rate_delta_pp": 0.0,
94
+ "quality_gates": [
95
+ "pass"
96
+ ]
97
+ },
98
+ "id": "quality_non_inferiority",
99
+ "label": "Quality non-inferiority",
100
+ "passed": true,
101
+ "reason": "all_quality_gates_pass",
102
+ "required": true,
103
+ "status": "pass"
104
+ },
105
+ {
106
+ "evidence": {
107
+ "matched_pair_count": 0,
108
+ "required_fields": [
109
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
110
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
111
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
112
+ ]
113
+ },
114
+ "id": "shifted_cost_accounting",
115
+ "label": "Shifted-cost accounting",
116
+ "passed": false,
117
+ "reason": "missing_shifted_cost_claim_accounting",
118
+ "required": true,
119
+ "status": "fail"
120
+ },
121
+ {
122
+ "evidence": {
123
+ "comparison_failure_fields_present": true,
124
+ "explicit_note_count": 0,
125
+ "failed_row_count": 0,
126
+ "failed_rows_with_notes": 0,
127
+ "replay_row_count": 0
128
+ },
129
+ "id": "confidence_failure_notes",
130
+ "label": "Confidence and failure notes",
131
+ "passed": false,
132
+ "reason": "missing_explicit_replay_notes_or_failure_evidence",
133
+ "required": true,
134
+ "status": "unknown"
135
+ },
136
+ {
137
+ "evidence": {
138
+ "mixed_csv": false,
139
+ "provider_names": [],
140
+ "replay_row_count": 0,
141
+ "report_row_count": 2,
142
+ "same_run_complete": false,
143
+ "source_types": []
144
+ },
145
+ "id": "provider_export_provenance",
146
+ "label": "Provider-export provenance",
147
+ "passed": false,
148
+ "reason": "missing_or_mixed_provider_export_provenance",
149
+ "required": true,
150
+ "status": "unknown"
151
+ }
152
+ ],
153
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
154
+ "passed_required_gate_count": 1,
155
+ "public_claim_eligible_observed": null,
156
+ "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
157
+ "raw_metric_claim_status_observed": "token_savings_observed_cost_unmeasured",
158
+ "reason": "replay_evidence_required_for_public_claim",
159
+ "required_gate_count": 6,
160
+ "required_gate_ids": [
161
+ "matched_successful_tasks",
162
+ "provider_measured_token_cost",
163
+ "quality_non_inferiority",
164
+ "shifted_cost_accounting",
165
+ "confidence_failure_notes",
166
+ "provider_export_provenance"
167
+ ],
168
+ "schema_version": "contextguard.bench.public-claim-readiness.v1",
169
+ "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
170
+ },
4
171
  "row_count": 2,
172
+ "schema": "context-guard-bench-report-v1",
5
173
  "summary_by_variant": {
6
174
  "baseline": {
7
- "runs": 1,
8
- "successful_runs": 1,
175
+ "artifacts_used_per_successful_task": 0.0,
176
+ "artifacts_used_successful": 0,
177
+ "byte_reduction_ratio": 1.0,
178
+ "byte_savings_pct": 0.0,
179
+ "bytes_after_successful": 12000,
180
+ "bytes_before_successful": 12000,
181
+ "bytes_saved_per_successful_task": 0.0,
182
+ "bytes_saved_successful": 0,
183
+ "compression_strategy": "baseline",
184
+ "corrections_per_successful_task": 0.0,
185
+ "corrections_successful": 0,
186
+ "external_cost_successful_usd": 0.0,
187
+ "external_cost_unknown_successful": 1,
188
+ "external_tokens_measured_successful": 0,
189
+ "external_tokens_per_successful_task": null,
190
+ "external_tokens_successful": 0,
9
191
  "failed_runs": 0,
10
- "total_tokens_all_runs": 1000,
11
- "primary_tokens_measured_runs": 1,
192
+ "failure_rate": 0.0,
193
+ "hook_triggers_successful": 0,
194
+ "is_baseline_strategy": true,
195
+ "observed_telemetry": {
196
+ "byte_savings": "observed",
197
+ "external_tokens": "unavailable",
198
+ "primary_cost": "unavailable",
199
+ "provider_cache": "unavailable",
200
+ "token_proxy": "inferred",
201
+ "tokens": "observed",
202
+ "wall_time": "observed"
203
+ },
12
204
  "primary_cost_all_runs_usd": 0.0,
13
205
  "primary_cost_measured_runs": 0,
14
- "wall_time_seconds_all_runs": 10.0,
15
- "wall_time_seconds_measured_runs": 1,
206
+ "primary_cost_measured_successful": 0,
207
+ "primary_cost_per_successful_task_usd": null,
208
+ "primary_cost_per_task_including_failures_usd": null,
209
+ "primary_cost_successful_usd": 0.0,
210
+ "primary_tokens_measured_runs": 1,
211
+ "primary_tokens_measured_successful": 1,
16
212
  "provider_cached_tokens_all_runs": 0,
17
213
  "provider_cached_tokens_measured_runs": 0,
214
+ "provider_cached_tokens_measured_successful": 0,
215
+ "provider_cached_tokens_per_successful_task": 0.0,
216
+ "provider_cached_tokens_per_task_including_failures": 0.0,
217
+ "provider_cached_tokens_successful": 0,
218
+ "runs": 1,
219
+ "successful_runs": 1,
220
+ "successful_task_count": 1,
221
+ "task_count": 1,
222
+ "token_proxy_saved_per_successful_task": 0.0,
223
+ "token_proxy_saved_successful": 0,
224
+ "tokens_per_successful_task": 1000.0,
225
+ "tokens_per_task_including_failures": 1000.0,
18
226
  "total_cost_with_shift_all_runs_usd": 0.0,
19
227
  "total_cost_with_shift_measured_runs": 0,
228
+ "total_cost_with_shift_measured_successful": 0,
229
+ "total_cost_with_shift_per_successful_task_usd": null,
230
+ "total_cost_with_shift_per_task_including_failures_usd": null,
231
+ "total_cost_with_shift_successful_usd": 0.0,
232
+ "total_tokens_all_runs": 1000,
20
233
  "total_tokens_successful": 1000,
21
- "primary_tokens_measured_successful": 1,
22
- "primary_cost_successful_usd": 0.0,
23
- "primary_cost_measured_successful": 0,
24
- "wall_time_seconds_successful": 10.0,
234
+ "turns_successful": 0,
235
+ "wall_time_seconds_all_runs": 10.0,
236
+ "wall_time_seconds_measured_runs": 1,
25
237
  "wall_time_seconds_measured_successful": 1,
26
- "provider_cached_tokens_successful": 0,
27
- "provider_cached_tokens_measured_successful": 0,
238
+ "wall_time_seconds_per_successful_task": 10.0,
239
+ "wall_time_seconds_per_task_including_failures": 10.0,
240
+ "wall_time_seconds_successful": 10.0
241
+ },
242
+ "brief_mode_standard": {
243
+ "artifacts_used_per_successful_task": 0.0,
244
+ "artifacts_used_successful": 0,
245
+ "byte_reduction_ratio": 0.75,
246
+ "byte_savings_pct": 25.0,
247
+ "bytes_after_successful": 9000,
248
+ "bytes_before_successful": 12000,
249
+ "bytes_saved_per_successful_task": 3000.0,
250
+ "bytes_saved_successful": 3000,
251
+ "compression_strategy": "brief_mode_standard",
252
+ "corrections_per_successful_task": 0.0,
253
+ "corrections_successful": 0,
28
254
  "external_cost_successful_usd": 0.0,
29
255
  "external_cost_unknown_successful": 1,
30
- "total_cost_with_shift_successful_usd": 0.0,
31
- "total_cost_with_shift_measured_successful": 0,
32
- "external_tokens_successful": 0,
33
256
  "external_tokens_measured_successful": 0,
34
- "artifacts_used_successful": 0,
35
- "corrections_successful": 0,
36
- "bytes_before_successful": 12000,
37
- "bytes_after_successful": 12000,
38
- "turns_successful": 0,
39
- "hook_triggers_successful": 0,
40
- "failure_rate": 0.0,
41
- "task_count": 1,
42
- "successful_task_count": 1,
43
- "tokens_per_task_including_failures": 1000.0,
44
- "wall_time_seconds_per_task_including_failures": 10.0,
45
- "provider_cached_tokens_per_task_including_failures": 0.0,
46
- "primary_cost_per_task_including_failures_usd": null,
47
- "total_cost_with_shift_per_task_including_failures_usd": null,
48
- "tokens_per_successful_task": 1000.0,
49
- "wall_time_seconds_per_successful_task": 10.0,
50
- "provider_cached_tokens_per_successful_task": 0.0,
51
- "primary_cost_per_successful_task_usd": null,
52
- "total_cost_with_shift_per_successful_task_usd": null,
53
257
  "external_tokens_per_successful_task": null,
54
- "artifacts_used_per_successful_task": 0.0,
55
- "corrections_per_successful_task": 0.0,
56
- "byte_reduction_ratio": 1.0,
57
- "compression_strategy": "baseline",
58
- "is_baseline_strategy": true,
59
- "bytes_saved_successful": 0,
60
- "bytes_saved_per_successful_task": 0.0,
61
- "byte_savings_pct": 0.0,
62
- "token_proxy_saved_successful": 0,
63
- "token_proxy_saved_per_successful_task": 0.0,
258
+ "external_tokens_successful": 0,
259
+ "failed_runs": 0,
260
+ "failure_rate": 0.0,
261
+ "hook_triggers_successful": 0,
262
+ "is_baseline_strategy": false,
64
263
  "observed_telemetry": {
65
- "tokens": "observed",
66
- "primary_cost": "unavailable",
67
- "external_tokens": "unavailable",
68
264
  "byte_savings": "observed",
265
+ "external_tokens": "unavailable",
266
+ "primary_cost": "unavailable",
267
+ "provider_cache": "unavailable",
69
268
  "token_proxy": "inferred",
70
- "wall_time": "observed",
71
- "provider_cache": "unavailable"
72
- }
73
- },
74
- "brief_mode_standard": {
75
- "runs": 1,
76
- "successful_runs": 1,
77
- "failed_runs": 0,
78
- "total_tokens_all_runs": 760,
79
- "primary_tokens_measured_runs": 1,
269
+ "tokens": "observed",
270
+ "wall_time": "observed"
271
+ },
80
272
  "primary_cost_all_runs_usd": 0.0,
81
273
  "primary_cost_measured_runs": 0,
82
- "wall_time_seconds_all_runs": 9.6,
83
- "wall_time_seconds_measured_runs": 1,
274
+ "primary_cost_measured_successful": 0,
275
+ "primary_cost_per_successful_task_usd": null,
276
+ "primary_cost_per_task_including_failures_usd": null,
277
+ "primary_cost_successful_usd": 0.0,
278
+ "primary_tokens_measured_runs": 1,
279
+ "primary_tokens_measured_successful": 1,
84
280
  "provider_cached_tokens_all_runs": 0,
85
281
  "provider_cached_tokens_measured_runs": 0,
86
- "total_cost_with_shift_all_runs_usd": 0.0,
87
- "total_cost_with_shift_measured_runs": 0,
88
- "total_tokens_successful": 760,
89
- "primary_tokens_measured_successful": 1,
90
- "primary_cost_successful_usd": 0.0,
91
- "primary_cost_measured_successful": 0,
92
- "wall_time_seconds_successful": 9.6,
93
- "wall_time_seconds_measured_successful": 1,
94
- "provider_cached_tokens_successful": 0,
95
282
  "provider_cached_tokens_measured_successful": 0,
96
- "external_cost_successful_usd": 0.0,
97
- "external_cost_unknown_successful": 1,
98
- "total_cost_with_shift_successful_usd": 0.0,
99
- "total_cost_with_shift_measured_successful": 0,
100
- "external_tokens_successful": 0,
101
- "external_tokens_measured_successful": 0,
102
- "artifacts_used_successful": 0,
103
- "corrections_successful": 0,
104
- "bytes_before_successful": 12000,
105
- "bytes_after_successful": 9000,
106
- "turns_successful": 0,
107
- "hook_triggers_successful": 0,
108
- "failure_rate": 0.0,
109
- "task_count": 1,
283
+ "provider_cached_tokens_per_successful_task": 0.0,
284
+ "provider_cached_tokens_per_task_including_failures": 0.0,
285
+ "provider_cached_tokens_successful": 0,
286
+ "runs": 1,
287
+ "successful_runs": 1,
110
288
  "successful_task_count": 1,
289
+ "task_count": 1,
290
+ "token_proxy_saved_per_successful_task": 750.0,
291
+ "token_proxy_saved_successful": 750,
292
+ "tokens_per_successful_task": 760.0,
111
293
  "tokens_per_task_including_failures": 760.0,
112
- "wall_time_seconds_per_task_including_failures": 9.6,
113
- "provider_cached_tokens_per_task_including_failures": 0.0,
114
- "primary_cost_per_task_including_failures_usd": null,
294
+ "total_cost_with_shift_all_runs_usd": 0.0,
295
+ "total_cost_with_shift_measured_runs": 0,
296
+ "total_cost_with_shift_measured_successful": 0,
297
+ "total_cost_with_shift_per_successful_task_usd": null,
115
298
  "total_cost_with_shift_per_task_including_failures_usd": null,
116
- "tokens_per_successful_task": 760.0,
299
+ "total_cost_with_shift_successful_usd": 0.0,
300
+ "total_tokens_all_runs": 760,
301
+ "total_tokens_successful": 760,
302
+ "turns_successful": 0,
303
+ "wall_time_seconds_all_runs": 9.6,
304
+ "wall_time_seconds_measured_runs": 1,
305
+ "wall_time_seconds_measured_successful": 1,
117
306
  "wall_time_seconds_per_successful_task": 9.6,
118
- "provider_cached_tokens_per_successful_task": 0.0,
119
- "primary_cost_per_successful_task_usd": null,
120
- "total_cost_with_shift_per_successful_task_usd": null,
121
- "external_tokens_per_successful_task": null,
122
- "artifacts_used_per_successful_task": 0.0,
123
- "corrections_per_successful_task": 0.0,
124
- "byte_reduction_ratio": 0.75,
125
- "compression_strategy": "brief_mode_standard",
126
- "is_baseline_strategy": false,
127
- "bytes_saved_successful": 3000,
128
- "bytes_saved_per_successful_task": 3000.0,
129
- "byte_savings_pct": 25.0,
130
- "token_proxy_saved_successful": 750,
131
- "token_proxy_saved_per_successful_task": 750.0,
132
- "observed_telemetry": {
133
- "tokens": "observed",
134
- "primary_cost": "unavailable",
135
- "external_tokens": "unavailable",
136
- "byte_savings": "observed",
137
- "token_proxy": "inferred",
138
- "wall_time": "observed",
139
- "provider_cache": "unavailable"
140
- }
141
- }
142
- },
143
- "comparisons": [
144
- {
145
- "variant": "brief_mode_standard",
146
- "baseline_variant": "baseline",
147
- "quality_gate": "pass",
148
- "baseline_failure_rate": 0.0,
149
- "variant_failure_rate": 0.0,
150
- "failure_rate_delta_pp": 0.0,
151
- "matched_successful_task_count": 1,
152
- "baseline_successful_task_count": 1,
153
- "missing_baseline_success_tasks": [],
154
- "baseline_corrections_per_successful_task": 0.0,
155
- "variant_corrections_per_successful_task": 0.0,
156
- "paired_corrections_task_count": 1,
157
- "corrections_delta_per_successful_task": 0.0,
158
- "token_delta_per_successful_task": -240.0,
159
- "token_savings_pct": 24.0,
160
- "paired_token_task_count": 1,
161
- "wall_time_delta_seconds_per_successful_task": -0.40000000000000036,
162
- "wall_time_change_pct": -4.0000000000000036,
163
- "paired_wall_time_task_count": 1,
164
- "cost_savings_pct_with_shift": null,
165
- "paired_cost_task_count": 0
307
+ "wall_time_seconds_per_task_including_failures": 9.6,
308
+ "wall_time_seconds_successful": 9.6
166
309
  }
167
- ],
168
- "claim_status": "token_savings_observed_cost_unmeasured",
169
- "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
310
+ }
170
311
  }