@ictechgy/context-guard 0.4.10 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CHANGELOG.md +17 -1
  2. package/README.ko.md +46 -28
  3. package/README.md +42 -33
  4. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  5. package/docs/benchmark-workflow-examples.md +3 -0
  6. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  7. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  8. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  9. package/docs/experimental-benchmark-fixtures.md +24 -7
  10. package/package.json +2 -1
  11. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  12. package/plugins/context-guard/README.ko.md +14 -11
  13. package/plugins/context-guard/README.md +15 -14
  14. package/plugins/context-guard/bin/context-guard +48 -17
  15. package/plugins/context-guard/bin/context-guard-artifact +342 -33
  16. package/plugins/context-guard/bin/context-guard-audit +36 -5
  17. package/plugins/context-guard/bin/context-guard-bench +1675 -44
  18. package/plugins/context-guard/bin/context-guard-cache-score +347 -35
  19. package/plugins/context-guard/bin/context-guard-compress +89 -27
  20. package/plugins/context-guard/bin/context-guard-cost +7 -2
  21. package/plugins/context-guard/bin/context-guard-experiments +364 -8
  22. package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
  23. package/plugins/context-guard/bin/context-guard-filter +88 -18
  24. package/plugins/context-guard/bin/context-guard-pack +329 -19
  25. package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
  26. package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
  27. package/plugins/context-guard/bin/context-guard-setup +21 -5
  28. package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
  29. package/plugins/context-guard/bin/context-guard-trim-output +394 -90
  30. package/plugins/context-guard/brief/README.md +5 -5
  31. package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
  32. package/plugins/context-guard/lib/context_guard_commands.py +217 -190
@@ -1,169 +1,310 @@
1
1
  {
2
- "schema": "context-guard-bench-report-v1",
3
2
  "baseline_variant": "baseline",
3
+ "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
4
+ "claim_status": "insufficient_paired_data",
5
+ "comparisons": [
6
+ {
7
+ "baseline_corrections_per_successful_task": 0.0,
8
+ "baseline_failure_rate": 0.0,
9
+ "baseline_successful_task_count": 1,
10
+ "baseline_variant": "baseline",
11
+ "corrections_delta_per_successful_task": 0.0,
12
+ "cost_savings_pct_with_shift": null,
13
+ "failure_rate_delta_pp": 0.0,
14
+ "matched_successful_task_count": 1,
15
+ "missing_baseline_success_tasks": [],
16
+ "paired_corrections_task_count": 1,
17
+ "paired_cost_task_count": 0,
18
+ "paired_token_task_count": 0,
19
+ "paired_wall_time_task_count": 1,
20
+ "quality_gate": "pass",
21
+ "token_savings_pct": null,
22
+ "variant": "context_pack_auto",
23
+ "variant_corrections_per_successful_task": 0.0,
24
+ "variant_failure_rate": 0.0,
25
+ "wall_time_change_pct": -8.333333333333332,
26
+ "wall_time_delta_seconds_per_successful_task": -1.0
27
+ }
28
+ ],
29
+ "public_claim_readiness": {
30
+ "blocking_gate_ids": [
31
+ "matched_successful_tasks",
32
+ "provider_measured_token_cost",
33
+ "shifted_cost_accounting",
34
+ "confidence_failure_notes",
35
+ "provider_export_provenance"
36
+ ],
37
+ "claim_allowed": false,
38
+ "claim_boundary": {
39
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
40
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
41
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
42
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
43
+ "id": "public_claim_readiness_authoritative_release_gate",
44
+ "reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
45
+ "reporting_only": true,
46
+ "requires_confidence_and_failure_notes": true,
47
+ "requires_matched_successful_tasks": true,
48
+ "requires_provider_export_provenance": true,
49
+ "requires_provider_measured_tokens_and_cost": true,
50
+ "requires_quality_non_inferiority": true,
51
+ "requires_shifted_cost_accounting": true,
52
+ "unsupported_claims_forbidden": true
53
+ },
54
+ "gates": [
55
+ {
56
+ "evidence": {
57
+ "comparison_count": 1,
58
+ "matched_pair_count": 0,
59
+ "min_matched_successful_task_count": 1.0,
60
+ "missing_baseline_success_task_count": 0,
61
+ "variants": [
62
+ "context_pack_auto"
63
+ ]
64
+ },
65
+ "id": "matched_successful_tasks",
66
+ "label": "Matched successful tasks",
67
+ "passed": false,
68
+ "reason": "missing_or_regressed_matched_successful_tasks",
69
+ "required": true,
70
+ "status": "fail"
71
+ },
72
+ {
73
+ "evidence": {
74
+ "matched_pair_count": 0,
75
+ "required_fields": [
76
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
77
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
78
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
79
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
80
+ ]
81
+ },
82
+ "id": "provider_measured_token_cost",
83
+ "label": "Provider-measured token and primary cost",
84
+ "passed": false,
85
+ "reason": "missing_provider_measured_primary_tokens_or_cost",
86
+ "required": true,
87
+ "status": "fail"
88
+ },
89
+ {
90
+ "evidence": {
91
+ "max_corrections_delta_per_successful_task": 0.0,
92
+ "max_failure_rate_delta_pp": 0.0,
93
+ "quality_gates": [
94
+ "pass"
95
+ ]
96
+ },
97
+ "id": "quality_non_inferiority",
98
+ "label": "Quality non-inferiority",
99
+ "passed": true,
100
+ "reason": "all_quality_gates_pass",
101
+ "required": true,
102
+ "status": "pass"
103
+ },
104
+ {
105
+ "evidence": {
106
+ "matched_pair_count": 0,
107
+ "required_fields": [
108
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
109
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
110
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
111
+ ]
112
+ },
113
+ "id": "shifted_cost_accounting",
114
+ "label": "Shifted-cost accounting",
115
+ "passed": false,
116
+ "reason": "missing_shifted_cost_claim_accounting",
117
+ "required": true,
118
+ "status": "fail"
119
+ },
120
+ {
121
+ "evidence": {
122
+ "comparison_failure_fields_present": true,
123
+ "explicit_note_count": 0,
124
+ "failed_row_count": 0,
125
+ "failed_rows_with_notes": 0,
126
+ "replay_row_count": 0
127
+ },
128
+ "id": "confidence_failure_notes",
129
+ "label": "Confidence and failure notes",
130
+ "passed": false,
131
+ "reason": "missing_explicit_replay_notes_or_failure_evidence",
132
+ "required": true,
133
+ "status": "unknown"
134
+ },
135
+ {
136
+ "evidence": {
137
+ "mixed_csv": false,
138
+ "provider_names": [],
139
+ "replay_row_count": 0,
140
+ "report_row_count": 2,
141
+ "same_run_complete": false,
142
+ "source_types": []
143
+ },
144
+ "id": "provider_export_provenance",
145
+ "label": "Provider-export provenance",
146
+ "passed": false,
147
+ "reason": "missing_or_mixed_provider_export_provenance",
148
+ "required": true,
149
+ "status": "unknown"
150
+ }
151
+ ],
152
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
153
+ "passed_required_gate_count": 1,
154
+ "public_claim_eligible_observed": null,
155
+ "public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
156
+ "raw_metric_claim_status_observed": "insufficient_paired_data",
157
+ "reason": "replay_evidence_required_for_public_claim",
158
+ "required_gate_count": 6,
159
+ "required_gate_ids": [
160
+ "matched_successful_tasks",
161
+ "provider_measured_token_cost",
162
+ "quality_non_inferiority",
163
+ "shifted_cost_accounting",
164
+ "confidence_failure_notes",
165
+ "provider_export_provenance"
166
+ ],
167
+ "schema_version": "contextguard.bench.public-claim-readiness.v1",
168
+ "status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
169
+ },
4
170
  "row_count": 2,
171
+ "schema": "context-guard-bench-report-v1",
5
172
  "summary_by_variant": {
6
173
  "baseline": {
7
- "runs": 1,
8
- "successful_runs": 1,
174
+ "artifacts_used_per_successful_task": 0.0,
175
+ "artifacts_used_successful": 0,
176
+ "byte_reduction_ratio": 1.0,
177
+ "byte_savings_pct": 0.0,
178
+ "bytes_after_successful": 24000,
179
+ "bytes_before_successful": 24000,
180
+ "bytes_saved_per_successful_task": 0.0,
181
+ "bytes_saved_successful": 0,
182
+ "compression_strategy": "baseline",
183
+ "corrections_per_successful_task": 0.0,
184
+ "corrections_successful": 0,
185
+ "external_cost_successful_usd": 0.0,
186
+ "external_cost_unknown_successful": 1,
187
+ "external_tokens_measured_successful": 0,
188
+ "external_tokens_per_successful_task": null,
189
+ "external_tokens_successful": 0,
9
190
  "failed_runs": 0,
10
- "total_tokens_all_runs": 0,
11
- "primary_tokens_measured_runs": 0,
191
+ "failure_rate": 0.0,
192
+ "hook_triggers_successful": 0,
193
+ "is_baseline_strategy": true,
194
+ "observed_telemetry": {
195
+ "byte_savings": "observed",
196
+ "external_tokens": "unavailable",
197
+ "primary_cost": "unavailable",
198
+ "provider_cache": "unavailable",
199
+ "token_proxy": "inferred",
200
+ "tokens": "unavailable",
201
+ "wall_time": "observed"
202
+ },
12
203
  "primary_cost_all_runs_usd": 0.0,
13
204
  "primary_cost_measured_runs": 0,
14
- "wall_time_seconds_all_runs": 12.0,
15
- "wall_time_seconds_measured_runs": 1,
205
+ "primary_cost_measured_successful": 0,
206
+ "primary_cost_per_successful_task_usd": null,
207
+ "primary_cost_per_task_including_failures_usd": null,
208
+ "primary_cost_successful_usd": 0.0,
209
+ "primary_tokens_measured_runs": 0,
210
+ "primary_tokens_measured_successful": 0,
16
211
  "provider_cached_tokens_all_runs": 0,
17
212
  "provider_cached_tokens_measured_runs": 0,
213
+ "provider_cached_tokens_measured_successful": 0,
214
+ "provider_cached_tokens_per_successful_task": 0.0,
215
+ "provider_cached_tokens_per_task_including_failures": 0.0,
216
+ "provider_cached_tokens_successful": 0,
217
+ "runs": 1,
218
+ "successful_runs": 1,
219
+ "successful_task_count": 1,
220
+ "task_count": 1,
221
+ "token_proxy_saved_per_successful_task": 0.0,
222
+ "token_proxy_saved_successful": 0,
223
+ "tokens_per_successful_task": null,
224
+ "tokens_per_task_including_failures": null,
18
225
  "total_cost_with_shift_all_runs_usd": 0.0,
19
226
  "total_cost_with_shift_measured_runs": 0,
227
+ "total_cost_with_shift_measured_successful": 0,
228
+ "total_cost_with_shift_per_successful_task_usd": null,
229
+ "total_cost_with_shift_per_task_including_failures_usd": null,
230
+ "total_cost_with_shift_successful_usd": 0.0,
231
+ "total_tokens_all_runs": 0,
20
232
  "total_tokens_successful": 0,
21
- "primary_tokens_measured_successful": 0,
22
- "primary_cost_successful_usd": 0.0,
23
- "primary_cost_measured_successful": 0,
24
- "wall_time_seconds_successful": 12.0,
233
+ "turns_successful": 0,
234
+ "wall_time_seconds_all_runs": 12.0,
235
+ "wall_time_seconds_measured_runs": 1,
25
236
  "wall_time_seconds_measured_successful": 1,
26
- "provider_cached_tokens_successful": 0,
27
- "provider_cached_tokens_measured_successful": 0,
237
+ "wall_time_seconds_per_successful_task": 12.0,
238
+ "wall_time_seconds_per_task_including_failures": 12.0,
239
+ "wall_time_seconds_successful": 12.0
240
+ },
241
+ "context_pack_auto": {
242
+ "artifacts_used_per_successful_task": 0.0,
243
+ "artifacts_used_successful": 0,
244
+ "byte_reduction_ratio": 0.25,
245
+ "byte_savings_pct": 75.0,
246
+ "bytes_after_successful": 6000,
247
+ "bytes_before_successful": 24000,
248
+ "bytes_saved_per_successful_task": 18000.0,
249
+ "bytes_saved_successful": 18000,
250
+ "compression_strategy": "context_pack_auto",
251
+ "corrections_per_successful_task": 0.0,
252
+ "corrections_successful": 0,
28
253
  "external_cost_successful_usd": 0.0,
29
254
  "external_cost_unknown_successful": 1,
30
- "total_cost_with_shift_successful_usd": 0.0,
31
- "total_cost_with_shift_measured_successful": 0,
32
- "external_tokens_successful": 0,
33
255
  "external_tokens_measured_successful": 0,
34
- "artifacts_used_successful": 0,
35
- "corrections_successful": 0,
36
- "bytes_before_successful": 24000,
37
- "bytes_after_successful": 24000,
38
- "turns_successful": 0,
39
- "hook_triggers_successful": 0,
40
- "failure_rate": 0.0,
41
- "task_count": 1,
42
- "successful_task_count": 1,
43
- "tokens_per_task_including_failures": null,
44
- "wall_time_seconds_per_task_including_failures": 12.0,
45
- "provider_cached_tokens_per_task_including_failures": 0.0,
46
- "primary_cost_per_task_including_failures_usd": null,
47
- "total_cost_with_shift_per_task_including_failures_usd": null,
48
- "tokens_per_successful_task": null,
49
- "wall_time_seconds_per_successful_task": 12.0,
50
- "provider_cached_tokens_per_successful_task": 0.0,
51
- "primary_cost_per_successful_task_usd": null,
52
- "total_cost_with_shift_per_successful_task_usd": null,
53
256
  "external_tokens_per_successful_task": null,
54
- "artifacts_used_per_successful_task": 0.0,
55
- "corrections_per_successful_task": 0.0,
56
- "byte_reduction_ratio": 1.0,
57
- "compression_strategy": "baseline",
58
- "is_baseline_strategy": true,
59
- "bytes_saved_successful": 0,
60
- "bytes_saved_per_successful_task": 0.0,
61
- "byte_savings_pct": 0.0,
62
- "token_proxy_saved_successful": 0,
63
- "token_proxy_saved_per_successful_task": 0.0,
257
+ "external_tokens_successful": 0,
258
+ "failed_runs": 0,
259
+ "failure_rate": 0.0,
260
+ "hook_triggers_successful": 0,
261
+ "is_baseline_strategy": false,
64
262
  "observed_telemetry": {
65
- "tokens": "unavailable",
66
- "primary_cost": "unavailable",
67
- "external_tokens": "unavailable",
68
263
  "byte_savings": "observed",
264
+ "external_tokens": "unavailable",
265
+ "primary_cost": "unavailable",
266
+ "provider_cache": "unavailable",
69
267
  "token_proxy": "inferred",
70
- "wall_time": "observed",
71
- "provider_cache": "unavailable"
72
- }
73
- },
74
- "context_pack_auto": {
75
- "runs": 1,
76
- "successful_runs": 1,
77
- "failed_runs": 0,
78
- "total_tokens_all_runs": 0,
79
- "primary_tokens_measured_runs": 0,
268
+ "tokens": "unavailable",
269
+ "wall_time": "observed"
270
+ },
80
271
  "primary_cost_all_runs_usd": 0.0,
81
272
  "primary_cost_measured_runs": 0,
82
- "wall_time_seconds_all_runs": 11.0,
83
- "wall_time_seconds_measured_runs": 1,
273
+ "primary_cost_measured_successful": 0,
274
+ "primary_cost_per_successful_task_usd": null,
275
+ "primary_cost_per_task_including_failures_usd": null,
276
+ "primary_cost_successful_usd": 0.0,
277
+ "primary_tokens_measured_runs": 0,
278
+ "primary_tokens_measured_successful": 0,
84
279
  "provider_cached_tokens_all_runs": 0,
85
280
  "provider_cached_tokens_measured_runs": 0,
86
- "total_cost_with_shift_all_runs_usd": 0.0,
87
- "total_cost_with_shift_measured_runs": 0,
88
- "total_tokens_successful": 0,
89
- "primary_tokens_measured_successful": 0,
90
- "primary_cost_successful_usd": 0.0,
91
- "primary_cost_measured_successful": 0,
92
- "wall_time_seconds_successful": 11.0,
93
- "wall_time_seconds_measured_successful": 1,
94
- "provider_cached_tokens_successful": 0,
95
281
  "provider_cached_tokens_measured_successful": 0,
96
- "external_cost_successful_usd": 0.0,
97
- "external_cost_unknown_successful": 1,
98
- "total_cost_with_shift_successful_usd": 0.0,
99
- "total_cost_with_shift_measured_successful": 0,
100
- "external_tokens_successful": 0,
101
- "external_tokens_measured_successful": 0,
102
- "artifacts_used_successful": 0,
103
- "corrections_successful": 0,
104
- "bytes_before_successful": 24000,
105
- "bytes_after_successful": 6000,
106
- "turns_successful": 0,
107
- "hook_triggers_successful": 0,
108
- "failure_rate": 0.0,
109
- "task_count": 1,
282
+ "provider_cached_tokens_per_successful_task": 0.0,
283
+ "provider_cached_tokens_per_task_including_failures": 0.0,
284
+ "provider_cached_tokens_successful": 0,
285
+ "runs": 1,
286
+ "successful_runs": 1,
110
287
  "successful_task_count": 1,
288
+ "task_count": 1,
289
+ "token_proxy_saved_per_successful_task": 4500.0,
290
+ "token_proxy_saved_successful": 4500,
291
+ "tokens_per_successful_task": null,
111
292
  "tokens_per_task_including_failures": null,
112
- "wall_time_seconds_per_task_including_failures": 11.0,
113
- "provider_cached_tokens_per_task_including_failures": 0.0,
114
- "primary_cost_per_task_including_failures_usd": null,
293
+ "total_cost_with_shift_all_runs_usd": 0.0,
294
+ "total_cost_with_shift_measured_runs": 0,
295
+ "total_cost_with_shift_measured_successful": 0,
296
+ "total_cost_with_shift_per_successful_task_usd": null,
115
297
  "total_cost_with_shift_per_task_including_failures_usd": null,
116
- "tokens_per_successful_task": null,
298
+ "total_cost_with_shift_successful_usd": 0.0,
299
+ "total_tokens_all_runs": 0,
300
+ "total_tokens_successful": 0,
301
+ "turns_successful": 0,
302
+ "wall_time_seconds_all_runs": 11.0,
303
+ "wall_time_seconds_measured_runs": 1,
304
+ "wall_time_seconds_measured_successful": 1,
117
305
  "wall_time_seconds_per_successful_task": 11.0,
118
- "provider_cached_tokens_per_successful_task": 0.0,
119
- "primary_cost_per_successful_task_usd": null,
120
- "total_cost_with_shift_per_successful_task_usd": null,
121
- "external_tokens_per_successful_task": null,
122
- "artifacts_used_per_successful_task": 0.0,
123
- "corrections_per_successful_task": 0.0,
124
- "byte_reduction_ratio": 0.25,
125
- "compression_strategy": "context_pack_auto",
126
- "is_baseline_strategy": false,
127
- "bytes_saved_successful": 18000,
128
- "bytes_saved_per_successful_task": 18000.0,
129
- "byte_savings_pct": 75.0,
130
- "token_proxy_saved_successful": 4500,
131
- "token_proxy_saved_per_successful_task": 4500.0,
132
- "observed_telemetry": {
133
- "tokens": "unavailable",
134
- "primary_cost": "unavailable",
135
- "external_tokens": "unavailable",
136
- "byte_savings": "observed",
137
- "token_proxy": "inferred",
138
- "wall_time": "observed",
139
- "provider_cache": "unavailable"
140
- }
141
- }
142
- },
143
- "comparisons": [
144
- {
145
- "variant": "context_pack_auto",
146
- "baseline_variant": "baseline",
147
- "quality_gate": "pass",
148
- "baseline_failure_rate": 0.0,
149
- "variant_failure_rate": 0.0,
150
- "failure_rate_delta_pp": 0.0,
151
- "matched_successful_task_count": 1,
152
- "baseline_successful_task_count": 1,
153
- "missing_baseline_success_tasks": [],
154
- "baseline_corrections_per_successful_task": 0.0,
155
- "variant_corrections_per_successful_task": 0.0,
156
- "paired_corrections_task_count": 1,
157
- "corrections_delta_per_successful_task": 0.0,
158
- "token_savings_pct": null,
159
- "paired_token_task_count": 0,
160
- "wall_time_delta_seconds_per_successful_task": -1.0,
161
- "wall_time_change_pct": -8.333333333333332,
162
- "paired_wall_time_task_count": 1,
163
- "cost_savings_pct_with_shift": null,
164
- "paired_cost_task_count": 0
306
+ "wall_time_seconds_per_task_including_failures": 11.0,
307
+ "wall_time_seconds_successful": 11.0
165
308
  }
166
- ],
167
- "claim_status": "insufficient_paired_data",
168
- "caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
309
+ }
169
310
  }