@ictechgy/context-guard 0.4.10 → 0.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -1
- package/README.ko.md +32 -21
- package/README.md +38 -29
- package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
- package/docs/benchmark-workflow-examples.md +3 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
- package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
- package/docs/experimental-benchmark-fixtures.md +24 -7
- package/package.json +2 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +14 -11
- package/plugins/context-guard/README.md +15 -14
- package/plugins/context-guard/bin/context-guard +46 -11
- package/plugins/context-guard/bin/context-guard-artifact +342 -33
- package/plugins/context-guard/bin/context-guard-audit +33 -2
- package/plugins/context-guard/bin/context-guard-bench +1542 -31
- package/plugins/context-guard/bin/context-guard-cache-score +318 -33
- package/plugins/context-guard/bin/context-guard-cost +7 -2
- package/plugins/context-guard/bin/context-guard-experiments +364 -8
- package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
- package/plugins/context-guard/bin/context-guard-pack +301 -17
- package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
- package/plugins/context-guard/bin/context-guard-tool-prune +241 -54
- package/plugins/context-guard/bin/context-guard-trim-output +288 -41
- package/plugins/context-guard/brief/README.md +5 -5
- package/plugins/context-guard/lib/context_guard_commands.py +214 -190
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
{"artifacts_used": 0, "bytes_after": 9450, "bytes_before": 9450, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_01_bugfix", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1715, "output_tokens": 229}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 11.17}
|
|
2
|
+
{"artifacts_used": 1, "bytes_after": 5481, "bytes_before": 9450, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_01_bugfix", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1131, "output_tokens": 210}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.13}
|
|
3
|
+
{"artifacts_used": 0, "bytes_after": 9900, "bytes_before": 9900, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_02_exploration", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1830, "output_tokens": 238}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 11.34}
|
|
4
|
+
{"artifacts_used": 1, "bytes_after": 5742, "bytes_before": 9900, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_02_exploration", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1207, "output_tokens": 218}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.26}
|
|
5
|
+
{"artifacts_used": 0, "bytes_after": 10350, "bytes_before": 10350, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_03_code_review", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1945, "output_tokens": 247}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 11.51}
|
|
6
|
+
{"artifacts_used": 1, "bytes_after": 6003, "bytes_before": 10350, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_03_code_review", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1283, "output_tokens": 227}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.39}
|
|
7
|
+
{"artifacts_used": 0, "bytes_after": 10800, "bytes_before": 10800, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_04_long_log_analysis", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2060, "output_tokens": 256}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 11.68}
|
|
8
|
+
{"artifacts_used": 1, "bytes_after": 6264, "bytes_before": 10800, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_04_long_log_analysis", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1359, "output_tokens": 235}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.52}
|
|
9
|
+
{"artifacts_used": 0, "bytes_after": 11250, "bytes_before": 11250, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_05_migration", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2175, "output_tokens": 265}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 11.85}
|
|
10
|
+
{"artifacts_used": 1, "bytes_after": 6525, "bytes_before": 11250, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_05_migration", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1435, "output_tokens": 243}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.65}
|
|
11
|
+
{"artifacts_used": 0, "bytes_after": 11700, "bytes_before": 11700, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_06_docs", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2290, "output_tokens": 274}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 12.02}
|
|
12
|
+
{"artifacts_used": 1, "bytes_after": 6785, "bytes_before": 11700, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_06_docs", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1511, "output_tokens": 252}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.78}
|
|
13
|
+
{"artifacts_used": 0, "bytes_after": 12150, "bytes_before": 12150, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_07_refactor", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2405, "output_tokens": 283}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 12.19}
|
|
14
|
+
{"artifacts_used": 1, "bytes_after": 7046, "bytes_before": 12150, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_07_refactor", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1587, "output_tokens": 260}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 11.91}
|
|
15
|
+
{"artifacts_used": 0, "bytes_after": 12600, "bytes_before": 12600, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_08_performance", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2520, "output_tokens": 292}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 12.36}
|
|
16
|
+
{"artifacts_used": 1, "bytes_after": 7307, "bytes_before": 12600, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_08_performance", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1663, "output_tokens": 268}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 12.04}
|
|
17
|
+
{"artifacts_used": 0, "bytes_after": 13050, "bytes_before": 13050, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_09_telemetry", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2635, "output_tokens": 301}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 12.53}
|
|
18
|
+
{"artifacts_used": 1, "bytes_after": 7568, "bytes_before": 13050, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_09_telemetry", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1739, "output_tokens": 276}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 12.17}
|
|
19
|
+
{"artifacts_used": 0, "bytes_after": 13500, "bytes_before": 13500, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_10_cache_layout", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2750, "output_tokens": 310}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 12.7}
|
|
20
|
+
{"artifacts_used": 1, "bytes_after": 7829, "bytes_before": 13500, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_10_cache_layout", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1815, "output_tokens": 285}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 12.3}
|
|
21
|
+
{"artifacts_used": 0, "bytes_after": 13950, "bytes_before": 13950, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_11_tool_schema", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2865, "output_tokens": 319}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 12.87}
|
|
22
|
+
{"artifacts_used": 1, "bytes_after": 8090, "bytes_before": 13950, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_11_tool_schema", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1890, "output_tokens": 293}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 12.43}
|
|
23
|
+
{"artifacts_used": 0, "bytes_after": 14400, "bytes_before": 14400, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 0, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_12_artifact_receipt", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 2980, "output_tokens": 328}, "turns": 3, "variant": "baseline_full_context_fixture", "wall_time_seconds": 13.04}
|
|
24
|
+
{"artifacts_used": 1, "bytes_after": 8352, "bytes_before": 14400, "corrections": 0, "cost_measured": false, "cost_usd": 0.0, "effort": "medium", "external_cost_measured": true, "external_cost_usd": 0.0, "external_tokens": 0, "external_tokens_measured": true, "hook_triggers": 1, "model": "sonnet", "notes": "synthetic fixture-only replay row; not provider measured and not public-claim eligible", "primary_tokens_measured": false, "provenance": {"capture_command_or_export_id": "docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl", "claim_scope": "local_replay_fixture_not_public_claim", "evidence_source_type": "synthetic_fixture"}, "provider_cached_tokens": 0, "provider_cached_tokens_measured": false, "schema_version": "contextguard.bench.run-evidence.v1", "success": true, "task_id": "token_savings_12_artifact_receipt", "tokens": {"cache_creation": 0, "cache_read": 0, "input_tokens": 1966, "output_tokens": 301}, "turns": 2, "variant": "fixture_only_contextguard_advisory_foundations", "wall_time_seconds": 12.56}
|
|
@@ -26,6 +26,7 @@ Use them to decide what evidence a workflow has and what it does **not** prove:
|
|
|
26
26
|
3. Treat `comparisons[].quality_gate != "pass"` as a warning to inspect failures, correction burden, and unmatched tasks before discussing savings.
|
|
27
27
|
4. Keep byte-proxy, provider-cache, wall-time, and shifted-cost evidence in separate language from provider-measured token/cost claims. Provider-cache telemetry is not independent savings proof.
|
|
28
28
|
5. Keep self-hosted local/model-server latency, memory, and quality metrics in the run-evidence ledger sidecar; do not fold them into hosted API token/cost savings claims unless provider-measured matched-task evidence separately supports that claim.
|
|
29
|
+
6. For deterministic local replay, add `--evidence-jsonl ... --dashboard-md ...`. Synthetic/manual replay evidence regenerates CSV/report/dashboard artifacts, but the report is marked `replay_only_not_public_claim` or `unknown_mixed_csv` unless every report row has complete provider-export provenance. Public hosted savings claims must additionally have `public_claim_readiness.claim_allowed=true`, which requires matched successful tasks, provider-measured token/cost, quality non-inferiority, shifted-cost accounting, explicit confidence/failure notes, and complete provider-export provenance.
|
|
29
30
|
|
|
30
31
|
## Safe wording
|
|
31
32
|
|
|
@@ -42,3 +43,5 @@ The `.example.json` fixtures intentionally use full `context-guard-bench-report-
|
|
|
42
43
|
The self-hosted metrics example is a JSONL run-evidence sidecar, not a full report shape. Its fields are additive ledger evidence only: `latency_ms`, `peak_memory_mb`, and normalized `quality_score` describe local/model-server behavior and leave hosted API report calculations unchanged. Use `context-guard experiments plan self-hosted-metrics-ledger --json ...` only as a dry-run ledger-preview checker for explicit metrics; it does not write the benchmark ledger.
|
|
43
44
|
|
|
44
45
|
For task/variant starter fixtures rather than full report-shape examples, see [`experimental-benchmark-fixtures.md`](experimental-benchmark-fixtures.md). Those files are fixture-only and synthetic dry-run-only starters until users replace the placeholder prompts and success checks; they are not shipped OCR, visual-token, learned-compression, or output-transform benchmark results, and real claims still require provider-measured matched successful tasks plus failure-rate, correction, and shifted-cost guardrails.
|
|
46
|
+
|
|
47
|
+
The token-savings 12-task starter also includes [`benchmark-fixtures/token-savings-12task.evidence.example.jsonl`](benchmark-fixtures/token-savings-12task.evidence.example.jsonl) for `context-guard-bench --evidence-jsonl` replay. That file is synthetic local replay evidence, not provider-measured savings proof; use it to validate dashboards and claim-boundary handling before collecting real provider exports.
|
|
@@ -1,169 +1,310 @@
|
|
|
1
1
|
{
|
|
2
|
-
"schema": "context-guard-bench-report-v1",
|
|
3
2
|
"baseline_variant": "baseline",
|
|
3
|
+
"caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims. Public hosted savings claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden.",
|
|
4
|
+
"claim_status": "insufficient_paired_data",
|
|
5
|
+
"comparisons": [
|
|
6
|
+
{
|
|
7
|
+
"baseline_corrections_per_successful_task": 0.0,
|
|
8
|
+
"baseline_failure_rate": 0.0,
|
|
9
|
+
"baseline_successful_task_count": 1,
|
|
10
|
+
"baseline_variant": "baseline",
|
|
11
|
+
"corrections_delta_per_successful_task": 0.0,
|
|
12
|
+
"cost_savings_pct_with_shift": null,
|
|
13
|
+
"failure_rate_delta_pp": 0.0,
|
|
14
|
+
"matched_successful_task_count": 1,
|
|
15
|
+
"missing_baseline_success_tasks": [],
|
|
16
|
+
"paired_corrections_task_count": 1,
|
|
17
|
+
"paired_cost_task_count": 0,
|
|
18
|
+
"paired_token_task_count": 0,
|
|
19
|
+
"paired_wall_time_task_count": 1,
|
|
20
|
+
"quality_gate": "pass",
|
|
21
|
+
"token_savings_pct": null,
|
|
22
|
+
"variant": "context_pack_auto",
|
|
23
|
+
"variant_corrections_per_successful_task": 0.0,
|
|
24
|
+
"variant_failure_rate": 0.0,
|
|
25
|
+
"wall_time_change_pct": -8.333333333333332,
|
|
26
|
+
"wall_time_delta_seconds_per_successful_task": -1.0
|
|
27
|
+
}
|
|
28
|
+
],
|
|
29
|
+
"public_claim_readiness": {
|
|
30
|
+
"blocking_gate_ids": [
|
|
31
|
+
"matched_successful_tasks",
|
|
32
|
+
"provider_measured_token_cost",
|
|
33
|
+
"shifted_cost_accounting",
|
|
34
|
+
"confidence_failure_notes",
|
|
35
|
+
"provider_export_provenance"
|
|
36
|
+
],
|
|
37
|
+
"claim_allowed": false,
|
|
38
|
+
"claim_boundary": {
|
|
39
|
+
"claim_allowed_field": "public_claim_readiness.claim_allowed",
|
|
40
|
+
"fixed_percent_savings_claim_without_matched_provider_report_forbidden": true,
|
|
41
|
+
"hosted_api_cost_savings_claim_without_claim_allowed_forbidden": true,
|
|
42
|
+
"hosted_api_token_savings_claim_without_claim_allowed_forbidden": true,
|
|
43
|
+
"id": "public_claim_readiness_authoritative_release_gate",
|
|
44
|
+
"reason": "Public hosted token/cost savings claims are forbidden unless every readiness gate passes and public_claim_readiness.claim_allowed is true.",
|
|
45
|
+
"reporting_only": true,
|
|
46
|
+
"requires_confidence_and_failure_notes": true,
|
|
47
|
+
"requires_matched_successful_tasks": true,
|
|
48
|
+
"requires_provider_export_provenance": true,
|
|
49
|
+
"requires_provider_measured_tokens_and_cost": true,
|
|
50
|
+
"requires_quality_non_inferiority": true,
|
|
51
|
+
"requires_shifted_cost_accounting": true,
|
|
52
|
+
"unsupported_claims_forbidden": true
|
|
53
|
+
},
|
|
54
|
+
"gates": [
|
|
55
|
+
{
|
|
56
|
+
"evidence": {
|
|
57
|
+
"comparison_count": 1,
|
|
58
|
+
"matched_pair_count": 0,
|
|
59
|
+
"min_matched_successful_task_count": 1.0,
|
|
60
|
+
"missing_baseline_success_task_count": 0,
|
|
61
|
+
"variants": [
|
|
62
|
+
"context_pack_auto"
|
|
63
|
+
]
|
|
64
|
+
},
|
|
65
|
+
"id": "matched_successful_tasks",
|
|
66
|
+
"label": "Matched successful tasks",
|
|
67
|
+
"passed": false,
|
|
68
|
+
"reason": "missing_or_regressed_matched_successful_tasks",
|
|
69
|
+
"required": true,
|
|
70
|
+
"status": "fail"
|
|
71
|
+
},
|
|
72
|
+
{
|
|
73
|
+
"evidence": {
|
|
74
|
+
"matched_pair_count": 0,
|
|
75
|
+
"required_fields": [
|
|
76
|
+
"matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
|
|
77
|
+
"matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
|
|
78
|
+
"matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
|
|
79
|
+
"matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured"
|
|
80
|
+
]
|
|
81
|
+
},
|
|
82
|
+
"id": "provider_measured_token_cost",
|
|
83
|
+
"label": "Provider-measured token and primary cost",
|
|
84
|
+
"passed": false,
|
|
85
|
+
"reason": "missing_provider_measured_primary_tokens_or_cost",
|
|
86
|
+
"required": true,
|
|
87
|
+
"status": "fail"
|
|
88
|
+
},
|
|
89
|
+
{
|
|
90
|
+
"evidence": {
|
|
91
|
+
"max_corrections_delta_per_successful_task": 0.0,
|
|
92
|
+
"max_failure_rate_delta_pp": 0.0,
|
|
93
|
+
"quality_gates": [
|
|
94
|
+
"pass"
|
|
95
|
+
]
|
|
96
|
+
},
|
|
97
|
+
"id": "quality_non_inferiority",
|
|
98
|
+
"label": "Quality non-inferiority",
|
|
99
|
+
"passed": true,
|
|
100
|
+
"reason": "all_quality_gates_pass",
|
|
101
|
+
"required": true,
|
|
102
|
+
"status": "pass"
|
|
103
|
+
},
|
|
104
|
+
{
|
|
105
|
+
"evidence": {
|
|
106
|
+
"matched_pair_count": 0,
|
|
107
|
+
"required_fields": [
|
|
108
|
+
"matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
|
|
109
|
+
"matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
|
|
110
|
+
"matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured"
|
|
111
|
+
]
|
|
112
|
+
},
|
|
113
|
+
"id": "shifted_cost_accounting",
|
|
114
|
+
"label": "Shifted-cost accounting",
|
|
115
|
+
"passed": false,
|
|
116
|
+
"reason": "missing_shifted_cost_claim_accounting",
|
|
117
|
+
"required": true,
|
|
118
|
+
"status": "fail"
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
"evidence": {
|
|
122
|
+
"comparison_failure_fields_present": true,
|
|
123
|
+
"explicit_note_count": 0,
|
|
124
|
+
"failed_row_count": 0,
|
|
125
|
+
"failed_rows_with_notes": 0,
|
|
126
|
+
"replay_row_count": 0
|
|
127
|
+
},
|
|
128
|
+
"id": "confidence_failure_notes",
|
|
129
|
+
"label": "Confidence and failure notes",
|
|
130
|
+
"passed": false,
|
|
131
|
+
"reason": "missing_explicit_replay_notes_or_failure_evidence",
|
|
132
|
+
"required": true,
|
|
133
|
+
"status": "unknown"
|
|
134
|
+
},
|
|
135
|
+
{
|
|
136
|
+
"evidence": {
|
|
137
|
+
"mixed_csv": false,
|
|
138
|
+
"provider_names": [],
|
|
139
|
+
"replay_row_count": 0,
|
|
140
|
+
"report_row_count": 2,
|
|
141
|
+
"same_run_complete": false,
|
|
142
|
+
"source_types": []
|
|
143
|
+
},
|
|
144
|
+
"id": "provider_export_provenance",
|
|
145
|
+
"label": "Provider-export provenance",
|
|
146
|
+
"passed": false,
|
|
147
|
+
"reason": "missing_or_mixed_provider_export_provenance",
|
|
148
|
+
"required": true,
|
|
149
|
+
"status": "unknown"
|
|
150
|
+
}
|
|
151
|
+
],
|
|
152
|
+
"generated_from": "matched_pair_evidence_and_replay_provenance",
|
|
153
|
+
"passed_required_gate_count": 1,
|
|
154
|
+
"public_claim_eligible_observed": null,
|
|
155
|
+
"public_claim_status_observed": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
|
|
156
|
+
"raw_metric_claim_status_observed": "insufficient_paired_data",
|
|
157
|
+
"reason": "replay_evidence_required_for_public_claim",
|
|
158
|
+
"required_gate_count": 6,
|
|
159
|
+
"required_gate_ids": [
|
|
160
|
+
"matched_successful_tasks",
|
|
161
|
+
"provider_measured_token_cost",
|
|
162
|
+
"quality_non_inferiority",
|
|
163
|
+
"shifted_cost_accounting",
|
|
164
|
+
"confidence_failure_notes",
|
|
165
|
+
"provider_export_provenance"
|
|
166
|
+
],
|
|
167
|
+
"schema_version": "contextguard.bench.public-claim-readiness.v1",
|
|
168
|
+
"status": "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
|
|
169
|
+
},
|
|
4
170
|
"row_count": 2,
|
|
171
|
+
"schema": "context-guard-bench-report-v1",
|
|
5
172
|
"summary_by_variant": {
|
|
6
173
|
"baseline": {
|
|
7
|
-
"
|
|
8
|
-
"
|
|
174
|
+
"artifacts_used_per_successful_task": 0.0,
|
|
175
|
+
"artifacts_used_successful": 0,
|
|
176
|
+
"byte_reduction_ratio": 1.0,
|
|
177
|
+
"byte_savings_pct": 0.0,
|
|
178
|
+
"bytes_after_successful": 24000,
|
|
179
|
+
"bytes_before_successful": 24000,
|
|
180
|
+
"bytes_saved_per_successful_task": 0.0,
|
|
181
|
+
"bytes_saved_successful": 0,
|
|
182
|
+
"compression_strategy": "baseline",
|
|
183
|
+
"corrections_per_successful_task": 0.0,
|
|
184
|
+
"corrections_successful": 0,
|
|
185
|
+
"external_cost_successful_usd": 0.0,
|
|
186
|
+
"external_cost_unknown_successful": 1,
|
|
187
|
+
"external_tokens_measured_successful": 0,
|
|
188
|
+
"external_tokens_per_successful_task": null,
|
|
189
|
+
"external_tokens_successful": 0,
|
|
9
190
|
"failed_runs": 0,
|
|
10
|
-
"
|
|
11
|
-
"
|
|
191
|
+
"failure_rate": 0.0,
|
|
192
|
+
"hook_triggers_successful": 0,
|
|
193
|
+
"is_baseline_strategy": true,
|
|
194
|
+
"observed_telemetry": {
|
|
195
|
+
"byte_savings": "observed",
|
|
196
|
+
"external_tokens": "unavailable",
|
|
197
|
+
"primary_cost": "unavailable",
|
|
198
|
+
"provider_cache": "unavailable",
|
|
199
|
+
"token_proxy": "inferred",
|
|
200
|
+
"tokens": "unavailable",
|
|
201
|
+
"wall_time": "observed"
|
|
202
|
+
},
|
|
12
203
|
"primary_cost_all_runs_usd": 0.0,
|
|
13
204
|
"primary_cost_measured_runs": 0,
|
|
14
|
-
"
|
|
15
|
-
"
|
|
205
|
+
"primary_cost_measured_successful": 0,
|
|
206
|
+
"primary_cost_per_successful_task_usd": null,
|
|
207
|
+
"primary_cost_per_task_including_failures_usd": null,
|
|
208
|
+
"primary_cost_successful_usd": 0.0,
|
|
209
|
+
"primary_tokens_measured_runs": 0,
|
|
210
|
+
"primary_tokens_measured_successful": 0,
|
|
16
211
|
"provider_cached_tokens_all_runs": 0,
|
|
17
212
|
"provider_cached_tokens_measured_runs": 0,
|
|
213
|
+
"provider_cached_tokens_measured_successful": 0,
|
|
214
|
+
"provider_cached_tokens_per_successful_task": 0.0,
|
|
215
|
+
"provider_cached_tokens_per_task_including_failures": 0.0,
|
|
216
|
+
"provider_cached_tokens_successful": 0,
|
|
217
|
+
"runs": 1,
|
|
218
|
+
"successful_runs": 1,
|
|
219
|
+
"successful_task_count": 1,
|
|
220
|
+
"task_count": 1,
|
|
221
|
+
"token_proxy_saved_per_successful_task": 0.0,
|
|
222
|
+
"token_proxy_saved_successful": 0,
|
|
223
|
+
"tokens_per_successful_task": null,
|
|
224
|
+
"tokens_per_task_including_failures": null,
|
|
18
225
|
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
19
226
|
"total_cost_with_shift_measured_runs": 0,
|
|
227
|
+
"total_cost_with_shift_measured_successful": 0,
|
|
228
|
+
"total_cost_with_shift_per_successful_task_usd": null,
|
|
229
|
+
"total_cost_with_shift_per_task_including_failures_usd": null,
|
|
230
|
+
"total_cost_with_shift_successful_usd": 0.0,
|
|
231
|
+
"total_tokens_all_runs": 0,
|
|
20
232
|
"total_tokens_successful": 0,
|
|
21
|
-
"
|
|
22
|
-
"
|
|
23
|
-
"
|
|
24
|
-
"wall_time_seconds_successful": 12.0,
|
|
233
|
+
"turns_successful": 0,
|
|
234
|
+
"wall_time_seconds_all_runs": 12.0,
|
|
235
|
+
"wall_time_seconds_measured_runs": 1,
|
|
25
236
|
"wall_time_seconds_measured_successful": 1,
|
|
26
|
-
"
|
|
27
|
-
"
|
|
237
|
+
"wall_time_seconds_per_successful_task": 12.0,
|
|
238
|
+
"wall_time_seconds_per_task_including_failures": 12.0,
|
|
239
|
+
"wall_time_seconds_successful": 12.0
|
|
240
|
+
},
|
|
241
|
+
"context_pack_auto": {
|
|
242
|
+
"artifacts_used_per_successful_task": 0.0,
|
|
243
|
+
"artifacts_used_successful": 0,
|
|
244
|
+
"byte_reduction_ratio": 0.25,
|
|
245
|
+
"byte_savings_pct": 75.0,
|
|
246
|
+
"bytes_after_successful": 6000,
|
|
247
|
+
"bytes_before_successful": 24000,
|
|
248
|
+
"bytes_saved_per_successful_task": 18000.0,
|
|
249
|
+
"bytes_saved_successful": 18000,
|
|
250
|
+
"compression_strategy": "context_pack_auto",
|
|
251
|
+
"corrections_per_successful_task": 0.0,
|
|
252
|
+
"corrections_successful": 0,
|
|
28
253
|
"external_cost_successful_usd": 0.0,
|
|
29
254
|
"external_cost_unknown_successful": 1,
|
|
30
|
-
"total_cost_with_shift_successful_usd": 0.0,
|
|
31
|
-
"total_cost_with_shift_measured_successful": 0,
|
|
32
|
-
"external_tokens_successful": 0,
|
|
33
255
|
"external_tokens_measured_successful": 0,
|
|
34
|
-
"artifacts_used_successful": 0,
|
|
35
|
-
"corrections_successful": 0,
|
|
36
|
-
"bytes_before_successful": 24000,
|
|
37
|
-
"bytes_after_successful": 24000,
|
|
38
|
-
"turns_successful": 0,
|
|
39
|
-
"hook_triggers_successful": 0,
|
|
40
|
-
"failure_rate": 0.0,
|
|
41
|
-
"task_count": 1,
|
|
42
|
-
"successful_task_count": 1,
|
|
43
|
-
"tokens_per_task_including_failures": null,
|
|
44
|
-
"wall_time_seconds_per_task_including_failures": 12.0,
|
|
45
|
-
"provider_cached_tokens_per_task_including_failures": 0.0,
|
|
46
|
-
"primary_cost_per_task_including_failures_usd": null,
|
|
47
|
-
"total_cost_with_shift_per_task_including_failures_usd": null,
|
|
48
|
-
"tokens_per_successful_task": null,
|
|
49
|
-
"wall_time_seconds_per_successful_task": 12.0,
|
|
50
|
-
"provider_cached_tokens_per_successful_task": 0.0,
|
|
51
|
-
"primary_cost_per_successful_task_usd": null,
|
|
52
|
-
"total_cost_with_shift_per_successful_task_usd": null,
|
|
53
256
|
"external_tokens_per_successful_task": null,
|
|
54
|
-
"
|
|
55
|
-
"
|
|
56
|
-
"
|
|
57
|
-
"
|
|
58
|
-
"is_baseline_strategy":
|
|
59
|
-
"bytes_saved_successful": 0,
|
|
60
|
-
"bytes_saved_per_successful_task": 0.0,
|
|
61
|
-
"byte_savings_pct": 0.0,
|
|
62
|
-
"token_proxy_saved_successful": 0,
|
|
63
|
-
"token_proxy_saved_per_successful_task": 0.0,
|
|
257
|
+
"external_tokens_successful": 0,
|
|
258
|
+
"failed_runs": 0,
|
|
259
|
+
"failure_rate": 0.0,
|
|
260
|
+
"hook_triggers_successful": 0,
|
|
261
|
+
"is_baseline_strategy": false,
|
|
64
262
|
"observed_telemetry": {
|
|
65
|
-
"tokens": "unavailable",
|
|
66
|
-
"primary_cost": "unavailable",
|
|
67
|
-
"external_tokens": "unavailable",
|
|
68
263
|
"byte_savings": "observed",
|
|
264
|
+
"external_tokens": "unavailable",
|
|
265
|
+
"primary_cost": "unavailable",
|
|
266
|
+
"provider_cache": "unavailable",
|
|
69
267
|
"token_proxy": "inferred",
|
|
70
|
-
"
|
|
71
|
-
"
|
|
72
|
-
}
|
|
73
|
-
},
|
|
74
|
-
"context_pack_auto": {
|
|
75
|
-
"runs": 1,
|
|
76
|
-
"successful_runs": 1,
|
|
77
|
-
"failed_runs": 0,
|
|
78
|
-
"total_tokens_all_runs": 0,
|
|
79
|
-
"primary_tokens_measured_runs": 0,
|
|
268
|
+
"tokens": "unavailable",
|
|
269
|
+
"wall_time": "observed"
|
|
270
|
+
},
|
|
80
271
|
"primary_cost_all_runs_usd": 0.0,
|
|
81
272
|
"primary_cost_measured_runs": 0,
|
|
82
|
-
"
|
|
83
|
-
"
|
|
273
|
+
"primary_cost_measured_successful": 0,
|
|
274
|
+
"primary_cost_per_successful_task_usd": null,
|
|
275
|
+
"primary_cost_per_task_including_failures_usd": null,
|
|
276
|
+
"primary_cost_successful_usd": 0.0,
|
|
277
|
+
"primary_tokens_measured_runs": 0,
|
|
278
|
+
"primary_tokens_measured_successful": 0,
|
|
84
279
|
"provider_cached_tokens_all_runs": 0,
|
|
85
280
|
"provider_cached_tokens_measured_runs": 0,
|
|
86
|
-
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
87
|
-
"total_cost_with_shift_measured_runs": 0,
|
|
88
|
-
"total_tokens_successful": 0,
|
|
89
|
-
"primary_tokens_measured_successful": 0,
|
|
90
|
-
"primary_cost_successful_usd": 0.0,
|
|
91
|
-
"primary_cost_measured_successful": 0,
|
|
92
|
-
"wall_time_seconds_successful": 11.0,
|
|
93
|
-
"wall_time_seconds_measured_successful": 1,
|
|
94
|
-
"provider_cached_tokens_successful": 0,
|
|
95
281
|
"provider_cached_tokens_measured_successful": 0,
|
|
96
|
-
"
|
|
97
|
-
"
|
|
98
|
-
"
|
|
99
|
-
"
|
|
100
|
-
"
|
|
101
|
-
"external_tokens_measured_successful": 0,
|
|
102
|
-
"artifacts_used_successful": 0,
|
|
103
|
-
"corrections_successful": 0,
|
|
104
|
-
"bytes_before_successful": 24000,
|
|
105
|
-
"bytes_after_successful": 6000,
|
|
106
|
-
"turns_successful": 0,
|
|
107
|
-
"hook_triggers_successful": 0,
|
|
108
|
-
"failure_rate": 0.0,
|
|
109
|
-
"task_count": 1,
|
|
282
|
+
"provider_cached_tokens_per_successful_task": 0.0,
|
|
283
|
+
"provider_cached_tokens_per_task_including_failures": 0.0,
|
|
284
|
+
"provider_cached_tokens_successful": 0,
|
|
285
|
+
"runs": 1,
|
|
286
|
+
"successful_runs": 1,
|
|
110
287
|
"successful_task_count": 1,
|
|
288
|
+
"task_count": 1,
|
|
289
|
+
"token_proxy_saved_per_successful_task": 4500.0,
|
|
290
|
+
"token_proxy_saved_successful": 4500,
|
|
291
|
+
"tokens_per_successful_task": null,
|
|
111
292
|
"tokens_per_task_including_failures": null,
|
|
112
|
-
"
|
|
113
|
-
"
|
|
114
|
-
"
|
|
293
|
+
"total_cost_with_shift_all_runs_usd": 0.0,
|
|
294
|
+
"total_cost_with_shift_measured_runs": 0,
|
|
295
|
+
"total_cost_with_shift_measured_successful": 0,
|
|
296
|
+
"total_cost_with_shift_per_successful_task_usd": null,
|
|
115
297
|
"total_cost_with_shift_per_task_including_failures_usd": null,
|
|
116
|
-
"
|
|
298
|
+
"total_cost_with_shift_successful_usd": 0.0,
|
|
299
|
+
"total_tokens_all_runs": 0,
|
|
300
|
+
"total_tokens_successful": 0,
|
|
301
|
+
"turns_successful": 0,
|
|
302
|
+
"wall_time_seconds_all_runs": 11.0,
|
|
303
|
+
"wall_time_seconds_measured_runs": 1,
|
|
304
|
+
"wall_time_seconds_measured_successful": 1,
|
|
117
305
|
"wall_time_seconds_per_successful_task": 11.0,
|
|
118
|
-
"
|
|
119
|
-
"
|
|
120
|
-
"total_cost_with_shift_per_successful_task_usd": null,
|
|
121
|
-
"external_tokens_per_successful_task": null,
|
|
122
|
-
"artifacts_used_per_successful_task": 0.0,
|
|
123
|
-
"corrections_per_successful_task": 0.0,
|
|
124
|
-
"byte_reduction_ratio": 0.25,
|
|
125
|
-
"compression_strategy": "context_pack_auto",
|
|
126
|
-
"is_baseline_strategy": false,
|
|
127
|
-
"bytes_saved_successful": 18000,
|
|
128
|
-
"bytes_saved_per_successful_task": 18000.0,
|
|
129
|
-
"byte_savings_pct": 75.0,
|
|
130
|
-
"token_proxy_saved_successful": 4500,
|
|
131
|
-
"token_proxy_saved_per_successful_task": 4500.0,
|
|
132
|
-
"observed_telemetry": {
|
|
133
|
-
"tokens": "unavailable",
|
|
134
|
-
"primary_cost": "unavailable",
|
|
135
|
-
"external_tokens": "unavailable",
|
|
136
|
-
"byte_savings": "observed",
|
|
137
|
-
"token_proxy": "inferred",
|
|
138
|
-
"wall_time": "observed",
|
|
139
|
-
"provider_cache": "unavailable"
|
|
140
|
-
}
|
|
141
|
-
}
|
|
142
|
-
},
|
|
143
|
-
"comparisons": [
|
|
144
|
-
{
|
|
145
|
-
"variant": "context_pack_auto",
|
|
146
|
-
"baseline_variant": "baseline",
|
|
147
|
-
"quality_gate": "pass",
|
|
148
|
-
"baseline_failure_rate": 0.0,
|
|
149
|
-
"variant_failure_rate": 0.0,
|
|
150
|
-
"failure_rate_delta_pp": 0.0,
|
|
151
|
-
"matched_successful_task_count": 1,
|
|
152
|
-
"baseline_successful_task_count": 1,
|
|
153
|
-
"missing_baseline_success_tasks": [],
|
|
154
|
-
"baseline_corrections_per_successful_task": 0.0,
|
|
155
|
-
"variant_corrections_per_successful_task": 0.0,
|
|
156
|
-
"paired_corrections_task_count": 1,
|
|
157
|
-
"corrections_delta_per_successful_task": 0.0,
|
|
158
|
-
"token_savings_pct": null,
|
|
159
|
-
"paired_token_task_count": 0,
|
|
160
|
-
"wall_time_delta_seconds_per_successful_task": -1.0,
|
|
161
|
-
"wall_time_change_pct": -8.333333333333332,
|
|
162
|
-
"paired_wall_time_task_count": 1,
|
|
163
|
-
"cost_savings_pct_with_shift": null,
|
|
164
|
-
"paired_cost_task_count": 0
|
|
306
|
+
"wall_time_seconds_per_task_including_failures": 11.0,
|
|
307
|
+
"wall_time_seconds_successful": 11.0
|
|
165
308
|
}
|
|
166
|
-
|
|
167
|
-
"claim_status": "insufficient_paired_data",
|
|
168
|
-
"caveat": "Proxy byte reductions are reported separately from matched-task token/cost metrics; shifted cost savings require measured primary cost and measured external cost when external tokens are present. Wall time and provider cached-token fields are diagnostic telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache discounts must stay separate from token-reduction claims."
|
|
309
|
+
}
|
|
169
310
|
}
|