@ictechgy/context-guard 0.4.9 → 0.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +28 -0
- package/README.ko.md +59 -31
- package/README.md +85 -36
- package/docs/benchmark-fixtures/token-savings-12task-baseline.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task-contextguard.prompt.example.md +7 -0
- package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
- package/docs/benchmark-fixtures/token-savings-12task.tasks.example.json +182 -0
- package/docs/benchmark-fixtures/token-savings-12task.variants.example.json +10 -0
- package/docs/benchmark-workflow-examples.md +3 -0
- package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
- package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
- package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
- package/docs/distribution.md +10 -7
- package/docs/experimental-benchmark-fixtures.md +30 -6
- package/package.json +4 -6
- package/packaging/homebrew/context-guard.rb.template +1 -1
- package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
- package/plugins/context-guard/README.ko.md +20 -14
- package/plugins/context-guard/README.md +26 -17
- package/plugins/context-guard/bin/context-guard +147 -25
- package/plugins/context-guard/bin/context-guard-artifact +884 -79
- package/plugins/context-guard/bin/context-guard-audit +33 -2
- package/plugins/context-guard/bin/context-guard-bench +1542 -31
- package/plugins/context-guard/bin/context-guard-cache-score +665 -0
- package/plugins/context-guard/bin/context-guard-compress +146 -1
- package/plugins/context-guard/bin/context-guard-cost +790 -6
- package/plugins/context-guard/bin/context-guard-experiments +463 -26
- package/plugins/context-guard/bin/context-guard-failed-nudge +9 -2
- package/plugins/context-guard/bin/context-guard-filter +163 -7
- package/plugins/context-guard/bin/context-guard-guard-read +3 -0
- package/plugins/context-guard/bin/context-guard-pack +892 -49
- package/plugins/context-guard/bin/context-guard-rewrite-bash +3 -0
- package/plugins/context-guard/bin/context-guard-sanitize-output +76 -12
- package/plugins/context-guard/bin/context-guard-setup +165 -31
- package/plugins/context-guard/bin/context-guard-statusline +490 -283
- package/plugins/context-guard/bin/context-guard-statusline-merged +5 -0
- package/plugins/context-guard/bin/context-guard-tool-prune +480 -53
- package/plugins/context-guard/bin/context-guard-trim-output +288 -41
- package/plugins/context-guard/brief/README.md +5 -5
- package/plugins/context-guard/lib/context_guard_commands.py +230 -0
- package/plugins/context-guard/skills/setup/SKILL.md +1 -0
- package/context-guard-kit/README.md +0 -91
- package/context-guard-kit/benchmark_runner.py +0 -2401
- package/context-guard-kit/claude_transcript_cost_audit.py +0 -2346
- package/context-guard-kit/context_compress.py +0 -695
- package/context-guard-kit/context_escrow.py +0 -935
- package/context-guard-kit/context_filter.py +0 -637
- package/context-guard-kit/context_guard_cli.py +0 -325
- package/context-guard-kit/context_guard_diet.py +0 -1711
- package/context-guard-kit/context_pack.py +0 -2713
- package/context-guard-kit/cost_guard.py +0 -2349
- package/context-guard-kit/experimental_registry.py +0 -4348
- package/context-guard-kit/failed_attempt_nudge.py +0 -567
- package/context-guard-kit/guard_large_read.py +0 -690
- package/context-guard-kit/hook_secret_patterns.py +0 -43
- package/context-guard-kit/read_symbol.py +0 -483
- package/context-guard-kit/rewrite_bash_for_token_budget.py +0 -501
- package/context-guard-kit/sanitize_output.py +0 -725
- package/context-guard-kit/settings.example.json +0 -67
- package/context-guard-kit/setup_wizard.py +0 -2515
- package/context-guard-kit/statusline.sh +0 -362
- package/context-guard-kit/statusline_merged.sh +0 -157
- package/context-guard-kit/tool_schema_pruner.py +0 -837
- package/context-guard-kit/trim_command_output.py +0 -1449
|
@@ -178,19 +178,137 @@ EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]],
|
|
|
178
178
|
)
|
|
179
179
|
MAX_USAGE_TOKEN_COUNT = 10**12
|
|
180
180
|
MAX_USAGE_COST_USD = 10**9
|
|
181
|
+
MAX_EVIDENCE_JSONL_BYTES = 5_000_000
|
|
182
|
+
MAX_EVIDENCE_JSONL_LINES = 100_000
|
|
181
183
|
# Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
|
|
182
184
|
# 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
|
|
183
185
|
# ~4 bytes/token의 통용 근사값을 사용한다.
|
|
184
186
|
TOKEN_PROXY_BYTES_PER_TOKEN = 4
|
|
185
187
|
BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
|
|
186
188
|
MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
|
|
189
|
+
MEASUREMENT_BASELINE_SCHEMA_VERSION = "contextguard.bench.measurement-baseline.v1"
|
|
190
|
+
DEFAULT_MATRIX_SCHEMA_VERSION = "contextguard.bench.default-matrix.v1"
|
|
191
|
+
PUBLIC_CLAIM_READINESS_SCHEMA_VERSION = "contextguard.bench.public-claim-readiness.v1"
|
|
187
192
|
SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
|
|
188
193
|
SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
|
|
189
194
|
SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
|
|
195
|
+
EVIDENCE_REPLAY_SOURCE_TYPES = frozenset({"synthetic_fixture", "provider_export", "manual_audit"})
|
|
196
|
+
PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES = frozenset({
|
|
197
|
+
"provider_measured_matched_task",
|
|
198
|
+
"provider_measured_matched_task_public_claim",
|
|
199
|
+
"hosted_api_provider_measured_matched_task",
|
|
200
|
+
})
|
|
201
|
+
REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS = "provider_export_public_claim_candidate"
|
|
202
|
+
REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS = "provider_export_claim_gates_not_met"
|
|
203
|
+
REPLAY_NOT_PUBLIC_CLAIM_STATUS = "replay_only_not_public_claim"
|
|
204
|
+
REPLAY_UNKNOWN_MIXED_CSV_STATUS = "unknown_mixed_csv"
|
|
205
|
+
REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES = frozenset({
|
|
206
|
+
"token_and_shifted_cost_savings_observed",
|
|
207
|
+
})
|
|
208
|
+
REPLAY_CLAIM_BOUNDARY = (
|
|
209
|
+
"Evidence replay is an import/replay mode. Synthetic fixtures and manual audits are never "
|
|
210
|
+
"hosted API token/cost savings evidence; public claims require complete provider_export "
|
|
211
|
+
"provenance for every report row plus the normal matched-task quality, token, cost, and "
|
|
212
|
+
"shifted-cost gates."
|
|
213
|
+
)
|
|
214
|
+
DEFAULT_MATRIX_CLASSIFICATIONS = ("default-on", "advisory", "experimental", "reject/rework")
|
|
215
|
+
DEFAULT_MATRIX_CLASSIFICATION_STRENGTH = {
|
|
216
|
+
"experimental": 0,
|
|
217
|
+
"advisory": 1,
|
|
218
|
+
"default-on": 2,
|
|
219
|
+
}
|
|
220
|
+
DEFAULT_MATRIX_LANES: tuple[dict[str, Any], ...] = (
|
|
221
|
+
{
|
|
222
|
+
"id": "trimming",
|
|
223
|
+
"label": "Trimming / digest output",
|
|
224
|
+
"policy_ceiling": "default-on",
|
|
225
|
+
"task_keywords": ("long_log_analysis", "output_transform", "trim", "trimming", "sanitize_output", "digest"),
|
|
226
|
+
"variant_keywords": ("trim", "trimming", "sanitize", "digest", "brief"),
|
|
227
|
+
},
|
|
228
|
+
{
|
|
229
|
+
"id": "artifact_escrow",
|
|
230
|
+
"label": "Artifact escrow / receipt handles",
|
|
231
|
+
"policy_ceiling": "default-on",
|
|
232
|
+
"task_keywords": ("artifact_receipt", "artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
|
|
233
|
+
"variant_keywords": ("artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
|
|
234
|
+
},
|
|
235
|
+
{
|
|
236
|
+
"id": "tool_pruning",
|
|
237
|
+
"label": "Tool/MCP schema pruning",
|
|
238
|
+
"policy_ceiling": "default-on",
|
|
239
|
+
"task_keywords": ("tool_schema", "tool_prune", "tool_pruning", "mcp_schema", "defer_report"),
|
|
240
|
+
"variant_keywords": ("tool_prune", "tool_pruning", "tool_schema", "mcp", "defer"),
|
|
241
|
+
},
|
|
242
|
+
{
|
|
243
|
+
"id": "cache_advice",
|
|
244
|
+
"label": "Cache layout advice",
|
|
245
|
+
"policy_ceiling": "advisory",
|
|
246
|
+
"task_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache"),
|
|
247
|
+
"variant_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache", "cache"),
|
|
248
|
+
},
|
|
249
|
+
{
|
|
250
|
+
"id": "adaptive_k",
|
|
251
|
+
"label": "Adaptive-k context packing",
|
|
252
|
+
"policy_ceiling": "advisory",
|
|
253
|
+
"task_keywords": ("adaptive_k", "adaptive", "top_k", "context_pack"),
|
|
254
|
+
"variant_keywords": ("adaptive_k", "adaptive", "top_k", "pack_adaptive"),
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
"id": "optional_compression",
|
|
258
|
+
"label": "Optional compression",
|
|
259
|
+
"policy_ceiling": "advisory",
|
|
260
|
+
"task_keywords": ("learned_compression", "compression", "compress", "context_diff"),
|
|
261
|
+
"variant_keywords": ("learned_compression", "compression", "compress", "context_diff"),
|
|
262
|
+
},
|
|
263
|
+
)
|
|
264
|
+
DEFAULT_MATRIX_LANE_IDS = tuple(str(item["id"]) for item in DEFAULT_MATRIX_LANES)
|
|
265
|
+
DEFAULT_MATRIX_LANE_BY_ID = {str(item["id"]): item for item in DEFAULT_MATRIX_LANES}
|
|
266
|
+
MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS = 20
|
|
267
|
+
DEFAULT_MATRIX_CLAIM_BOUNDARY = {
|
|
268
|
+
"id": "default_matrix_reporting_only_not_runtime_default_or_savings_claim",
|
|
269
|
+
"reporting_only": True,
|
|
270
|
+
"changes_runtime_defaults": False,
|
|
271
|
+
"hosted_api_token_savings_claim_allowed": False,
|
|
272
|
+
"hosted_api_cost_savings_claim_allowed": False,
|
|
273
|
+
"public_claims_must_use_report_claim_status_and_matched_pair_evidence": True,
|
|
274
|
+
"reason": (
|
|
275
|
+
"The default matrix classifies local benchmark lanes for review only; it does not "
|
|
276
|
+
"turn features on by default and does not authorize hosted API savings claims."
|
|
277
|
+
),
|
|
278
|
+
}
|
|
279
|
+
PUBLIC_CLAIM_READINESS_GATE_IDS = (
|
|
280
|
+
"matched_successful_tasks",
|
|
281
|
+
"provider_measured_token_cost",
|
|
282
|
+
"quality_non_inferiority",
|
|
283
|
+
"shifted_cost_accounting",
|
|
284
|
+
"confidence_failure_notes",
|
|
285
|
+
"provider_export_provenance",
|
|
286
|
+
)
|
|
287
|
+
PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY = {
|
|
288
|
+
"id": "public_claim_readiness_authoritative_release_gate",
|
|
289
|
+
"reporting_only": True,
|
|
290
|
+
"claim_allowed_field": "public_claim_readiness.claim_allowed",
|
|
291
|
+
"unsupported_claims_forbidden": True,
|
|
292
|
+
"hosted_api_token_savings_claim_without_claim_allowed_forbidden": True,
|
|
293
|
+
"hosted_api_cost_savings_claim_without_claim_allowed_forbidden": True,
|
|
294
|
+
"fixed_percent_savings_claim_without_matched_provider_report_forbidden": True,
|
|
295
|
+
"requires_matched_successful_tasks": True,
|
|
296
|
+
"requires_provider_measured_tokens_and_cost": True,
|
|
297
|
+
"requires_quality_non_inferiority": True,
|
|
298
|
+
"requires_shifted_cost_accounting": True,
|
|
299
|
+
"requires_confidence_and_failure_notes": True,
|
|
300
|
+
"requires_provider_export_provenance": True,
|
|
301
|
+
"reason": (
|
|
302
|
+
"Public hosted token/cost savings claims are forbidden unless every readiness gate passes "
|
|
303
|
+
"and public_claim_readiness.claim_allowed is true."
|
|
304
|
+
),
|
|
305
|
+
}
|
|
190
306
|
MAX_SELF_HOSTED_LABEL_CHARS = 120
|
|
191
307
|
MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
|
|
192
308
|
MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
|
|
193
309
|
MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
|
|
310
|
+
MAX_FIXTURE_FILE_BYTES = 1_000_000
|
|
311
|
+
MAX_CLAUDE_PROMPT_ARG_BYTES = MAX_VARIANT_PROMPT_FILE_BYTES
|
|
194
312
|
CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
|
|
195
313
|
SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
|
|
196
314
|
VERSION_OUTPUT_MAX_BYTES = 16_000
|
|
@@ -317,12 +435,18 @@ def _open_regular_no_symlink(
|
|
|
317
435
|
os.close(parent_fd)
|
|
318
436
|
|
|
319
437
|
|
|
320
|
-
def _read_text_no_follow(path: Path) -> str:
|
|
438
|
+
def _read_text_no_follow(path: Path, *, max_bytes: int = MAX_FIXTURE_FILE_BYTES) -> str:
|
|
321
439
|
fd = _open_regular_no_symlink(path)
|
|
322
440
|
try:
|
|
323
|
-
with os.fdopen(fd, "
|
|
441
|
+
with os.fdopen(fd, "rb") as handle:
|
|
324
442
|
fd = -1
|
|
325
|
-
|
|
443
|
+
raw = handle.read(max_bytes + 1)
|
|
444
|
+
if len(raw) > max_bytes:
|
|
445
|
+
raise SystemExit(f"fixture file exceeds {max_bytes} bytes: {path}")
|
|
446
|
+
try:
|
|
447
|
+
return raw.decode("utf-8")
|
|
448
|
+
except UnicodeDecodeError as exc:
|
|
449
|
+
raise SystemExit(f"fixture file must be UTF-8 text: {path}: {exc.reason}") from None
|
|
326
450
|
finally:
|
|
327
451
|
if fd != -1:
|
|
328
452
|
os.close(fd)
|
|
@@ -400,6 +524,38 @@ class RunResult:
|
|
|
400
524
|
self_hosted_metrics: dict[str, Any] | None = None
|
|
401
525
|
|
|
402
526
|
|
|
527
|
+
@dataclass
|
|
528
|
+
class EvidenceReplayRow:
|
|
529
|
+
result: RunResult
|
|
530
|
+
source_type: str
|
|
531
|
+
provider_name: str | None
|
|
532
|
+
capture_command_or_export_id: str | None
|
|
533
|
+
claim_scope: str
|
|
534
|
+
provider_export_provenance_complete: bool
|
|
535
|
+
public_claim_eligible: bool
|
|
536
|
+
explicit_notes: bool
|
|
537
|
+
line_number: int
|
|
538
|
+
|
|
539
|
+
@property
|
|
540
|
+
def key(self) -> tuple[str, str]:
|
|
541
|
+
return (self.result.task_id, self.result.variant)
|
|
542
|
+
|
|
543
|
+
def provenance_payload(self) -> dict[str, Any]:
|
|
544
|
+
return {
|
|
545
|
+
"schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
|
|
546
|
+
"mode": "evidence_jsonl_replay",
|
|
547
|
+
"evidence_source_type": self.source_type,
|
|
548
|
+
"provider_name": self.provider_name,
|
|
549
|
+
"capture_command_or_export_id": self.capture_command_or_export_id,
|
|
550
|
+
"claim_scope": self.claim_scope,
|
|
551
|
+
"provider_export_provenance_complete": self.provider_export_provenance_complete,
|
|
552
|
+
"public_claim_eligible": self.public_claim_eligible,
|
|
553
|
+
"explicit_notes": self.explicit_notes,
|
|
554
|
+
"line_number": self.line_number,
|
|
555
|
+
"claim_boundary": REPLAY_CLAIM_BOUNDARY,
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
|
|
403
559
|
@dataclass
|
|
404
560
|
class BoundedProcessResult:
|
|
405
561
|
returncode: int
|
|
@@ -470,6 +626,17 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
|
|
|
470
626
|
return extra_args
|
|
471
627
|
|
|
472
628
|
|
|
629
|
+
def require_argv_safe_prompt(text: str, *, owner: str) -> str:
|
|
630
|
+
"""Keep prompt-bearing argv below a bounded size to avoid E2BIG failures."""
|
|
631
|
+
size = len(text.encode("utf-8", errors="replace"))
|
|
632
|
+
if size > MAX_CLAUDE_PROMPT_ARG_BYTES:
|
|
633
|
+
raise SystemExit(
|
|
634
|
+
f"{owner} prompt exceeds argv-safe limit "
|
|
635
|
+
f"({size} bytes > {MAX_CLAUDE_PROMPT_ARG_BYTES}); use a smaller fixture prompt"
|
|
636
|
+
)
|
|
637
|
+
return text
|
|
638
|
+
|
|
639
|
+
|
|
473
640
|
def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
|
|
474
641
|
"""Return a safe relative prompt-file path, or fail before any file read."""
|
|
475
642
|
rel_path = Path(raw_path)
|
|
@@ -522,26 +689,28 @@ def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None
|
|
|
522
689
|
f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
|
|
523
690
|
)
|
|
524
691
|
try:
|
|
525
|
-
with os.fdopen(fd, "
|
|
692
|
+
with os.fdopen(fd, "rb") as handle:
|
|
526
693
|
fd = -1
|
|
527
|
-
|
|
528
|
-
except UnicodeDecodeError as exc:
|
|
529
|
-
raise SystemExit(
|
|
530
|
-
f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
|
|
531
|
-
f"{label}: {exc.reason}"
|
|
532
|
-
) from None
|
|
694
|
+
raw = handle.read(MAX_VARIANT_PROMPT_FILE_BYTES + 1)
|
|
533
695
|
except OSError as exc:
|
|
534
696
|
detail = exc.strerror or exc.__class__.__name__
|
|
535
697
|
raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
|
|
536
698
|
finally:
|
|
537
699
|
if fd != -1:
|
|
538
700
|
os.close(fd)
|
|
539
|
-
if len(
|
|
701
|
+
if len(raw) > MAX_VARIANT_PROMPT_FILE_BYTES:
|
|
540
702
|
raise SystemExit(
|
|
541
703
|
f"{owner} variant_prompt_files prompt text exceeds "
|
|
542
704
|
f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
|
|
543
705
|
)
|
|
544
|
-
|
|
706
|
+
try:
|
|
707
|
+
text = raw.decode("utf-8")
|
|
708
|
+
except UnicodeDecodeError as exc:
|
|
709
|
+
raise SystemExit(
|
|
710
|
+
f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
|
|
711
|
+
f"{label}: {exc.reason}"
|
|
712
|
+
) from None
|
|
713
|
+
return require_argv_safe_prompt(text, owner=f"{owner} variant_prompt_files")
|
|
545
714
|
|
|
546
715
|
|
|
547
716
|
def load_variant_prompt_files_for_targets(
|
|
@@ -977,7 +1146,11 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
|
|
|
977
1146
|
argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
|
|
978
1147
|
argv.extend(variant.extra_args)
|
|
979
1148
|
argv.append("--")
|
|
980
|
-
|
|
1149
|
+
prompt = require_argv_safe_prompt(
|
|
1150
|
+
task.variant_prompt_texts.get(variant.name, task.prompt),
|
|
1151
|
+
owner=f"task {task.id} variant {variant.name}",
|
|
1152
|
+
)
|
|
1153
|
+
argv.append(prompt)
|
|
981
1154
|
return argv
|
|
982
1155
|
|
|
983
1156
|
|
|
@@ -1361,7 +1534,13 @@ def write_text_no_follow(path: Path, text: str) -> None:
|
|
|
1361
1534
|
os.close(fd)
|
|
1362
1535
|
|
|
1363
1536
|
|
|
1364
|
-
def append_cost_shift_ledger(
|
|
1537
|
+
def append_cost_shift_ledger(
|
|
1538
|
+
path: Path,
|
|
1539
|
+
claude_ver: str,
|
|
1540
|
+
result: RunResult,
|
|
1541
|
+
*,
|
|
1542
|
+
replay_provenance: dict[str, Any] | None = None,
|
|
1543
|
+
) -> None:
|
|
1365
1544
|
shifted_cost_known = cost_shift_measured(result)
|
|
1366
1545
|
byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
|
|
1367
1546
|
payload = {
|
|
@@ -1412,6 +1591,10 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
|
|
|
1412
1591
|
}
|
|
1413
1592
|
if result.self_hosted_metrics is not None:
|
|
1414
1593
|
payload["self_hosted_metrics"] = result.self_hosted_metrics
|
|
1594
|
+
if replay_provenance is not None:
|
|
1595
|
+
payload["replay_provenance"] = replay_provenance
|
|
1596
|
+
payload["evidence_source_type"] = replay_provenance.get("evidence_source_type")
|
|
1597
|
+
payload["public_claim_eligible"] = bool(replay_provenance.get("public_claim_eligible"))
|
|
1415
1598
|
with csv_file_lock(path, create_parent=True):
|
|
1416
1599
|
fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
|
|
1417
1600
|
try:
|
|
@@ -1435,7 +1618,9 @@ def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
|
|
|
1435
1618
|
reader = csv.DictReader(f)
|
|
1436
1619
|
fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
|
|
1437
1620
|
validate_csv_schema(csv_path, fieldnames)
|
|
1438
|
-
for row in reader:
|
|
1621
|
+
for index, row in enumerate(reader, start=1):
|
|
1622
|
+
if index > MAX_CSV_ROWS:
|
|
1623
|
+
raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
|
|
1439
1624
|
tid = row.get("task_id") or ""
|
|
1440
1625
|
var = row.get("variant") or ""
|
|
1441
1626
|
if tid and var:
|
|
@@ -1487,6 +1672,356 @@ def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
|
|
|
1487
1672
|
os.close(fd)
|
|
1488
1673
|
|
|
1489
1674
|
|
|
1675
|
+
def file_has_content_no_follow(path: Path) -> bool:
|
|
1676
|
+
try:
|
|
1677
|
+
fd = _open_regular_no_symlink(path)
|
|
1678
|
+
except FileNotFoundError:
|
|
1679
|
+
return False
|
|
1680
|
+
try:
|
|
1681
|
+
return os.fstat(fd).st_size > 0
|
|
1682
|
+
finally:
|
|
1683
|
+
os.close(fd)
|
|
1684
|
+
|
|
1685
|
+
|
|
1686
|
+
def require_evidence_object(raw: Any, *, owner: str) -> dict[str, Any]:
|
|
1687
|
+
if not isinstance(raw, dict):
|
|
1688
|
+
raise SystemExit(f"{owner} evidence row must be a JSON object")
|
|
1689
|
+
return raw
|
|
1690
|
+
|
|
1691
|
+
|
|
1692
|
+
def evidence_non_empty_string(raw: Any, *, field: str, owner: str, required: bool = True) -> str | None:
|
|
1693
|
+
if raw is None:
|
|
1694
|
+
if required:
|
|
1695
|
+
raise SystemExit(f"{owner} {field} must be a non-empty string")
|
|
1696
|
+
return None
|
|
1697
|
+
if not isinstance(raw, str):
|
|
1698
|
+
raise SystemExit(f"{owner} {field} must be a string")
|
|
1699
|
+
text = sanitize_note_text(raw)
|
|
1700
|
+
if not text:
|
|
1701
|
+
if required:
|
|
1702
|
+
raise SystemExit(f"{owner} {field} must be a non-empty string")
|
|
1703
|
+
return None
|
|
1704
|
+
return text
|
|
1705
|
+
|
|
1706
|
+
|
|
1707
|
+
def evidence_bool(raw: Any, *, field: str, owner: str, default: bool = False) -> bool:
|
|
1708
|
+
if raw is None:
|
|
1709
|
+
return default
|
|
1710
|
+
if not isinstance(raw, bool):
|
|
1711
|
+
raise SystemExit(f"{owner} {field} must be a boolean")
|
|
1712
|
+
return raw
|
|
1713
|
+
|
|
1714
|
+
|
|
1715
|
+
def evidence_nonnegative_int(
|
|
1716
|
+
raw: Any,
|
|
1717
|
+
*,
|
|
1718
|
+
field: str,
|
|
1719
|
+
owner: str,
|
|
1720
|
+
default: int = 0,
|
|
1721
|
+
maximum: int = MAX_USAGE_TOKEN_COUNT,
|
|
1722
|
+
) -> int:
|
|
1723
|
+
if raw is None:
|
|
1724
|
+
return default
|
|
1725
|
+
value = normalize_usage_token(raw)
|
|
1726
|
+
if value is None or value > maximum:
|
|
1727
|
+
raise SystemExit(f"{owner} {field} must be a finite non-negative integer")
|
|
1728
|
+
return value
|
|
1729
|
+
|
|
1730
|
+
|
|
1731
|
+
def evidence_nonnegative_float(
|
|
1732
|
+
raw: Any,
|
|
1733
|
+
*,
|
|
1734
|
+
field: str,
|
|
1735
|
+
owner: str,
|
|
1736
|
+
default: float = 0.0,
|
|
1737
|
+
maximum: float = MAX_USAGE_COST_USD,
|
|
1738
|
+
) -> float:
|
|
1739
|
+
if raw is None:
|
|
1740
|
+
return default
|
|
1741
|
+
if isinstance(raw, bool) or not isinstance(raw, (int, float)):
|
|
1742
|
+
raise SystemExit(f"{owner} {field} must be a finite non-negative number")
|
|
1743
|
+
value = float(raw)
|
|
1744
|
+
if not math.isfinite(value) or value < 0 or value > maximum:
|
|
1745
|
+
raise SystemExit(f"{owner} {field} must be a finite non-negative number")
|
|
1746
|
+
return value
|
|
1747
|
+
|
|
1748
|
+
|
|
1749
|
+
def evidence_first(raw: dict[str, Any], *keys: str) -> Any:
|
|
1750
|
+
for key in keys:
|
|
1751
|
+
if key in raw:
|
|
1752
|
+
return raw[key]
|
|
1753
|
+
return None
|
|
1754
|
+
|
|
1755
|
+
|
|
1756
|
+
def parse_evidence_provenance(raw: dict[str, Any], *, owner: str) -> dict[str, Any]:
|
|
1757
|
+
provenance = raw.get("provenance")
|
|
1758
|
+
if provenance is not None and not isinstance(provenance, dict):
|
|
1759
|
+
raise SystemExit(f"{owner} provenance must be a JSON object")
|
|
1760
|
+
source_raw = (
|
|
1761
|
+
provenance.get("evidence_source_type")
|
|
1762
|
+
if isinstance(provenance, dict) and "evidence_source_type" in provenance
|
|
1763
|
+
else raw.get("evidence_source_type")
|
|
1764
|
+
)
|
|
1765
|
+
source_type = evidence_non_empty_string(source_raw, field="evidence_source_type", owner=owner)
|
|
1766
|
+
assert source_type is not None
|
|
1767
|
+
if source_type not in EVIDENCE_REPLAY_SOURCE_TYPES:
|
|
1768
|
+
raise SystemExit(
|
|
1769
|
+
f"{owner} evidence_source_type must be one of: {', '.join(sorted(EVIDENCE_REPLAY_SOURCE_TYPES))}"
|
|
1770
|
+
)
|
|
1771
|
+
provider_name = evidence_non_empty_string(
|
|
1772
|
+
provenance.get("provider_name") if isinstance(provenance, dict) else raw.get("provider_name"),
|
|
1773
|
+
field="provider_name",
|
|
1774
|
+
owner=owner,
|
|
1775
|
+
required=False,
|
|
1776
|
+
)
|
|
1777
|
+
capture_id = evidence_non_empty_string(
|
|
1778
|
+
(
|
|
1779
|
+
provenance.get("capture_command_or_export_id")
|
|
1780
|
+
if isinstance(provenance, dict) and "capture_command_or_export_id" in provenance
|
|
1781
|
+
else raw.get("capture_command_or_export_id")
|
|
1782
|
+
),
|
|
1783
|
+
field="capture_command_or_export_id",
|
|
1784
|
+
owner=owner,
|
|
1785
|
+
required=False,
|
|
1786
|
+
)
|
|
1787
|
+
claim_scope = evidence_non_empty_string(
|
|
1788
|
+
provenance.get("claim_scope") if isinstance(provenance, dict) else raw.get("claim_scope"),
|
|
1789
|
+
field="claim_scope",
|
|
1790
|
+
owner=owner,
|
|
1791
|
+
)
|
|
1792
|
+
assert claim_scope is not None
|
|
1793
|
+
provider_authority = (
|
|
1794
|
+
source_type == "provider_export"
|
|
1795
|
+
and provider_name is not None
|
|
1796
|
+
and capture_id is not None
|
|
1797
|
+
and claim_scope in PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES
|
|
1798
|
+
)
|
|
1799
|
+
return {
|
|
1800
|
+
"source_type": source_type,
|
|
1801
|
+
"provider_name": provider_name,
|
|
1802
|
+
"capture_command_or_export_id": capture_id,
|
|
1803
|
+
"claim_scope": claim_scope,
|
|
1804
|
+
"provider_public_claim_authority": provider_authority,
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
|
|
1808
|
+
def parse_evidence_tokens(raw: dict[str, Any], *, owner: str) -> tuple[dict[str, int], set[str]]:
|
|
1809
|
+
token_block = raw.get("tokens")
|
|
1810
|
+
if token_block is not None and not isinstance(token_block, dict):
|
|
1811
|
+
raise SystemExit(f"{owner} tokens must be a JSON object")
|
|
1812
|
+
tokens: dict[str, int] = {}
|
|
1813
|
+
observed: set[str] = set()
|
|
1814
|
+
source = token_block if isinstance(token_block, dict) else {}
|
|
1815
|
+
for bucket, _keys in USAGE_KEY_GROUPS:
|
|
1816
|
+
value = source.get(bucket) if bucket in source else raw.get(bucket)
|
|
1817
|
+
if value is not None:
|
|
1818
|
+
observed.add(bucket)
|
|
1819
|
+
tokens[bucket] = evidence_nonnegative_int(value, field=bucket, owner=owner)
|
|
1820
|
+
return tokens, observed
|
|
1821
|
+
|
|
1822
|
+
|
|
1823
|
+
def parse_evidence_row(raw_value: Any, *, owner: str, line_number: int) -> EvidenceReplayRow:
|
|
1824
|
+
raw = require_evidence_object(raw_value, owner=owner)
|
|
1825
|
+
schema = evidence_non_empty_string(raw.get("schema_version"), field="schema_version", owner=owner)
|
|
1826
|
+
if schema != BENCH_RUN_EVIDENCE_SCHEMA_VERSION:
|
|
1827
|
+
raise SystemExit(
|
|
1828
|
+
f"{owner} schema_version must be {BENCH_RUN_EVIDENCE_SCHEMA_VERSION}"
|
|
1829
|
+
)
|
|
1830
|
+
task_id = evidence_non_empty_string(raw.get("task_id"), field="task_id", owner=owner)
|
|
1831
|
+
variant = evidence_non_empty_string(raw.get("variant"), field="variant", owner=owner)
|
|
1832
|
+
assert task_id is not None and variant is not None
|
|
1833
|
+
provenance = parse_evidence_provenance(raw, owner=owner)
|
|
1834
|
+
provider_authority = bool(provenance["provider_public_claim_authority"])
|
|
1835
|
+
raw_primary_tokens_measured = evidence_bool(
|
|
1836
|
+
raw.get("primary_tokens_measured"),
|
|
1837
|
+
field="primary_tokens_measured",
|
|
1838
|
+
owner=owner,
|
|
1839
|
+
)
|
|
1840
|
+
raw_cost_measured = evidence_bool(
|
|
1841
|
+
evidence_first(raw, "cost_measured", "primary_cost_measured"),
|
|
1842
|
+
field="cost_measured",
|
|
1843
|
+
owner=owner,
|
|
1844
|
+
)
|
|
1845
|
+
if provenance["source_type"] in {"synthetic_fixture", "manual_audit"}:
|
|
1846
|
+
primary_tokens_measured = False
|
|
1847
|
+
cost_measured = False
|
|
1848
|
+
elif provider_authority:
|
|
1849
|
+
primary_tokens_measured = raw_primary_tokens_measured
|
|
1850
|
+
cost_measured = raw_cost_measured
|
|
1851
|
+
else:
|
|
1852
|
+
if raw_primary_tokens_measured or raw_cost_measured:
|
|
1853
|
+
raise SystemExit(
|
|
1854
|
+
f"{owner} provider_export measured flags require provider_name, "
|
|
1855
|
+
"capture_command_or_export_id, and a provider-measured matched-task claim_scope"
|
|
1856
|
+
)
|
|
1857
|
+
primary_tokens_measured = False
|
|
1858
|
+
cost_measured = False
|
|
1859
|
+
|
|
1860
|
+
tokens, observed_token_buckets = parse_evidence_tokens(raw, owner=owner)
|
|
1861
|
+
if primary_tokens_measured and not {"input_tokens", "output_tokens"}.issubset(observed_token_buckets):
|
|
1862
|
+
raise SystemExit(
|
|
1863
|
+
f"{owner} primary_tokens_measured=true requires input_tokens and output_tokens evidence"
|
|
1864
|
+
)
|
|
1865
|
+
cost_usd = evidence_nonnegative_float(
|
|
1866
|
+
evidence_first(raw, "cost_usd", "primary_cost_usd"),
|
|
1867
|
+
field="cost_usd",
|
|
1868
|
+
owner=owner,
|
|
1869
|
+
)
|
|
1870
|
+
if cost_measured and "cost_usd" not in raw and "primary_cost_usd" not in raw:
|
|
1871
|
+
raise SystemExit(f"{owner} cost_measured=true requires cost_usd evidence")
|
|
1872
|
+
|
|
1873
|
+
if "success" not in raw:
|
|
1874
|
+
raise SystemExit(f"{owner} success must be a boolean")
|
|
1875
|
+
success = evidence_bool(raw.get("success"), field="success", owner=owner)
|
|
1876
|
+
notes = evidence_non_empty_string(raw.get("notes"), field="notes", owner=owner, required=False)
|
|
1877
|
+
explicit_notes = notes is not None
|
|
1878
|
+
model = evidence_non_empty_string(raw.get("model"), field="model", owner=owner, required=False) or "evidence-replay"
|
|
1879
|
+
effort = evidence_non_empty_string(raw.get("effort"), field="effort", owner=owner, required=False) or ""
|
|
1880
|
+
self_hosted_metrics = None
|
|
1881
|
+
if SELF_HOSTED_METRICS_KEY in raw:
|
|
1882
|
+
self_hosted_metrics = normalize_self_hosted_metrics(
|
|
1883
|
+
raw.get(SELF_HOSTED_METRICS_KEY),
|
|
1884
|
+
source="evidence_jsonl.self_hosted_metrics",
|
|
1885
|
+
)
|
|
1886
|
+
if self_hosted_metrics is None:
|
|
1887
|
+
raise SystemExit(f"{owner} self_hosted_metrics must be normalized explicit metrics")
|
|
1888
|
+
|
|
1889
|
+
result = RunResult(
|
|
1890
|
+
task_id=task_id,
|
|
1891
|
+
variant=variant,
|
|
1892
|
+
model=model,
|
|
1893
|
+
effort=effort,
|
|
1894
|
+
tokens=tokens,
|
|
1895
|
+
cost_usd=cost_usd,
|
|
1896
|
+
success=success,
|
|
1897
|
+
notes=notes or f"evidence replay ({provenance['source_type']})",
|
|
1898
|
+
corrections=evidence_nonnegative_int(raw.get("corrections"), field="corrections", owner=owner),
|
|
1899
|
+
cost_measured=cost_measured,
|
|
1900
|
+
wall_time_seconds=evidence_nonnegative_float(
|
|
1901
|
+
raw.get("wall_time_seconds"),
|
|
1902
|
+
field="wall_time_seconds",
|
|
1903
|
+
owner=owner,
|
|
1904
|
+
maximum=MAX_SELF_HOSTED_LATENCY_MS / 1000,
|
|
1905
|
+
),
|
|
1906
|
+
turns=evidence_nonnegative_int(raw.get("turns"), field="turns", owner=owner),
|
|
1907
|
+
hook_triggers=evidence_nonnegative_int(raw.get("hook_triggers"), field="hook_triggers", owner=owner),
|
|
1908
|
+
bytes_before=evidence_nonnegative_int(raw.get("bytes_before"), field="bytes_before", owner=owner),
|
|
1909
|
+
bytes_after=evidence_nonnegative_int(raw.get("bytes_after"), field="bytes_after", owner=owner),
|
|
1910
|
+
artifacts_used=evidence_nonnegative_int(raw.get("artifacts_used"), field="artifacts_used", owner=owner),
|
|
1911
|
+
external_tokens=evidence_nonnegative_int(raw.get("external_tokens"), field="external_tokens", owner=owner),
|
|
1912
|
+
external_tokens_measured=evidence_bool(
|
|
1913
|
+
raw.get("external_tokens_measured"),
|
|
1914
|
+
field="external_tokens_measured",
|
|
1915
|
+
owner=owner,
|
|
1916
|
+
),
|
|
1917
|
+
external_cost_usd=evidence_nonnegative_float(
|
|
1918
|
+
raw.get("external_cost_usd"),
|
|
1919
|
+
field="external_cost_usd",
|
|
1920
|
+
owner=owner,
|
|
1921
|
+
),
|
|
1922
|
+
external_cost_measured=evidence_bool(
|
|
1923
|
+
raw.get("external_cost_measured"),
|
|
1924
|
+
field="external_cost_measured",
|
|
1925
|
+
owner=owner,
|
|
1926
|
+
),
|
|
1927
|
+
provider_cached_tokens=evidence_nonnegative_int(
|
|
1928
|
+
raw.get("provider_cached_tokens"),
|
|
1929
|
+
field="provider_cached_tokens",
|
|
1930
|
+
owner=owner,
|
|
1931
|
+
),
|
|
1932
|
+
provider_cached_tokens_measured=evidence_bool(
|
|
1933
|
+
raw.get("provider_cached_tokens_measured"),
|
|
1934
|
+
field="provider_cached_tokens_measured",
|
|
1935
|
+
owner=owner,
|
|
1936
|
+
),
|
|
1937
|
+
primary_tokens_measured=primary_tokens_measured,
|
|
1938
|
+
self_hosted_metrics=self_hosted_metrics,
|
|
1939
|
+
)
|
|
1940
|
+
return EvidenceReplayRow(
|
|
1941
|
+
result=result,
|
|
1942
|
+
source_type=str(provenance["source_type"]),
|
|
1943
|
+
provider_name=provenance["provider_name"],
|
|
1944
|
+
capture_command_or_export_id=provenance["capture_command_or_export_id"],
|
|
1945
|
+
claim_scope=str(provenance["claim_scope"]),
|
|
1946
|
+
provider_export_provenance_complete=provider_authority,
|
|
1947
|
+
public_claim_eligible=False,
|
|
1948
|
+
explicit_notes=explicit_notes,
|
|
1949
|
+
line_number=line_number,
|
|
1950
|
+
)
|
|
1951
|
+
|
|
1952
|
+
|
|
1953
|
+
def read_evidence_jsonl(path: Path) -> list[EvidenceReplayRow]:
|
|
1954
|
+
fd = _open_regular_no_symlink(path)
|
|
1955
|
+
try:
|
|
1956
|
+
size = os.fstat(fd).st_size
|
|
1957
|
+
if size > MAX_EVIDENCE_JSONL_BYTES:
|
|
1958
|
+
raise SystemExit(
|
|
1959
|
+
f"evidence JSONL exceeds {MAX_EVIDENCE_JSONL_BYTES} bytes: {path}"
|
|
1960
|
+
)
|
|
1961
|
+
rows: list[EvidenceReplayRow] = []
|
|
1962
|
+
with os.fdopen(fd, "r", encoding="utf-8") as handle:
|
|
1963
|
+
fd = -1
|
|
1964
|
+
for line_number, line in enumerate(handle, start=1):
|
|
1965
|
+
if line_number > MAX_EVIDENCE_JSONL_LINES:
|
|
1966
|
+
raise SystemExit(
|
|
1967
|
+
f"evidence JSONL line limit exceeded for {path}: > {MAX_EVIDENCE_JSONL_LINES}"
|
|
1968
|
+
)
|
|
1969
|
+
if not line.strip():
|
|
1970
|
+
continue
|
|
1971
|
+
try:
|
|
1972
|
+
payload = json.loads(line)
|
|
1973
|
+
except json.JSONDecodeError as exc:
|
|
1974
|
+
raise SystemExit(
|
|
1975
|
+
f"{path}:{line_number} evidence row must be JSON: {exc.msg}"
|
|
1976
|
+
) from None
|
|
1977
|
+
rows.append(parse_evidence_row(payload, owner=f"{path}:{line_number}", line_number=line_number))
|
|
1978
|
+
finally:
|
|
1979
|
+
if fd != -1:
|
|
1980
|
+
os.close(fd)
|
|
1981
|
+
if not rows:
|
|
1982
|
+
raise SystemExit(f"evidence JSONL contains no rows: {path}")
|
|
1983
|
+
return rows
|
|
1984
|
+
|
|
1985
|
+
|
|
1986
|
+
def validate_evidence_coverage(
|
|
1987
|
+
evidence_rows: list[EvidenceReplayRow],
|
|
1988
|
+
runnable_targets: list[tuple[TaskFixture, Variant]],
|
|
1989
|
+
) -> dict[tuple[str, str], EvidenceReplayRow]:
|
|
1990
|
+
by_key: dict[tuple[str, str], EvidenceReplayRow] = {}
|
|
1991
|
+
for row in evidence_rows:
|
|
1992
|
+
if row.key in by_key:
|
|
1993
|
+
raise SystemExit(
|
|
1994
|
+
f"duplicate evidence row for {row.key[0]}/{row.key[1]} "
|
|
1995
|
+
f"(lines {by_key[row.key].line_number} and {row.line_number})"
|
|
1996
|
+
)
|
|
1997
|
+
by_key[row.key] = row
|
|
1998
|
+
missing = [
|
|
1999
|
+
f"{task.id}/{variant.name}"
|
|
2000
|
+
for task, variant in runnable_targets
|
|
2001
|
+
if (task.id, variant.name) not in by_key
|
|
2002
|
+
]
|
|
2003
|
+
if missing:
|
|
2004
|
+
raise SystemExit(f"missing evidence row(s) for selected targets: {', '.join(missing)}")
|
|
2005
|
+
return {
|
|
2006
|
+
(task.id, variant.name): by_key[(task.id, variant.name)]
|
|
2007
|
+
for task, variant in runnable_targets
|
|
2008
|
+
}
|
|
2009
|
+
|
|
2010
|
+
|
|
2011
|
+
def run_evidence_fixture(task: TaskFixture, variant: Variant, evidence: EvidenceReplayRow) -> RunResult:
|
|
2012
|
+
result = evidence.result
|
|
2013
|
+
if result.task_id != task.id or result.variant != variant.name:
|
|
2014
|
+
raise SystemExit(
|
|
2015
|
+
f"evidence target mismatch: expected {task.id}/{variant.name}, "
|
|
2016
|
+
f"got {result.task_id}/{result.variant}"
|
|
2017
|
+
)
|
|
2018
|
+
if result.model == "evidence-replay":
|
|
2019
|
+
result.model = task.model
|
|
2020
|
+
if not result.effort:
|
|
2021
|
+
result.effort = task.effort or ""
|
|
2022
|
+
return result
|
|
2023
|
+
|
|
2024
|
+
|
|
1490
2025
|
def row_int(row: dict[str, str], key: str) -> int:
|
|
1491
2026
|
try:
|
|
1492
2027
|
return int(float(row.get(key) or 0))
|
|
@@ -1546,6 +2081,77 @@ def row_cost_shift_measured(row: dict[str, str]) -> bool:
|
|
|
1546
2081
|
)
|
|
1547
2082
|
|
|
1548
2083
|
|
|
2084
|
+
def measurement_baseline_contract() -> dict[str, Any]:
|
|
2085
|
+
"""Describe the benchmark report's current measurement baseline contract.
|
|
2086
|
+
|
|
2087
|
+
This block is descriptive. It does not change the CSV schema and does not
|
|
2088
|
+
grant token/cost savings claims by itself; those remain gated by matched
|
|
2089
|
+
successful tasks, measured primary tokens/costs, shifted-cost accounting,
|
|
2090
|
+
and quality gates.
|
|
2091
|
+
"""
|
|
2092
|
+
return {
|
|
2093
|
+
"schema_version": MEASUREMENT_BASELINE_SCHEMA_VERSION,
|
|
2094
|
+
"csv_schema_unchanged": True,
|
|
2095
|
+
"csv_columns": list(CSV_COLUMNS),
|
|
2096
|
+
"captured_fields": {
|
|
2097
|
+
"task_identity": ["task_id", "variant"],
|
|
2098
|
+
"run_configuration": ["model", "effort", "claude_version"],
|
|
2099
|
+
"primary_token_buckets": [
|
|
2100
|
+
"input_tokens",
|
|
2101
|
+
"output_tokens",
|
|
2102
|
+
"cache_read",
|
|
2103
|
+
"cache_creation",
|
|
2104
|
+
"total_tokens",
|
|
2105
|
+
"primary_tokens_measured",
|
|
2106
|
+
],
|
|
2107
|
+
"primary_cost": ["cost_usd", "cost_measured"],
|
|
2108
|
+
"provider_cache_telemetry": ["provider_cached_tokens", "provider_cached_tokens_measured"],
|
|
2109
|
+
"latency": ["wall_time_seconds"],
|
|
2110
|
+
"quality_and_result": ["success", "corrections", "notes"],
|
|
2111
|
+
"tooling_and_proxy_metrics": ["turns", "hook_triggers", "bytes_before", "bytes_after", "artifacts_used"],
|
|
2112
|
+
"shifted_cost_accounting": [
|
|
2113
|
+
"external_tokens",
|
|
2114
|
+
"external_tokens_measured",
|
|
2115
|
+
"external_cost_usd",
|
|
2116
|
+
"external_cost_measured",
|
|
2117
|
+
"total_cost_with_shift_usd",
|
|
2118
|
+
],
|
|
2119
|
+
},
|
|
2120
|
+
"claim_eligible_fields": {
|
|
2121
|
+
"token_savings": [
|
|
2122
|
+
"matched successful baseline and variant tasks",
|
|
2123
|
+
"primary_tokens_measured=true on both sides",
|
|
2124
|
+
"quality_gate=pass",
|
|
2125
|
+
],
|
|
2126
|
+
"shifted_cost_savings": [
|
|
2127
|
+
"matched successful baseline and variant tasks",
|
|
2128
|
+
"cost_measured=true on both sides",
|
|
2129
|
+
"external_cost_measured=true when external_tokens are present",
|
|
2130
|
+
"quality_gate=pass",
|
|
2131
|
+
],
|
|
2132
|
+
},
|
|
2133
|
+
"proxy_only_fields": {
|
|
2134
|
+
"byte_metrics": ["bytes_before", "bytes_after"],
|
|
2135
|
+
"token_proxy": "chars_div_4_proxy_only",
|
|
2136
|
+
"provider_cache": "diagnostic_telemetry_not_contextguard_token_reduction",
|
|
2137
|
+
},
|
|
2138
|
+
"missing_future_run_identity_fields": [
|
|
2139
|
+
"repo_revision",
|
|
2140
|
+
"agent_harness",
|
|
2141
|
+
"feature_flags",
|
|
2142
|
+
"provider_name",
|
|
2143
|
+
"success_command_identity",
|
|
2144
|
+
],
|
|
2145
|
+
"claim_boundary": {
|
|
2146
|
+
"descriptive_contract_only": True,
|
|
2147
|
+
"enables_savings_claims_by_itself": False,
|
|
2148
|
+
"requires_matched_successful_tasks": True,
|
|
2149
|
+
"requires_shifted_cost_accounting_for_cost_claims": True,
|
|
2150
|
+
"raw_proxy_estimates_are_not_hosted_api_token_savings": True,
|
|
2151
|
+
},
|
|
2152
|
+
}
|
|
2153
|
+
|
|
2154
|
+
|
|
1549
2155
|
def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
|
|
1550
2156
|
by_variant: dict[str, dict[str, Any]] = {}
|
|
1551
2157
|
successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
|
|
@@ -2187,10 +2793,11 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
2187
2793
|
claim_status = "token_savings_observed_cost_unmeasured"
|
|
2188
2794
|
elif token_savings_observed:
|
|
2189
2795
|
claim_status = "token_savings_observed_cost_shift_watch"
|
|
2190
|
-
|
|
2796
|
+
report = {
|
|
2191
2797
|
"schema": "context-guard-bench-report-v1",
|
|
2192
2798
|
"baseline_variant": baseline_variant,
|
|
2193
2799
|
"row_count": len(rows),
|
|
2800
|
+
"measurement_baseline": measurement_baseline_contract(),
|
|
2194
2801
|
"summary_by_variant": by_variant,
|
|
2195
2802
|
"comparisons": comparisons,
|
|
2196
2803
|
"matched_pair_evidence": matched_pair_evidence,
|
|
@@ -2200,22 +2807,854 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
|
|
|
2200
2807
|
"shifted cost savings require measured primary cost and measured external cost when "
|
|
2201
2808
|
"external tokens are present. Wall time and provider cached-token fields are diagnostic "
|
|
2202
2809
|
"telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
|
|
2203
|
-
"discounts must stay separate from token-reduction claims."
|
|
2810
|
+
"discounts must stay separate from token-reduction claims. Public hosted savings "
|
|
2811
|
+
"claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden."
|
|
2204
2812
|
),
|
|
2205
2813
|
}
|
|
2814
|
+
report["public_claim_readiness"] = build_public_claim_readiness(report)
|
|
2815
|
+
report["default_matrix"] = build_default_matrix(report)
|
|
2816
|
+
return report
|
|
2817
|
+
|
|
2818
|
+
def annotate_replay_report(
|
|
2819
|
+
report: dict[str, Any],
|
|
2820
|
+
replay_rows: list[EvidenceReplayRow],
|
|
2821
|
+
*,
|
|
2822
|
+
mixed_csv: bool,
|
|
2823
|
+
) -> dict[str, Any]:
|
|
2824
|
+
source_types = sorted({row.source_type for row in replay_rows})
|
|
2825
|
+
provider_names = sorted({row.provider_name for row in replay_rows if row.provider_name})
|
|
2826
|
+
claim_scopes = sorted({row.claim_scope for row in replay_rows})
|
|
2827
|
+
same_run_complete = (not mixed_csv) and len(replay_rows) == int(report.get("row_count") or 0)
|
|
2828
|
+
all_provider_claim_authority = bool(replay_rows) and all(
|
|
2829
|
+
row.provider_export_provenance_complete for row in replay_rows
|
|
2830
|
+
)
|
|
2831
|
+
raw_claim_status = str(report.get("claim_status") or "")
|
|
2832
|
+
matched_pair_evidence = report.get("matched_pair_evidence")
|
|
2833
|
+
matched_claim_gates_allow_public_claim = (
|
|
2834
|
+
isinstance(matched_pair_evidence, list)
|
|
2835
|
+
and bool(matched_pair_evidence)
|
|
2836
|
+
and all(
|
|
2837
|
+
isinstance(item, dict)
|
|
2838
|
+
and isinstance(item.get("claim_boundary"), dict)
|
|
2839
|
+
and bool(item["claim_boundary"].get("token_savings_claim_allowed"))
|
|
2840
|
+
and bool(item["claim_boundary"].get("shifted_cost_claim_allowed"))
|
|
2841
|
+
for item in matched_pair_evidence
|
|
2842
|
+
)
|
|
2843
|
+
)
|
|
2844
|
+
report_claim_gates_allow_public_claim = (
|
|
2845
|
+
raw_claim_status in REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES
|
|
2846
|
+
and matched_claim_gates_allow_public_claim
|
|
2847
|
+
)
|
|
2848
|
+
if not same_run_complete:
|
|
2849
|
+
public_claim_status = REPLAY_UNKNOWN_MIXED_CSV_STATUS
|
|
2850
|
+
public_claim_eligible = False
|
|
2851
|
+
elif all_provider_claim_authority and report_claim_gates_allow_public_claim:
|
|
2852
|
+
public_claim_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
|
|
2853
|
+
public_claim_eligible = True
|
|
2854
|
+
elif all_provider_claim_authority:
|
|
2855
|
+
public_claim_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
|
|
2856
|
+
public_claim_eligible = False
|
|
2857
|
+
else:
|
|
2858
|
+
public_claim_status = REPLAY_NOT_PUBLIC_CLAIM_STATUS
|
|
2859
|
+
public_claim_eligible = False
|
|
2860
|
+
report["raw_metric_claim_status"] = raw_claim_status
|
|
2861
|
+
report["public_claim_status"] = public_claim_status
|
|
2862
|
+
report["public_claim_eligible"] = public_claim_eligible
|
|
2863
|
+
if not public_claim_eligible:
|
|
2864
|
+
report["claim_status"] = public_claim_status
|
|
2865
|
+
report["replay_evidence"] = {
|
|
2866
|
+
"schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
|
|
2867
|
+
"mode": "evidence_jsonl_replay",
|
|
2868
|
+
"row_count": len(replay_rows),
|
|
2869
|
+
"source_types": source_types,
|
|
2870
|
+
"provider_names": provider_names,
|
|
2871
|
+
"claim_scopes": claim_scopes,
|
|
2872
|
+
"same_run_complete": same_run_complete,
|
|
2873
|
+
"mixed_csv": mixed_csv,
|
|
2874
|
+
"provider_export_provenance_complete": all_provider_claim_authority,
|
|
2875
|
+
"report_claim_gates_allow_public_claim": report_claim_gates_allow_public_claim,
|
|
2876
|
+
"public_claim_status": public_claim_status,
|
|
2877
|
+
"public_claim_eligible": public_claim_eligible,
|
|
2878
|
+
"target_keys": [f"{row.result.task_id}/{row.result.variant}" for row in replay_rows],
|
|
2879
|
+
"claim_boundary": REPLAY_CLAIM_BOUNDARY,
|
|
2880
|
+
}
|
|
2881
|
+
report["public_claim_readiness"] = build_public_claim_readiness(
|
|
2882
|
+
report,
|
|
2883
|
+
replay_rows=replay_rows,
|
|
2884
|
+
mixed_csv=mixed_csv,
|
|
2885
|
+
)
|
|
2886
|
+
report["default_matrix"] = build_default_matrix(report)
|
|
2887
|
+
return report
|
|
2888
|
+
|
|
2889
|
+
|
|
2890
|
+
def report_public_claim_status(report: dict[str, Any]) -> tuple[str, bool | None]:
|
|
2891
|
+
if "public_claim_status" in report:
|
|
2892
|
+
return str(report.get("public_claim_status")), bool(report.get("public_claim_eligible"))
|
|
2893
|
+
return (
|
|
2894
|
+
"csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
|
|
2895
|
+
None,
|
|
2896
|
+
)
|
|
2897
|
+
|
|
2898
|
+
|
|
2899
|
+
|
|
2900
|
+
def public_claim_readiness_gate(
|
|
2901
|
+
gate_id: str,
|
|
2902
|
+
label: str,
|
|
2903
|
+
passed: bool,
|
|
2904
|
+
reason: str,
|
|
2905
|
+
evidence: dict[str, Any] | None = None,
|
|
2906
|
+
*,
|
|
2907
|
+
unknown: bool = False,
|
|
2908
|
+
) -> dict[str, Any]:
|
|
2909
|
+
status = "unknown" if unknown else ("pass" if passed else "fail")
|
|
2910
|
+
return {
|
|
2911
|
+
"id": gate_id,
|
|
2912
|
+
"label": label,
|
|
2913
|
+
"required": True,
|
|
2914
|
+
"status": status,
|
|
2915
|
+
"passed": passed and not unknown,
|
|
2916
|
+
"reason": reason,
|
|
2917
|
+
"evidence": evidence or {},
|
|
2918
|
+
}
|
|
2919
|
+
|
|
2920
|
+
|
|
2921
|
+
def public_claim_pair_side_measured(pair: dict[str, Any], side: str, metric: str) -> bool:
|
|
2922
|
+
measurements = pair.get("measurements") if isinstance(pair.get("measurements"), dict) else {}
|
|
2923
|
+
side_block = measurements.get(side) if isinstance(measurements.get(side), dict) else {}
|
|
2924
|
+
metric_block = side_block.get(metric) if isinstance(side_block.get(metric), dict) else {}
|
|
2925
|
+
return bool(metric_block.get("measured"))
|
|
2926
|
+
|
|
2927
|
+
|
|
2928
|
+
def public_claim_numeric_values(items: list[Any]) -> list[float]:
|
|
2929
|
+
values: list[float] = []
|
|
2930
|
+
for item in items:
|
|
2931
|
+
if isinstance(item, bool) or not isinstance(item, (int, float)):
|
|
2932
|
+
continue
|
|
2933
|
+
numeric = float(item)
|
|
2934
|
+
if math.isfinite(numeric):
|
|
2935
|
+
values.append(numeric)
|
|
2936
|
+
return values
|
|
2937
|
+
|
|
2938
|
+
|
|
2939
|
+
def public_claim_readiness_evidence_text(evidence: dict[str, Any]) -> str:
|
|
2940
|
+
parts: list[str] = []
|
|
2941
|
+
for key, value in evidence.items():
|
|
2942
|
+
if isinstance(value, list):
|
|
2943
|
+
display = ",".join(str(item) for item in value[:5])
|
|
2944
|
+
if len(value) > 5:
|
|
2945
|
+
display += ",…"
|
|
2946
|
+
elif isinstance(value, dict):
|
|
2947
|
+
display = ",".join(f"{k}={v}" for k, v in list(value.items())[:5])
|
|
2948
|
+
if len(value) > 5:
|
|
2949
|
+
display += ",…"
|
|
2950
|
+
else:
|
|
2951
|
+
display = str(value)
|
|
2952
|
+
parts.append(f"{key}={display}")
|
|
2953
|
+
return "; ".join(parts)
|
|
2954
|
+
|
|
2955
|
+
|
|
2956
|
+
def build_public_claim_readiness(
|
|
2957
|
+
report: dict[str, Any],
|
|
2958
|
+
*,
|
|
2959
|
+
replay_rows: list[EvidenceReplayRow] | None = None,
|
|
2960
|
+
mixed_csv: bool = False,
|
|
2961
|
+
) -> dict[str, Any]:
|
|
2962
|
+
comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
|
|
2963
|
+
comparisons = [item for item in comparisons if isinstance(item, dict)]
|
|
2964
|
+
pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
|
|
2965
|
+
pairs = [item for item in pairs if isinstance(item, dict)]
|
|
2966
|
+
row_count = int(report.get("row_count") or 0)
|
|
2967
|
+
replay_evidence = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else {}
|
|
2968
|
+
replay_count = len(replay_rows or [])
|
|
2969
|
+
public_claim_status, public_claim_eligible = report_public_claim_status(report)
|
|
2970
|
+
raw_metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
|
|
2971
|
+
|
|
2972
|
+
comparison_variants = [str(item.get("variant")) for item in comparisons if item.get("variant")]
|
|
2973
|
+
matched_counts = public_claim_numeric_values([
|
|
2974
|
+
item.get("matched_successful_task_count") for item in comparisons
|
|
2975
|
+
])
|
|
2976
|
+
missing_baseline_successes = [
|
|
2977
|
+
task
|
|
2978
|
+
for item in comparisons
|
|
2979
|
+
for task in (item.get("missing_baseline_success_tasks") or [])
|
|
2980
|
+
]
|
|
2981
|
+
baseline_success_counts = public_claim_numeric_values([
|
|
2982
|
+
item.get("baseline_successful_task_count") for item in comparisons
|
|
2983
|
+
])
|
|
2984
|
+
matched_tasks_pass = (
|
|
2985
|
+
bool(comparisons)
|
|
2986
|
+
and bool(pairs)
|
|
2987
|
+
and len(matched_counts) == len(comparisons)
|
|
2988
|
+
and all(value > 0 for value in matched_counts)
|
|
2989
|
+
and len(baseline_success_counts) == len(comparisons)
|
|
2990
|
+
and all(value > 0 for value in baseline_success_counts)
|
|
2991
|
+
and not missing_baseline_successes
|
|
2992
|
+
)
|
|
2993
|
+
gates = [
|
|
2994
|
+
public_claim_readiness_gate(
|
|
2995
|
+
"matched_successful_tasks",
|
|
2996
|
+
"Matched successful tasks",
|
|
2997
|
+
matched_tasks_pass,
|
|
2998
|
+
"matched_successful_tasks_present" if matched_tasks_pass else "missing_or_regressed_matched_successful_tasks",
|
|
2999
|
+
{
|
|
3000
|
+
"comparison_count": len(comparisons),
|
|
3001
|
+
"matched_pair_count": len(pairs),
|
|
3002
|
+
"variants": comparison_variants[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
|
|
3003
|
+
"min_matched_successful_task_count": min(matched_counts) if matched_counts else None,
|
|
3004
|
+
"missing_baseline_success_task_count": len(missing_baseline_successes),
|
|
3005
|
+
},
|
|
3006
|
+
)
|
|
3007
|
+
]
|
|
3008
|
+
|
|
3009
|
+
provider_measured_token_cost_pass = bool(pairs) and all(
|
|
3010
|
+
public_claim_pair_side_measured(pair, "baseline", "primary_tokens")
|
|
3011
|
+
and public_claim_pair_side_measured(pair, "variant", "primary_tokens")
|
|
3012
|
+
and public_claim_pair_side_measured(pair, "baseline", "primary_cost_usd")
|
|
3013
|
+
and public_claim_pair_side_measured(pair, "variant", "primary_cost_usd")
|
|
3014
|
+
for pair in pairs
|
|
3015
|
+
)
|
|
3016
|
+
gates.append(public_claim_readiness_gate(
|
|
3017
|
+
"provider_measured_token_cost",
|
|
3018
|
+
"Provider-measured token and primary cost",
|
|
3019
|
+
provider_measured_token_cost_pass,
|
|
3020
|
+
"provider_measured_primary_tokens_and_cost" if provider_measured_token_cost_pass else "missing_provider_measured_primary_tokens_or_cost",
|
|
3021
|
+
{
|
|
3022
|
+
"matched_pair_count": len(pairs),
|
|
3023
|
+
"required_fields": [
|
|
3024
|
+
"matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
|
|
3025
|
+
"matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
|
|
3026
|
+
"matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
|
|
3027
|
+
"matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured",
|
|
3028
|
+
],
|
|
3029
|
+
},
|
|
3030
|
+
))
|
|
3031
|
+
|
|
3032
|
+
quality_gates = sorted({str(item.get("quality_gate") or "unknown") for item in comparisons})
|
|
3033
|
+
failure_deltas = public_claim_numeric_values([
|
|
3034
|
+
item.get("failure_rate_delta_pp") for item in comparisons
|
|
3035
|
+
])
|
|
3036
|
+
correction_deltas = public_claim_numeric_values([
|
|
3037
|
+
item.get("corrections_delta_per_successful_task") for item in comparisons
|
|
3038
|
+
])
|
|
3039
|
+
quality_pass = bool(comparisons) and all(item.get("quality_gate") == "pass" for item in comparisons)
|
|
3040
|
+
gates.append(public_claim_readiness_gate(
|
|
3041
|
+
"quality_non_inferiority",
|
|
3042
|
+
"Quality non-inferiority",
|
|
3043
|
+
quality_pass,
|
|
3044
|
+
"all_quality_gates_pass" if quality_pass else "quality_gate_not_pass",
|
|
3045
|
+
{
|
|
3046
|
+
"quality_gates": quality_gates,
|
|
3047
|
+
"max_failure_rate_delta_pp": max(failure_deltas) if failure_deltas else None,
|
|
3048
|
+
"max_corrections_delta_per_successful_task": max(correction_deltas) if correction_deltas else None,
|
|
3049
|
+
},
|
|
3050
|
+
))
|
|
3051
|
+
|
|
3052
|
+
shifted_cost_pass = bool(pairs) and all(
|
|
3053
|
+
isinstance(pair.get("claim_boundary"), dict)
|
|
3054
|
+
and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
|
|
3055
|
+
and public_claim_pair_side_measured(pair, "baseline", "total_cost_with_shift_usd")
|
|
3056
|
+
and public_claim_pair_side_measured(pair, "variant", "total_cost_with_shift_usd")
|
|
3057
|
+
for pair in pairs
|
|
3058
|
+
)
|
|
3059
|
+
gates.append(public_claim_readiness_gate(
|
|
3060
|
+
"shifted_cost_accounting",
|
|
3061
|
+
"Shifted-cost accounting",
|
|
3062
|
+
shifted_cost_pass,
|
|
3063
|
+
"shifted_cost_claim_gates_pass" if shifted_cost_pass else "missing_shifted_cost_claim_accounting",
|
|
3064
|
+
{
|
|
3065
|
+
"matched_pair_count": len(pairs),
|
|
3066
|
+
"required_fields": [
|
|
3067
|
+
"matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
|
|
3068
|
+
"matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
|
|
3069
|
+
"matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured",
|
|
3070
|
+
],
|
|
3071
|
+
},
|
|
3072
|
+
))
|
|
3073
|
+
|
|
3074
|
+
has_replay = replay_rows is not None and bool(replay_rows)
|
|
3075
|
+
explicit_note_count = sum(1 for row in (replay_rows or []) if row.explicit_notes)
|
|
3076
|
+
failed_rows = [row for row in (replay_rows or []) if not row.result.success]
|
|
3077
|
+
failed_rows_with_notes = sum(1 for row in failed_rows if row.explicit_notes)
|
|
3078
|
+
comparison_failure_fields_present = bool(comparisons) and all(
|
|
3079
|
+
"baseline_failure_rate" in item
|
|
3080
|
+
and "variant_failure_rate" in item
|
|
3081
|
+
and "failure_rate_delta_pp" in item
|
|
3082
|
+
and "paired_corrections_task_count" in item
|
|
3083
|
+
for item in comparisons
|
|
3084
|
+
)
|
|
3085
|
+
confidence_notes_pass = (
|
|
3086
|
+
has_replay
|
|
3087
|
+
and explicit_note_count == replay_count
|
|
3088
|
+
and failed_rows_with_notes == len(failed_rows)
|
|
3089
|
+
and comparison_failure_fields_present
|
|
3090
|
+
)
|
|
3091
|
+
gates.append(public_claim_readiness_gate(
|
|
3092
|
+
"confidence_failure_notes",
|
|
3093
|
+
"Confidence and failure notes",
|
|
3094
|
+
confidence_notes_pass,
|
|
3095
|
+
"explicit_replay_notes_and_failure_rate_evidence_present" if confidence_notes_pass else "missing_explicit_replay_notes_or_failure_evidence",
|
|
3096
|
+
{
|
|
3097
|
+
"replay_row_count": replay_count,
|
|
3098
|
+
"explicit_note_count": explicit_note_count,
|
|
3099
|
+
"failed_row_count": len(failed_rows),
|
|
3100
|
+
"failed_rows_with_notes": failed_rows_with_notes,
|
|
3101
|
+
"comparison_failure_fields_present": comparison_failure_fields_present,
|
|
3102
|
+
},
|
|
3103
|
+
unknown=not has_replay,
|
|
3104
|
+
))
|
|
3105
|
+
|
|
3106
|
+
same_run_complete = bool(replay_evidence.get("same_run_complete")) if replay_evidence else (
|
|
3107
|
+
has_replay and not mixed_csv and replay_count == row_count
|
|
3108
|
+
)
|
|
3109
|
+
source_types = sorted({row.source_type for row in (replay_rows or [])})
|
|
3110
|
+
provider_names = sorted({row.provider_name for row in (replay_rows or []) if row.provider_name})
|
|
3111
|
+
provider_export_pass = (
|
|
3112
|
+
has_replay
|
|
3113
|
+
and not mixed_csv
|
|
3114
|
+
and same_run_complete
|
|
3115
|
+
and replay_count == row_count
|
|
3116
|
+
and all(row.provider_export_provenance_complete for row in (replay_rows or []))
|
|
3117
|
+
)
|
|
3118
|
+
gates.append(public_claim_readiness_gate(
|
|
3119
|
+
"provider_export_provenance",
|
|
3120
|
+
"Provider-export provenance",
|
|
3121
|
+
provider_export_pass,
|
|
3122
|
+
"complete_provider_export_same_run_provenance" if provider_export_pass else "missing_or_mixed_provider_export_provenance",
|
|
3123
|
+
{
|
|
3124
|
+
"replay_row_count": replay_count,
|
|
3125
|
+
"report_row_count": row_count,
|
|
3126
|
+
"mixed_csv": mixed_csv,
|
|
3127
|
+
"same_run_complete": same_run_complete,
|
|
3128
|
+
"source_types": source_types,
|
|
3129
|
+
"provider_names": provider_names[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
|
|
3130
|
+
},
|
|
3131
|
+
unknown=not has_replay,
|
|
3132
|
+
))
|
|
3133
|
+
|
|
3134
|
+
passed_required_gate_count = sum(1 for gate in gates if gate["passed"])
|
|
3135
|
+
blocking_gate_ids = [str(gate["id"]) for gate in gates if not gate["passed"]]
|
|
3136
|
+
required_gates_pass = passed_required_gate_count == len(gates)
|
|
3137
|
+
claim_allowed = (
|
|
3138
|
+
required_gates_pass
|
|
3139
|
+
and public_claim_status == REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
|
|
3140
|
+
and bool(public_claim_eligible)
|
|
3141
|
+
)
|
|
3142
|
+
if claim_allowed:
|
|
3143
|
+
readiness_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
|
|
3144
|
+
reason = "all_required_public_claim_gates_pass"
|
|
3145
|
+
elif not has_replay:
|
|
3146
|
+
readiness_status = "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
|
|
3147
|
+
reason = "replay_evidence_required_for_public_claim"
|
|
3148
|
+
elif provider_export_pass:
|
|
3149
|
+
readiness_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
|
|
3150
|
+
reason = "provider_export_present_but_readiness_gates_failed"
|
|
3151
|
+
else:
|
|
3152
|
+
readiness_status = "public_claim_blocked"
|
|
3153
|
+
reason = "unsupported_public_savings_claim_forbidden"
|
|
3154
|
+
|
|
3155
|
+
return {
|
|
3156
|
+
"schema_version": PUBLIC_CLAIM_READINESS_SCHEMA_VERSION,
|
|
3157
|
+
"generated_from": "matched_pair_evidence_and_replay_provenance",
|
|
3158
|
+
"status": readiness_status,
|
|
3159
|
+
"reason": reason,
|
|
3160
|
+
"claim_allowed": claim_allowed,
|
|
3161
|
+
"public_claim_status_observed": public_claim_status,
|
|
3162
|
+
"public_claim_eligible_observed": public_claim_eligible,
|
|
3163
|
+
"raw_metric_claim_status_observed": raw_metric_claim_status,
|
|
3164
|
+
"required_gate_ids": list(PUBLIC_CLAIM_READINESS_GATE_IDS),
|
|
3165
|
+
"required_gate_count": len(gates),
|
|
3166
|
+
"passed_required_gate_count": passed_required_gate_count,
|
|
3167
|
+
"blocking_gate_ids": blocking_gate_ids,
|
|
3168
|
+
"gates": gates,
|
|
3169
|
+
"claim_boundary": PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY,
|
|
3170
|
+
}
|
|
3171
|
+
|
|
3172
|
+
|
|
3173
|
+
def default_matrix_normalized_key(value: Any) -> str:
|
|
3174
|
+
text = str(value or "").lower()
|
|
3175
|
+
return re.sub(r"[^a-z0-9]+", "_", text).strip("_")
|
|
3176
|
+
|
|
3177
|
+
|
|
3178
|
+
def default_matrix_contains_key(haystack: str, needle: str) -> bool:
|
|
3179
|
+
needle = default_matrix_normalized_key(needle)
|
|
3180
|
+
if not needle:
|
|
3181
|
+
return False
|
|
3182
|
+
return needle in haystack
|
|
3183
|
+
|
|
3184
|
+
|
|
3185
|
+
def infer_default_matrix_lanes(pair: dict[str, Any]) -> list[tuple[str, str]]:
|
|
3186
|
+
task_id = default_matrix_normalized_key(pair.get("task_id"))
|
|
3187
|
+
variant = default_matrix_normalized_key(pair.get("variant"))
|
|
3188
|
+
matches: list[tuple[str, str]] = []
|
|
3189
|
+
for lane in DEFAULT_MATRIX_LANES:
|
|
3190
|
+
lane_id = str(lane["id"])
|
|
3191
|
+
task_keywords = tuple(str(item) for item in lane.get("task_keywords", ()))
|
|
3192
|
+
variant_keywords = tuple(str(item) for item in lane.get("variant_keywords", ()))
|
|
3193
|
+
if any(default_matrix_contains_key(task_id, item) for item in task_keywords):
|
|
3194
|
+
matches.append((lane_id, "exact_key"))
|
|
3195
|
+
elif any(default_matrix_contains_key(variant, item) for item in variant_keywords):
|
|
3196
|
+
matches.append((lane_id, "name_heuristic"))
|
|
3197
|
+
return matches
|
|
3198
|
+
|
|
3199
|
+
|
|
3200
|
+
def default_matrix_number(value: Any) -> float | None:
|
|
3201
|
+
if isinstance(value, bool) or not isinstance(value, (int, float)):
|
|
3202
|
+
return None
|
|
3203
|
+
numeric = float(value)
|
|
3204
|
+
if not math.isfinite(numeric):
|
|
3205
|
+
return None
|
|
3206
|
+
return numeric
|
|
3207
|
+
|
|
3208
|
+
|
|
3209
|
+
def default_matrix_unique(values: list[Any]) -> list[Any]:
|
|
3210
|
+
out: list[Any] = []
|
|
3211
|
+
for value in values:
|
|
3212
|
+
if value not in out:
|
|
3213
|
+
out.append(value)
|
|
3214
|
+
return out
|
|
3215
|
+
|
|
3216
|
+
|
|
3217
|
+
def default_matrix_cap(values: list[Any]) -> list[Any]:
|
|
3218
|
+
return default_matrix_unique(values)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS]
|
|
3219
|
+
|
|
3220
|
+
|
|
3221
|
+
def default_matrix_lane_match_method(methods: set[str]) -> str:
|
|
3222
|
+
if "exact_key" in methods:
|
|
3223
|
+
return "exact_key"
|
|
3224
|
+
if "name_heuristic" in methods:
|
|
3225
|
+
return "name_heuristic"
|
|
3226
|
+
return "absent"
|
|
3227
|
+
|
|
3228
|
+
|
|
3229
|
+
def default_matrix_clamp_classification(classification: str, ceiling: str) -> tuple[str, bool]:
|
|
3230
|
+
if classification == "reject/rework":
|
|
3231
|
+
return classification, False
|
|
3232
|
+
if ceiling not in DEFAULT_MATRIX_CLASSIFICATION_STRENGTH:
|
|
3233
|
+
return classification, False
|
|
3234
|
+
current_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH.get(classification, 0)
|
|
3235
|
+
ceiling_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH[ceiling]
|
|
3236
|
+
if current_strength > ceiling_strength:
|
|
3237
|
+
return ceiling, True
|
|
3238
|
+
return classification, False
|
|
3239
|
+
|
|
3240
|
+
|
|
3241
|
+
def default_matrix_token_evidence(token_values: list[float], pair_count: int, byte_proxy_positive: bool) -> str:
|
|
3242
|
+
if pair_count and len(token_values) == pair_count and all(value > 0 for value in token_values):
|
|
3243
|
+
return "measured_positive"
|
|
3244
|
+
if token_values:
|
|
3245
|
+
if any(value < 0 for value in token_values):
|
|
3246
|
+
return "measured_regression"
|
|
3247
|
+
return "measured_incomplete_or_mixed"
|
|
3248
|
+
if byte_proxy_positive:
|
|
3249
|
+
return "byte_proxy_only"
|
|
3250
|
+
return "unavailable"
|
|
3251
|
+
|
|
3252
|
+
|
|
3253
|
+
def classify_default_matrix_lane(
|
|
3254
|
+
lane_id: str,
|
|
3255
|
+
pairs: list[dict[str, Any]],
|
|
3256
|
+
methods: set[str],
|
|
3257
|
+
) -> dict[str, Any]:
|
|
3258
|
+
lane = DEFAULT_MATRIX_LANE_BY_ID[lane_id]
|
|
3259
|
+
policy_ceiling = str(lane["policy_ceiling"])
|
|
3260
|
+
if not pairs:
|
|
3261
|
+
classification = "experimental"
|
|
3262
|
+
reason_codes = ["no_matched_lane_evidence"]
|
|
3263
|
+
return {
|
|
3264
|
+
"lane": lane_id,
|
|
3265
|
+
"label": lane["label"],
|
|
3266
|
+
"classification": classification,
|
|
3267
|
+
"policy_ceiling": policy_ceiling,
|
|
3268
|
+
"policy_clamped": False,
|
|
3269
|
+
"lane_match_method": "absent",
|
|
3270
|
+
"matched_task_count": 0,
|
|
3271
|
+
"matched_tasks": [],
|
|
3272
|
+
"matched_variants": [],
|
|
3273
|
+
"quality_gate": "insufficient_evidence",
|
|
3274
|
+
"quality_gates": [],
|
|
3275
|
+
"token_evidence": "unavailable",
|
|
3276
|
+
"shifted_cost_evidence": "unavailable",
|
|
3277
|
+
"byte_proxy_evidence": "unavailable",
|
|
3278
|
+
"matched_pair_claim_gates": {
|
|
3279
|
+
"token_savings_claim_allowed": False,
|
|
3280
|
+
"shifted_cost_claim_allowed": False,
|
|
3281
|
+
},
|
|
3282
|
+
"public_claim_allowed": False,
|
|
3283
|
+
"reason_codes": reason_codes,
|
|
3284
|
+
"claim_boundary": {
|
|
3285
|
+
"classification_is_reporting_only": True,
|
|
3286
|
+
"hosted_api_savings_claim_allowed": False,
|
|
3287
|
+
"requires_report_claim_status_and_matched_pair_evidence": True,
|
|
3288
|
+
},
|
|
3289
|
+
}
|
|
3290
|
+
|
|
3291
|
+
quality_gates = sorted({str(pair.get("quality_gate") or "unknown") for pair in pairs})
|
|
3292
|
+
quality_gate = quality_gates[0] if len(quality_gates) == 1 else "mixed"
|
|
3293
|
+
token_values = [
|
|
3294
|
+
value for value in (
|
|
3295
|
+
default_matrix_number((pair.get("delta") or {}).get("token_savings_pct"))
|
|
3296
|
+
for pair in pairs
|
|
3297
|
+
if isinstance(pair.get("delta"), dict)
|
|
3298
|
+
)
|
|
3299
|
+
if value is not None
|
|
3300
|
+
]
|
|
3301
|
+
cost_values = [
|
|
3302
|
+
value for value in (
|
|
3303
|
+
default_matrix_number((pair.get("delta") or {}).get("cost_savings_pct_with_shift"))
|
|
3304
|
+
for pair in pairs
|
|
3305
|
+
if isinstance(pair.get("delta"), dict)
|
|
3306
|
+
)
|
|
3307
|
+
if value is not None
|
|
3308
|
+
]
|
|
3309
|
+
byte_after_deltas = [
|
|
3310
|
+
value for value in (
|
|
3311
|
+
default_matrix_number((pair.get("delta") or {}).get("bytes_after_total"))
|
|
3312
|
+
for pair in pairs
|
|
3313
|
+
if isinstance(pair.get("delta"), dict)
|
|
3314
|
+
)
|
|
3315
|
+
if value is not None
|
|
3316
|
+
]
|
|
3317
|
+
byte_proxy_positive = bool(byte_after_deltas) and any(value < 0 for value in byte_after_deltas)
|
|
3318
|
+
token_claim_gate = bool(pairs) and all(
|
|
3319
|
+
isinstance(pair.get("claim_boundary"), dict)
|
|
3320
|
+
and bool((pair.get("claim_boundary") or {}).get("token_savings_claim_allowed"))
|
|
3321
|
+
for pair in pairs
|
|
3322
|
+
)
|
|
3323
|
+
shifted_cost_claim_gate = bool(pairs) and all(
|
|
3324
|
+
isinstance(pair.get("claim_boundary"), dict)
|
|
3325
|
+
and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
|
|
3326
|
+
for pair in pairs
|
|
3327
|
+
)
|
|
3328
|
+
reason_codes: list[str] = []
|
|
3329
|
+
if any(gate != "pass" for gate in quality_gates):
|
|
3330
|
+
classification = "reject/rework"
|
|
3331
|
+
reason_codes.extend(f"quality_gate_{gate}" for gate in quality_gates if gate != "pass")
|
|
3332
|
+
elif any(value < 0 for value in token_values):
|
|
3333
|
+
classification = "reject/rework"
|
|
3334
|
+
reason_codes.append("measured_token_regression")
|
|
3335
|
+
elif any(value < 0 for value in cost_values):
|
|
3336
|
+
classification = "reject/rework"
|
|
3337
|
+
reason_codes.append("measured_shifted_cost_regression")
|
|
3338
|
+
elif (
|
|
3339
|
+
len(token_values) == len(pairs)
|
|
3340
|
+
and all(value > 0 for value in token_values)
|
|
3341
|
+
and len(cost_values) == len(pairs)
|
|
3342
|
+
and all(value >= 0 for value in cost_values)
|
|
3343
|
+
and token_claim_gate
|
|
3344
|
+
and shifted_cost_claim_gate
|
|
3345
|
+
):
|
|
3346
|
+
classification = "default-on"
|
|
3347
|
+
reason_codes.append("quality_pass_measured_token_and_shifted_cost_non_regression")
|
|
3348
|
+
elif len(token_values) == len(pairs) and all(value > 0 for value in token_values) and token_claim_gate:
|
|
3349
|
+
classification = "advisory"
|
|
3350
|
+
reason_codes.append("quality_pass_measured_token_savings_shifted_cost_unproven")
|
|
3351
|
+
elif byte_proxy_positive:
|
|
3352
|
+
classification = "advisory"
|
|
3353
|
+
reason_codes.append("quality_pass_byte_proxy_only")
|
|
3354
|
+
else:
|
|
3355
|
+
classification = "experimental"
|
|
3356
|
+
reason_codes.append("quality_pass_but_no_positive_measured_or_proxy_savings")
|
|
3357
|
+
|
|
3358
|
+
if lane_id == "optional_compression" and classification == "advisory" and not token_values:
|
|
3359
|
+
classification = "experimental"
|
|
3360
|
+
reason_codes.append("optional_compression_requires_provider_token_evidence_for_advisory")
|
|
3361
|
+
|
|
3362
|
+
classification, policy_clamped = default_matrix_clamp_classification(classification, policy_ceiling)
|
|
3363
|
+
if policy_clamped:
|
|
3364
|
+
reason_codes.append(f"policy_ceiling_{policy_ceiling}")
|
|
3365
|
+
|
|
3366
|
+
return {
|
|
3367
|
+
"lane": lane_id,
|
|
3368
|
+
"label": lane["label"],
|
|
3369
|
+
"classification": classification,
|
|
3370
|
+
"policy_ceiling": policy_ceiling,
|
|
3371
|
+
"policy_clamped": policy_clamped,
|
|
3372
|
+
"lane_match_method": default_matrix_lane_match_method(methods),
|
|
3373
|
+
"matched_task_count": len({str(pair.get("task_id")) for pair in pairs}),
|
|
3374
|
+
"matched_tasks": default_matrix_cap([pair.get("task_id") for pair in pairs if pair.get("task_id")]),
|
|
3375
|
+
"matched_variants": default_matrix_cap([pair.get("variant") for pair in pairs if pair.get("variant")]),
|
|
3376
|
+
"quality_gate": quality_gate,
|
|
3377
|
+
"quality_gates": quality_gates,
|
|
3378
|
+
"token_evidence": default_matrix_token_evidence(token_values, len(pairs), byte_proxy_positive),
|
|
3379
|
+
"shifted_cost_evidence": (
|
|
3380
|
+
"measured_non_regression"
|
|
3381
|
+
if cost_values and len(cost_values) == len(pairs) and all(value >= 0 for value in cost_values)
|
|
3382
|
+
else ("measured_regression" if any(value < 0 for value in cost_values) else "unavailable")
|
|
3383
|
+
),
|
|
3384
|
+
"byte_proxy_evidence": (
|
|
3385
|
+
"observed_positive" if byte_proxy_positive
|
|
3386
|
+
else ("observed_non_positive" if byte_after_deltas else "unavailable")
|
|
3387
|
+
),
|
|
3388
|
+
"matched_pair_claim_gates": {
|
|
3389
|
+
"token_savings_claim_allowed": token_claim_gate,
|
|
3390
|
+
"shifted_cost_claim_allowed": shifted_cost_claim_gate,
|
|
3391
|
+
},
|
|
3392
|
+
"public_claim_allowed": False,
|
|
3393
|
+
"reason_codes": default_matrix_unique(reason_codes),
|
|
3394
|
+
"claim_boundary": {
|
|
3395
|
+
"classification_is_reporting_only": True,
|
|
3396
|
+
"hosted_api_savings_claim_allowed": False,
|
|
3397
|
+
"requires_report_claim_status_and_matched_pair_evidence": True,
|
|
3398
|
+
},
|
|
3399
|
+
}
|
|
3400
|
+
|
|
3401
|
+
|
|
3402
|
+
def build_default_matrix(report: dict[str, Any]) -> dict[str, Any]:
|
|
3403
|
+
buckets: dict[str, list[dict[str, Any]]] = {lane_id: [] for lane_id in DEFAULT_MATRIX_LANE_IDS}
|
|
3404
|
+
methods: dict[str, set[str]] = {lane_id: set() for lane_id in DEFAULT_MATRIX_LANE_IDS}
|
|
3405
|
+
unmatched_variants: set[str] = set()
|
|
3406
|
+
pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
|
|
3407
|
+
for pair in pairs:
|
|
3408
|
+
if not isinstance(pair, dict):
|
|
3409
|
+
continue
|
|
3410
|
+
lane_matches = infer_default_matrix_lanes(pair)
|
|
3411
|
+
if not lane_matches:
|
|
3412
|
+
if pair.get("variant"):
|
|
3413
|
+
unmatched_variants.add(str(pair.get("variant")))
|
|
3414
|
+
continue
|
|
3415
|
+
for lane_id, method in lane_matches:
|
|
3416
|
+
buckets[lane_id].append(pair)
|
|
3417
|
+
methods[lane_id].add(method)
|
|
3418
|
+
lanes = [
|
|
3419
|
+
classify_default_matrix_lane(lane_id, buckets[lane_id], methods[lane_id])
|
|
3420
|
+
for lane_id in DEFAULT_MATRIX_LANE_IDS
|
|
3421
|
+
]
|
|
3422
|
+
classification_counts = {
|
|
3423
|
+
classification: sum(1 for lane in lanes if lane.get("classification") == classification)
|
|
3424
|
+
for classification in DEFAULT_MATRIX_CLASSIFICATIONS
|
|
3425
|
+
}
|
|
3426
|
+
return {
|
|
3427
|
+
"schema_version": DEFAULT_MATRIX_SCHEMA_VERSION,
|
|
3428
|
+
"classification_set": list(DEFAULT_MATRIX_CLASSIFICATIONS),
|
|
3429
|
+
"generated_from": "matched_pair_evidence",
|
|
3430
|
+
"reporting_only": True,
|
|
3431
|
+
"claim_status_observed": report.get("claim_status"),
|
|
3432
|
+
"public_claim_allowed": False,
|
|
3433
|
+
"claim_boundary": DEFAULT_MATRIX_CLAIM_BOUNDARY,
|
|
3434
|
+
"lanes": lanes,
|
|
3435
|
+
"summary": {
|
|
3436
|
+
"lane_count": len(lanes),
|
|
3437
|
+
"classification_counts": classification_counts,
|
|
3438
|
+
"unmatched_variants": sorted(unmatched_variants)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
|
|
3439
|
+
},
|
|
3440
|
+
}
|
|
3441
|
+
|
|
3442
|
+
|
|
3443
|
+
def markdown_value(value: Any) -> str:
|
|
3444
|
+
if value is None:
|
|
3445
|
+
return "n/a"
|
|
3446
|
+
if isinstance(value, bool):
|
|
3447
|
+
return "true" if value else "false"
|
|
3448
|
+
if isinstance(value, float):
|
|
3449
|
+
return f"{value:.6g}"
|
|
3450
|
+
text = sanitize_note_text(value)
|
|
3451
|
+
return text.replace("|", "\\|") or "n/a"
|
|
3452
|
+
|
|
3453
|
+
|
|
3454
|
+
def render_dashboard_markdown(report: dict[str, Any]) -> str:
|
|
3455
|
+
public_claim_status, public_claim_eligible = report_public_claim_status(report)
|
|
3456
|
+
metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
|
|
3457
|
+
lines = [
|
|
3458
|
+
"# ContextGuard Benchmark Dashboard",
|
|
3459
|
+
"",
|
|
3460
|
+
f"- Schema: `{markdown_value(report.get('schema'))}`",
|
|
3461
|
+
f"- Baseline variant: `{markdown_value(report.get('baseline_variant'))}`",
|
|
3462
|
+
f"- Rows: {markdown_value(report.get('row_count'))}",
|
|
3463
|
+
f"- Metric claim status: `{markdown_value(metric_claim_status)}`",
|
|
3464
|
+
f"- Public claim status: `{markdown_value(public_claim_status)}`",
|
|
3465
|
+
f"- Public claim eligible: `{markdown_value(public_claim_eligible)}`",
|
|
3466
|
+
"",
|
|
3467
|
+
"> Claim boundary: this dashboard is not a hosted savings claim unless report claim gates "
|
|
3468
|
+
"allow it and public-claim provenance is complete. Proxy byte reductions are diagnostic "
|
|
3469
|
+
"and are not hosted API token savings.",
|
|
3470
|
+
"",
|
|
3471
|
+
"## Variant summary",
|
|
3472
|
+
"",
|
|
3473
|
+
"| Variant | Runs | Successes | Failure rate | Tokens/success | Bytes saved | Token proxy saved | Quality notes |",
|
|
3474
|
+
"| --- | ---: | ---: | ---: | ---: | ---: | ---: | --- |",
|
|
3475
|
+
]
|
|
3476
|
+
summaries = report.get("summary_by_variant") if isinstance(report.get("summary_by_variant"), dict) else {}
|
|
3477
|
+
comparison_by_variant = {
|
|
3478
|
+
item.get("variant"): item
|
|
3479
|
+
for item in report.get("comparisons", [])
|
|
3480
|
+
if isinstance(item, dict)
|
|
3481
|
+
}
|
|
3482
|
+
for variant, summary in sorted(summaries.items()):
|
|
3483
|
+
if not isinstance(summary, dict):
|
|
3484
|
+
continue
|
|
3485
|
+
comparison = comparison_by_variant.get(variant, {})
|
|
3486
|
+
quality = comparison.get("quality_gate") if isinstance(comparison, dict) else None
|
|
3487
|
+
if quality is None and summary.get("is_baseline_strategy"):
|
|
3488
|
+
quality = "baseline"
|
|
3489
|
+
lines.append(
|
|
3490
|
+
"| "
|
|
3491
|
+
+ " | ".join([
|
|
3492
|
+
markdown_value(variant),
|
|
3493
|
+
markdown_value(summary.get("runs")),
|
|
3494
|
+
markdown_value(summary.get("successful_runs")),
|
|
3495
|
+
markdown_value(summary.get("failure_rate")),
|
|
3496
|
+
markdown_value(summary.get("tokens_per_successful_task")),
|
|
3497
|
+
markdown_value(summary.get("bytes_saved_successful")),
|
|
3498
|
+
markdown_value(summary.get("token_proxy_saved_successful")),
|
|
3499
|
+
markdown_value(quality),
|
|
3500
|
+
])
|
|
3501
|
+
+ " |"
|
|
3502
|
+
)
|
|
3503
|
+
lines.extend([
|
|
3504
|
+
"",
|
|
3505
|
+
"## Comparisons",
|
|
3506
|
+
"",
|
|
3507
|
+
"| Variant | Quality gate | Matched tasks | Token paired tasks | Token savings % | Shifted cost savings % |",
|
|
3508
|
+
"| --- | --- | ---: | ---: | ---: | ---: |",
|
|
3509
|
+
])
|
|
3510
|
+
comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
|
|
3511
|
+
if comparisons:
|
|
3512
|
+
for item in comparisons:
|
|
3513
|
+
if not isinstance(item, dict):
|
|
3514
|
+
continue
|
|
3515
|
+
lines.append(
|
|
3516
|
+
"| "
|
|
3517
|
+
+ " | ".join([
|
|
3518
|
+
markdown_value(item.get("variant")),
|
|
3519
|
+
markdown_value(item.get("quality_gate")),
|
|
3520
|
+
markdown_value(item.get("matched_successful_task_count")),
|
|
3521
|
+
markdown_value(item.get("paired_token_task_count")),
|
|
3522
|
+
markdown_value(item.get("token_savings_pct")),
|
|
3523
|
+
markdown_value(item.get("cost_savings_pct_with_shift")),
|
|
3524
|
+
])
|
|
3525
|
+
+ " |"
|
|
3526
|
+
)
|
|
3527
|
+
else:
|
|
3528
|
+
lines.append("| n/a | n/a | 0 | 0 | n/a | n/a |")
|
|
3529
|
+
readiness = report.get("public_claim_readiness") if isinstance(report.get("public_claim_readiness"), dict) else None
|
|
3530
|
+
if readiness is not None:
|
|
3531
|
+
lines.extend([
|
|
3532
|
+
"",
|
|
3533
|
+
"## Public claim readiness",
|
|
3534
|
+
"",
|
|
3535
|
+
f"- Status: `{markdown_value(readiness.get('status'))}`",
|
|
3536
|
+
f"- Claim allowed: `{markdown_value(readiness.get('claim_allowed'))}`",
|
|
3537
|
+
"",
|
|
3538
|
+
"| Gate | Status | Reason | Evidence |",
|
|
3539
|
+
"| --- | --- | --- | --- |",
|
|
3540
|
+
])
|
|
3541
|
+
gates = readiness.get("gates") if isinstance(readiness.get("gates"), list) else []
|
|
3542
|
+
for gate in gates:
|
|
3543
|
+
if not isinstance(gate, dict):
|
|
3544
|
+
continue
|
|
3545
|
+
evidence = gate.get("evidence") if isinstance(gate.get("evidence"), dict) else {}
|
|
3546
|
+
lines.append(
|
|
3547
|
+
"| "
|
|
3548
|
+
+ " | ".join([
|
|
3549
|
+
markdown_value(gate.get("id")),
|
|
3550
|
+
markdown_value(gate.get("status")),
|
|
3551
|
+
markdown_value(gate.get("reason")),
|
|
3552
|
+
markdown_value(public_claim_readiness_evidence_text(evidence)),
|
|
3553
|
+
])
|
|
3554
|
+
+ " |"
|
|
3555
|
+
)
|
|
3556
|
+
boundary = readiness.get("claim_boundary")
|
|
3557
|
+
if isinstance(boundary, dict):
|
|
3558
|
+
lines.extend([
|
|
3559
|
+
"",
|
|
3560
|
+
f"- Public claim boundary: {markdown_value(boundary.get('reason'))}",
|
|
3561
|
+
])
|
|
3562
|
+
default_matrix = report.get("default_matrix") if isinstance(report.get("default_matrix"), dict) else None
|
|
3563
|
+
if default_matrix is not None:
|
|
3564
|
+
lines.extend([
|
|
3565
|
+
"",
|
|
3566
|
+
"## Default matrix",
|
|
3567
|
+
"",
|
|
3568
|
+
"| Lane | Classification | Matched Tasks | Quality Gate | Token Evidence | Public Claim | Reason |",
|
|
3569
|
+
"| --- | --- | ---: | --- | --- | --- | --- |",
|
|
3570
|
+
])
|
|
3571
|
+
lanes = default_matrix.get("lanes") if isinstance(default_matrix.get("lanes"), list) else []
|
|
3572
|
+
for lane in lanes:
|
|
3573
|
+
if not isinstance(lane, dict):
|
|
3574
|
+
continue
|
|
3575
|
+
reasons = lane.get("reason_codes") if isinstance(lane.get("reason_codes"), list) else []
|
|
3576
|
+
lines.append(
|
|
3577
|
+
"| "
|
|
3578
|
+
+ " | ".join([
|
|
3579
|
+
markdown_value(lane.get("lane")),
|
|
3580
|
+
markdown_value(lane.get("classification")),
|
|
3581
|
+
markdown_value(lane.get("matched_task_count")),
|
|
3582
|
+
markdown_value(lane.get("quality_gate")),
|
|
3583
|
+
markdown_value(lane.get("token_evidence")),
|
|
3584
|
+
markdown_value(lane.get("public_claim_allowed")),
|
|
3585
|
+
markdown_value(", ".join(str(item) for item in reasons[:3])),
|
|
3586
|
+
])
|
|
3587
|
+
+ " |"
|
|
3588
|
+
)
|
|
3589
|
+
boundary = default_matrix.get("claim_boundary")
|
|
3590
|
+
if isinstance(boundary, dict):
|
|
3591
|
+
lines.extend([
|
|
3592
|
+
"",
|
|
3593
|
+
f"- Matrix boundary: {markdown_value(boundary.get('reason'))}",
|
|
3594
|
+
])
|
|
3595
|
+
replay = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else None
|
|
3596
|
+
if replay is not None:
|
|
3597
|
+
lines.extend([
|
|
3598
|
+
"",
|
|
3599
|
+
"## Replay evidence provenance",
|
|
3600
|
+
"",
|
|
3601
|
+
f"- Source types: `{markdown_value(', '.join(replay.get('source_types') or []))}`",
|
|
3602
|
+
f"- Claim scopes: `{markdown_value(', '.join(replay.get('claim_scopes') or []))}`",
|
|
3603
|
+
f"- Same-run complete: `{markdown_value(replay.get('same_run_complete'))}`",
|
|
3604
|
+
f"- Mixed/pre-existing CSV: `{markdown_value(replay.get('mixed_csv'))}`",
|
|
3605
|
+
f"- Boundary: {markdown_value(replay.get('claim_boundary'))}",
|
|
3606
|
+
])
|
|
3607
|
+
else:
|
|
3608
|
+
lines.extend([
|
|
3609
|
+
"",
|
|
3610
|
+
"## Provenance note",
|
|
3611
|
+
"",
|
|
3612
|
+
"- CSV-only dashboards have unknown public-claim provenance unless regenerated from "
|
|
3613
|
+
"the original evidence JSONL or a future trusted provenance ledger.",
|
|
3614
|
+
])
|
|
3615
|
+
lines.extend([
|
|
3616
|
+
"",
|
|
3617
|
+
"## Re-run context",
|
|
3618
|
+
"",
|
|
3619
|
+
"- Evidence replay: `context-guard-bench --tasks <tasks.json> --variants <variants.json> "
|
|
3620
|
+
"--evidence-jsonl <evidence.jsonl> --csv <results.csv> --report-json <report.json> "
|
|
3621
|
+
"--dashboard-md <dashboard.md>`",
|
|
3622
|
+
])
|
|
3623
|
+
return "\n".join(lines) + "\n"
|
|
3624
|
+
|
|
3625
|
+
|
|
3626
|
+
def write_report_outputs(
|
|
3627
|
+
csv_path: Path,
|
|
3628
|
+
report_path: Path | None,
|
|
3629
|
+
dashboard_path: Path | None,
|
|
3630
|
+
baseline_variant: str,
|
|
3631
|
+
*,
|
|
3632
|
+
replay_rows: list[EvidenceReplayRow] | None = None,
|
|
3633
|
+
mixed_csv: bool = False,
|
|
3634
|
+
) -> dict[str, Any]:
|
|
3635
|
+
# Keep lock order stable across all derived writes: source CSV first, then
|
|
3636
|
+
# report, then dashboard. Do not introduce a derived-output -> CSV path.
|
|
3637
|
+
with csv_file_lock(csv_path, create_parent=True):
|
|
3638
|
+
report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
|
|
3639
|
+
if replay_rows is not None:
|
|
3640
|
+
report = annotate_replay_report(report, replay_rows, mixed_csv=mixed_csv)
|
|
3641
|
+
if report_path is not None:
|
|
3642
|
+
with csv_file_lock(report_path, create_parent=True):
|
|
3643
|
+
write_text_no_follow(
|
|
3644
|
+
report_path,
|
|
3645
|
+
json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
|
|
3646
|
+
)
|
|
3647
|
+
if dashboard_path is not None:
|
|
3648
|
+
with csv_file_lock(dashboard_path, create_parent=True):
|
|
3649
|
+
write_text_no_follow(dashboard_path, render_dashboard_markdown(report))
|
|
3650
|
+
return report
|
|
3651
|
+
|
|
2206
3652
|
|
|
2207
3653
|
def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
|
|
2208
3654
|
# Keep lock order stable across all report writes: source CSV first, derived
|
|
2209
3655
|
# report second. Do not introduce a report -> CSV path; that can deadlock
|
|
2210
3656
|
# concurrent report generation.
|
|
2211
|
-
|
|
2212
|
-
report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
|
|
2213
|
-
with csv_file_lock(report_path, create_parent=True):
|
|
2214
|
-
write_text_no_follow(
|
|
2215
|
-
report_path,
|
|
2216
|
-
json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
|
|
2217
|
-
)
|
|
2218
|
-
return report
|
|
3657
|
+
return write_report_outputs(csv_path, report_path, None, baseline_variant)
|
|
2219
3658
|
|
|
2220
3659
|
|
|
2221
3660
|
def sanitize_note_text(value: Any) -> str:
|
|
@@ -2278,8 +3717,18 @@ def existing_file_identity(path: Path) -> tuple[int, int] | None:
|
|
|
2278
3717
|
os.close(fd)
|
|
2279
3718
|
|
|
2280
3719
|
|
|
2281
|
-
def validate_distinct_output_paths(
|
|
2282
|
-
|
|
3720
|
+
def validate_distinct_output_paths(
|
|
3721
|
+
csv_path: Path,
|
|
3722
|
+
ledger_path: Path | None,
|
|
3723
|
+
report_path: Path | None,
|
|
3724
|
+
dashboard_path: Path | None = None,
|
|
3725
|
+
) -> None:
|
|
3726
|
+
outputs = [
|
|
3727
|
+
("csv", csv_path),
|
|
3728
|
+
("ledger-jsonl", ledger_path),
|
|
3729
|
+
("report-json", report_path),
|
|
3730
|
+
("dashboard-md", dashboard_path),
|
|
3731
|
+
]
|
|
2283
3732
|
seen: dict[Path, str] = {}
|
|
2284
3733
|
seen_identity: dict[tuple[int, int], str] = {}
|
|
2285
3734
|
for label, path in outputs:
|
|
@@ -2318,12 +3767,16 @@ def main() -> int:
|
|
|
2318
3767
|
help="optional JSONL ledger path for cost-shift accounting per run")
|
|
2319
3768
|
parser.add_argument("--report-json", default=None, type=Path,
|
|
2320
3769
|
help="optional A/B summary report JSON path generated from --csv after real runs")
|
|
3770
|
+
parser.add_argument("--dashboard-md", default=None, type=Path,
|
|
3771
|
+
help="optional Markdown dashboard path generated from the benchmark report")
|
|
3772
|
+
parser.add_argument("--evidence-jsonl", default=None, type=Path,
|
|
3773
|
+
help="optional validated run-evidence JSONL replay input; skips provider invocation")
|
|
2321
3774
|
parser.add_argument("--baseline-variant", default="baseline",
|
|
2322
3775
|
help="variant name used as the report baseline (default: baseline)")
|
|
2323
3776
|
args = parser.parse_args()
|
|
2324
3777
|
|
|
2325
3778
|
require_no_follow_file_ops_supported()
|
|
2326
|
-
validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
|
|
3779
|
+
validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json, args.dashboard_md)
|
|
2327
3780
|
|
|
2328
3781
|
variants = parse_variants(args.variants)
|
|
2329
3782
|
tasks = parse_tasks(args.tasks, variants=variants)
|
|
@@ -2338,6 +3791,61 @@ def main() -> int:
|
|
|
2338
3791
|
for task, variant in targets
|
|
2339
3792
|
if (task.id, variant.name) not in skip_keys
|
|
2340
3793
|
]
|
|
3794
|
+
if args.evidence_jsonl is not None:
|
|
3795
|
+
if args.dry_run:
|
|
3796
|
+
for task, variant in targets:
|
|
3797
|
+
if (task.id, variant.name) in skip_keys:
|
|
3798
|
+
print(f"skip {task.id}/{variant.name} (already in {args.csv})")
|
|
3799
|
+
continue
|
|
3800
|
+
print(f"evidence replay dry-run: {task.id}/{variant.name} <- {args.evidence_jsonl}")
|
|
3801
|
+
print("completed 0 run(s); results in (dry-run; no CSV writes)")
|
|
3802
|
+
return 0
|
|
3803
|
+
csv_had_preexisting_content = file_has_content_no_follow(args.csv)
|
|
3804
|
+
evidence_rows = read_evidence_jsonl(args.evidence_jsonl)
|
|
3805
|
+
evidence_by_key = validate_evidence_coverage(evidence_rows, runnable_targets)
|
|
3806
|
+
claude_ver = "evidence-replay"
|
|
3807
|
+
completed = 0
|
|
3808
|
+
replay_rows_written: list[EvidenceReplayRow] = []
|
|
3809
|
+
for task, variant in targets:
|
|
3810
|
+
if (task.id, variant.name) in skip_keys:
|
|
3811
|
+
print(f"skip {task.id}/{variant.name} (already in {args.csv})")
|
|
3812
|
+
continue
|
|
3813
|
+
evidence = evidence_by_key[(task.id, variant.name)]
|
|
3814
|
+
print(f"replay {task.id}/{variant.name} ...", flush=True)
|
|
3815
|
+
result = run_evidence_fixture(task, variant, evidence)
|
|
3816
|
+
wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
|
|
3817
|
+
if wrote:
|
|
3818
|
+
replay_rows_written.append(evidence)
|
|
3819
|
+
if args.ledger_jsonl is not None:
|
|
3820
|
+
append_cost_shift_ledger(
|
|
3821
|
+
args.ledger_jsonl,
|
|
3822
|
+
claude_ver,
|
|
3823
|
+
result,
|
|
3824
|
+
replay_provenance=evidence.provenance_payload(),
|
|
3825
|
+
)
|
|
3826
|
+
completed += 1
|
|
3827
|
+
status = "ok" if result.success else "FAIL"
|
|
3828
|
+
suffix = "" if wrote else " (CSV not updated; row already present)"
|
|
3829
|
+
print(
|
|
3830
|
+
f" {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
|
|
3831
|
+
f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
|
|
3832
|
+
)
|
|
3833
|
+
if args.report_json is not None or args.dashboard_md is not None:
|
|
3834
|
+
report = write_report_outputs(
|
|
3835
|
+
args.csv,
|
|
3836
|
+
args.report_json,
|
|
3837
|
+
args.dashboard_md,
|
|
3838
|
+
args.baseline_variant,
|
|
3839
|
+
replay_rows=replay_rows_written,
|
|
3840
|
+
mixed_csv=csv_had_preexisting_content or bool(skip_keys) or len(replay_rows_written) != int(completed),
|
|
3841
|
+
)
|
|
3842
|
+
if args.report_json is not None:
|
|
3843
|
+
print(f"report {args.report_json}: {report['claim_status']}")
|
|
3844
|
+
if args.dashboard_md is not None:
|
|
3845
|
+
print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
|
|
3846
|
+
print(f"completed {completed} run(s); results in {args.csv}")
|
|
3847
|
+
return 0
|
|
3848
|
+
|
|
2341
3849
|
placeholder_targets = [
|
|
2342
3850
|
f"{task.id}/{variant.name}"
|
|
2343
3851
|
for task, variant in runnable_targets
|
|
@@ -2390,9 +3898,12 @@ def main() -> int:
|
|
|
2390
3898
|
f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
|
|
2391
3899
|
)
|
|
2392
3900
|
target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
|
|
2393
|
-
if args.report_json is not None and not args.dry_run:
|
|
2394
|
-
report =
|
|
2395
|
-
|
|
3901
|
+
if (args.report_json is not None or args.dashboard_md is not None) and not args.dry_run:
|
|
3902
|
+
report = write_report_outputs(args.csv, args.report_json, args.dashboard_md, args.baseline_variant)
|
|
3903
|
+
if args.report_json is not None:
|
|
3904
|
+
print(f"report {args.report_json}: {report['claim_status']}")
|
|
3905
|
+
if args.dashboard_md is not None:
|
|
3906
|
+
print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
|
|
2396
3907
|
print(f"completed {completed} run(s); results in {target}")
|
|
2397
3908
|
return 0
|
|
2398
3909
|
|