@ictechgy/context-guard 0.4.10 → 0.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/CHANGELOG.md +17 -1
  2. package/README.ko.md +46 -28
  3. package/README.md +42 -33
  4. package/docs/benchmark-fixtures/token-savings-12task.evidence.example.jsonl +24 -0
  5. package/docs/benchmark-workflow-examples.md +3 -0
  6. package/docs/benchmark-workflows/context-pack-byte-proxy.example.json +278 -137
  7. package/docs/benchmark-workflows/measured-token-workflow.example.json +279 -138
  8. package/docs/benchmark-workflows/provider-cache-telemetry.example.json +279 -138
  9. package/docs/experimental-benchmark-fixtures.md +24 -7
  10. package/package.json +2 -1
  11. package/plugins/context-guard/.claude-plugin/plugin.json +1 -1
  12. package/plugins/context-guard/README.ko.md +14 -11
  13. package/plugins/context-guard/README.md +15 -14
  14. package/plugins/context-guard/bin/context-guard +48 -17
  15. package/plugins/context-guard/bin/context-guard-artifact +342 -33
  16. package/plugins/context-guard/bin/context-guard-audit +36 -5
  17. package/plugins/context-guard/bin/context-guard-bench +1675 -44
  18. package/plugins/context-guard/bin/context-guard-cache-score +347 -35
  19. package/plugins/context-guard/bin/context-guard-compress +89 -27
  20. package/plugins/context-guard/bin/context-guard-cost +7 -2
  21. package/plugins/context-guard/bin/context-guard-experiments +364 -8
  22. package/plugins/context-guard/bin/context-guard-failed-nudge +6 -2
  23. package/plugins/context-guard/bin/context-guard-filter +88 -18
  24. package/plugins/context-guard/bin/context-guard-pack +329 -19
  25. package/plugins/context-guard/bin/context-guard-read-symbol +27 -0
  26. package/plugins/context-guard/bin/context-guard-sanitize-output +245 -18
  27. package/plugins/context-guard/bin/context-guard-setup +21 -5
  28. package/plugins/context-guard/bin/context-guard-tool-prune +287 -62
  29. package/plugins/context-guard/bin/context-guard-trim-output +394 -90
  30. package/plugins/context-guard/brief/README.md +5 -5
  31. package/plugins/context-guard/lib/context_guard_command_manifest_loader.py +123 -0
  32. package/plugins/context-guard/lib/context_guard_commands.py +217 -190
@@ -178,19 +178,137 @@ EXTERNAL_SOURCE_KEY_GROUPS: tuple[tuple[str, tuple[str, ...], tuple[str, ...]],
178
178
  )
179
179
  MAX_USAGE_TOKEN_COUNT = 10**12
180
180
  MAX_USAGE_COST_USD = 10**9
181
+ MAX_EVIDENCE_JSONL_BYTES = 5_000_000
182
+ MAX_EVIDENCE_JSONL_LINES = 100_000
181
183
  # Byte -> token proxy 환산 계수. 측정된 모델 토큰이 아니라 byte delta 기반 보수적
182
184
  # 추정치이며, report에서 evidence="inferred"로 분명히 라벨링한다. 영어 텍스트 기준
183
185
  # ~4 bytes/token의 통용 근사값을 사용한다.
184
186
  TOKEN_PROXY_BYTES_PER_TOKEN = 4
185
187
  BENCH_RUN_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.run-evidence.v1"
186
188
  MATCHED_PAIR_EVIDENCE_SCHEMA_VERSION = "contextguard.bench.matched-pair.v1"
189
+ MEASUREMENT_BASELINE_SCHEMA_VERSION = "contextguard.bench.measurement-baseline.v1"
190
+ DEFAULT_MATRIX_SCHEMA_VERSION = "contextguard.bench.default-matrix.v1"
191
+ PUBLIC_CLAIM_READINESS_SCHEMA_VERSION = "contextguard.bench.public-claim-readiness.v1"
187
192
  SELF_HOSTED_METRICS_SCHEMA_VERSION = "contextguard.bench.self-hosted-metrics.v1"
188
193
  SELF_HOSTED_METRICS_KEY = "self_hosted_metrics"
189
194
  SELF_HOSTED_METRICS_CLAIM_BOUNDARY = "self_hosted_metrics_only_not_hosted_api_token_or_cost_savings"
195
+ EVIDENCE_REPLAY_SOURCE_TYPES = frozenset({"synthetic_fixture", "provider_export", "manual_audit"})
196
+ PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES = frozenset({
197
+ "provider_measured_matched_task",
198
+ "provider_measured_matched_task_public_claim",
199
+ "hosted_api_provider_measured_matched_task",
200
+ })
201
+ REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS = "provider_export_public_claim_candidate"
202
+ REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS = "provider_export_claim_gates_not_met"
203
+ REPLAY_NOT_PUBLIC_CLAIM_STATUS = "replay_only_not_public_claim"
204
+ REPLAY_UNKNOWN_MIXED_CSV_STATUS = "unknown_mixed_csv"
205
+ REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES = frozenset({
206
+ "token_and_shifted_cost_savings_observed",
207
+ })
208
+ REPLAY_CLAIM_BOUNDARY = (
209
+ "Evidence replay is an import/replay mode. Synthetic fixtures and manual audits are never "
210
+ "hosted API token/cost savings evidence; public claims require complete provider_export "
211
+ "provenance for every report row plus the normal matched-task quality, token, cost, and "
212
+ "shifted-cost gates."
213
+ )
214
+ DEFAULT_MATRIX_CLASSIFICATIONS = ("default-on", "advisory", "experimental", "reject/rework")
215
+ DEFAULT_MATRIX_CLASSIFICATION_STRENGTH = {
216
+ "experimental": 0,
217
+ "advisory": 1,
218
+ "default-on": 2,
219
+ }
220
+ DEFAULT_MATRIX_LANES: tuple[dict[str, Any], ...] = (
221
+ {
222
+ "id": "trimming",
223
+ "label": "Trimming / digest output",
224
+ "policy_ceiling": "default-on",
225
+ "task_keywords": ("long_log_analysis", "output_transform", "trim", "trimming", "sanitize_output", "digest"),
226
+ "variant_keywords": ("trim", "trimming", "sanitize", "digest", "brief"),
227
+ },
228
+ {
229
+ "id": "artifact_escrow",
230
+ "label": "Artifact escrow / receipt handles",
231
+ "policy_ceiling": "default-on",
232
+ "task_keywords": ("artifact_receipt", "artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
233
+ "variant_keywords": ("artifact", "receipt", "escrow", "output_sandbox", "response_sandbox"),
234
+ },
235
+ {
236
+ "id": "tool_pruning",
237
+ "label": "Tool/MCP schema pruning",
238
+ "policy_ceiling": "default-on",
239
+ "task_keywords": ("tool_schema", "tool_prune", "tool_pruning", "mcp_schema", "defer_report"),
240
+ "variant_keywords": ("tool_prune", "tool_pruning", "tool_schema", "mcp", "defer"),
241
+ },
242
+ {
243
+ "id": "cache_advice",
244
+ "label": "Cache layout advice",
245
+ "policy_ceiling": "advisory",
246
+ "task_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache"),
247
+ "variant_keywords": ("cache_layout", "cache_advice", "cache_score", "provider_cache", "cache"),
248
+ },
249
+ {
250
+ "id": "adaptive_k",
251
+ "label": "Adaptive-k context packing",
252
+ "policy_ceiling": "advisory",
253
+ "task_keywords": ("adaptive_k", "adaptive", "top_k", "context_pack"),
254
+ "variant_keywords": ("adaptive_k", "adaptive", "top_k", "pack_adaptive"),
255
+ },
256
+ {
257
+ "id": "optional_compression",
258
+ "label": "Optional compression",
259
+ "policy_ceiling": "advisory",
260
+ "task_keywords": ("learned_compression", "compression", "compress", "context_diff"),
261
+ "variant_keywords": ("learned_compression", "compression", "compress", "context_diff"),
262
+ },
263
+ )
264
+ DEFAULT_MATRIX_LANE_IDS = tuple(str(item["id"]) for item in DEFAULT_MATRIX_LANES)
265
+ DEFAULT_MATRIX_LANE_BY_ID = {str(item["id"]): item for item in DEFAULT_MATRIX_LANES}
266
+ MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS = 20
267
+ DEFAULT_MATRIX_CLAIM_BOUNDARY = {
268
+ "id": "default_matrix_reporting_only_not_runtime_default_or_savings_claim",
269
+ "reporting_only": True,
270
+ "changes_runtime_defaults": False,
271
+ "hosted_api_token_savings_claim_allowed": False,
272
+ "hosted_api_cost_savings_claim_allowed": False,
273
+ "public_claims_must_use_report_claim_status_and_matched_pair_evidence": True,
274
+ "reason": (
275
+ "The default matrix classifies local benchmark lanes for review only; it does not "
276
+ "turn features on by default and does not authorize hosted API savings claims."
277
+ ),
278
+ }
279
+ PUBLIC_CLAIM_READINESS_GATE_IDS = (
280
+ "matched_successful_tasks",
281
+ "provider_measured_token_cost",
282
+ "quality_non_inferiority",
283
+ "shifted_cost_accounting",
284
+ "confidence_failure_notes",
285
+ "provider_export_provenance",
286
+ )
287
+ PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY = {
288
+ "id": "public_claim_readiness_authoritative_release_gate",
289
+ "reporting_only": True,
290
+ "claim_allowed_field": "public_claim_readiness.claim_allowed",
291
+ "unsupported_claims_forbidden": True,
292
+ "hosted_api_token_savings_claim_without_claim_allowed_forbidden": True,
293
+ "hosted_api_cost_savings_claim_without_claim_allowed_forbidden": True,
294
+ "fixed_percent_savings_claim_without_matched_provider_report_forbidden": True,
295
+ "requires_matched_successful_tasks": True,
296
+ "requires_provider_measured_tokens_and_cost": True,
297
+ "requires_quality_non_inferiority": True,
298
+ "requires_shifted_cost_accounting": True,
299
+ "requires_confidence_and_failure_notes": True,
300
+ "requires_provider_export_provenance": True,
301
+ "reason": (
302
+ "Public hosted token/cost savings claims are forbidden unless every readiness gate passes "
303
+ "and public_claim_readiness.claim_allowed is true."
304
+ ),
305
+ }
190
306
  MAX_SELF_HOSTED_LABEL_CHARS = 120
191
307
  MAX_SELF_HOSTED_LATENCY_MS = 7 * 24 * 60 * 60 * 1000
192
308
  MAX_SELF_HOSTED_MEMORY_MB = 10_000_000
193
309
  MAX_VARIANT_PROMPT_FILE_BYTES = 128_000
310
+ MAX_FIXTURE_FILE_BYTES = 1_000_000
311
+ MAX_CLAUDE_PROMPT_ARG_BYTES = MAX_VARIANT_PROMPT_FILE_BYTES
194
312
  CLAUDE_OUTPUT_MAX_BYTES = 1_000_000
195
313
  SUCCESS_COMMAND_OUTPUT_MAX_BYTES = 64_000
196
314
  VERSION_OUTPUT_MAX_BYTES = 16_000
@@ -317,12 +435,18 @@ def _open_regular_no_symlink(
317
435
  os.close(parent_fd)
318
436
 
319
437
 
320
- def _read_text_no_follow(path: Path) -> str:
438
+ def _read_text_no_follow(path: Path, *, max_bytes: int = MAX_FIXTURE_FILE_BYTES) -> str:
321
439
  fd = _open_regular_no_symlink(path)
322
440
  try:
323
- with os.fdopen(fd, "r", encoding="utf-8") as handle:
441
+ with os.fdopen(fd, "rb") as handle:
324
442
  fd = -1
325
- return handle.read()
443
+ raw = handle.read(max_bytes + 1)
444
+ if len(raw) > max_bytes:
445
+ raise SystemExit(f"fixture file exceeds {max_bytes} bytes: {path}")
446
+ try:
447
+ return raw.decode("utf-8")
448
+ except UnicodeDecodeError as exc:
449
+ raise SystemExit(f"fixture file must be UTF-8 text: {path}: {exc.reason}") from None
326
450
  finally:
327
451
  if fd != -1:
328
452
  os.close(fd)
@@ -400,6 +524,38 @@ class RunResult:
400
524
  self_hosted_metrics: dict[str, Any] | None = None
401
525
 
402
526
 
527
+ @dataclass
528
+ class EvidenceReplayRow:
529
+ result: RunResult
530
+ source_type: str
531
+ provider_name: str | None
532
+ capture_command_or_export_id: str | None
533
+ claim_scope: str
534
+ provider_export_provenance_complete: bool
535
+ public_claim_eligible: bool
536
+ explicit_notes: bool
537
+ line_number: int
538
+
539
+ @property
540
+ def key(self) -> tuple[str, str]:
541
+ return (self.result.task_id, self.result.variant)
542
+
543
+ def provenance_payload(self) -> dict[str, Any]:
544
+ return {
545
+ "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
546
+ "mode": "evidence_jsonl_replay",
547
+ "evidence_source_type": self.source_type,
548
+ "provider_name": self.provider_name,
549
+ "capture_command_or_export_id": self.capture_command_or_export_id,
550
+ "claim_scope": self.claim_scope,
551
+ "provider_export_provenance_complete": self.provider_export_provenance_complete,
552
+ "public_claim_eligible": self.public_claim_eligible,
553
+ "explicit_notes": self.explicit_notes,
554
+ "line_number": self.line_number,
555
+ "claim_boundary": REPLAY_CLAIM_BOUNDARY,
556
+ }
557
+
558
+
403
559
  @dataclass
404
560
  class BoundedProcessResult:
405
561
  returncode: int
@@ -470,6 +626,17 @@ def validate_variant_extra_args(extra_args: list[str], *, owner: str) -> list[st
470
626
  return extra_args
471
627
 
472
628
 
629
+ def require_argv_safe_prompt(text: str, *, owner: str) -> str:
630
+ """Keep prompt-bearing argv below a bounded size to avoid E2BIG failures."""
631
+ size = len(text.encode("utf-8", errors="replace"))
632
+ if size > MAX_CLAUDE_PROMPT_ARG_BYTES:
633
+ raise SystemExit(
634
+ f"{owner} prompt exceeds argv-safe limit "
635
+ f"({size} bytes > {MAX_CLAUDE_PROMPT_ARG_BYTES}); use a smaller fixture prompt"
636
+ )
637
+ return text
638
+
639
+
473
640
  def validate_variant_prompt_file_path(raw_path: str, *, owner: str) -> Path:
474
641
  """Return a safe relative prompt-file path, or fail before any file read."""
475
642
  rel_path = Path(raw_path)
@@ -522,26 +689,28 @@ def read_variant_prompt_file(path: Path, *, owner: str, display_path: str | None
522
689
  f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes: {label}"
523
690
  )
524
691
  try:
525
- with os.fdopen(fd, "r", encoding="utf-8") as handle:
692
+ with os.fdopen(fd, "rb") as handle:
526
693
  fd = -1
527
- text = handle.read()
528
- except UnicodeDecodeError as exc:
529
- raise SystemExit(
530
- f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
531
- f"{label}: {exc.reason}"
532
- ) from None
694
+ raw = handle.read(MAX_VARIANT_PROMPT_FILE_BYTES + 1)
533
695
  except OSError as exc:
534
696
  detail = exc.strerror or exc.__class__.__name__
535
697
  raise SystemExit(f"{owner} variant_prompt_files could not read prompt file: {label}: {detail}") from None
536
698
  finally:
537
699
  if fd != -1:
538
700
  os.close(fd)
539
- if len(text.encode("utf-8", errors="replace")) > MAX_VARIANT_PROMPT_FILE_BYTES:
701
+ if len(raw) > MAX_VARIANT_PROMPT_FILE_BYTES:
540
702
  raise SystemExit(
541
703
  f"{owner} variant_prompt_files prompt text exceeds "
542
704
  f"{MAX_VARIANT_PROMPT_FILE_BYTES} bytes after decoding: {label}"
543
705
  )
544
- return text
706
+ try:
707
+ text = raw.decode("utf-8")
708
+ except UnicodeDecodeError as exc:
709
+ raise SystemExit(
710
+ f"{owner} variant_prompt_files prompt file must be UTF-8 text: "
711
+ f"{label}: {exc.reason}"
712
+ ) from None
713
+ return require_argv_safe_prompt(text, owner=f"{owner} variant_prompt_files")
545
714
 
546
715
 
547
716
  def load_variant_prompt_files_for_targets(
@@ -977,7 +1146,11 @@ def build_claude_argv(claude_bin: str, task: TaskFixture, variant: Variant) -> l
977
1146
  argv.extend(["--allowedTools", ",".join(task.allowed_tools)])
978
1147
  argv.extend(variant.extra_args)
979
1148
  argv.append("--")
980
- argv.append(task.variant_prompt_texts.get(variant.name, task.prompt))
1149
+ prompt = require_argv_safe_prompt(
1150
+ task.variant_prompt_texts.get(variant.name, task.prompt),
1151
+ owner=f"task {task.id} variant {variant.name}",
1152
+ )
1153
+ argv.append(prompt)
981
1154
  return argv
982
1155
 
983
1156
 
@@ -1256,10 +1429,84 @@ def run_fixture(task: TaskFixture, variant: Variant, claude_bin: str,
1256
1429
  )
1257
1430
 
1258
1431
 
1259
- def append_csv(csv_path: Path, claude_ver: str, result: RunResult, *, skip_existing: bool = False) -> bool:
1432
+ def csv_file_stamp_unlocked(csv_path: Path) -> tuple[int, int, int, int] | None:
1433
+ try:
1434
+ fd = _open_regular_no_symlink(csv_path)
1435
+ except FileNotFoundError:
1436
+ return None
1437
+ try:
1438
+ st = os.fstat(fd)
1439
+ return (int(st.st_dev), int(st.st_ino), int(st.st_size), int(st.st_mtime_ns))
1440
+ finally:
1441
+ os.close(fd)
1442
+
1443
+
1444
+ def refresh_existing_key_cache_unlocked(
1445
+ csv_path: Path,
1446
+ existing_key_cache: set[tuple[str, str]],
1447
+ existing_key_cache_stamp: dict[str, tuple[int, int, int, int] | None] | None,
1448
+ ) -> None:
1449
+ current_stamp = csv_file_stamp_unlocked(csv_path)
1450
+ if existing_key_cache_stamp is not None and existing_key_cache_stamp.get("stamp") == current_stamp:
1451
+ return
1452
+ refreshed = _read_existing_keys_unlocked(csv_path)
1453
+ existing_key_cache.clear()
1454
+ existing_key_cache.update(refreshed)
1455
+ if existing_key_cache_stamp is not None:
1456
+ existing_key_cache_stamp["stamp"] = current_stamp
1457
+
1458
+
1459
+ def resume_key_present(
1460
+ csv_path: Path,
1461
+ key: tuple[str, str],
1462
+ existing_key_cache: set[tuple[str, str]],
1463
+ existing_key_cache_stamp: dict[str, tuple[int, int, int, int] | None] | None,
1464
+ ) -> bool:
1465
+ if not _csv_exists_no_follow(csv_path):
1466
+ existing_key_cache.clear()
1467
+ if existing_key_cache_stamp is not None:
1468
+ existing_key_cache_stamp["stamp"] = None
1469
+ return False
1470
+ with csv_file_lock(csv_path, create_parent=False):
1471
+ refresh_existing_key_cache_unlocked(csv_path, existing_key_cache, existing_key_cache_stamp)
1472
+ return key in existing_key_cache
1473
+
1474
+
1475
+ def resume_runnable_targets(
1476
+ csv_path: Path,
1477
+ targets: list[tuple[TaskFixture, Variant]],
1478
+ *,
1479
+ resume: bool,
1480
+ existing_key_cache: set[tuple[str, str]],
1481
+ existing_key_cache_stamp: dict[str, tuple[int, int, int, int] | None] | None,
1482
+ ) -> list[tuple[TaskFixture, Variant]]:
1483
+ if not resume:
1484
+ return list(targets)
1485
+ return [
1486
+ (task, variant)
1487
+ for task, variant in targets
1488
+ if not resume_key_present(csv_path, (task.id, variant.name), existing_key_cache, existing_key_cache_stamp)
1489
+ ]
1490
+
1491
+
1492
+ def append_csv(
1493
+ csv_path: Path,
1494
+ claude_ver: str,
1495
+ result: RunResult,
1496
+ *,
1497
+ skip_existing: bool = False,
1498
+ existing_key_cache: set[tuple[str, str]] | None = None,
1499
+ existing_key_cache_stamp: dict[str, tuple[int, int, int, int] | None] | None = None,
1500
+ ) -> bool:
1260
1501
  with csv_file_lock(csv_path, create_parent=True):
1261
- if skip_existing and (result.task_id, result.variant) in _read_existing_keys_unlocked(csv_path):
1262
- return False
1502
+ key = (result.task_id, result.variant)
1503
+ if skip_existing:
1504
+ if existing_key_cache is not None:
1505
+ refresh_existing_key_cache_unlocked(csv_path, existing_key_cache, existing_key_cache_stamp)
1506
+ if key in existing_key_cache:
1507
+ return False
1508
+ elif key in _read_existing_keys_unlocked(csv_path):
1509
+ return False
1263
1510
  flags = os.O_CREAT | os.O_APPEND | os.O_WRONLY
1264
1511
  fd = _open_regular_no_symlink(csv_path, flags, 0o600, create_parent=True)
1265
1512
  try:
@@ -1313,6 +1560,10 @@ def append_csv(csv_path: Path, claude_ver: str, result: RunResult, *, skip_exist
1313
1560
  finally:
1314
1561
  if fd != -1:
1315
1562
  os.close(fd)
1563
+ if existing_key_cache is not None:
1564
+ existing_key_cache.add(key)
1565
+ if existing_key_cache_stamp is not None:
1566
+ existing_key_cache_stamp["stamp"] = csv_file_stamp_unlocked(csv_path)
1316
1567
  return True
1317
1568
 
1318
1569
 
@@ -1361,7 +1612,13 @@ def write_text_no_follow(path: Path, text: str) -> None:
1361
1612
  os.close(fd)
1362
1613
 
1363
1614
 
1364
- def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) -> None:
1615
+ def append_cost_shift_ledger(
1616
+ path: Path,
1617
+ claude_ver: str,
1618
+ result: RunResult,
1619
+ *,
1620
+ replay_provenance: dict[str, Any] | None = None,
1621
+ ) -> None:
1365
1622
  shifted_cost_known = cost_shift_measured(result)
1366
1623
  byte_metrics_observed = bool(result.bytes_before or result.bytes_after)
1367
1624
  payload = {
@@ -1412,6 +1669,10 @@ def append_cost_shift_ledger(path: Path, claude_ver: str, result: RunResult) ->
1412
1669
  }
1413
1670
  if result.self_hosted_metrics is not None:
1414
1671
  payload["self_hosted_metrics"] = result.self_hosted_metrics
1672
+ if replay_provenance is not None:
1673
+ payload["replay_provenance"] = replay_provenance
1674
+ payload["evidence_source_type"] = replay_provenance.get("evidence_source_type")
1675
+ payload["public_claim_eligible"] = bool(replay_provenance.get("public_claim_eligible"))
1415
1676
  with csv_file_lock(path, create_parent=True):
1416
1677
  fd = _open_regular_no_symlink(path, os.O_CREAT | os.O_APPEND | os.O_WRONLY, 0o600, create_parent=True)
1417
1678
  try:
@@ -1435,7 +1696,9 @@ def _read_existing_keys_unlocked(csv_path: Path) -> set[tuple[str, str]]:
1435
1696
  reader = csv.DictReader(f)
1436
1697
  fieldnames = list(reader.fieldnames) if reader.fieldnames is not None else None
1437
1698
  validate_csv_schema(csv_path, fieldnames)
1438
- for row in reader:
1699
+ for index, row in enumerate(reader, start=1):
1700
+ if index > MAX_CSV_ROWS:
1701
+ raise SystemExit(f"CSV row limit exceeded for {csv_path}: > {MAX_CSV_ROWS}")
1439
1702
  tid = row.get("task_id") or ""
1440
1703
  var = row.get("variant") or ""
1441
1704
  if tid and var:
@@ -1459,10 +1722,16 @@ def _csv_exists_no_follow(csv_path: Path) -> bool:
1459
1722
 
1460
1723
  def existing_keys(csv_path: Path) -> set[tuple[str, str]]:
1461
1724
  """이미 적재된 (task_id, variant) 조합. resume 시 skip 판정에 사용."""
1725
+ keys, _stamp = existing_keys_snapshot(csv_path)
1726
+ return keys
1727
+
1728
+
1729
+ def existing_keys_snapshot(csv_path: Path) -> tuple[set[tuple[str, str]], tuple[int, int, int, int] | None]:
1730
+ """Loaded resume keys plus the CSV stamp observed under the same lock."""
1462
1731
  if not _csv_exists_no_follow(csv_path):
1463
- return set()
1732
+ return set(), None
1464
1733
  with csv_file_lock(csv_path, create_parent=False):
1465
- return _read_existing_keys_unlocked(csv_path)
1734
+ return _read_existing_keys_unlocked(csv_path), csv_file_stamp_unlocked(csv_path)
1466
1735
 
1467
1736
 
1468
1737
  def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
@@ -1487,6 +1756,356 @@ def read_csv_rows(csv_path: Path) -> list[dict[str, str]]:
1487
1756
  os.close(fd)
1488
1757
 
1489
1758
 
1759
+ def file_has_content_no_follow(path: Path) -> bool:
1760
+ try:
1761
+ fd = _open_regular_no_symlink(path)
1762
+ except FileNotFoundError:
1763
+ return False
1764
+ try:
1765
+ return os.fstat(fd).st_size > 0
1766
+ finally:
1767
+ os.close(fd)
1768
+
1769
+
1770
+ def require_evidence_object(raw: Any, *, owner: str) -> dict[str, Any]:
1771
+ if not isinstance(raw, dict):
1772
+ raise SystemExit(f"{owner} evidence row must be a JSON object")
1773
+ return raw
1774
+
1775
+
1776
+ def evidence_non_empty_string(raw: Any, *, field: str, owner: str, required: bool = True) -> str | None:
1777
+ if raw is None:
1778
+ if required:
1779
+ raise SystemExit(f"{owner} {field} must be a non-empty string")
1780
+ return None
1781
+ if not isinstance(raw, str):
1782
+ raise SystemExit(f"{owner} {field} must be a string")
1783
+ text = sanitize_note_text(raw)
1784
+ if not text:
1785
+ if required:
1786
+ raise SystemExit(f"{owner} {field} must be a non-empty string")
1787
+ return None
1788
+ return text
1789
+
1790
+
1791
+ def evidence_bool(raw: Any, *, field: str, owner: str, default: bool = False) -> bool:
1792
+ if raw is None:
1793
+ return default
1794
+ if not isinstance(raw, bool):
1795
+ raise SystemExit(f"{owner} {field} must be a boolean")
1796
+ return raw
1797
+
1798
+
1799
+ def evidence_nonnegative_int(
1800
+ raw: Any,
1801
+ *,
1802
+ field: str,
1803
+ owner: str,
1804
+ default: int = 0,
1805
+ maximum: int = MAX_USAGE_TOKEN_COUNT,
1806
+ ) -> int:
1807
+ if raw is None:
1808
+ return default
1809
+ value = normalize_usage_token(raw)
1810
+ if value is None or value > maximum:
1811
+ raise SystemExit(f"{owner} {field} must be a finite non-negative integer")
1812
+ return value
1813
+
1814
+
1815
+ def evidence_nonnegative_float(
1816
+ raw: Any,
1817
+ *,
1818
+ field: str,
1819
+ owner: str,
1820
+ default: float = 0.0,
1821
+ maximum: float = MAX_USAGE_COST_USD,
1822
+ ) -> float:
1823
+ if raw is None:
1824
+ return default
1825
+ if isinstance(raw, bool) or not isinstance(raw, (int, float)):
1826
+ raise SystemExit(f"{owner} {field} must be a finite non-negative number")
1827
+ value = float(raw)
1828
+ if not math.isfinite(value) or value < 0 or value > maximum:
1829
+ raise SystemExit(f"{owner} {field} must be a finite non-negative number")
1830
+ return value
1831
+
1832
+
1833
+ def evidence_first(raw: dict[str, Any], *keys: str) -> Any:
1834
+ for key in keys:
1835
+ if key in raw:
1836
+ return raw[key]
1837
+ return None
1838
+
1839
+
1840
+ def parse_evidence_provenance(raw: dict[str, Any], *, owner: str) -> dict[str, Any]:
1841
+ provenance = raw.get("provenance")
1842
+ if provenance is not None and not isinstance(provenance, dict):
1843
+ raise SystemExit(f"{owner} provenance must be a JSON object")
1844
+ source_raw = (
1845
+ provenance.get("evidence_source_type")
1846
+ if isinstance(provenance, dict) and "evidence_source_type" in provenance
1847
+ else raw.get("evidence_source_type")
1848
+ )
1849
+ source_type = evidence_non_empty_string(source_raw, field="evidence_source_type", owner=owner)
1850
+ assert source_type is not None
1851
+ if source_type not in EVIDENCE_REPLAY_SOURCE_TYPES:
1852
+ raise SystemExit(
1853
+ f"{owner} evidence_source_type must be one of: {', '.join(sorted(EVIDENCE_REPLAY_SOURCE_TYPES))}"
1854
+ )
1855
+ provider_name = evidence_non_empty_string(
1856
+ provenance.get("provider_name") if isinstance(provenance, dict) else raw.get("provider_name"),
1857
+ field="provider_name",
1858
+ owner=owner,
1859
+ required=False,
1860
+ )
1861
+ capture_id = evidence_non_empty_string(
1862
+ (
1863
+ provenance.get("capture_command_or_export_id")
1864
+ if isinstance(provenance, dict) and "capture_command_or_export_id" in provenance
1865
+ else raw.get("capture_command_or_export_id")
1866
+ ),
1867
+ field="capture_command_or_export_id",
1868
+ owner=owner,
1869
+ required=False,
1870
+ )
1871
+ claim_scope = evidence_non_empty_string(
1872
+ provenance.get("claim_scope") if isinstance(provenance, dict) else raw.get("claim_scope"),
1873
+ field="claim_scope",
1874
+ owner=owner,
1875
+ )
1876
+ assert claim_scope is not None
1877
+ provider_authority = (
1878
+ source_type == "provider_export"
1879
+ and provider_name is not None
1880
+ and capture_id is not None
1881
+ and claim_scope in PROVIDER_EXPORT_PUBLIC_CLAIM_SCOPES
1882
+ )
1883
+ return {
1884
+ "source_type": source_type,
1885
+ "provider_name": provider_name,
1886
+ "capture_command_or_export_id": capture_id,
1887
+ "claim_scope": claim_scope,
1888
+ "provider_public_claim_authority": provider_authority,
1889
+ }
1890
+
1891
+
1892
+ def parse_evidence_tokens(raw: dict[str, Any], *, owner: str) -> tuple[dict[str, int], set[str]]:
1893
+ token_block = raw.get("tokens")
1894
+ if token_block is not None and not isinstance(token_block, dict):
1895
+ raise SystemExit(f"{owner} tokens must be a JSON object")
1896
+ tokens: dict[str, int] = {}
1897
+ observed: set[str] = set()
1898
+ source = token_block if isinstance(token_block, dict) else {}
1899
+ for bucket, _keys in USAGE_KEY_GROUPS:
1900
+ value = source.get(bucket) if bucket in source else raw.get(bucket)
1901
+ if value is not None:
1902
+ observed.add(bucket)
1903
+ tokens[bucket] = evidence_nonnegative_int(value, field=bucket, owner=owner)
1904
+ return tokens, observed
1905
+
1906
+
1907
+ def parse_evidence_row(raw_value: Any, *, owner: str, line_number: int) -> EvidenceReplayRow:
1908
+ raw = require_evidence_object(raw_value, owner=owner)
1909
+ schema = evidence_non_empty_string(raw.get("schema_version"), field="schema_version", owner=owner)
1910
+ if schema != BENCH_RUN_EVIDENCE_SCHEMA_VERSION:
1911
+ raise SystemExit(
1912
+ f"{owner} schema_version must be {BENCH_RUN_EVIDENCE_SCHEMA_VERSION}"
1913
+ )
1914
+ task_id = evidence_non_empty_string(raw.get("task_id"), field="task_id", owner=owner)
1915
+ variant = evidence_non_empty_string(raw.get("variant"), field="variant", owner=owner)
1916
+ assert task_id is not None and variant is not None
1917
+ provenance = parse_evidence_provenance(raw, owner=owner)
1918
+ provider_authority = bool(provenance["provider_public_claim_authority"])
1919
+ raw_primary_tokens_measured = evidence_bool(
1920
+ raw.get("primary_tokens_measured"),
1921
+ field="primary_tokens_measured",
1922
+ owner=owner,
1923
+ )
1924
+ raw_cost_measured = evidence_bool(
1925
+ evidence_first(raw, "cost_measured", "primary_cost_measured"),
1926
+ field="cost_measured",
1927
+ owner=owner,
1928
+ )
1929
+ if provenance["source_type"] in {"synthetic_fixture", "manual_audit"}:
1930
+ primary_tokens_measured = False
1931
+ cost_measured = False
1932
+ elif provider_authority:
1933
+ primary_tokens_measured = raw_primary_tokens_measured
1934
+ cost_measured = raw_cost_measured
1935
+ else:
1936
+ if raw_primary_tokens_measured or raw_cost_measured:
1937
+ raise SystemExit(
1938
+ f"{owner} provider_export measured flags require provider_name, "
1939
+ "capture_command_or_export_id, and a provider-measured matched-task claim_scope"
1940
+ )
1941
+ primary_tokens_measured = False
1942
+ cost_measured = False
1943
+
1944
+ tokens, observed_token_buckets = parse_evidence_tokens(raw, owner=owner)
1945
+ if primary_tokens_measured and not {"input_tokens", "output_tokens"}.issubset(observed_token_buckets):
1946
+ raise SystemExit(
1947
+ f"{owner} primary_tokens_measured=true requires input_tokens and output_tokens evidence"
1948
+ )
1949
+ cost_usd = evidence_nonnegative_float(
1950
+ evidence_first(raw, "cost_usd", "primary_cost_usd"),
1951
+ field="cost_usd",
1952
+ owner=owner,
1953
+ )
1954
+ if cost_measured and "cost_usd" not in raw and "primary_cost_usd" not in raw:
1955
+ raise SystemExit(f"{owner} cost_measured=true requires cost_usd evidence")
1956
+
1957
+ if "success" not in raw:
1958
+ raise SystemExit(f"{owner} success must be a boolean")
1959
+ success = evidence_bool(raw.get("success"), field="success", owner=owner)
1960
+ notes = evidence_non_empty_string(raw.get("notes"), field="notes", owner=owner, required=False)
1961
+ explicit_notes = notes is not None
1962
+ model = evidence_non_empty_string(raw.get("model"), field="model", owner=owner, required=False) or "evidence-replay"
1963
+ effort = evidence_non_empty_string(raw.get("effort"), field="effort", owner=owner, required=False) or ""
1964
+ self_hosted_metrics = None
1965
+ if SELF_HOSTED_METRICS_KEY in raw:
1966
+ self_hosted_metrics = normalize_self_hosted_metrics(
1967
+ raw.get(SELF_HOSTED_METRICS_KEY),
1968
+ source="evidence_jsonl.self_hosted_metrics",
1969
+ )
1970
+ if self_hosted_metrics is None:
1971
+ raise SystemExit(f"{owner} self_hosted_metrics must be normalized explicit metrics")
1972
+
1973
+ result = RunResult(
1974
+ task_id=task_id,
1975
+ variant=variant,
1976
+ model=model,
1977
+ effort=effort,
1978
+ tokens=tokens,
1979
+ cost_usd=cost_usd,
1980
+ success=success,
1981
+ notes=notes or f"evidence replay ({provenance['source_type']})",
1982
+ corrections=evidence_nonnegative_int(raw.get("corrections"), field="corrections", owner=owner),
1983
+ cost_measured=cost_measured,
1984
+ wall_time_seconds=evidence_nonnegative_float(
1985
+ raw.get("wall_time_seconds"),
1986
+ field="wall_time_seconds",
1987
+ owner=owner,
1988
+ maximum=MAX_SELF_HOSTED_LATENCY_MS / 1000,
1989
+ ),
1990
+ turns=evidence_nonnegative_int(raw.get("turns"), field="turns", owner=owner),
1991
+ hook_triggers=evidence_nonnegative_int(raw.get("hook_triggers"), field="hook_triggers", owner=owner),
1992
+ bytes_before=evidence_nonnegative_int(raw.get("bytes_before"), field="bytes_before", owner=owner),
1993
+ bytes_after=evidence_nonnegative_int(raw.get("bytes_after"), field="bytes_after", owner=owner),
1994
+ artifacts_used=evidence_nonnegative_int(raw.get("artifacts_used"), field="artifacts_used", owner=owner),
1995
+ external_tokens=evidence_nonnegative_int(raw.get("external_tokens"), field="external_tokens", owner=owner),
1996
+ external_tokens_measured=evidence_bool(
1997
+ raw.get("external_tokens_measured"),
1998
+ field="external_tokens_measured",
1999
+ owner=owner,
2000
+ ),
2001
+ external_cost_usd=evidence_nonnegative_float(
2002
+ raw.get("external_cost_usd"),
2003
+ field="external_cost_usd",
2004
+ owner=owner,
2005
+ ),
2006
+ external_cost_measured=evidence_bool(
2007
+ raw.get("external_cost_measured"),
2008
+ field="external_cost_measured",
2009
+ owner=owner,
2010
+ ),
2011
+ provider_cached_tokens=evidence_nonnegative_int(
2012
+ raw.get("provider_cached_tokens"),
2013
+ field="provider_cached_tokens",
2014
+ owner=owner,
2015
+ ),
2016
+ provider_cached_tokens_measured=evidence_bool(
2017
+ raw.get("provider_cached_tokens_measured"),
2018
+ field="provider_cached_tokens_measured",
2019
+ owner=owner,
2020
+ ),
2021
+ primary_tokens_measured=primary_tokens_measured,
2022
+ self_hosted_metrics=self_hosted_metrics,
2023
+ )
2024
+ return EvidenceReplayRow(
2025
+ result=result,
2026
+ source_type=str(provenance["source_type"]),
2027
+ provider_name=provenance["provider_name"],
2028
+ capture_command_or_export_id=provenance["capture_command_or_export_id"],
2029
+ claim_scope=str(provenance["claim_scope"]),
2030
+ provider_export_provenance_complete=provider_authority,
2031
+ public_claim_eligible=False,
2032
+ explicit_notes=explicit_notes,
2033
+ line_number=line_number,
2034
+ )
2035
+
2036
+
2037
+ def read_evidence_jsonl(path: Path) -> list[EvidenceReplayRow]:
2038
+ fd = _open_regular_no_symlink(path)
2039
+ try:
2040
+ size = os.fstat(fd).st_size
2041
+ if size > MAX_EVIDENCE_JSONL_BYTES:
2042
+ raise SystemExit(
2043
+ f"evidence JSONL exceeds {MAX_EVIDENCE_JSONL_BYTES} bytes: {path}"
2044
+ )
2045
+ rows: list[EvidenceReplayRow] = []
2046
+ with os.fdopen(fd, "r", encoding="utf-8") as handle:
2047
+ fd = -1
2048
+ for line_number, line in enumerate(handle, start=1):
2049
+ if line_number > MAX_EVIDENCE_JSONL_LINES:
2050
+ raise SystemExit(
2051
+ f"evidence JSONL line limit exceeded for {path}: > {MAX_EVIDENCE_JSONL_LINES}"
2052
+ )
2053
+ if not line.strip():
2054
+ continue
2055
+ try:
2056
+ payload = json.loads(line)
2057
+ except json.JSONDecodeError as exc:
2058
+ raise SystemExit(
2059
+ f"{path}:{line_number} evidence row must be JSON: {exc.msg}"
2060
+ ) from None
2061
+ rows.append(parse_evidence_row(payload, owner=f"{path}:{line_number}", line_number=line_number))
2062
+ finally:
2063
+ if fd != -1:
2064
+ os.close(fd)
2065
+ if not rows:
2066
+ raise SystemExit(f"evidence JSONL contains no rows: {path}")
2067
+ return rows
2068
+
2069
+
2070
+ def validate_evidence_coverage(
2071
+ evidence_rows: list[EvidenceReplayRow],
2072
+ runnable_targets: list[tuple[TaskFixture, Variant]],
2073
+ ) -> dict[tuple[str, str], EvidenceReplayRow]:
2074
+ by_key: dict[tuple[str, str], EvidenceReplayRow] = {}
2075
+ for row in evidence_rows:
2076
+ if row.key in by_key:
2077
+ raise SystemExit(
2078
+ f"duplicate evidence row for {row.key[0]}/{row.key[1]} "
2079
+ f"(lines {by_key[row.key].line_number} and {row.line_number})"
2080
+ )
2081
+ by_key[row.key] = row
2082
+ missing = [
2083
+ f"{task.id}/{variant.name}"
2084
+ for task, variant in runnable_targets
2085
+ if (task.id, variant.name) not in by_key
2086
+ ]
2087
+ if missing:
2088
+ raise SystemExit(f"missing evidence row(s) for selected targets: {', '.join(missing)}")
2089
+ return {
2090
+ (task.id, variant.name): by_key[(task.id, variant.name)]
2091
+ for task, variant in runnable_targets
2092
+ }
2093
+
2094
+
2095
+ def run_evidence_fixture(task: TaskFixture, variant: Variant, evidence: EvidenceReplayRow) -> RunResult:
2096
+ result = evidence.result
2097
+ if result.task_id != task.id or result.variant != variant.name:
2098
+ raise SystemExit(
2099
+ f"evidence target mismatch: expected {task.id}/{variant.name}, "
2100
+ f"got {result.task_id}/{result.variant}"
2101
+ )
2102
+ if result.model == "evidence-replay":
2103
+ result.model = task.model
2104
+ if not result.effort:
2105
+ result.effort = task.effort or ""
2106
+ return result
2107
+
2108
+
1490
2109
  def row_int(row: dict[str, str], key: str) -> int:
1491
2110
  try:
1492
2111
  return int(float(row.get(key) or 0))
@@ -1546,6 +2165,77 @@ def row_cost_shift_measured(row: dict[str, str]) -> bool:
1546
2165
  )
1547
2166
 
1548
2167
 
2168
+ def measurement_baseline_contract() -> dict[str, Any]:
2169
+ """Describe the benchmark report's current measurement baseline contract.
2170
+
2171
+ This block is descriptive. It does not change the CSV schema and does not
2172
+ grant token/cost savings claims by itself; those remain gated by matched
2173
+ successful tasks, measured primary tokens/costs, shifted-cost accounting,
2174
+ and quality gates.
2175
+ """
2176
+ return {
2177
+ "schema_version": MEASUREMENT_BASELINE_SCHEMA_VERSION,
2178
+ "csv_schema_unchanged": True,
2179
+ "csv_columns": list(CSV_COLUMNS),
2180
+ "captured_fields": {
2181
+ "task_identity": ["task_id", "variant"],
2182
+ "run_configuration": ["model", "effort", "claude_version"],
2183
+ "primary_token_buckets": [
2184
+ "input_tokens",
2185
+ "output_tokens",
2186
+ "cache_read",
2187
+ "cache_creation",
2188
+ "total_tokens",
2189
+ "primary_tokens_measured",
2190
+ ],
2191
+ "primary_cost": ["cost_usd", "cost_measured"],
2192
+ "provider_cache_telemetry": ["provider_cached_tokens", "provider_cached_tokens_measured"],
2193
+ "latency": ["wall_time_seconds"],
2194
+ "quality_and_result": ["success", "corrections", "notes"],
2195
+ "tooling_and_proxy_metrics": ["turns", "hook_triggers", "bytes_before", "bytes_after", "artifacts_used"],
2196
+ "shifted_cost_accounting": [
2197
+ "external_tokens",
2198
+ "external_tokens_measured",
2199
+ "external_cost_usd",
2200
+ "external_cost_measured",
2201
+ "total_cost_with_shift_usd",
2202
+ ],
2203
+ },
2204
+ "claim_eligible_fields": {
2205
+ "token_savings": [
2206
+ "matched successful baseline and variant tasks",
2207
+ "primary_tokens_measured=true on both sides",
2208
+ "quality_gate=pass",
2209
+ ],
2210
+ "shifted_cost_savings": [
2211
+ "matched successful baseline and variant tasks",
2212
+ "cost_measured=true on both sides",
2213
+ "external_cost_measured=true when external_tokens are present",
2214
+ "quality_gate=pass",
2215
+ ],
2216
+ },
2217
+ "proxy_only_fields": {
2218
+ "byte_metrics": ["bytes_before", "bytes_after"],
2219
+ "token_proxy": "chars_div_4_proxy_only",
2220
+ "provider_cache": "diagnostic_telemetry_not_contextguard_token_reduction",
2221
+ },
2222
+ "missing_future_run_identity_fields": [
2223
+ "repo_revision",
2224
+ "agent_harness",
2225
+ "feature_flags",
2226
+ "provider_name",
2227
+ "success_command_identity",
2228
+ ],
2229
+ "claim_boundary": {
2230
+ "descriptive_contract_only": True,
2231
+ "enables_savings_claims_by_itself": False,
2232
+ "requires_matched_successful_tasks": True,
2233
+ "requires_shifted_cost_accounting_for_cost_claims": True,
2234
+ "raw_proxy_estimates_are_not_hosted_api_token_savings": True,
2235
+ },
2236
+ }
2237
+
2238
+
1549
2239
  def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str) -> dict[str, Any]:
1550
2240
  by_variant: dict[str, dict[str, Any]] = {}
1551
2241
  successful_rows_by_variant_task: dict[str, dict[str, list[dict[str, str]]]] = {}
@@ -2187,10 +2877,11 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
2187
2877
  claim_status = "token_savings_observed_cost_unmeasured"
2188
2878
  elif token_savings_observed:
2189
2879
  claim_status = "token_savings_observed_cost_shift_watch"
2190
- return {
2880
+ report = {
2191
2881
  "schema": "context-guard-bench-report-v1",
2192
2882
  "baseline_variant": baseline_variant,
2193
2883
  "row_count": len(rows),
2884
+ "measurement_baseline": measurement_baseline_contract(),
2194
2885
  "summary_by_variant": by_variant,
2195
2886
  "comparisons": comparisons,
2196
2887
  "matched_pair_evidence": matched_pair_evidence,
@@ -2200,22 +2891,854 @@ def summarize_benchmark_rows(rows: list[dict[str, str]], baseline_variant: str)
2200
2891
  "shifted cost savings require measured primary cost and measured external cost when "
2201
2892
  "external tokens are present. Wall time and provider cached-token fields are diagnostic "
2202
2893
  "telemetry, not proof of ContextGuard-caused token or cost savings; provider-cache "
2203
- "discounts must stay separate from token-reduction claims."
2894
+ "discounts must stay separate from token-reduction claims. Public hosted savings "
2895
+ "claims must use public_claim_readiness.claim_allowed; unsupported claims are forbidden."
2204
2896
  ),
2205
2897
  }
2898
+ report["public_claim_readiness"] = build_public_claim_readiness(report)
2899
+ report["default_matrix"] = build_default_matrix(report)
2900
+ return report
2901
+
2902
+ def annotate_replay_report(
2903
+ report: dict[str, Any],
2904
+ replay_rows: list[EvidenceReplayRow],
2905
+ *,
2906
+ mixed_csv: bool,
2907
+ ) -> dict[str, Any]:
2908
+ source_types = sorted({row.source_type for row in replay_rows})
2909
+ provider_names = sorted({row.provider_name for row in replay_rows if row.provider_name})
2910
+ claim_scopes = sorted({row.claim_scope for row in replay_rows})
2911
+ same_run_complete = (not mixed_csv) and len(replay_rows) == int(report.get("row_count") or 0)
2912
+ all_provider_claim_authority = bool(replay_rows) and all(
2913
+ row.provider_export_provenance_complete for row in replay_rows
2914
+ )
2915
+ raw_claim_status = str(report.get("claim_status") or "")
2916
+ matched_pair_evidence = report.get("matched_pair_evidence")
2917
+ matched_claim_gates_allow_public_claim = (
2918
+ isinstance(matched_pair_evidence, list)
2919
+ and bool(matched_pair_evidence)
2920
+ and all(
2921
+ isinstance(item, dict)
2922
+ and isinstance(item.get("claim_boundary"), dict)
2923
+ and bool(item["claim_boundary"].get("token_savings_claim_allowed"))
2924
+ and bool(item["claim_boundary"].get("shifted_cost_claim_allowed"))
2925
+ for item in matched_pair_evidence
2926
+ )
2927
+ )
2928
+ report_claim_gates_allow_public_claim = (
2929
+ raw_claim_status in REPLAY_PUBLIC_CLAIM_ELIGIBLE_RAW_STATUSES
2930
+ and matched_claim_gates_allow_public_claim
2931
+ )
2932
+ if not same_run_complete:
2933
+ public_claim_status = REPLAY_UNKNOWN_MIXED_CSV_STATUS
2934
+ public_claim_eligible = False
2935
+ elif all_provider_claim_authority and report_claim_gates_allow_public_claim:
2936
+ public_claim_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
2937
+ public_claim_eligible = True
2938
+ elif all_provider_claim_authority:
2939
+ public_claim_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
2940
+ public_claim_eligible = False
2941
+ else:
2942
+ public_claim_status = REPLAY_NOT_PUBLIC_CLAIM_STATUS
2943
+ public_claim_eligible = False
2944
+ report["raw_metric_claim_status"] = raw_claim_status
2945
+ report["public_claim_status"] = public_claim_status
2946
+ report["public_claim_eligible"] = public_claim_eligible
2947
+ if not public_claim_eligible:
2948
+ report["claim_status"] = public_claim_status
2949
+ report["replay_evidence"] = {
2950
+ "schema_version": BENCH_RUN_EVIDENCE_SCHEMA_VERSION,
2951
+ "mode": "evidence_jsonl_replay",
2952
+ "row_count": len(replay_rows),
2953
+ "source_types": source_types,
2954
+ "provider_names": provider_names,
2955
+ "claim_scopes": claim_scopes,
2956
+ "same_run_complete": same_run_complete,
2957
+ "mixed_csv": mixed_csv,
2958
+ "provider_export_provenance_complete": all_provider_claim_authority,
2959
+ "report_claim_gates_allow_public_claim": report_claim_gates_allow_public_claim,
2960
+ "public_claim_status": public_claim_status,
2961
+ "public_claim_eligible": public_claim_eligible,
2962
+ "target_keys": [f"{row.result.task_id}/{row.result.variant}" for row in replay_rows],
2963
+ "claim_boundary": REPLAY_CLAIM_BOUNDARY,
2964
+ }
2965
+ report["public_claim_readiness"] = build_public_claim_readiness(
2966
+ report,
2967
+ replay_rows=replay_rows,
2968
+ mixed_csv=mixed_csv,
2969
+ )
2970
+ report["default_matrix"] = build_default_matrix(report)
2971
+ return report
2972
+
2973
+
2974
+ def report_public_claim_status(report: dict[str, Any]) -> tuple[str, bool | None]:
2975
+ if "public_claim_status" in report:
2976
+ return str(report.get("public_claim_status")), bool(report.get("public_claim_eligible"))
2977
+ return (
2978
+ "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger",
2979
+ None,
2980
+ )
2981
+
2982
+
2983
+
2984
+ def public_claim_readiness_gate(
2985
+ gate_id: str,
2986
+ label: str,
2987
+ passed: bool,
2988
+ reason: str,
2989
+ evidence: dict[str, Any] | None = None,
2990
+ *,
2991
+ unknown: bool = False,
2992
+ ) -> dict[str, Any]:
2993
+ status = "unknown" if unknown else ("pass" if passed else "fail")
2994
+ return {
2995
+ "id": gate_id,
2996
+ "label": label,
2997
+ "required": True,
2998
+ "status": status,
2999
+ "passed": passed and not unknown,
3000
+ "reason": reason,
3001
+ "evidence": evidence or {},
3002
+ }
3003
+
3004
+
3005
+ def public_claim_pair_side_measured(pair: dict[str, Any], side: str, metric: str) -> bool:
3006
+ measurements = pair.get("measurements") if isinstance(pair.get("measurements"), dict) else {}
3007
+ side_block = measurements.get(side) if isinstance(measurements.get(side), dict) else {}
3008
+ metric_block = side_block.get(metric) if isinstance(side_block.get(metric), dict) else {}
3009
+ return bool(metric_block.get("measured"))
3010
+
3011
+
3012
+ def public_claim_numeric_values(items: list[Any]) -> list[float]:
3013
+ values: list[float] = []
3014
+ for item in items:
3015
+ if isinstance(item, bool) or not isinstance(item, (int, float)):
3016
+ continue
3017
+ numeric = float(item)
3018
+ if math.isfinite(numeric):
3019
+ values.append(numeric)
3020
+ return values
3021
+
3022
+
3023
+ def public_claim_readiness_evidence_text(evidence: dict[str, Any]) -> str:
3024
+ parts: list[str] = []
3025
+ for key, value in evidence.items():
3026
+ if isinstance(value, list):
3027
+ display = ",".join(str(item) for item in value[:5])
3028
+ if len(value) > 5:
3029
+ display += ",…"
3030
+ elif isinstance(value, dict):
3031
+ display = ",".join(f"{k}={v}" for k, v in list(value.items())[:5])
3032
+ if len(value) > 5:
3033
+ display += ",…"
3034
+ else:
3035
+ display = str(value)
3036
+ parts.append(f"{key}={display}")
3037
+ return "; ".join(parts)
3038
+
3039
+
3040
+ def build_public_claim_readiness(
3041
+ report: dict[str, Any],
3042
+ *,
3043
+ replay_rows: list[EvidenceReplayRow] | None = None,
3044
+ mixed_csv: bool = False,
3045
+ ) -> dict[str, Any]:
3046
+ comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
3047
+ comparisons = [item for item in comparisons if isinstance(item, dict)]
3048
+ pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
3049
+ pairs = [item for item in pairs if isinstance(item, dict)]
3050
+ row_count = int(report.get("row_count") or 0)
3051
+ replay_evidence = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else {}
3052
+ replay_count = len(replay_rows or [])
3053
+ public_claim_status, public_claim_eligible = report_public_claim_status(report)
3054
+ raw_metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
3055
+
3056
+ comparison_variants = [str(item.get("variant")) for item in comparisons if item.get("variant")]
3057
+ matched_counts = public_claim_numeric_values([
3058
+ item.get("matched_successful_task_count") for item in comparisons
3059
+ ])
3060
+ missing_baseline_successes = [
3061
+ task
3062
+ for item in comparisons
3063
+ for task in (item.get("missing_baseline_success_tasks") or [])
3064
+ ]
3065
+ baseline_success_counts = public_claim_numeric_values([
3066
+ item.get("baseline_successful_task_count") for item in comparisons
3067
+ ])
3068
+ matched_tasks_pass = (
3069
+ bool(comparisons)
3070
+ and bool(pairs)
3071
+ and len(matched_counts) == len(comparisons)
3072
+ and all(value > 0 for value in matched_counts)
3073
+ and len(baseline_success_counts) == len(comparisons)
3074
+ and all(value > 0 for value in baseline_success_counts)
3075
+ and not missing_baseline_successes
3076
+ )
3077
+ gates = [
3078
+ public_claim_readiness_gate(
3079
+ "matched_successful_tasks",
3080
+ "Matched successful tasks",
3081
+ matched_tasks_pass,
3082
+ "matched_successful_tasks_present" if matched_tasks_pass else "missing_or_regressed_matched_successful_tasks",
3083
+ {
3084
+ "comparison_count": len(comparisons),
3085
+ "matched_pair_count": len(pairs),
3086
+ "variants": comparison_variants[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
3087
+ "min_matched_successful_task_count": min(matched_counts) if matched_counts else None,
3088
+ "missing_baseline_success_task_count": len(missing_baseline_successes),
3089
+ },
3090
+ )
3091
+ ]
3092
+
3093
+ provider_measured_token_cost_pass = bool(pairs) and all(
3094
+ public_claim_pair_side_measured(pair, "baseline", "primary_tokens")
3095
+ and public_claim_pair_side_measured(pair, "variant", "primary_tokens")
3096
+ and public_claim_pair_side_measured(pair, "baseline", "primary_cost_usd")
3097
+ and public_claim_pair_side_measured(pair, "variant", "primary_cost_usd")
3098
+ for pair in pairs
3099
+ )
3100
+ gates.append(public_claim_readiness_gate(
3101
+ "provider_measured_token_cost",
3102
+ "Provider-measured token and primary cost",
3103
+ provider_measured_token_cost_pass,
3104
+ "provider_measured_primary_tokens_and_cost" if provider_measured_token_cost_pass else "missing_provider_measured_primary_tokens_or_cost",
3105
+ {
3106
+ "matched_pair_count": len(pairs),
3107
+ "required_fields": [
3108
+ "matched_pair_evidence[*].measurements.baseline.primary_tokens.measured",
3109
+ "matched_pair_evidence[*].measurements.variant.primary_tokens.measured",
3110
+ "matched_pair_evidence[*].measurements.baseline.primary_cost_usd.measured",
3111
+ "matched_pair_evidence[*].measurements.variant.primary_cost_usd.measured",
3112
+ ],
3113
+ },
3114
+ ))
3115
+
3116
+ quality_gates = sorted({str(item.get("quality_gate") or "unknown") for item in comparisons})
3117
+ failure_deltas = public_claim_numeric_values([
3118
+ item.get("failure_rate_delta_pp") for item in comparisons
3119
+ ])
3120
+ correction_deltas = public_claim_numeric_values([
3121
+ item.get("corrections_delta_per_successful_task") for item in comparisons
3122
+ ])
3123
+ quality_pass = bool(comparisons) and all(item.get("quality_gate") == "pass" for item in comparisons)
3124
+ gates.append(public_claim_readiness_gate(
3125
+ "quality_non_inferiority",
3126
+ "Quality non-inferiority",
3127
+ quality_pass,
3128
+ "all_quality_gates_pass" if quality_pass else "quality_gate_not_pass",
3129
+ {
3130
+ "quality_gates": quality_gates,
3131
+ "max_failure_rate_delta_pp": max(failure_deltas) if failure_deltas else None,
3132
+ "max_corrections_delta_per_successful_task": max(correction_deltas) if correction_deltas else None,
3133
+ },
3134
+ ))
3135
+
3136
+ shifted_cost_pass = bool(pairs) and all(
3137
+ isinstance(pair.get("claim_boundary"), dict)
3138
+ and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
3139
+ and public_claim_pair_side_measured(pair, "baseline", "total_cost_with_shift_usd")
3140
+ and public_claim_pair_side_measured(pair, "variant", "total_cost_with_shift_usd")
3141
+ for pair in pairs
3142
+ )
3143
+ gates.append(public_claim_readiness_gate(
3144
+ "shifted_cost_accounting",
3145
+ "Shifted-cost accounting",
3146
+ shifted_cost_pass,
3147
+ "shifted_cost_claim_gates_pass" if shifted_cost_pass else "missing_shifted_cost_claim_accounting",
3148
+ {
3149
+ "matched_pair_count": len(pairs),
3150
+ "required_fields": [
3151
+ "matched_pair_evidence[*].claim_boundary.shifted_cost_claim_allowed",
3152
+ "matched_pair_evidence[*].measurements.baseline.total_cost_with_shift_usd.measured",
3153
+ "matched_pair_evidence[*].measurements.variant.total_cost_with_shift_usd.measured",
3154
+ ],
3155
+ },
3156
+ ))
3157
+
3158
+ has_replay = replay_rows is not None and bool(replay_rows)
3159
+ explicit_note_count = sum(1 for row in (replay_rows or []) if row.explicit_notes)
3160
+ failed_rows = [row for row in (replay_rows or []) if not row.result.success]
3161
+ failed_rows_with_notes = sum(1 for row in failed_rows if row.explicit_notes)
3162
+ comparison_failure_fields_present = bool(comparisons) and all(
3163
+ "baseline_failure_rate" in item
3164
+ and "variant_failure_rate" in item
3165
+ and "failure_rate_delta_pp" in item
3166
+ and "paired_corrections_task_count" in item
3167
+ for item in comparisons
3168
+ )
3169
+ confidence_notes_pass = (
3170
+ has_replay
3171
+ and explicit_note_count == replay_count
3172
+ and failed_rows_with_notes == len(failed_rows)
3173
+ and comparison_failure_fields_present
3174
+ )
3175
+ gates.append(public_claim_readiness_gate(
3176
+ "confidence_failure_notes",
3177
+ "Confidence and failure notes",
3178
+ confidence_notes_pass,
3179
+ "explicit_replay_notes_and_failure_rate_evidence_present" if confidence_notes_pass else "missing_explicit_replay_notes_or_failure_evidence",
3180
+ {
3181
+ "replay_row_count": replay_count,
3182
+ "explicit_note_count": explicit_note_count,
3183
+ "failed_row_count": len(failed_rows),
3184
+ "failed_rows_with_notes": failed_rows_with_notes,
3185
+ "comparison_failure_fields_present": comparison_failure_fields_present,
3186
+ },
3187
+ unknown=not has_replay,
3188
+ ))
3189
+
3190
+ same_run_complete = bool(replay_evidence.get("same_run_complete")) if replay_evidence else (
3191
+ has_replay and not mixed_csv and replay_count == row_count
3192
+ )
3193
+ source_types = sorted({row.source_type for row in (replay_rows or [])})
3194
+ provider_names = sorted({row.provider_name for row in (replay_rows or []) if row.provider_name})
3195
+ provider_export_pass = (
3196
+ has_replay
3197
+ and not mixed_csv
3198
+ and same_run_complete
3199
+ and replay_count == row_count
3200
+ and all(row.provider_export_provenance_complete for row in (replay_rows or []))
3201
+ )
3202
+ gates.append(public_claim_readiness_gate(
3203
+ "provider_export_provenance",
3204
+ "Provider-export provenance",
3205
+ provider_export_pass,
3206
+ "complete_provider_export_same_run_provenance" if provider_export_pass else "missing_or_mixed_provider_export_provenance",
3207
+ {
3208
+ "replay_row_count": replay_count,
3209
+ "report_row_count": row_count,
3210
+ "mixed_csv": mixed_csv,
3211
+ "same_run_complete": same_run_complete,
3212
+ "source_types": source_types,
3213
+ "provider_names": provider_names[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
3214
+ },
3215
+ unknown=not has_replay,
3216
+ ))
3217
+
3218
+ passed_required_gate_count = sum(1 for gate in gates if gate["passed"])
3219
+ blocking_gate_ids = [str(gate["id"]) for gate in gates if not gate["passed"]]
3220
+ required_gates_pass = passed_required_gate_count == len(gates)
3221
+ claim_allowed = (
3222
+ required_gates_pass
3223
+ and public_claim_status == REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
3224
+ and bool(public_claim_eligible)
3225
+ )
3226
+ if claim_allowed:
3227
+ readiness_status = REPLAY_PUBLIC_CLAIM_CANDIDATE_STATUS
3228
+ reason = "all_required_public_claim_gates_pass"
3229
+ elif not has_replay:
3230
+ readiness_status = "csv_provenance_unknown_requires_original_evidence_or_trusted_ledger"
3231
+ reason = "replay_evidence_required_for_public_claim"
3232
+ elif provider_export_pass:
3233
+ readiness_status = REPLAY_PROVIDER_CLAIM_GATES_NOT_MET_STATUS
3234
+ reason = "provider_export_present_but_readiness_gates_failed"
3235
+ else:
3236
+ readiness_status = "public_claim_blocked"
3237
+ reason = "unsupported_public_savings_claim_forbidden"
3238
+
3239
+ return {
3240
+ "schema_version": PUBLIC_CLAIM_READINESS_SCHEMA_VERSION,
3241
+ "generated_from": "matched_pair_evidence_and_replay_provenance",
3242
+ "status": readiness_status,
3243
+ "reason": reason,
3244
+ "claim_allowed": claim_allowed,
3245
+ "public_claim_status_observed": public_claim_status,
3246
+ "public_claim_eligible_observed": public_claim_eligible,
3247
+ "raw_metric_claim_status_observed": raw_metric_claim_status,
3248
+ "required_gate_ids": list(PUBLIC_CLAIM_READINESS_GATE_IDS),
3249
+ "required_gate_count": len(gates),
3250
+ "passed_required_gate_count": passed_required_gate_count,
3251
+ "blocking_gate_ids": blocking_gate_ids,
3252
+ "gates": gates,
3253
+ "claim_boundary": PUBLIC_CLAIM_READINESS_CLAIM_BOUNDARY,
3254
+ }
3255
+
3256
+
3257
+ def default_matrix_normalized_key(value: Any) -> str:
3258
+ text = str(value or "").lower()
3259
+ return re.sub(r"[^a-z0-9]+", "_", text).strip("_")
3260
+
3261
+
3262
+ def default_matrix_contains_key(haystack: str, needle: str) -> bool:
3263
+ needle = default_matrix_normalized_key(needle)
3264
+ if not needle:
3265
+ return False
3266
+ return needle in haystack
3267
+
3268
+
3269
+ def infer_default_matrix_lanes(pair: dict[str, Any]) -> list[tuple[str, str]]:
3270
+ task_id = default_matrix_normalized_key(pair.get("task_id"))
3271
+ variant = default_matrix_normalized_key(pair.get("variant"))
3272
+ matches: list[tuple[str, str]] = []
3273
+ for lane in DEFAULT_MATRIX_LANES:
3274
+ lane_id = str(lane["id"])
3275
+ task_keywords = tuple(str(item) for item in lane.get("task_keywords", ()))
3276
+ variant_keywords = tuple(str(item) for item in lane.get("variant_keywords", ()))
3277
+ if any(default_matrix_contains_key(task_id, item) for item in task_keywords):
3278
+ matches.append((lane_id, "exact_key"))
3279
+ elif any(default_matrix_contains_key(variant, item) for item in variant_keywords):
3280
+ matches.append((lane_id, "name_heuristic"))
3281
+ return matches
3282
+
3283
+
3284
+ def default_matrix_number(value: Any) -> float | None:
3285
+ if isinstance(value, bool) or not isinstance(value, (int, float)):
3286
+ return None
3287
+ numeric = float(value)
3288
+ if not math.isfinite(numeric):
3289
+ return None
3290
+ return numeric
3291
+
3292
+
3293
+ def default_matrix_unique(values: list[Any]) -> list[Any]:
3294
+ out: list[Any] = []
3295
+ for value in values:
3296
+ if value not in out:
3297
+ out.append(value)
3298
+ return out
3299
+
3300
+
3301
+ def default_matrix_cap(values: list[Any]) -> list[Any]:
3302
+ return default_matrix_unique(values)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS]
3303
+
3304
+
3305
+ def default_matrix_lane_match_method(methods: set[str]) -> str:
3306
+ if "exact_key" in methods:
3307
+ return "exact_key"
3308
+ if "name_heuristic" in methods:
3309
+ return "name_heuristic"
3310
+ return "absent"
3311
+
3312
+
3313
+ def default_matrix_clamp_classification(classification: str, ceiling: str) -> tuple[str, bool]:
3314
+ if classification == "reject/rework":
3315
+ return classification, False
3316
+ if ceiling not in DEFAULT_MATRIX_CLASSIFICATION_STRENGTH:
3317
+ return classification, False
3318
+ current_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH.get(classification, 0)
3319
+ ceiling_strength = DEFAULT_MATRIX_CLASSIFICATION_STRENGTH[ceiling]
3320
+ if current_strength > ceiling_strength:
3321
+ return ceiling, True
3322
+ return classification, False
3323
+
3324
+
3325
+ def default_matrix_token_evidence(token_values: list[float], pair_count: int, byte_proxy_positive: bool) -> str:
3326
+ if pair_count and len(token_values) == pair_count and all(value > 0 for value in token_values):
3327
+ return "measured_positive"
3328
+ if token_values:
3329
+ if any(value < 0 for value in token_values):
3330
+ return "measured_regression"
3331
+ return "measured_incomplete_or_mixed"
3332
+ if byte_proxy_positive:
3333
+ return "byte_proxy_only"
3334
+ return "unavailable"
3335
+
3336
+
3337
+ def classify_default_matrix_lane(
3338
+ lane_id: str,
3339
+ pairs: list[dict[str, Any]],
3340
+ methods: set[str],
3341
+ ) -> dict[str, Any]:
3342
+ lane = DEFAULT_MATRIX_LANE_BY_ID[lane_id]
3343
+ policy_ceiling = str(lane["policy_ceiling"])
3344
+ if not pairs:
3345
+ classification = "experimental"
3346
+ reason_codes = ["no_matched_lane_evidence"]
3347
+ return {
3348
+ "lane": lane_id,
3349
+ "label": lane["label"],
3350
+ "classification": classification,
3351
+ "policy_ceiling": policy_ceiling,
3352
+ "policy_clamped": False,
3353
+ "lane_match_method": "absent",
3354
+ "matched_task_count": 0,
3355
+ "matched_tasks": [],
3356
+ "matched_variants": [],
3357
+ "quality_gate": "insufficient_evidence",
3358
+ "quality_gates": [],
3359
+ "token_evidence": "unavailable",
3360
+ "shifted_cost_evidence": "unavailable",
3361
+ "byte_proxy_evidence": "unavailable",
3362
+ "matched_pair_claim_gates": {
3363
+ "token_savings_claim_allowed": False,
3364
+ "shifted_cost_claim_allowed": False,
3365
+ },
3366
+ "public_claim_allowed": False,
3367
+ "reason_codes": reason_codes,
3368
+ "claim_boundary": {
3369
+ "classification_is_reporting_only": True,
3370
+ "hosted_api_savings_claim_allowed": False,
3371
+ "requires_report_claim_status_and_matched_pair_evidence": True,
3372
+ },
3373
+ }
3374
+
3375
+ quality_gates = sorted({str(pair.get("quality_gate") or "unknown") for pair in pairs})
3376
+ quality_gate = quality_gates[0] if len(quality_gates) == 1 else "mixed"
3377
+ token_values = [
3378
+ value for value in (
3379
+ default_matrix_number((pair.get("delta") or {}).get("token_savings_pct"))
3380
+ for pair in pairs
3381
+ if isinstance(pair.get("delta"), dict)
3382
+ )
3383
+ if value is not None
3384
+ ]
3385
+ cost_values = [
3386
+ value for value in (
3387
+ default_matrix_number((pair.get("delta") or {}).get("cost_savings_pct_with_shift"))
3388
+ for pair in pairs
3389
+ if isinstance(pair.get("delta"), dict)
3390
+ )
3391
+ if value is not None
3392
+ ]
3393
+ byte_after_deltas = [
3394
+ value for value in (
3395
+ default_matrix_number((pair.get("delta") or {}).get("bytes_after_total"))
3396
+ for pair in pairs
3397
+ if isinstance(pair.get("delta"), dict)
3398
+ )
3399
+ if value is not None
3400
+ ]
3401
+ byte_proxy_positive = bool(byte_after_deltas) and any(value < 0 for value in byte_after_deltas)
3402
+ token_claim_gate = bool(pairs) and all(
3403
+ isinstance(pair.get("claim_boundary"), dict)
3404
+ and bool((pair.get("claim_boundary") or {}).get("token_savings_claim_allowed"))
3405
+ for pair in pairs
3406
+ )
3407
+ shifted_cost_claim_gate = bool(pairs) and all(
3408
+ isinstance(pair.get("claim_boundary"), dict)
3409
+ and bool((pair.get("claim_boundary") or {}).get("shifted_cost_claim_allowed"))
3410
+ for pair in pairs
3411
+ )
3412
+ reason_codes: list[str] = []
3413
+ if any(gate != "pass" for gate in quality_gates):
3414
+ classification = "reject/rework"
3415
+ reason_codes.extend(f"quality_gate_{gate}" for gate in quality_gates if gate != "pass")
3416
+ elif any(value < 0 for value in token_values):
3417
+ classification = "reject/rework"
3418
+ reason_codes.append("measured_token_regression")
3419
+ elif any(value < 0 for value in cost_values):
3420
+ classification = "reject/rework"
3421
+ reason_codes.append("measured_shifted_cost_regression")
3422
+ elif (
3423
+ len(token_values) == len(pairs)
3424
+ and all(value > 0 for value in token_values)
3425
+ and len(cost_values) == len(pairs)
3426
+ and all(value >= 0 for value in cost_values)
3427
+ and token_claim_gate
3428
+ and shifted_cost_claim_gate
3429
+ ):
3430
+ classification = "default-on"
3431
+ reason_codes.append("quality_pass_measured_token_and_shifted_cost_non_regression")
3432
+ elif len(token_values) == len(pairs) and all(value > 0 for value in token_values) and token_claim_gate:
3433
+ classification = "advisory"
3434
+ reason_codes.append("quality_pass_measured_token_savings_shifted_cost_unproven")
3435
+ elif byte_proxy_positive:
3436
+ classification = "advisory"
3437
+ reason_codes.append("quality_pass_byte_proxy_only")
3438
+ else:
3439
+ classification = "experimental"
3440
+ reason_codes.append("quality_pass_but_no_positive_measured_or_proxy_savings")
3441
+
3442
+ if lane_id == "optional_compression" and classification == "advisory" and not token_values:
3443
+ classification = "experimental"
3444
+ reason_codes.append("optional_compression_requires_provider_token_evidence_for_advisory")
3445
+
3446
+ classification, policy_clamped = default_matrix_clamp_classification(classification, policy_ceiling)
3447
+ if policy_clamped:
3448
+ reason_codes.append(f"policy_ceiling_{policy_ceiling}")
3449
+
3450
+ return {
3451
+ "lane": lane_id,
3452
+ "label": lane["label"],
3453
+ "classification": classification,
3454
+ "policy_ceiling": policy_ceiling,
3455
+ "policy_clamped": policy_clamped,
3456
+ "lane_match_method": default_matrix_lane_match_method(methods),
3457
+ "matched_task_count": len({str(pair.get("task_id")) for pair in pairs}),
3458
+ "matched_tasks": default_matrix_cap([pair.get("task_id") for pair in pairs if pair.get("task_id")]),
3459
+ "matched_variants": default_matrix_cap([pair.get("variant") for pair in pairs if pair.get("variant")]),
3460
+ "quality_gate": quality_gate,
3461
+ "quality_gates": quality_gates,
3462
+ "token_evidence": default_matrix_token_evidence(token_values, len(pairs), byte_proxy_positive),
3463
+ "shifted_cost_evidence": (
3464
+ "measured_non_regression"
3465
+ if cost_values and len(cost_values) == len(pairs) and all(value >= 0 for value in cost_values)
3466
+ else ("measured_regression" if any(value < 0 for value in cost_values) else "unavailable")
3467
+ ),
3468
+ "byte_proxy_evidence": (
3469
+ "observed_positive" if byte_proxy_positive
3470
+ else ("observed_non_positive" if byte_after_deltas else "unavailable")
3471
+ ),
3472
+ "matched_pair_claim_gates": {
3473
+ "token_savings_claim_allowed": token_claim_gate,
3474
+ "shifted_cost_claim_allowed": shifted_cost_claim_gate,
3475
+ },
3476
+ "public_claim_allowed": False,
3477
+ "reason_codes": default_matrix_unique(reason_codes),
3478
+ "claim_boundary": {
3479
+ "classification_is_reporting_only": True,
3480
+ "hosted_api_savings_claim_allowed": False,
3481
+ "requires_report_claim_status_and_matched_pair_evidence": True,
3482
+ },
3483
+ }
3484
+
3485
+
3486
+ def build_default_matrix(report: dict[str, Any]) -> dict[str, Any]:
3487
+ buckets: dict[str, list[dict[str, Any]]] = {lane_id: [] for lane_id in DEFAULT_MATRIX_LANE_IDS}
3488
+ methods: dict[str, set[str]] = {lane_id: set() for lane_id in DEFAULT_MATRIX_LANE_IDS}
3489
+ unmatched_variants: set[str] = set()
3490
+ pairs = report.get("matched_pair_evidence") if isinstance(report.get("matched_pair_evidence"), list) else []
3491
+ for pair in pairs:
3492
+ if not isinstance(pair, dict):
3493
+ continue
3494
+ lane_matches = infer_default_matrix_lanes(pair)
3495
+ if not lane_matches:
3496
+ if pair.get("variant"):
3497
+ unmatched_variants.add(str(pair.get("variant")))
3498
+ continue
3499
+ for lane_id, method in lane_matches:
3500
+ buckets[lane_id].append(pair)
3501
+ methods[lane_id].add(method)
3502
+ lanes = [
3503
+ classify_default_matrix_lane(lane_id, buckets[lane_id], methods[lane_id])
3504
+ for lane_id in DEFAULT_MATRIX_LANE_IDS
3505
+ ]
3506
+ classification_counts = {
3507
+ classification: sum(1 for lane in lanes if lane.get("classification") == classification)
3508
+ for classification in DEFAULT_MATRIX_CLASSIFICATIONS
3509
+ }
3510
+ return {
3511
+ "schema_version": DEFAULT_MATRIX_SCHEMA_VERSION,
3512
+ "classification_set": list(DEFAULT_MATRIX_CLASSIFICATIONS),
3513
+ "generated_from": "matched_pair_evidence",
3514
+ "reporting_only": True,
3515
+ "claim_status_observed": report.get("claim_status"),
3516
+ "public_claim_allowed": False,
3517
+ "claim_boundary": DEFAULT_MATRIX_CLAIM_BOUNDARY,
3518
+ "lanes": lanes,
3519
+ "summary": {
3520
+ "lane_count": len(lanes),
3521
+ "classification_counts": classification_counts,
3522
+ "unmatched_variants": sorted(unmatched_variants)[:MAX_DEFAULT_MATRIX_EVIDENCE_ITEMS],
3523
+ },
3524
+ }
3525
+
3526
+
3527
+ def markdown_value(value: Any) -> str:
3528
+ if value is None:
3529
+ return "n/a"
3530
+ if isinstance(value, bool):
3531
+ return "true" if value else "false"
3532
+ if isinstance(value, float):
3533
+ return f"{value:.6g}"
3534
+ text = sanitize_note_text(value)
3535
+ return text.replace("|", "\\|") or "n/a"
3536
+
3537
+
3538
+ def render_dashboard_markdown(report: dict[str, Any]) -> str:
3539
+ public_claim_status, public_claim_eligible = report_public_claim_status(report)
3540
+ metric_claim_status = report.get("raw_metric_claim_status", report.get("claim_status"))
3541
+ lines = [
3542
+ "# ContextGuard Benchmark Dashboard",
3543
+ "",
3544
+ f"- Schema: `{markdown_value(report.get('schema'))}`",
3545
+ f"- Baseline variant: `{markdown_value(report.get('baseline_variant'))}`",
3546
+ f"- Rows: {markdown_value(report.get('row_count'))}",
3547
+ f"- Metric claim status: `{markdown_value(metric_claim_status)}`",
3548
+ f"- Public claim status: `{markdown_value(public_claim_status)}`",
3549
+ f"- Public claim eligible: `{markdown_value(public_claim_eligible)}`",
3550
+ "",
3551
+ "> Claim boundary: this dashboard is not a hosted savings claim unless report claim gates "
3552
+ "allow it and public-claim provenance is complete. Proxy byte reductions are diagnostic "
3553
+ "and are not hosted API token savings.",
3554
+ "",
3555
+ "## Variant summary",
3556
+ "",
3557
+ "| Variant | Runs | Successes | Failure rate | Tokens/success | Bytes saved | Token proxy saved | Quality notes |",
3558
+ "| --- | ---: | ---: | ---: | ---: | ---: | ---: | --- |",
3559
+ ]
3560
+ summaries = report.get("summary_by_variant") if isinstance(report.get("summary_by_variant"), dict) else {}
3561
+ comparison_by_variant = {
3562
+ item.get("variant"): item
3563
+ for item in report.get("comparisons", [])
3564
+ if isinstance(item, dict)
3565
+ }
3566
+ for variant, summary in sorted(summaries.items()):
3567
+ if not isinstance(summary, dict):
3568
+ continue
3569
+ comparison = comparison_by_variant.get(variant, {})
3570
+ quality = comparison.get("quality_gate") if isinstance(comparison, dict) else None
3571
+ if quality is None and summary.get("is_baseline_strategy"):
3572
+ quality = "baseline"
3573
+ lines.append(
3574
+ "| "
3575
+ + " | ".join([
3576
+ markdown_value(variant),
3577
+ markdown_value(summary.get("runs")),
3578
+ markdown_value(summary.get("successful_runs")),
3579
+ markdown_value(summary.get("failure_rate")),
3580
+ markdown_value(summary.get("tokens_per_successful_task")),
3581
+ markdown_value(summary.get("bytes_saved_successful")),
3582
+ markdown_value(summary.get("token_proxy_saved_successful")),
3583
+ markdown_value(quality),
3584
+ ])
3585
+ + " |"
3586
+ )
3587
+ lines.extend([
3588
+ "",
3589
+ "## Comparisons",
3590
+ "",
3591
+ "| Variant | Quality gate | Matched tasks | Token paired tasks | Token savings % | Shifted cost savings % |",
3592
+ "| --- | --- | ---: | ---: | ---: | ---: |",
3593
+ ])
3594
+ comparisons = report.get("comparisons") if isinstance(report.get("comparisons"), list) else []
3595
+ if comparisons:
3596
+ for item in comparisons:
3597
+ if not isinstance(item, dict):
3598
+ continue
3599
+ lines.append(
3600
+ "| "
3601
+ + " | ".join([
3602
+ markdown_value(item.get("variant")),
3603
+ markdown_value(item.get("quality_gate")),
3604
+ markdown_value(item.get("matched_successful_task_count")),
3605
+ markdown_value(item.get("paired_token_task_count")),
3606
+ markdown_value(item.get("token_savings_pct")),
3607
+ markdown_value(item.get("cost_savings_pct_with_shift")),
3608
+ ])
3609
+ + " |"
3610
+ )
3611
+ else:
3612
+ lines.append("| n/a | n/a | 0 | 0 | n/a | n/a |")
3613
+ readiness = report.get("public_claim_readiness") if isinstance(report.get("public_claim_readiness"), dict) else None
3614
+ if readiness is not None:
3615
+ lines.extend([
3616
+ "",
3617
+ "## Public claim readiness",
3618
+ "",
3619
+ f"- Status: `{markdown_value(readiness.get('status'))}`",
3620
+ f"- Claim allowed: `{markdown_value(readiness.get('claim_allowed'))}`",
3621
+ "",
3622
+ "| Gate | Status | Reason | Evidence |",
3623
+ "| --- | --- | --- | --- |",
3624
+ ])
3625
+ gates = readiness.get("gates") if isinstance(readiness.get("gates"), list) else []
3626
+ for gate in gates:
3627
+ if not isinstance(gate, dict):
3628
+ continue
3629
+ evidence = gate.get("evidence") if isinstance(gate.get("evidence"), dict) else {}
3630
+ lines.append(
3631
+ "| "
3632
+ + " | ".join([
3633
+ markdown_value(gate.get("id")),
3634
+ markdown_value(gate.get("status")),
3635
+ markdown_value(gate.get("reason")),
3636
+ markdown_value(public_claim_readiness_evidence_text(evidence)),
3637
+ ])
3638
+ + " |"
3639
+ )
3640
+ boundary = readiness.get("claim_boundary")
3641
+ if isinstance(boundary, dict):
3642
+ lines.extend([
3643
+ "",
3644
+ f"- Public claim boundary: {markdown_value(boundary.get('reason'))}",
3645
+ ])
3646
+ default_matrix = report.get("default_matrix") if isinstance(report.get("default_matrix"), dict) else None
3647
+ if default_matrix is not None:
3648
+ lines.extend([
3649
+ "",
3650
+ "## Default matrix",
3651
+ "",
3652
+ "| Lane | Classification | Matched Tasks | Quality Gate | Token Evidence | Public Claim | Reason |",
3653
+ "| --- | --- | ---: | --- | --- | --- | --- |",
3654
+ ])
3655
+ lanes = default_matrix.get("lanes") if isinstance(default_matrix.get("lanes"), list) else []
3656
+ for lane in lanes:
3657
+ if not isinstance(lane, dict):
3658
+ continue
3659
+ reasons = lane.get("reason_codes") if isinstance(lane.get("reason_codes"), list) else []
3660
+ lines.append(
3661
+ "| "
3662
+ + " | ".join([
3663
+ markdown_value(lane.get("lane")),
3664
+ markdown_value(lane.get("classification")),
3665
+ markdown_value(lane.get("matched_task_count")),
3666
+ markdown_value(lane.get("quality_gate")),
3667
+ markdown_value(lane.get("token_evidence")),
3668
+ markdown_value(lane.get("public_claim_allowed")),
3669
+ markdown_value(", ".join(str(item) for item in reasons[:3])),
3670
+ ])
3671
+ + " |"
3672
+ )
3673
+ boundary = default_matrix.get("claim_boundary")
3674
+ if isinstance(boundary, dict):
3675
+ lines.extend([
3676
+ "",
3677
+ f"- Matrix boundary: {markdown_value(boundary.get('reason'))}",
3678
+ ])
3679
+ replay = report.get("replay_evidence") if isinstance(report.get("replay_evidence"), dict) else None
3680
+ if replay is not None:
3681
+ lines.extend([
3682
+ "",
3683
+ "## Replay evidence provenance",
3684
+ "",
3685
+ f"- Source types: `{markdown_value(', '.join(replay.get('source_types') or []))}`",
3686
+ f"- Claim scopes: `{markdown_value(', '.join(replay.get('claim_scopes') or []))}`",
3687
+ f"- Same-run complete: `{markdown_value(replay.get('same_run_complete'))}`",
3688
+ f"- Mixed/pre-existing CSV: `{markdown_value(replay.get('mixed_csv'))}`",
3689
+ f"- Boundary: {markdown_value(replay.get('claim_boundary'))}",
3690
+ ])
3691
+ else:
3692
+ lines.extend([
3693
+ "",
3694
+ "## Provenance note",
3695
+ "",
3696
+ "- CSV-only dashboards have unknown public-claim provenance unless regenerated from "
3697
+ "the original evidence JSONL or a future trusted provenance ledger.",
3698
+ ])
3699
+ lines.extend([
3700
+ "",
3701
+ "## Re-run context",
3702
+ "",
3703
+ "- Evidence replay: `context-guard-bench --tasks <tasks.json> --variants <variants.json> "
3704
+ "--evidence-jsonl <evidence.jsonl> --csv <results.csv> --report-json <report.json> "
3705
+ "--dashboard-md <dashboard.md>`",
3706
+ ])
3707
+ return "\n".join(lines) + "\n"
3708
+
3709
+
3710
+ def write_report_outputs(
3711
+ csv_path: Path,
3712
+ report_path: Path | None,
3713
+ dashboard_path: Path | None,
3714
+ baseline_variant: str,
3715
+ *,
3716
+ replay_rows: list[EvidenceReplayRow] | None = None,
3717
+ mixed_csv: bool = False,
3718
+ ) -> dict[str, Any]:
3719
+ # Keep lock order stable across all derived writes: source CSV first, then
3720
+ # report, then dashboard. Do not introduce a derived-output -> CSV path.
3721
+ with csv_file_lock(csv_path, create_parent=True):
3722
+ report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
3723
+ if replay_rows is not None:
3724
+ report = annotate_replay_report(report, replay_rows, mixed_csv=mixed_csv)
3725
+ if report_path is not None:
3726
+ with csv_file_lock(report_path, create_parent=True):
3727
+ write_text_no_follow(
3728
+ report_path,
3729
+ json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
3730
+ )
3731
+ if dashboard_path is not None:
3732
+ with csv_file_lock(dashboard_path, create_parent=True):
3733
+ write_text_no_follow(dashboard_path, render_dashboard_markdown(report))
3734
+ return report
3735
+
2206
3736
 
2207
3737
  def write_report_json(csv_path: Path, report_path: Path, baseline_variant: str) -> dict[str, Any]:
2208
3738
  # Keep lock order stable across all report writes: source CSV first, derived
2209
3739
  # report second. Do not introduce a report -> CSV path; that can deadlock
2210
3740
  # concurrent report generation.
2211
- with csv_file_lock(csv_path, create_parent=True):
2212
- report = summarize_benchmark_rows(read_csv_rows(csv_path), baseline_variant)
2213
- with csv_file_lock(report_path, create_parent=True):
2214
- write_text_no_follow(
2215
- report_path,
2216
- json.dumps(report, ensure_ascii=False, indent=2, sort_keys=True) + "\n",
2217
- )
2218
- return report
3741
+ return write_report_outputs(csv_path, report_path, None, baseline_variant)
2219
3742
 
2220
3743
 
2221
3744
  def sanitize_note_text(value: Any) -> str:
@@ -2278,8 +3801,18 @@ def existing_file_identity(path: Path) -> tuple[int, int] | None:
2278
3801
  os.close(fd)
2279
3802
 
2280
3803
 
2281
- def validate_distinct_output_paths(csv_path: Path, ledger_path: Path | None, report_path: Path | None) -> None:
2282
- outputs = [("csv", csv_path), ("ledger-jsonl", ledger_path), ("report-json", report_path)]
3804
+ def validate_distinct_output_paths(
3805
+ csv_path: Path,
3806
+ ledger_path: Path | None,
3807
+ report_path: Path | None,
3808
+ dashboard_path: Path | None = None,
3809
+ ) -> None:
3810
+ outputs = [
3811
+ ("csv", csv_path),
3812
+ ("ledger-jsonl", ledger_path),
3813
+ ("report-json", report_path),
3814
+ ("dashboard-md", dashboard_path),
3815
+ ]
2283
3816
  seen: dict[Path, str] = {}
2284
3817
  seen_identity: dict[tuple[int, int], str] = {}
2285
3818
  for label, path in outputs:
@@ -2318,12 +3851,16 @@ def main() -> int:
2318
3851
  help="optional JSONL ledger path for cost-shift accounting per run")
2319
3852
  parser.add_argument("--report-json", default=None, type=Path,
2320
3853
  help="optional A/B summary report JSON path generated from --csv after real runs")
3854
+ parser.add_argument("--dashboard-md", default=None, type=Path,
3855
+ help="optional Markdown dashboard path generated from the benchmark report")
3856
+ parser.add_argument("--evidence-jsonl", default=None, type=Path,
3857
+ help="optional validated run-evidence JSONL replay input; skips provider invocation")
2321
3858
  parser.add_argument("--baseline-variant", default="baseline",
2322
3859
  help="variant name used as the report baseline (default: baseline)")
2323
3860
  args = parser.parse_args()
2324
3861
 
2325
3862
  require_no_follow_file_ops_supported()
2326
- validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json)
3863
+ validate_distinct_output_paths(args.csv, args.ledger_jsonl, args.report_json, args.dashboard_md)
2327
3864
 
2328
3865
  variants = parse_variants(args.variants)
2329
3866
  tasks = parse_tasks(args.tasks, variants=variants)
@@ -2332,12 +3869,96 @@ def main() -> int:
2332
3869
  print("no (task, variant) targets matched the filters", file=sys.stderr)
2333
3870
  return 1
2334
3871
 
2335
- skip_keys = existing_keys(args.csv) if args.resume else set()
2336
- runnable_targets = [
2337
- (task, variant)
2338
- for task, variant in targets
2339
- if (task.id, variant.name) not in skip_keys
2340
- ]
3872
+ if args.resume:
3873
+ skip_keys, skip_keys_loaded_stamp = existing_keys_snapshot(args.csv)
3874
+ skip_keys_stamp = {"stamp": skip_keys_loaded_stamp}
3875
+ else:
3876
+ skip_keys = set()
3877
+ skip_keys_stamp = None
3878
+ runnable_targets = resume_runnable_targets(
3879
+ args.csv,
3880
+ targets,
3881
+ resume=args.resume,
3882
+ existing_key_cache=skip_keys,
3883
+ existing_key_cache_stamp=skip_keys_stamp,
3884
+ )
3885
+ if args.evidence_jsonl is not None:
3886
+ if args.dry_run:
3887
+ for task, variant in targets:
3888
+ if args.resume and resume_key_present(args.csv, (task.id, variant.name), skip_keys, skip_keys_stamp):
3889
+ print(f"skip {task.id}/{variant.name} (already in {args.csv})")
3890
+ continue
3891
+ print(f"evidence replay dry-run: {task.id}/{variant.name} <- {args.evidence_jsonl}")
3892
+ print("completed 0 run(s); results in (dry-run; no CSV writes)")
3893
+ return 0
3894
+ csv_had_preexisting_content = file_has_content_no_follow(args.csv)
3895
+ evidence_rows = read_evidence_jsonl(args.evidence_jsonl)
3896
+ runnable_targets = resume_runnable_targets(
3897
+ args.csv,
3898
+ targets,
3899
+ resume=args.resume,
3900
+ existing_key_cache=skip_keys,
3901
+ existing_key_cache_stamp=skip_keys_stamp,
3902
+ )
3903
+ evidence_by_key = validate_evidence_coverage(evidence_rows, runnable_targets)
3904
+ runnable_keys = {(task.id, variant.name) for task, variant in runnable_targets}
3905
+ claude_ver = "evidence-replay"
3906
+ completed = 0
3907
+ replay_rows_written: list[EvidenceReplayRow] = []
3908
+ for task, variant in targets:
3909
+ if args.resume and (task.id, variant.name) not in runnable_keys:
3910
+ print(f"skip {task.id}/{variant.name} (already in {args.csv})")
3911
+ continue
3912
+ evidence = evidence_by_key[(task.id, variant.name)]
3913
+ print(f"replay {task.id}/{variant.name} ...", flush=True)
3914
+ result = run_evidence_fixture(task, variant, evidence)
3915
+ wrote = append_csv(
3916
+ args.csv,
3917
+ claude_ver,
3918
+ result,
3919
+ skip_existing=args.resume,
3920
+ existing_key_cache=skip_keys if args.resume else None,
3921
+ existing_key_cache_stamp=skip_keys_stamp,
3922
+ )
3923
+ if wrote:
3924
+ replay_rows_written.append(evidence)
3925
+ if args.ledger_jsonl is not None:
3926
+ append_cost_shift_ledger(
3927
+ args.ledger_jsonl,
3928
+ claude_ver,
3929
+ result,
3930
+ replay_provenance=evidence.provenance_payload(),
3931
+ )
3932
+ completed += 1
3933
+ status = "ok" if result.success else "FAIL"
3934
+ suffix = "" if wrote else " (CSV not updated; row already present)"
3935
+ print(
3936
+ f" {status} tokens={sum(result.tokens.values())} cost=${result.cost_usd:.4f} "
3937
+ f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
3938
+ )
3939
+ if args.report_json is not None or args.dashboard_md is not None:
3940
+ report = write_report_outputs(
3941
+ args.csv,
3942
+ args.report_json,
3943
+ args.dashboard_md,
3944
+ args.baseline_variant,
3945
+ replay_rows=replay_rows_written,
3946
+ mixed_csv=csv_had_preexisting_content or bool(skip_keys) or len(replay_rows_written) != int(completed),
3947
+ )
3948
+ if args.report_json is not None:
3949
+ print(f"report {args.report_json}: {report['claim_status']}")
3950
+ if args.dashboard_md is not None:
3951
+ print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
3952
+ print(f"completed {completed} run(s); results in {args.csv}")
3953
+ return 0
3954
+
3955
+ runnable_targets = resume_runnable_targets(
3956
+ args.csv,
3957
+ targets,
3958
+ resume=args.resume,
3959
+ existing_key_cache=skip_keys,
3960
+ existing_key_cache_stamp=skip_keys_stamp,
3961
+ )
2341
3962
  placeholder_targets = [
2342
3963
  f"{task.id}/{variant.name}"
2343
3964
  for task, variant in runnable_targets
@@ -2365,7 +3986,7 @@ def main() -> int:
2365
3986
 
2366
3987
  completed = 0
2367
3988
  for task, variant in targets:
2368
- if (task.id, variant.name) in skip_keys:
3989
+ if args.resume and resume_key_present(args.csv, (task.id, variant.name), skip_keys, skip_keys_stamp):
2369
3990
  print(f"skip {task.id}/{variant.name} (already in {args.csv})")
2370
3991
  continue
2371
3992
  print(f"run {task.id}/{variant.name} ...", flush=True)
@@ -2374,7 +3995,14 @@ def main() -> int:
2374
3995
  # 깎고, (b) --resume 이 그 (task, variant) 를 skip 해 실제 측정값이 영구 누락된다.
2375
3996
  wrote = True
2376
3997
  if not args.dry_run:
2377
- wrote = append_csv(args.csv, claude_ver, result, skip_existing=args.resume)
3998
+ wrote = append_csv(
3999
+ args.csv,
4000
+ claude_ver,
4001
+ result,
4002
+ skip_existing=args.resume,
4003
+ existing_key_cache=skip_keys if args.resume else None,
4004
+ existing_key_cache_stamp=skip_keys_stamp,
4005
+ )
2378
4006
  if wrote and args.ledger_jsonl is not None:
2379
4007
  append_cost_shift_ledger(args.ledger_jsonl, claude_ver, result)
2380
4008
  completed += 1
@@ -2390,9 +4018,12 @@ def main() -> int:
2390
4018
  f"wall_time={result.wall_time_seconds:.3f}s {sanitize_note_text(result.notes)}{suffix}"
2391
4019
  )
2392
4020
  target = args.csv if not args.dry_run else "(dry-run; no CSV writes)"
2393
- if args.report_json is not None and not args.dry_run:
2394
- report = write_report_json(args.csv, args.report_json, args.baseline_variant)
2395
- print(f"report {args.report_json}: {report['claim_status']}")
4021
+ if (args.report_json is not None or args.dashboard_md is not None) and not args.dry_run:
4022
+ report = write_report_outputs(args.csv, args.report_json, args.dashboard_md, args.baseline_variant)
4023
+ if args.report_json is not None:
4024
+ print(f"report {args.report_json}: {report['claim_status']}")
4025
+ if args.dashboard_md is not None:
4026
+ print(f"dashboard {args.dashboard_md}: {report_public_claim_status(report)[0]}")
2396
4027
  print(f"completed {completed} run(s); results in {target}")
2397
4028
  return 0
2398
4029