devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -5,11 +5,20 @@ from __future__ import annotations
5
5
 
6
6
  import argparse
7
7
  import json
8
+ import math
8
9
  import re
9
10
  from collections import Counter
10
11
  from pathlib import Path
11
12
  from typing import Any
12
13
 
14
+ from pair_evidence_contract import (
15
+ all_known_pair_trigger_reasons,
16
+ has_canonical_pair_trigger_reason,
17
+ has_known_pair_trigger_reason,
18
+ loads_strict_json_object,
19
+ path_has_actionable_solo_headroom_hypothesis,
20
+ )
21
+
13
22
 
14
23
  RANK = {
15
24
  "PASS": 0,
@@ -24,7 +33,29 @@ def rank(verdict: str | None) -> int:
24
33
 
25
34
 
26
35
  def load_json(path: Path) -> dict[str, Any]:
27
- return json.loads(path.read_text(encoding="utf8"))
36
+ try:
37
+ return loads_strict_json_object(path.read_text(encoding="utf8"))
38
+ except (json.JSONDecodeError, ValueError):
39
+ return {}
40
+
41
+
42
+ def object_field(payload: dict[str, Any], key: str) -> dict[str, Any]:
43
+ value = payload.get(key)
44
+ return value if isinstance(value, dict) else {}
45
+
46
+
47
+ def verdict_field(payload: dict[str, Any], key: str) -> str | None:
48
+ value = payload.get(key)
49
+ return value if isinstance(value, str) else None
50
+
51
+
52
+ def number_field(payload: dict[str, Any], key: str) -> int | float | None:
53
+ value = payload.get(key)
54
+ if isinstance(value, bool):
55
+ return None
56
+ if not isinstance(value, (int, float)) or not math.isfinite(value):
57
+ return None
58
+ return value
28
59
 
29
60
 
30
61
  def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
@@ -51,16 +82,112 @@ def infer_fixture_id(results_root: Path, run_id: str) -> str:
51
82
  def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
52
83
  if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
53
84
  return None
54
- if solo_elapsed <= 0:
85
+ if pair_elapsed <= 0 or solo_elapsed <= 0:
55
86
  return None
56
87
  return pair_elapsed / solo_elapsed
57
88
 
58
89
 
90
+ def is_true(value: Any) -> bool:
91
+ return value is True
92
+
93
+
94
+ def pair_trigger_failures(
95
+ pair: dict[str, Any],
96
+ *,
97
+ fixture_spec: Path | None = None,
98
+ require_hypothesis_trigger: bool = False,
99
+ ) -> list[str]:
100
+ trigger = pair.get("pair_trigger")
101
+ if not isinstance(trigger, dict):
102
+ return ["pair_trigger missing or malformed"]
103
+ eligible = trigger.get("eligible")
104
+ reasons = trigger.get("reasons")
105
+ skipped_reason = trigger.get("skipped_reason")
106
+ if not isinstance(eligible, bool):
107
+ return ["pair_trigger.eligible malformed"]
108
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
109
+ return ["pair_trigger.reasons malformed"]
110
+ if skipped_reason is not None and not isinstance(skipped_reason, str):
111
+ return ["pair_trigger.skipped_reason malformed"]
112
+ if eligible is True:
113
+ failures = []
114
+ if not reasons:
115
+ failures.append("pair_trigger eligible with empty reasons")
116
+ if reasons and not has_known_pair_trigger_reason(reasons):
117
+ failures.append("pair_trigger reasons missing known trigger reason")
118
+ if (
119
+ reasons
120
+ and has_known_pair_trigger_reason(reasons)
121
+ and not all_known_pair_trigger_reasons(reasons)
122
+ ):
123
+ failures.append("pair_trigger reasons contain unknown trigger reason")
124
+ if (
125
+ reasons
126
+ and all_known_pair_trigger_reasons(reasons)
127
+ and not has_canonical_pair_trigger_reason(reasons)
128
+ ):
129
+ failures.append("pair_trigger reasons missing canonical trigger reason")
130
+ if skipped_reason is not None:
131
+ failures.append("pair_trigger eligible with skipped_reason")
132
+ if (
133
+ require_hypothesis_trigger
134
+ and fixture_spec is not None
135
+ and path_has_actionable_solo_headroom_hypothesis(fixture_spec)
136
+ and "spec.solo_headroom_hypothesis" not in reasons
137
+ ):
138
+ failures.append("pair_trigger missing spec.solo_headroom_hypothesis")
139
+ return failures
140
+ if reasons:
141
+ return ["pair_trigger ineligible with reasons"]
142
+ return []
143
+
144
+
145
+ def pair_trigger_eligible(pair: dict[str, Any]) -> bool:
146
+ trigger = pair.get("pair_trigger")
147
+ return (
148
+ isinstance(trigger, dict)
149
+ and trigger.get("eligible") is True
150
+ and isinstance(trigger.get("reasons"), list)
151
+ and all(isinstance(reason, str) for reason in trigger["reasons"])
152
+ and len(trigger["reasons"]) > 0
153
+ and all_known_pair_trigger_reasons(trigger["reasons"])
154
+ and has_canonical_pair_trigger_reason(trigger["reasons"])
155
+ and trigger.get("skipped_reason") is None
156
+ )
157
+
158
+
159
+ def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:
160
+ trigger = pair.get("pair_trigger")
161
+ if not isinstance(trigger, dict):
162
+ return []
163
+ reasons = trigger.get("reasons")
164
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
165
+ return []
166
+ return reasons
167
+
168
+
169
+ def pair_trigger_label(row: dict[str, Any]) -> str:
170
+ if row["pair_trigger_missed"]:
171
+ return "missed"
172
+ failures = row.get("pair_trigger_failures") or []
173
+ if failures:
174
+ return "malformed"
175
+ if row["pair_trigger_eligible"]:
176
+ return "eligible"
177
+ return "not_eligible"
178
+
179
+
59
180
  def load_gate_rows(gate_json: Path | None) -> dict[str, dict[str, Any]]:
60
181
  if gate_json is None:
61
182
  return {}
62
183
  doc = load_json(gate_json)
63
- return {row["run_id"]: row for row in doc.get("rows", [])}
184
+ rows = doc.get("rows")
185
+ if not isinstance(rows, list):
186
+ return {}
187
+ return {
188
+ row["run_id"]: row for row in rows
189
+ if isinstance(row, dict) and isinstance(row.get("run_id"), str)
190
+ }
64
191
 
65
192
 
66
193
  def min_gate_rate(value: str) -> float:
@@ -98,8 +225,21 @@ def classify(row: dict[str, Any], included: bool) -> str:
98
225
  return "failed attempt: timeout"
99
226
  if row.get("solo_failure_reason") == "provider_limit" or row.get("pair_failure_reason") == "provider_limit":
100
227
  return "failed attempt: provider limit"
228
+ if row.get("solo_environment_contamination") or row.get("pair_environment_contamination"):
229
+ return "failed attempt: environment contamination"
230
+ if row.get("solo_disqualifier") or row.get("pair_disqualifier"):
231
+ return "failed attempt: disqualifier"
232
+ if row.get("solo_invoke_failure") or row.get("pair_invoke_failure"):
233
+ return "failed attempt: invoke failure"
101
234
  if row.get("solo_invoke_exit") not in (None, 0) or row.get("pair_invoke_exit") not in (None, 0):
102
235
  return "failed attempt: nonzero invoke exit"
236
+ if row.get("malformed_compare"):
237
+ return "failed attempt: malformed compare"
238
+ if row.get("pair_trigger_missed"):
239
+ return "failed attempt: pair trigger missed"
240
+ trigger_failures = row.get("pair_trigger_failures") or []
241
+ if trigger_failures:
242
+ return "failed attempt: pair trigger contract: " + "; ".join(trigger_failures)
103
243
  if row["solo_mechanical"] == "FAIL":
104
244
  return "excluded: solo mechanical dominated"
105
245
  if row["external_lift"] or row["internal_lift"]:
@@ -114,47 +254,95 @@ def classify(row: dict[str, Any], included: bool) -> str:
114
254
  return "no verdict lift"
115
255
 
116
256
 
117
- def build_row(results_root: Path, run_id: str, gate_rows_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
257
+ def build_row(
258
+ results_root: Path,
259
+ run_id: str,
260
+ gate_rows_by_id: dict[str, dict[str, Any]],
261
+ *,
262
+ fixtures_root: Path | None,
263
+ require_hypothesis_trigger: bool,
264
+ ) -> dict[str, Any]:
118
265
  compare_path = results_root / run_id / "compare.json"
266
+ malformed_compare = False
119
267
  if compare_path.exists():
120
268
  compare = load_json(compare_path)
269
+ malformed_compare = not bool(compare)
121
270
  else:
122
271
  compare = {
123
272
  "solo": {},
124
273
  "pair": {},
125
274
  "comparison": {"compare_missing": True},
126
275
  }
127
- solo = compare.get("solo") or {}
128
- pair = compare.get("pair") or {}
129
- comparison = compare.get("comparison") or {}
130
- pair_ratio = elapsed_ratio(pair.get("elapsed_seconds"), solo.get("elapsed_seconds"))
276
+ solo = object_field(compare, "solo")
277
+ pair = object_field(compare, "pair")
278
+ comparison = object_field(compare, "comparison")
279
+ malformed_compare = malformed_compare or any(
280
+ key in compare and not isinstance(compare.get(key), dict)
281
+ for key in ("solo", "pair", "comparison")
282
+ )
283
+ pair_ratio = elapsed_ratio(
284
+ number_field(pair, "elapsed_seconds"),
285
+ number_field(solo, "elapsed_seconds"),
286
+ )
131
287
  gate_row = gate_rows_by_id.get(run_id) or {}
288
+ solo_verdict = (
289
+ verdict_field(comparison, "solo_verdict")
290
+ or verdict_field(solo, "verify_verdict")
291
+ )
292
+ pair_verdict = (
293
+ verdict_field(comparison, "pair_verdict")
294
+ or verdict_field(pair, "verify_verdict")
295
+ )
296
+ solo_sub = object_field(solo, "sub_verdicts")
297
+ pair_sub = object_field(pair, "sub_verdicts")
298
+ fixture_id = infer_fixture_id(results_root, run_id)
299
+ fixture_spec = None
300
+ if fixtures_root is not None and fixture_id != "unknown":
301
+ fixture_spec = fixtures_root / fixture_id / "spec.md"
302
+ trigger_failures = pair_trigger_failures(
303
+ pair,
304
+ fixture_spec=fixture_spec,
305
+ require_hypothesis_trigger=require_hypothesis_trigger,
306
+ )
307
+ trigger_reasons = pair_trigger_reasons(pair)
132
308
  row = {
133
- "fixture_id": infer_fixture_id(results_root, run_id),
309
+ "fixture_id": fixture_id,
134
310
  "run_id": run_id,
135
- "solo_verdict": comparison.get("solo_verdict") or solo.get("verify_verdict"),
136
- "pair_verdict": comparison.get("pair_verdict") or pair.get("verify_verdict"),
137
- "pair_mode": bool(pair.get("pair_mode")),
138
- "external_lift": bool(comparison.get("pair_verdict_lift")),
139
- "internal_lift": bool(comparison.get("pair_internal_verdict_lift")),
140
- "pair_found_more_findings": bool(comparison.get("pair_found_more_findings")),
141
- "pair_found_more_low_or_worse": bool(comparison.get("pair_found_more_low_or_worse")),
142
- "row_failed_before_compare": bool(comparison.get("row_failed_before_compare")),
311
+ "solo_verdict": solo_verdict,
312
+ "pair_verdict": pair_verdict,
313
+ "pair_mode": is_true(pair.get("pair_mode")),
314
+ "pair_trigger_eligible": pair_trigger_eligible(pair),
315
+ "pair_trigger_reasons": trigger_reasons,
316
+ "pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons),
317
+ "pair_trigger_missed": is_true(comparison.get("pair_trigger_missed")),
318
+ "pair_trigger_failures": trigger_failures,
319
+ "external_lift": is_true(comparison.get("pair_verdict_lift")),
320
+ "internal_lift": is_true(comparison.get("pair_internal_verdict_lift")),
321
+ "pair_found_more_findings": is_true(comparison.get("pair_found_more_findings")),
322
+ "pair_found_more_low_or_worse": is_true(comparison.get("pair_found_more_low_or_worse")),
323
+ "row_failed_before_compare": is_true(comparison.get("row_failed_before_compare")),
143
324
  "row_exit": comparison.get("row_exit"),
144
- "compare_missing": bool(comparison.get("compare_missing")),
325
+ "compare_missing": is_true(comparison.get("compare_missing")),
145
326
  "solo_invoke_exit": solo.get("invoke_exit"),
146
327
  "pair_invoke_exit": pair.get("invoke_exit"),
147
328
  "solo_failure_reason": solo.get("invoke_failure_reason")
148
329
  or transcript_failure_reason(results_root, run_id, "solo"),
149
330
  "pair_failure_reason": pair.get("invoke_failure_reason")
150
331
  or transcript_failure_reason(results_root, run_id, "pair"),
151
- "solo_timed_out": bool(solo.get("timed_out")),
152
- "pair_timed_out": bool(pair.get("timed_out")),
332
+ "solo_invoke_failure": is_true(solo.get("invoke_failure")),
333
+ "pair_invoke_failure": is_true(pair.get("invoke_failure")),
334
+ "solo_environment_contamination": is_true(solo.get("environment_contamination")),
335
+ "pair_environment_contamination": is_true(pair.get("environment_contamination")),
336
+ "solo_disqualifier": is_true(solo.get("disqualifier")),
337
+ "pair_disqualifier": is_true(pair.get("disqualifier")),
338
+ "solo_timed_out": is_true(solo.get("timed_out")),
339
+ "pair_timed_out": is_true(pair.get("timed_out")),
153
340
  "pair_solo_wall_ratio": pair_ratio,
154
- "solo_mechanical": (solo.get("sub_verdicts") or {}).get("mechanical"),
155
- "pair_mechanical": (pair.get("sub_verdicts") or {}).get("mechanical"),
341
+ "solo_mechanical": verdict_field(solo_sub, "mechanical"),
342
+ "pair_mechanical": verdict_field(pair_sub, "mechanical"),
156
343
  "included_in_gate": gate_row.get("status") == "PASS",
157
344
  "gate_failures": gate_row.get("failures") or [],
345
+ "malformed_compare": malformed_compare,
158
346
  }
159
347
  row["classification"] = classify(row, row["included_in_gate"])
160
348
  return row
@@ -164,6 +352,12 @@ def fmt_ratio(value: Any) -> str:
164
352
  return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
165
353
 
166
354
 
355
+ def fmt_trigger_reasons(value: Any) -> str:
356
+ if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
357
+ return ""
358
+ return ",".join(value)
359
+
360
+
167
361
  def write_md(path: Path, report: dict[str, Any]) -> None:
168
362
  lines = [
169
363
  f"# {report['title']}",
@@ -189,14 +383,16 @@ def write_md(path: Path, report: dict[str, Any]) -> None:
189
383
  lines.extend(
190
384
  [
191
385
  "",
192
- "| Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Included | Classification |",
193
- "|---|---|---|---|---:|---|---|---|---|",
386
+ "| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |",
387
+ "|---|---|---|---|---|---|---:|---|---|---|---|",
194
388
  ]
195
389
  )
196
390
  for row in report["rows"]:
197
391
  lines.append(
198
392
  f"| {row['fixture_id']} | {row['solo_verdict']} | {row['pair_verdict']} | "
199
- f"{str(row['pair_mode']).lower()} | {fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
393
+ f"{str(row['pair_mode']).lower()} | {pair_trigger_label(row)} | "
394
+ f"{fmt_trigger_reasons(row.get('pair_trigger_reasons'))} | "
395
+ f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
200
396
  f"{str(row['external_lift']).lower()} | {str(row['internal_lift']).lower()} | "
201
397
  f"{str(row['included_in_gate']).lower()} | {row['classification']} |"
202
398
  )
@@ -207,18 +403,35 @@ def write_md(path: Path, report: dict[str, Any]) -> None:
207
403
  def main() -> int:
208
404
  parser = argparse.ArgumentParser()
209
405
  parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=Path)
406
+ parser.add_argument("--fixtures-root", type=Path)
210
407
  parser.add_argument("--run-id", action="append", required=True)
211
408
  parser.add_argument("--gate-json", type=Path)
212
409
  parser.add_argument("--title", required=True)
213
410
  parser.add_argument("--verdict", required=True)
214
411
  parser.add_argument("--min-gate-rate", type=min_gate_rate)
215
412
  parser.add_argument("--max-trailing-non-gate", type=non_negative_int)
413
+ parser.add_argument(
414
+ "--require-hypothesis-trigger",
415
+ action="store_true",
416
+ help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
417
+ )
216
418
  parser.add_argument("--out-json", required=True, type=Path)
217
419
  parser.add_argument("--out-md", required=True, type=Path)
218
420
  args = parser.parse_args()
421
+ if args.require_hypothesis_trigger and args.fixtures_root is None:
422
+ parser.error("--require-hypothesis-trigger requires --fixtures-root")
219
423
 
220
424
  gate_rows_by_id = load_gate_rows(args.gate_json)
221
- rows = [build_row(args.results_root, run_id, gate_rows_by_id) for run_id in args.run_id]
425
+ rows = [
426
+ build_row(
427
+ args.results_root,
428
+ run_id,
429
+ gate_rows_by_id,
430
+ fixtures_root=args.fixtures_root,
431
+ require_hypothesis_trigger=args.require_hypothesis_trigger,
432
+ )
433
+ for run_id in args.run_id
434
+ ]
222
435
  gate_rows = sum(1 for row in rows if row["included_in_gate"])
223
436
  trailing_non_gate_rows = 0
224
437
  for row in reversed(rows):
@@ -0,0 +1,288 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for audit-headroom-rejections.py.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ SCRIPT="$SCRIPT_DIR/audit-headroom-rejections.py"
8
+ TMP_DIR="$(mktemp -d /tmp/audit-headroom-rejections-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+
11
+ fixtures="$TMP_DIR/fixtures"
12
+ results="$TMP_DIR/results"
13
+ registry="$TMP_DIR/pair-rejected-fixtures.sh"
14
+ mkdir -p "$fixtures/F16-cli-quote-tax-rules" \
15
+ "$fixtures/F33-cli-new-candidate" \
16
+ "$fixtures/F34-cli-rejected-candidate" \
17
+ "$fixtures/F35-cli-missing-judge" \
18
+ "$fixtures/F36-unsupported-rejection" \
19
+ "$results/old-f16" \
20
+ "$results/f33-headroom" \
21
+ "$results/f33-weak-pair-pass" \
22
+ "$results/f34-headroom" \
23
+ "$results/f35-missing-judge" \
24
+ "$results/20260512-f36-headroom" \
25
+ "$results/bad-json-headroom" \
26
+ "$results/malformed-headroom" \
27
+ "$results/f16-pair-pass"
28
+
29
+ cat > "$registry" <<'SH'
30
+ rejected_pair_fixture_reason() {
31
+ local fid="$1"
32
+ case "$fid" in
33
+ F34-*|F34)
34
+ echo "measured solo ceiling"
35
+ ;;
36
+ F36-*|F36)
37
+ echo "bare 33 / solo_claude 98 in 20260512-missing-headroom"
38
+ ;;
39
+ *)
40
+ return 1
41
+ ;;
42
+ esac
43
+ }
44
+ SH
45
+
46
+ s_only_registry="$TMP_DIR/s-only-registry.sh"
47
+ cat > "$s_only_registry" <<'SH'
48
+ rejected_pair_fixture_reason() {
49
+ local fid="$1"
50
+ case "$fid" in
51
+ S3-*|S3)
52
+ echo "shadow solo ceiling"
53
+ ;;
54
+ *)
55
+ return 1
56
+ ;;
57
+ esac
58
+ }
59
+ SH
60
+ python3 - "$SCRIPT" "$s_only_registry" <<'PY'
61
+ import importlib.util
62
+ import pathlib
63
+ import sys
64
+
65
+ spec = importlib.util.spec_from_file_location("audit_headroom_rejections", sys.argv[1])
66
+ module = importlib.util.module_from_spec(spec)
67
+ assert spec.loader is not None
68
+ spec.loader.exec_module(module)
69
+ assert module.registry_short_ids(pathlib.Path(sys.argv[2])) == {"S3"}
70
+ PY
71
+
72
+ write_headroom_fail() {
73
+ local run_id="$1"
74
+ local fixture="$2"
75
+ local bare="$3"
76
+ local solo="$4"
77
+ cat > "$results/$run_id/headroom-gate.json" <<JSON
78
+ {
79
+ "run_id": "$run_id",
80
+ "verdict": "FAIL",
81
+ "rows": [
82
+ {
83
+ "fixture": "$fixture",
84
+ "status": "FAIL",
85
+ "bare_score": $bare,
86
+ "solo_score": $solo,
87
+ "reason": "solo_claude score $solo > 80"
88
+ }
89
+ ]
90
+ }
91
+ JSON
92
+ }
93
+
94
+ write_headroom_fail old-f16 F16-cli-quote-tax-rules 50 98
95
+ write_headroom_fail f33-headroom F33-cli-new-candidate 33 98
96
+ write_headroom_fail f34-headroom F34-cli-rejected-candidate 33 98
97
+
98
+ cat > "$results/f35-missing-judge/headroom-gate.json" <<'JSON'
99
+ {
100
+ "run_id": "f35-missing-judge",
101
+ "verdict": "FAIL",
102
+ "rows": [
103
+ {
104
+ "fixture": "F35-cli-missing-judge",
105
+ "status": "MISSING_JUDGE",
106
+ "reason": "judge.json missing"
107
+ }
108
+ ]
109
+ }
110
+ JSON
111
+
112
+ cat > "$results/malformed-headroom/headroom-gate.json" <<'JSON'
113
+ {
114
+ "run_id": "malformed-headroom",
115
+ "verdict": "FAIL",
116
+ "rows": []
117
+ }
118
+ JSON
119
+
120
+ printf '{not-json\n' > "$results/bad-json-headroom/headroom-gate.json"
121
+
122
+ cat > "$results/f16-pair-pass/full-pipeline-pair-gate.json" <<'JSON'
123
+ {
124
+ "run_id": "f16-pair-pass",
125
+ "verdict": "PASS",
126
+ "pair_arm": "l2_risk_probes",
127
+ "rows": [
128
+ {
129
+ "fixture": "F16-cli-quote-tax-rules",
130
+ "status": "PASS",
131
+ "bare_score": 50,
132
+ "solo_score": 75,
133
+ "pair_score": 96,
134
+ "pair_margin": 21,
135
+ "pair_mode": true,
136
+ "pair_trigger_eligible": true,
137
+ "pair_solo_wall_ratio": 1.28
138
+ }
139
+ ]
140
+ }
141
+ JSON
142
+ mkdir -p "$results/f16-pair-pass/F16-cli-quote-tax-rules/l2_risk_probes"
143
+ cat > "$results/f16-pair-pass/F16-cli-quote-tax-rules/l2_risk_probes/result.json" <<'JSON'
144
+ {
145
+ "pair_trigger": {
146
+ "eligible": true,
147
+ "reasons": ["complexity.high"],
148
+ "skipped_reason": null
149
+ }
150
+ }
151
+ JSON
152
+ python3 - "$SCRIPT" "$results" <<'PY'
153
+ import importlib.util
154
+ import pathlib
155
+ import sys
156
+
157
+ spec = importlib.util.spec_from_file_location("audit_headroom_rejections", sys.argv[1])
158
+ module = importlib.util.module_from_spec(spec)
159
+ assert spec.loader is not None
160
+ spec.loader.exec_module(module)
161
+ results_root = pathlib.Path(sys.argv[2])
162
+ kwargs = {
163
+ "results_root": results_root,
164
+ "run_id": "f16-pair-pass",
165
+ "fixture": "F16-cli-quote-tax-rules",
166
+ "pair_arm": "l2_risk_probes",
167
+ }
168
+ assert module.pair_result_trigger_reasons(**kwargs) == ["complexity.high"]
169
+ path = (
170
+ results_root
171
+ / "f16-pair-pass"
172
+ / "F16-cli-quote-tax-rules"
173
+ / "l2_risk_probes"
174
+ / "result.json"
175
+ )
176
+ path.write_text(
177
+ '{"pair_trigger":{"eligible":true,"reasons":["risk high"],"skipped_reason":null}}\n',
178
+ encoding="utf8",
179
+ )
180
+ assert module.pair_result_trigger_reasons(**kwargs) == []
181
+ path.write_text(
182
+ '{"pair_trigger":{"eligible":true,"reasons":["complexity.high"],"skipped_reason":null}}\n',
183
+ encoding="utf8",
184
+ )
185
+ PY
186
+
187
+ cat > "$results/f33-weak-pair-pass/full-pipeline-pair-gate.json" <<'JSON'
188
+ {
189
+ "run_id": "f33-weak-pair-pass",
190
+ "verdict": "PASS",
191
+ "pair_arm": "l2_risk_probes",
192
+ "rows": [
193
+ {
194
+ "fixture": "F33-cli-new-candidate",
195
+ "status": "PASS",
196
+ "bare_score": 33,
197
+ "solo_score": 98,
198
+ "pair_score": 96,
199
+ "pair_margin": -2,
200
+ "pair_mode": true,
201
+ "pair_trigger_eligible": true,
202
+ "pair_solo_wall_ratio": 1.1
203
+ }
204
+ ]
205
+ }
206
+ JSON
207
+
208
+ if python3 "$SCRIPT" \
209
+ --fixtures-root "$fixtures" \
210
+ --registry "$registry" \
211
+ --results-root "$results" \
212
+ --out-json "$TMP_DIR/audit.json" > "$TMP_DIR/audit.out" 2> "$TMP_DIR/audit.err"; then
213
+ echo "expected unrecorded F33 failure" >&2
214
+ exit 1
215
+ fi
216
+ grep -Fq 'F33-cli-new-candidate' "$TMP_DIR/audit.err"
217
+ grep -Fq 'F35-cli-missing-judge' "$TMP_DIR/audit.err"
218
+ grep -Fq 'status=MISSING_JUDGE' "$TMP_DIR/audit.err"
219
+ grep -Fq 'malformed-headroom <unknown>' "$TMP_DIR/audit.err"
220
+ grep -Fq 'status=MALFORMED_ROWS' "$TMP_DIR/audit.err"
221
+ grep -Fq 'bad-json-headroom <unknown>' "$TMP_DIR/audit.err"
222
+ grep -Fq 'status=MALFORMED_JSON' "$TMP_DIR/audit.err"
223
+ grep -Fq 'unsupported registry rejection(s)' "$TMP_DIR/audit.err"
224
+ grep -Fq 'F36-unsupported-rejection' "$TMP_DIR/audit.err"
225
+ grep -Fq 'expected_run=20260512-missing-headroom' "$TMP_DIR/audit.err"
226
+ grep -Fq 'solo_claude=98' "$TMP_DIR/audit.err"
227
+ grep -Fq 'expected_solo_claude=98' "$TMP_DIR/audit.err"
228
+ grep -Fq '"verdict": "FAIL"' "$TMP_DIR/audit.json"
229
+ grep -Fq '"fixture": "F33-cli-new-candidate"' "$TMP_DIR/audit.json"
230
+ grep -Fq '"fixture": "F35-cli-missing-judge"' "$TMP_DIR/audit.json"
231
+ grep -Fq '"fixture": "<unknown>"' "$TMP_DIR/audit.json"
232
+ grep -Fq '"unsupported_registry_rejections"' "$TMP_DIR/audit.json"
233
+ if grep -Fq 'F16-cli-quote-tax-rules' "$TMP_DIR/audit.err"; then
234
+ echo "F16 has passing pair evidence and must not be reported" >&2
235
+ cat "$TMP_DIR/audit.err" >&2
236
+ exit 1
237
+ fi
238
+ if grep -Fq 'F34-cli-rejected-candidate' "$TMP_DIR/audit.err"; then
239
+ echo "F34 is rejected and must not be reported" >&2
240
+ cat "$TMP_DIR/audit.err" >&2
241
+ exit 1
242
+ fi
243
+
244
+ python3 - "$registry" <<'PY'
245
+ from pathlib import Path
246
+ import sys
247
+ path = Path(sys.argv[1])
248
+ text = path.read_text()
249
+ text = text.replace(
250
+ ' F34-*|F34)',
251
+ ' F33-*|F33)\n'
252
+ ' echo "measured solo ceiling"\n'
253
+ ' ;;\n'
254
+ ' F35-*|F35)\n'
255
+ ' echo "missing judge artifact"\n'
256
+ ' ;;\n'
257
+ ' F34-*|F34)'
258
+ )
259
+ path.write_text(text)
260
+ PY
261
+
262
+ rm -rf "$results/malformed-headroom"
263
+ rm -rf "$results/bad-json-headroom"
264
+
265
+ write_headroom_fail 20260512-f36-headroom F36-unsupported-rejection 33 98
266
+ python3 - "$registry" <<'PY'
267
+ from pathlib import Path
268
+ import sys
269
+ path = Path(sys.argv[1])
270
+ text = path.read_text()
271
+ text = text.replace(
272
+ "bare 33 / solo_claude 98 in 20260512-missing-headroom",
273
+ "bare 33 / solo_claude 98 in 20260512-f36-headroom",
274
+ )
275
+ path.write_text(text)
276
+ PY
277
+
278
+ python3 "$SCRIPT" \
279
+ --fixtures-root "$fixtures" \
280
+ --registry "$registry" \
281
+ --results-root "$results" \
282
+ --out-json "$TMP_DIR/audit-pass.json" \
283
+ > "$TMP_DIR/audit-pass.out"
284
+ grep -Fq 'PASS audit-headroom-rejections' "$TMP_DIR/audit-pass.out"
285
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/audit-pass.json"
286
+ grep -Fq '"unsupported_registry_rejections": []' "$TMP_DIR/audit-pass.json"
287
+
288
+ echo "PASS test-audit-headroom-rejections"