devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -3,54 +3,298 @@
3
3
 
4
4
  This is stricter than headroom-gate.py. Headroom only says a candidate set is
5
5
  worth measuring. This gate says the measured L2 arm is usable evidence:
6
- bare and solo leave headroom, l2_gated is clean, gated pair actually fired, and
7
- the blind judge scores l2_gated materially above solo_claude.
6
+ bare and solo leave headroom with complete comparable artifacts, the selected
7
+ pair arm is evidence-clean, pair mode actually fired for a canonical trigger
8
+ reason, and the blind judge scores the selected pair arm materially above
9
+ solo_claude.
8
10
  """
9
11
  from __future__ import annotations
10
12
 
11
13
  import argparse
12
14
  import json
15
+ import os
13
16
  import pathlib
17
+ import re
14
18
  import sys
15
19
  from typing import Any
16
20
 
21
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
22
+ if str(SCRIPT_DIR) not in sys.path:
23
+ sys.path.insert(0, str(SCRIPT_DIR))
24
+ FIXTURES_ROOT = SCRIPT_DIR.parent / "fixtures"
17
25
 
18
- def load_json(path: pathlib.Path) -> dict[str, Any] | None:
26
+ from pair_evidence_contract import (
27
+ ALLOWED_PAIR_ARMS,
28
+ all_known_pair_trigger_reasons,
29
+ has_canonical_pair_trigger_reason,
30
+ has_known_pair_trigger_reason,
31
+ is_score,
32
+ is_strict_number,
33
+ loads_strict_json_object,
34
+ path_has_actionable_solo_headroom_hypothesis,
35
+ )
36
+
37
+ KNOWN_ARMS = {"bare", "solo_claude"} | ALLOWED_PAIR_ARMS
38
+ PASS_VERDICTS = {"PASS", "PASS_WITH_ISSUES"}
39
+ REJECTED_REGISTRY = pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh")
40
+
41
+
42
+ def load_json(path: pathlib.Path) -> tuple[dict[str, Any] | None, str | None]:
19
43
  if not path.is_file():
20
- return None
21
- return json.loads(path.read_text())
44
+ return None, "missing"
45
+ try:
46
+ data = loads_strict_json_object(path.read_text())
47
+ except (ValueError, json.JSONDecodeError):
48
+ return None, "malformed"
49
+ return data, None
50
+
51
+
52
+ def fixture_short(name: str) -> str:
53
+ return name.split("-", 1)[0] if "-" in name else name
54
+
55
+
56
+ def rejected_registry_path() -> pathlib.Path:
57
+ override = os.environ.get("PAIR_REJECTED_FIXTURES_REGISTRY")
58
+ return pathlib.Path(override) if override else REJECTED_REGISTRY
59
+
60
+
61
+ def load_rejected_short_ids(path: pathlib.Path) -> set[str]:
62
+ if not path.is_file():
63
+ raise ValueError(f"rejected fixture registry missing: {path}")
64
+ rejected = set()
65
+ for line in path.read_text().splitlines():
66
+ match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
67
+ if match and match.group(1) == match.group(2):
68
+ rejected.add(match.group(1))
69
+ if not rejected:
70
+ raise ValueError(f"rejected fixture registry has no fixture entries: {path}")
71
+ return rejected
22
72
 
23
73
 
24
74
  def score_for(judge: dict[str, Any], arm: str) -> int | None:
25
- value = (judge.get("scores_by_arm") or {}).get(arm)
26
- return value if isinstance(value, int) else None
75
+ mapping = judge.get("_blind_mapping")
76
+ if not isinstance(mapping, dict):
77
+ return None
78
+ if arm not in {mapped for slot, mapped in mapping.items() if slot in {"A", "B", "C"}}:
79
+ return None
80
+ raw_scores = judge.get("scores_by_arm")
81
+ scores = raw_scores if isinstance(raw_scores, dict) else {}
82
+ value = scores.get(arm)
83
+ return value if is_score(value) else None
27
84
 
28
85
 
29
- def clean_failures(fixture_dir: pathlib.Path, judge: dict[str, Any], arm: str) -> list[str]:
30
- failures: list[str] = []
31
- result = load_json(fixture_dir / arm / "result.json")
32
- verify = load_json(fixture_dir / arm / "verify.json")
86
+ def verify_score_clean(payload: dict[str, Any] | None) -> bool:
87
+ if payload is None:
88
+ return False
89
+ value = payload.get("verify_score")
90
+ return is_strict_number(value) and value >= 1.0
91
+
92
+
93
+ def bool_flag_failure(value: Any, true_reason: str, malformed_reason: str) -> str | None:
94
+ if value is True:
95
+ return true_reason
96
+ if value is False or value is None:
97
+ return None
98
+ return malformed_reason
99
+
100
+
101
+ def pair_trigger_failures(result: dict[str, Any] | None, arm: str) -> list[str]:
102
+ if result is None:
103
+ return []
104
+ trigger = result.get("pair_trigger")
105
+ if not isinstance(trigger, dict):
106
+ return [f"{arm} pair_trigger missing or malformed"]
107
+ eligible = trigger.get("eligible")
108
+ reasons = trigger.get("reasons")
109
+ skipped_reason = trigger.get("skipped_reason")
110
+ if not isinstance(eligible, bool):
111
+ return [f"{arm} pair_trigger.eligible malformed"]
112
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
113
+ return [f"{arm} pair_trigger.reasons malformed"]
114
+ if skipped_reason is not None and not isinstance(skipped_reason, str):
115
+ return [f"{arm} pair_trigger.skipped_reason malformed"]
116
+ if eligible is not True:
117
+ return [f"{arm} pair_trigger not eligible"]
118
+ if not reasons:
119
+ return [f"{arm} pair_trigger eligible with empty reasons"]
120
+ if not has_known_pair_trigger_reason(reasons):
121
+ return [f"{arm} pair_trigger reasons missing known trigger reason"]
122
+ if not all_known_pair_trigger_reasons(reasons):
123
+ return [f"{arm} pair_trigger reasons contain unknown trigger reason"]
124
+ if not has_canonical_pair_trigger_reason(reasons):
125
+ return [f"{arm} pair_trigger reasons missing canonical trigger reason"]
126
+ if skipped_reason is not None:
127
+ return [f"{arm} pair_trigger eligible with skipped_reason"]
128
+ return []
129
+
130
+
131
+ def pair_trigger_eligible(result: dict[str, Any] | None) -> bool:
132
+ if result is None:
133
+ return False
134
+ trigger = result.get("pair_trigger")
135
+ return (
136
+ isinstance(trigger, dict)
137
+ and trigger.get("eligible") is True
138
+ and isinstance(trigger.get("reasons"), list)
139
+ and bool(trigger.get("reasons"))
140
+ and all(isinstance(reason, str) for reason in trigger.get("reasons", []))
141
+ and all_known_pair_trigger_reasons(trigger.get("reasons", []))
142
+ and has_canonical_pair_trigger_reason(trigger.get("reasons", []))
143
+ and trigger.get("skipped_reason") is None
144
+ )
145
+
146
+
147
+ def pair_trigger_reasons(result: dict[str, Any] | None) -> list[str]:
33
148
  if result is None:
34
- failures.append(f"{arm} result.json missing")
35
- if verify is None:
36
- failures.append(f"{arm} verify.json missing")
149
+ return []
150
+ trigger = result.get("pair_trigger")
151
+ if not isinstance(trigger, dict):
152
+ return []
153
+ reasons = trigger.get("reasons")
154
+ if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
155
+ return []
156
+ return reasons
157
+
158
+
159
+ def fixture_spec_has_solo_headroom_hypothesis(fixture: str) -> bool:
160
+ return path_has_actionable_solo_headroom_hypothesis(FIXTURES_ROOT / fixture / "spec.md")
161
+
162
+
163
+ def skill_verdict_failures(result: dict[str, Any] | None, arm: str) -> list[str]:
164
+ if result is None or arm == "bare":
165
+ return []
166
+ failures: list[str] = []
167
+ terminal = result.get("terminal_verdict")
168
+ verify = result.get("verify_verdict")
169
+ if terminal not in PASS_VERDICTS:
170
+ failures.append(f"{arm} terminal verdict not pass")
171
+ if verify not in PASS_VERDICTS:
172
+ failures.append(f"{arm} verify verdict not pass")
173
+ return failures
174
+
37
175
 
38
- dq_by_arm = judge.get("disqualifiers_by_arm") or {}
39
- if bool((dq_by_arm.get(arm) or {}).get("disqualifier")):
40
- failures.append(f"{arm} judge disqualifier")
176
+ def axis_validation_counts(judge: dict[str, Any]) -> tuple[dict[str, int], int]:
177
+ raw_mapping = judge.get("_blind_mapping")
178
+ mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
179
+ raw_validation = judge.get("_axis_validation")
180
+ validation = raw_validation if isinstance(raw_validation, dict) else {}
181
+ cells = validation.get("out_of_range_cells") or []
182
+ declared_count = validation.get("out_of_range_count")
183
+ total_invalid = max(
184
+ declared_count if isinstance(declared_count, int) else 0,
185
+ len(cells) if isinstance(cells, list) else 0,
186
+ )
187
+ breakdown_to_letter = {
188
+ "a_breakdown": "A",
189
+ "b_breakdown": "B",
190
+ "c_breakdown": "C",
191
+ }
192
+ counts: dict[str, int] = {}
193
+ mapped_count = 0
194
+ if not isinstance(cells, list):
195
+ return counts, total_invalid
196
+ for cell in cells:
197
+ if not isinstance(cell, dict):
198
+ continue
199
+ letter = breakdown_to_letter.get(cell.get("breakdown"))
200
+ arm = mapping.get(letter) if letter else None
201
+ if arm in KNOWN_ARMS:
202
+ counts[arm] = counts.get(arm, 0) + 1
203
+ mapped_count += 1
204
+ return counts, max(0, total_invalid - mapped_count)
205
+
206
+
207
+ def axis_invalid_count(judge: dict[str, Any], arm: str) -> int:
208
+ counts, _ = axis_validation_counts(judge)
209
+ return counts.get(arm, 0)
210
+
211
+
212
+ def axis_unmapped_invalid_count(judge: dict[str, Any]) -> int:
213
+ _, unmapped = axis_validation_counts(judge)
214
+ return unmapped
215
+
216
+
217
+ def blind_mapping_failures(judge: dict[str, Any], required_arms: set[str]) -> list[str]:
218
+ mapping = judge.get("_blind_mapping")
219
+ if not isinstance(mapping, dict):
220
+ return ["judge blind mapping missing"]
221
+ mapped_arms = {arm for key, arm in mapping.items() if key in {"A", "B", "C"}}
222
+ missing = sorted(required_arms - mapped_arms)
223
+ if missing:
224
+ return [f"judge blind mapping missing arm(s): {', '.join(missing)}"]
225
+ return []
226
+
227
+
228
+ def clean_failures(
229
+ fixture_dir: pathlib.Path,
230
+ judge: dict[str, Any],
231
+ arm: str,
232
+ *,
233
+ require_correctness: bool,
234
+ ) -> list[str]:
235
+ failures: list[str] = []
236
+ result, result_error = load_json(fixture_dir / arm / "result.json")
237
+ verify, verify_error = load_json(fixture_dir / arm / "verify.json")
238
+ diff = fixture_dir / arm / "diff.patch"
239
+ if result_error:
240
+ failures.append(f"{arm} result.json {result_error}")
241
+ if verify_error:
242
+ failures.append(f"{arm} verify.json {verify_error}")
243
+ if not diff.is_file():
244
+ failures.append(f"{arm} diff.patch missing")
245
+
246
+ raw_dq_by_arm = judge.get("disqualifiers_by_arm")
247
+ dq_by_arm = raw_dq_by_arm if isinstance(raw_dq_by_arm, dict) else {}
248
+ dq_entry = dq_by_arm.get(arm)
249
+ dq_value = dq_entry.get("disqualifier") if isinstance(dq_entry, dict) else dq_entry
250
+ judge_dq_failure = bool_flag_failure(
251
+ dq_value,
252
+ f"{arm} judge disqualifier",
253
+ f"{arm} judge disqualifier malformed",
254
+ )
255
+ if judge_dq_failure:
256
+ failures.append(judge_dq_failure)
257
+ axis_invalid = axis_invalid_count(judge, arm)
258
+ if axis_invalid > 0:
259
+ failures.append(f"{arm} judge axis-invalid ({axis_invalid})")
41
260
  if result is not None:
42
- if bool(result.get("disqualifier")):
43
- failures.append(f"{arm} result disqualifier")
44
- if bool(result.get("timed_out")):
45
- failures.append(f"{arm} timed out")
46
- if bool(result.get("invoke_failure")):
261
+ for field, true_reason in (
262
+ ("disqualifier", f"{arm} result disqualifier"),
263
+ ("timed_out", f"{arm} timed out"),
264
+ ("environment_contamination", f"{arm} environment contamination"),
265
+ ):
266
+ failure = bool_flag_failure(
267
+ result.get(field),
268
+ true_reason,
269
+ f"{arm} result {field} malformed",
270
+ )
271
+ if failure:
272
+ failures.append(failure)
273
+ invoke_failure = bool_flag_failure(
274
+ result.get("invoke_failure"),
275
+ f"{arm} invoke failure",
276
+ f"{arm} result invoke_failure malformed",
277
+ )
278
+ if invoke_failure == f"{arm} invoke failure":
47
279
  reason = result.get("invoke_failure_reason")
48
280
  if isinstance(reason, str) and reason:
49
281
  failures.append(f"{arm} invoke failure ({reason})")
50
282
  else:
51
- failures.append(f"{arm} invoke failure")
52
- if verify is not None and bool(verify.get("disqualifier")):
53
- failures.append(f"{arm} verify disqualifier")
283
+ failures.append(invoke_failure)
284
+ elif invoke_failure:
285
+ failures.append(invoke_failure)
286
+ if require_correctness:
287
+ failures.extend(skill_verdict_failures(result, arm))
288
+ if verify is not None:
289
+ verify_dq_failure = bool_flag_failure(
290
+ verify.get("disqualifier"),
291
+ f"{arm} verify disqualifier",
292
+ f"{arm} verify disqualifier malformed",
293
+ )
294
+ if verify_dq_failure:
295
+ failures.append(verify_dq_failure)
296
+ if require_correctness and verify is not None and not verify_score_clean(verify):
297
+ failures.append(f"{arm} verify_score < 1.0")
54
298
  return failures
55
299
 
56
300
 
@@ -59,9 +303,7 @@ def elapsed_ratio(pair_result: dict[str, Any] | None, solo_result: dict[str, Any
59
303
  return None
60
304
  pair_elapsed = pair_result.get("elapsed_seconds")
61
305
  solo_elapsed = solo_result.get("elapsed_seconds")
62
- if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
63
- return None
64
- if solo_elapsed <= 0:
306
+ if not is_strict_number(pair_elapsed) or not is_strict_number(solo_elapsed):
65
307
  return None
66
308
  return pair_elapsed / solo_elapsed
67
309
 
@@ -73,53 +315,78 @@ def provider_limited(result: dict[str, Any] | None) -> bool:
73
315
  def evaluate_fixture(
74
316
  fixture_dir: pathlib.Path,
75
317
  *,
318
+ rejected_short_ids: set[str],
76
319
  pair_arm: str,
77
320
  bare_max: int,
78
321
  solo_max: int,
322
+ min_bare_headroom: int,
323
+ min_solo_headroom: int,
79
324
  min_pair_margin: int,
80
325
  max_pair_solo_wall_ratio: float | None,
326
+ require_hypothesis_trigger: bool,
81
327
  ) -> dict[str, Any]:
82
- judge = load_json(fixture_dir / "judge.json")
328
+ judge, judge_error = load_json(fixture_dir / "judge.json")
83
329
  if judge is None:
84
330
  return {
85
331
  "fixture": fixture_dir.name,
86
332
  "status": "FAIL",
87
- "reason": "judge.json missing",
333
+ "reason": f"judge.json {judge_error}",
88
334
  }
89
335
 
90
336
  bare = score_for(judge, "bare")
91
337
  solo = score_for(judge, "solo_claude")
92
338
  pair = score_for(judge, pair_arm)
93
- solo_result = load_json(fixture_dir / "solo_claude" / "result.json")
94
- pair_result = load_json(fixture_dir / pair_arm / "result.json")
339
+ bare_headroom = bare_max - bare if isinstance(bare, int) else None
340
+ solo_headroom = solo_max - solo if isinstance(solo, int) else None
341
+ solo_result, _ = load_json(fixture_dir / "solo_claude" / "result.json")
342
+ pair_result, _ = load_json(fixture_dir / pair_arm / "result.json")
95
343
  ratio = elapsed_ratio(pair_result, solo_result)
96
344
  pair_provider_limited = provider_limited(pair_result)
97
345
  if pair_provider_limited:
98
346
  ratio = None
99
347
 
100
348
  reasons: list[str] = []
349
+ if fixture_short(fixture_dir.name) in rejected_short_ids:
350
+ reasons.append("fixture rejected for pair-candidate runs")
101
351
  if bare is None:
102
352
  reasons.append("bare score missing")
103
353
  elif bare > bare_max:
104
354
  reasons.append(f"bare score {bare} > {bare_max}")
355
+ elif bare_headroom is not None and bare_headroom < min_bare_headroom:
356
+ reasons.append(f"bare headroom {bare_headroom} < {min_bare_headroom}")
105
357
  if solo is None:
106
358
  reasons.append("solo_claude score missing")
107
359
  elif solo > solo_max:
108
360
  reasons.append(f"solo_claude score {solo} > {solo_max}")
361
+ elif solo_headroom is not None and solo_headroom < min_solo_headroom:
362
+ reasons.append(f"solo_claude headroom {solo_headroom} < {min_solo_headroom}")
109
363
  if pair_provider_limited:
110
364
  pass
111
365
  elif pair is None:
112
366
  reasons.append(f"{pair_arm} score missing")
113
367
  elif solo is not None and pair - solo < min_pair_margin:
114
368
  reasons.append(f"{pair_arm} margin {pair - solo:+d} < +{min_pair_margin}")
369
+ unmapped_axis_invalid = axis_unmapped_invalid_count(judge)
370
+ if unmapped_axis_invalid > 0:
371
+ reasons.append(f"judge axis-invalid unmapped ({unmapped_axis_invalid})")
372
+ reasons.extend(blind_mapping_failures(judge, {"bare", "solo_claude", pair_arm}))
115
373
 
116
- reasons.extend(clean_failures(fixture_dir, judge, "bare"))
117
- reasons.extend(clean_failures(fixture_dir, judge, "solo_claude"))
118
- reasons.extend(clean_failures(fixture_dir, judge, pair_arm))
374
+ reasons.extend(clean_failures(fixture_dir, judge, "bare", require_correctness=False))
375
+ reasons.extend(clean_failures(fixture_dir, judge, "solo_claude", require_correctness=False))
376
+ reasons.extend(clean_failures(fixture_dir, judge, pair_arm, require_correctness=True))
119
377
 
120
378
  pair_mode = None if pair_result is None else pair_result.get("pair_mode")
121
379
  if pair_mode is not True and not pair_provider_limited:
122
380
  reasons.append(f"{pair_arm} pair_mode not true")
381
+ if not pair_provider_limited:
382
+ reasons.extend(pair_trigger_failures(pair_result, pair_arm))
383
+ if (
384
+ require_hypothesis_trigger
385
+ and
386
+ fixture_spec_has_solo_headroom_hypothesis(fixture_dir.name)
387
+ and "spec.solo_headroom_hypothesis" not in pair_trigger_reasons(pair_result)
388
+ ):
389
+ reasons.append(f"{pair_arm} pair_trigger missing spec.solo_headroom_hypothesis")
123
390
 
124
391
  if max_pair_solo_wall_ratio is not None and not pair_provider_limited:
125
392
  if ratio is None:
@@ -131,7 +398,9 @@ def evaluate_fixture(
131
398
  "fixture": fixture_dir.name,
132
399
  "status": "PASS" if not reasons else "FAIL",
133
400
  "bare_score": bare,
401
+ "bare_headroom": bare_headroom,
134
402
  "solo_score": solo,
403
+ "solo_headroom": solo_headroom,
135
404
  "pair_score": pair,
136
405
  "pair_margin": (
137
406
  None if pair_provider_limited
@@ -139,6 +408,14 @@ def evaluate_fixture(
139
408
  else None
140
409
  ),
141
410
  "pair_mode": pair_mode,
411
+ "pair_trigger_eligible": pair_trigger_eligible(pair_result),
412
+ "pair_trigger_reasons": pair_trigger_reasons(pair_result),
413
+ "pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(
414
+ pair_trigger_reasons(pair_result)
415
+ ),
416
+ "pair_trigger_has_hypothesis_reason": (
417
+ "spec.solo_headroom_hypothesis" in pair_trigger_reasons(pair_result)
418
+ ),
142
419
  "pair_solo_wall_ratio": ratio,
143
420
  "reason": "; ".join(reasons),
144
421
  }
@@ -148,27 +425,50 @@ def fmt_ratio(value: Any) -> str:
148
425
  return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
149
426
 
150
427
 
428
+ def fmt_margin(value: Any) -> str:
429
+ return f"{value:+.1f}" if isinstance(value, (int, float)) else "n/a"
430
+
431
+
432
+ def fmt_trigger_reasons(value: Any) -> str:
433
+ if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
434
+ return ""
435
+ return ",".join(value)
436
+
437
+
151
438
  def write_md(path: pathlib.Path, report: dict[str, Any]) -> None:
152
439
  lines = [
153
440
  f"# Full-Pipeline Pair Gate - {report['run_id']}",
154
441
  "",
155
442
  f"Verdict: **{report['verdict']}**",
156
443
  "",
444
+ f"Fixtures passed: {report['fixtures_passed']}/{report['fixtures_total']} "
445
+ f"(minimum required: {report['min_fixtures']})",
446
+ "",
157
447
  f"Rule: at least {report['min_fixtures']} fixtures; bare <= {report['bare_max']}; "
158
- f"solo_claude <= {report['solo_max']}; {report['pair_arm']} clean; pair_mode true; "
448
+ f"bare headroom >= {report['min_bare_headroom_required']}; "
449
+ f"solo_claude <= {report['solo_max']}; "
450
+ f"solo_claude headroom >= {report['min_solo_headroom_required']}; "
451
+ f"{report['pair_arm']} evidence-clean; pair_mode true; "
452
+ "pair_trigger eligible with canonical reason; "
159
453
  f"{report['pair_arm']} - solo_claude >= {report['min_pair_margin']}.",
160
- f"Max pair/solo wall ratio: {fmt_ratio(report['max_pair_solo_wall_ratio'])}",
454
+ f"Average pair margin: {fmt_margin(report['avg_pair_margin'])}",
455
+ f"Allowed pair/solo wall ratio: {fmt_ratio(report['max_pair_solo_wall_ratio'])}",
456
+ f"Maximum observed pair/solo wall ratio: {fmt_ratio(report['max_observed_pair_solo_wall_ratio'])}",
161
457
  f"Average pair/solo wall ratio: {fmt_ratio(report['avg_pair_solo_wall_ratio'])}",
458
+ f"Hypothesis trigger required: {str(report['require_hypothesis_trigger']).lower()}",
162
459
  "",
163
- "| Fixture | Bare | Solo | Pair | Margin | Pair mode | Wall ratio | Status | Reason |",
164
- "|---|---:|---:|---:|---:|---|---:|---|---|",
460
+ "| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Pair | Margin | Pair mode | Hypothesis trigger | Triggers | Wall ratio | Status | Reason |",
461
+ "|---|---:|---:|---:|---:|---:|---:|---|---|---|---:|---|---|",
165
462
  ]
166
463
  for row in report["rows"]:
167
464
  margin = row.get("pair_margin")
168
465
  margin_text = f"{margin:+d}" if isinstance(margin, int) else "n/a"
169
466
  lines.append(
170
- f"| {row['fixture']} | {row.get('bare_score')} | {row.get('solo_score')} | "
467
+ f"| {row['fixture']} | {row.get('bare_score')} | {row.get('bare_headroom')} | "
468
+ f"{row.get('solo_score')} | {row.get('solo_headroom')} | "
171
469
  f"{row.get('pair_score')} | {margin_text} | {str(row.get('pair_mode')).lower()} | "
470
+ f"{str(row.get('pair_trigger_has_hypothesis_reason')).lower()} | "
471
+ f"{fmt_trigger_reasons(row.get('pair_trigger_reasons'))} | "
172
472
  f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | {row['status']} | {row.get('reason', '')} |"
173
473
  )
174
474
  lines.append("")
@@ -177,38 +477,82 @@ def write_md(path: pathlib.Path, report: dict[str, Any]) -> None:
177
477
 
178
478
  def positive_float(value: str) -> float:
179
479
  parsed = float(value)
480
+ if not is_strict_number(parsed):
481
+ raise argparse.ArgumentTypeError("value must be finite and > 0")
482
+ return parsed
483
+
484
+
485
+ def positive_int(value: str) -> int:
486
+ parsed = int(value)
180
487
  if parsed <= 0:
181
488
  raise argparse.ArgumentTypeError("value must be > 0")
182
489
  return parsed
183
490
 
184
491
 
492
+ def non_negative_int(value: str) -> int:
493
+ parsed = int(value)
494
+ if parsed < 0:
495
+ raise argparse.ArgumentTypeError("value must be >= 0")
496
+ return parsed
497
+
498
+
185
499
  def main() -> int:
186
500
  parser = argparse.ArgumentParser()
187
501
  parser.add_argument("--run-id", required=True)
188
502
  parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=pathlib.Path)
189
503
  parser.add_argument("--bare-max", type=int, default=60)
190
504
  parser.add_argument("--solo-max", type=int, default=80)
191
- parser.add_argument("--min-pair-margin", type=int, default=5)
192
- parser.add_argument("--min-fixtures", type=int, default=2)
193
- parser.add_argument("--pair-arm", default="l2_gated")
194
- parser.add_argument("--max-pair-solo-wall-ratio", type=positive_float)
505
+ parser.add_argument("--min-bare-headroom", type=non_negative_int, default=5)
506
+ parser.add_argument("--min-solo-headroom", type=non_negative_int, default=5)
507
+ parser.add_argument("--min-pair-margin", type=positive_int, default=5)
508
+ parser.add_argument("--min-fixtures", type=positive_int, default=2)
509
+ parser.add_argument("--pair-arm", default="l2_risk_probes")
510
+ parser.add_argument("--max-pair-solo-wall-ratio", type=positive_float, default=3.0)
511
+ parser.add_argument(
512
+ "--require-hypothesis-trigger",
513
+ action="store_true",
514
+ help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
515
+ )
195
516
  parser.add_argument("--out-json", type=pathlib.Path)
196
517
  parser.add_argument("--out-md", type=pathlib.Path)
197
518
  args = parser.parse_args()
198
519
 
520
+ if args.pair_arm == "l2_forced":
521
+ print(
522
+ "pair-arm l2_forced is retired: it leaks pair-awareness before IMPLEMENT; "
523
+ "use l2_risk_probes for current proof runs or l2_gated for diagnostics.",
524
+ file=sys.stderr,
525
+ )
526
+ return 2
527
+ if args.pair_arm not in ALLOWED_PAIR_ARMS:
528
+ print(
529
+ f"pair-arm must be one of {sorted(ALLOWED_PAIR_ARMS)}: {args.pair_arm}",
530
+ file=sys.stderr,
531
+ )
532
+ return 2
533
+
199
534
  run_root = args.results_root / args.run_id
200
535
  if not run_root.is_dir():
201
536
  print(f"no results dir: {run_root}", file=sys.stderr)
202
537
  return 2
203
538
 
539
+ try:
540
+ rejected_short_ids = load_rejected_short_ids(rejected_registry_path())
541
+ except ValueError as exc:
542
+ print(str(exc), file=sys.stderr)
543
+ return 2
204
544
  rows = [
205
545
  evaluate_fixture(
206
546
  fixture_dir,
547
+ rejected_short_ids=rejected_short_ids,
207
548
  pair_arm=args.pair_arm,
208
549
  bare_max=args.bare_max,
209
550
  solo_max=args.solo_max,
551
+ min_bare_headroom=args.min_bare_headroom,
552
+ min_solo_headroom=args.min_solo_headroom,
210
553
  min_pair_margin=args.min_pair_margin,
211
554
  max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
555
+ require_hypothesis_trigger=args.require_hypothesis_trigger,
212
556
  )
213
557
  for fixture_dir in sorted(p for p in run_root.iterdir() if p.is_dir())
214
558
  ]
@@ -218,11 +562,24 @@ def main() -> int:
218
562
  ratios = [
219
563
  row["pair_solo_wall_ratio"]
220
564
  for row in rows
221
- if isinstance(row.get("pair_solo_wall_ratio"), (int, float))
565
+ if is_strict_number(row.get("pair_solo_wall_ratio"))
566
+ ]
567
+ margins = [
568
+ row["pair_margin"]
569
+ for row in rows
570
+ if isinstance(row.get("pair_margin"), int)
222
571
  ]
572
+ rule = (
573
+ "headroom candidates only; "
574
+ f"bare headroom >= {args.min_bare_headroom}; "
575
+ f"solo_claude headroom >= {args.min_solo_headroom}; "
576
+ f"{args.pair_arm} must be evidence-clean, pair_mode true, "
577
+ "pair_trigger eligible with a canonical reason, and beat solo_claude "
578
+ "by the configured margin"
579
+ )
223
580
  report = {
224
581
  "run_id": args.run_id,
225
- "rule": "headroom candidates only; l2_gated must be clean, pair_mode true, and beat solo_claude by the configured margin",
582
+ "rule": rule,
226
583
  "verdict": verdict,
227
584
  "fixtures_total": len(rows),
228
585
  "fixtures_passed": pass_count,
@@ -230,9 +587,14 @@ def main() -> int:
230
587
  "fixture_count_ok": fixture_count_ok,
231
588
  "bare_max": args.bare_max,
232
589
  "solo_max": args.solo_max,
590
+ "min_bare_headroom_required": args.min_bare_headroom,
591
+ "min_solo_headroom_required": args.min_solo_headroom,
233
592
  "min_pair_margin": args.min_pair_margin,
234
593
  "pair_arm": args.pair_arm,
594
+ "require_hypothesis_trigger": args.require_hypothesis_trigger,
235
595
  "max_pair_solo_wall_ratio": args.max_pair_solo_wall_ratio,
596
+ "max_observed_pair_solo_wall_ratio": max(ratios) if ratios else None,
597
+ "avg_pair_margin": (sum(margins) / len(margins)) if margins else None,
236
598
  "avg_pair_solo_wall_ratio": (sum(ratios) / len(ratios)) if ratios else None,
237
599
  "rows": rows,
238
600
  }