devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,441 @@
1
+ #!/usr/bin/env python3
2
+ """Audit failed headroom artifacts against the rejected fixture registry.
3
+
4
+ The spending loop is:
5
+ headroom FAIL -> reject/rework fixture before pair spend
6
+ headroom PASS -> run pair gate and keep passing pair evidence
7
+
8
+ This audit catches the forgotten middle state: an active fixture has a failed
9
+ headroom-gate.json, no passing pair evidence, and no rejected-registry entry.
10
+ """
11
+ from __future__ import annotations
12
+
13
+ import argparse
14
+ import json
15
+ import math
16
+ import pathlib
17
+ import re
18
+ import subprocess
19
+ import sys
20
+ from typing import Any
21
+
22
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
23
+ if str(SCRIPT_DIR) not in sys.path:
24
+ sys.path.insert(0, str(SCRIPT_DIR))
25
+
26
+ from pair_evidence_contract import (
27
+ all_known_pair_trigger_reasons,
28
+ has_canonical_pair_trigger_reason,
29
+ has_known_pair_trigger_reason,
30
+ loads_strict_json_object,
31
+ normalize_pair_evidence_row,
32
+ )
33
+
34
+
35
+ def fixture_short(name: str) -> str:
36
+ return name.split("-", 1)[0] if "-" in name else name
37
+
38
+
39
+ def sort_fixture_key(name: str) -> tuple[int, str]:
40
+ short = fixture_short(name)
41
+ match = re.fullmatch(r"F(\d+)", short)
42
+ return (int(match.group(1)) if match else 10_000, name)
43
+
44
+
45
+ def active_fixtures(fixtures_root: pathlib.Path) -> set[str]:
46
+ if not fixtures_root.is_dir():
47
+ raise ValueError(f"fixtures root missing: {fixtures_root}")
48
+ return {
49
+ path.name
50
+ for path in fixtures_root.iterdir()
51
+ if path.is_dir() and re.fullmatch(r"F\d+-.+", path.name)
52
+ }
53
+
54
+
55
+ def registry_short_ids(registry: pathlib.Path) -> set[str]:
56
+ if not registry.is_file():
57
+ raise ValueError(f"rejected fixture registry missing: {registry}")
58
+ rejected: set[str] = set()
59
+ for line in registry.read_text().splitlines():
60
+ match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
61
+ if match and match.group(1) == match.group(2):
62
+ rejected.add(match.group(1))
63
+ if not rejected:
64
+ raise ValueError(f"rejected fixture registry has no fixture entries: {registry}")
65
+ return rejected
66
+
67
+
68
+ def rejected_reason(registry: pathlib.Path, fixture: str) -> str | None:
69
+ proc = subprocess.run(
70
+ [
71
+ "bash",
72
+ "-c",
73
+ 'source "$1"; rejected_pair_fixture_reason "$2"',
74
+ "bash",
75
+ str(registry),
76
+ fixture,
77
+ ],
78
+ text=True,
79
+ stdout=subprocess.PIPE,
80
+ stderr=subprocess.PIPE,
81
+ check=False,
82
+ )
83
+ if proc.returncode == 0:
84
+ return proc.stdout.strip()
85
+ return None
86
+
87
+
88
+ def load_json_object(path: pathlib.Path) -> dict[str, Any] | None:
89
+ try:
90
+ data = loads_strict_json_object(path.read_text())
91
+ except (OSError, ValueError, json.JSONDecodeError):
92
+ return None
93
+ return data
94
+
95
+
96
+ def load_headroom_gate(path: pathlib.Path) -> tuple[dict[str, Any] | None, str | None]:
97
+ try:
98
+ data = loads_strict_json_object(path.read_text())
99
+ except (OSError, ValueError, json.JSONDecodeError):
100
+ return None, "headroom-gate.json must be valid JSON object"
101
+ return data, None
102
+
103
+
104
+ def pair_result_trigger_reasons(
105
+ results_root: pathlib.Path,
106
+ *,
107
+ run_id: str,
108
+ fixture: str,
109
+ pair_arm: str,
110
+ ) -> list[str]:
111
+ path = results_root / run_id / fixture / pair_arm / "result.json"
112
+ try:
113
+ result = loads_strict_json_object(path.read_text())
114
+ except (OSError, ValueError, json.JSONDecodeError):
115
+ return []
116
+ trigger = result.get("pair_trigger")
117
+ if not isinstance(trigger, dict):
118
+ return []
119
+ reasons = trigger.get("reasons")
120
+ if not (
121
+ isinstance(reasons, list)
122
+ and reasons
123
+ and all(isinstance(reason, str) for reason in reasons)
124
+ and has_known_pair_trigger_reason(reasons)
125
+ and all_known_pair_trigger_reasons(reasons)
126
+ and has_canonical_pair_trigger_reason(reasons)
127
+ ):
128
+ return []
129
+ return reasons
130
+
131
+
132
+ def fixtures_with_passing_pair_evidence(
133
+ results_root: pathlib.Path,
134
+ *,
135
+ min_pair_margin: int,
136
+ max_pair_solo_wall_ratio: float,
137
+ ) -> set[str]:
138
+ fixtures: set[str] = set()
139
+ if not results_root.is_dir():
140
+ return fixtures
141
+ for gate_path in results_root.glob("*/full-pipeline-pair-gate.json"):
142
+ gate = load_json_object(gate_path)
143
+ if gate is None or gate.get("verdict") != "PASS":
144
+ continue
145
+ run_id = str(gate.get("run_id") or gate_path.parent.name)
146
+ pair_arm = gate.get("pair_arm")
147
+ for row in gate.get("rows", []):
148
+ if not isinstance(row, dict) or row.get("status") != "PASS":
149
+ continue
150
+ fixture = row.get("fixture")
151
+ if not isinstance(fixture, str):
152
+ continue
153
+ candidate_row = row
154
+ if row.get("pair_trigger_reasons") is None and isinstance(pair_arm, str):
155
+ reasons = pair_result_trigger_reasons(
156
+ results_root,
157
+ run_id=run_id,
158
+ fixture=fixture,
159
+ pair_arm=pair_arm,
160
+ )
161
+ if reasons:
162
+ candidate_row = dict(row)
163
+ candidate_row["pair_trigger_reasons"] = reasons
164
+ candidate_row["pair_trigger_has_canonical_reason"] = True
165
+ evidence = normalize_pair_evidence_row(
166
+ fixture=fixture,
167
+ run_id=run_id,
168
+ pair_arm=pair_arm,
169
+ row=candidate_row,
170
+ )
171
+ if evidence is None:
172
+ continue
173
+ if (
174
+ evidence["pair_margin"] >= min_pair_margin
175
+ and evidence["pair_solo_wall_ratio"] <= max_pair_solo_wall_ratio
176
+ ):
177
+ fixtures.add(fixture)
178
+ return fixtures
179
+
180
+
181
+ def failed_headroom_rows(
182
+ *,
183
+ fixtures_root: pathlib.Path,
184
+ results_root: pathlib.Path,
185
+ ) -> dict[str, list[dict[str, Any]]]:
186
+ active = active_fixtures(fixtures_root)
187
+ rows_by_fixture: dict[str, list[dict[str, Any]]] = {}
188
+ if not results_root.is_dir():
189
+ return rows_by_fixture
190
+ for gate_path in sorted(results_root.glob("*/headroom-gate.json")):
191
+ gate = load_json_object(gate_path)
192
+ if gate is None or gate.get("verdict") != "FAIL":
193
+ continue
194
+ run_id = str(gate.get("run_id") or gate_path.parent.name)
195
+ rows = gate.get("rows")
196
+ if not isinstance(rows, list):
197
+ continue
198
+ for row in rows:
199
+ if not isinstance(row, dict) or row.get("status") == "PASS":
200
+ continue
201
+ fixture = row.get("fixture")
202
+ if not isinstance(fixture, str) or fixture not in active:
203
+ continue
204
+ observed = dict(row)
205
+ observed["run_id"] = run_id
206
+ rows_by_fixture.setdefault(fixture, []).append(observed)
207
+ return rows_by_fixture
208
+
209
+
210
+ def expected_run_id(reason: str) -> str | None:
211
+ match = re.search(r"\bin\s+([A-Za-z0-9_.:-]+)", reason)
212
+ if not match:
213
+ return None
214
+ token = match.group(1)
215
+ return token if any(char.isdigit() for char in token) else None
216
+
217
+
218
+ def expected_scores(reason: str) -> tuple[int | None, int | None]:
219
+ pair = re.search(r"\bbare\s+(\d+)\s*/\s*solo_claude\s+(\d+)\b", reason)
220
+ if pair:
221
+ return int(pair.group(1)), int(pair.group(2))
222
+ solo = re.search(r"\bsolo_claude\s+(?:scored|score)\s+(\d+)\b", reason)
223
+ return None, int(solo.group(1)) if solo else None
224
+
225
+
226
+ def score_matches(row: dict[str, Any], expected_bare: int | None, expected_solo: int | None) -> bool:
227
+ if expected_bare is not None and row.get("bare_score") != expected_bare:
228
+ return False
229
+ if expected_solo is not None and row.get("solo_score") != expected_solo:
230
+ return False
231
+ return True
232
+
233
+
234
+ def unsupported_registry_rejections(
235
+ *,
236
+ fixtures_root: pathlib.Path,
237
+ registry: pathlib.Path,
238
+ results_root: pathlib.Path,
239
+ ) -> list[dict[str, Any]]:
240
+ active = sorted(active_fixtures(fixtures_root), key=sort_fixture_key)
241
+ rows_by_fixture = failed_headroom_rows(
242
+ fixtures_root=fixtures_root,
243
+ results_root=results_root,
244
+ )
245
+ unsupported: list[dict[str, Any]] = []
246
+ for fixture in active:
247
+ reason = rejected_reason(registry, fixture)
248
+ if not reason:
249
+ continue
250
+ if "trivial calibration fixture" in reason or "known-limit ambiguity fixture" in reason:
251
+ continue
252
+ rows = rows_by_fixture.get(fixture, [])
253
+ run_id = expected_run_id(reason)
254
+ expected_bare, expected_solo = expected_scores(reason)
255
+ matching_rows = rows
256
+ if run_id is not None:
257
+ matching_rows = [row for row in rows if row.get("run_id") == run_id]
258
+ if not matching_rows:
259
+ unsupported.append({
260
+ "fixture": fixture,
261
+ "reason": reason,
262
+ "expected_run_id": run_id,
263
+ "expected_bare_score": expected_bare,
264
+ "expected_solo_score": expected_solo,
265
+ "observed": rows,
266
+ "problem": "no matching failed headroom artifact",
267
+ })
268
+ continue
269
+ if not any(score_matches(row, expected_bare, expected_solo) for row in matching_rows):
270
+ unsupported.append({
271
+ "fixture": fixture,
272
+ "reason": reason,
273
+ "expected_run_id": run_id,
274
+ "expected_bare_score": expected_bare,
275
+ "expected_solo_score": expected_solo,
276
+ "observed": matching_rows,
277
+ "problem": "registry score does not match headroom artifact",
278
+ })
279
+ return unsupported
280
+
281
+
282
+ def unrecorded_headroom_failures(
283
+ *,
284
+ fixtures_root: pathlib.Path,
285
+ registry: pathlib.Path,
286
+ results_root: pathlib.Path,
287
+ min_pair_margin: int,
288
+ max_pair_solo_wall_ratio: float,
289
+ ) -> list[dict[str, Any]]:
290
+ active = active_fixtures(fixtures_root)
291
+ rejected = registry_short_ids(registry)
292
+ pair_passed = fixtures_with_passing_pair_evidence(
293
+ results_root,
294
+ min_pair_margin=min_pair_margin,
295
+ max_pair_solo_wall_ratio=max_pair_solo_wall_ratio,
296
+ )
297
+ failures: list[dict[str, Any]] = []
298
+
299
+ if not results_root.is_dir():
300
+ return failures
301
+
302
+ for gate_path in sorted(results_root.glob("*/headroom-gate.json")):
303
+ gate, load_error = load_headroom_gate(gate_path)
304
+ run_id = str(gate.get("run_id") or gate_path.parent.name) if gate else gate_path.parent.name
305
+ if load_error:
306
+ failures.append({
307
+ "run_id": run_id,
308
+ "fixture": "<unknown>",
309
+ "status": "MALFORMED_JSON",
310
+ "reason": load_error,
311
+ "bare_score": None,
312
+ "solo_score": None,
313
+ })
314
+ continue
315
+ if gate.get("verdict") != "FAIL":
316
+ continue
317
+ rows = gate.get("rows")
318
+ if not isinstance(rows, list) or not rows:
319
+ failures.append({
320
+ "run_id": run_id,
321
+ "fixture": "<unknown>",
322
+ "status": "MALFORMED_ROWS",
323
+ "reason": "headroom-gate.json rows must be a non-empty array",
324
+ "bare_score": None,
325
+ "solo_score": None,
326
+ })
327
+ continue
328
+ for row in rows:
329
+ if not isinstance(row, dict) or row.get("status") == "PASS":
330
+ continue
331
+ fixture = row.get("fixture")
332
+ if not isinstance(fixture, str) or fixture not in active:
333
+ continue
334
+ short = fixture_short(fixture)
335
+ if short in rejected or fixture in pair_passed:
336
+ continue
337
+ status = row.get("status")
338
+ if not isinstance(status, str) or not status:
339
+ status = "MALFORMED"
340
+ failures.append({
341
+ "run_id": run_id,
342
+ "fixture": fixture,
343
+ "status": status,
344
+ "reason": row.get("reason") or "",
345
+ "bare_score": row.get("bare_score"),
346
+ "solo_score": row.get("solo_score"),
347
+ })
348
+
349
+ return sorted(failures, key=lambda item: (sort_fixture_key(item["fixture"]), item["run_id"]))
350
+
351
+
352
+ def main() -> int:
353
+ parser = argparse.ArgumentParser()
354
+ parser.add_argument(
355
+ "--fixtures-root",
356
+ type=pathlib.Path,
357
+ default=pathlib.Path("benchmark/auto-resolve/fixtures"),
358
+ )
359
+ parser.add_argument(
360
+ "--registry",
361
+ type=pathlib.Path,
362
+ default=pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh"),
363
+ )
364
+ parser.add_argument(
365
+ "--results-root",
366
+ type=pathlib.Path,
367
+ default=pathlib.Path("benchmark/auto-resolve/results"),
368
+ )
369
+ parser.add_argument("--out-json", type=pathlib.Path)
370
+ parser.add_argument(
371
+ "--min-pair-margin",
372
+ type=int,
373
+ default=5,
374
+ help="minimum pair-over-solo margin required to count passing pair evidence",
375
+ )
376
+ parser.add_argument(
377
+ "--max-pair-solo-wall-ratio",
378
+ type=float,
379
+ default=3.0,
380
+ help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
381
+ )
382
+ args = parser.parse_args()
383
+ if args.min_pair_margin < 1:
384
+ print("error: --min-pair-margin must be >= 1", file=sys.stderr)
385
+ return 2
386
+ if not math.isfinite(args.max_pair_solo_wall_ratio) or args.max_pair_solo_wall_ratio <= 0:
387
+ print("error: --max-pair-solo-wall-ratio must be finite and > 0", file=sys.stderr)
388
+ return 2
389
+
390
+ try:
391
+ failures = unrecorded_headroom_failures(
392
+ fixtures_root=args.fixtures_root,
393
+ registry=args.registry,
394
+ results_root=args.results_root,
395
+ min_pair_margin=args.min_pair_margin,
396
+ max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
397
+ )
398
+ unsupported_rejections = unsupported_registry_rejections(
399
+ fixtures_root=args.fixtures_root,
400
+ registry=args.registry,
401
+ results_root=args.results_root,
402
+ )
403
+ except ValueError as exc:
404
+ print(f"error: {exc}", file=sys.stderr)
405
+ return 2
406
+
407
+ report = {
408
+ "verdict": "PASS" if not failures and not unsupported_rejections else "FAIL",
409
+ "unrecorded_failures": failures,
410
+ "unsupported_registry_rejections": unsupported_rejections,
411
+ }
412
+ if args.out_json:
413
+ args.out_json.parent.mkdir(parents=True, exist_ok=True)
414
+ args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
415
+
416
+ if failures:
417
+ print("unrecorded headroom rejection(s):", file=sys.stderr)
418
+ for item in failures:
419
+ print(
420
+ "{run_id} {fixture}: status={status} bare={bare_score} "
421
+ "solo_claude={solo_score} reason={reason}".format(**item),
422
+ file=sys.stderr,
423
+ )
424
+ if unsupported_rejections:
425
+ print("unsupported registry rejection(s):", file=sys.stderr)
426
+ for item in unsupported_rejections:
427
+ print(
428
+ "{fixture}: problem={problem} expected_run={expected_run_id} "
429
+ "expected_bare={expected_bare_score} expected_solo_claude={expected_solo_score} "
430
+ "reason={reason}".format(**item),
431
+ file=sys.stderr,
432
+ )
433
+ if failures or unsupported_rejections:
434
+ return 1
435
+
436
+ print("PASS audit-headroom-rejections")
437
+ return 0
438
+
439
+
440
+ if __name__ == "__main__":
441
+ sys.exit(main())