devlyn-cli 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +80 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +210 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -10,52 +10,210 @@ from __future__ import annotations
10
10
 
11
11
  import argparse
12
12
  import json
13
+ import os
13
14
  import pathlib
15
+ import re
14
16
  import sys
15
17
 
18
+ SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
19
+ if str(SCRIPT_DIR) not in sys.path:
20
+ sys.path.insert(0, str(SCRIPT_DIR))
16
21
 
17
- def load_json(path: pathlib.Path) -> dict | None:
22
+ from pair_evidence_contract import is_score, loads_strict_json_object
23
+
24
+ KNOWN_ARMS = {"bare", "solo_claude"}
25
+ REJECTED_REGISTRY = pathlib.Path(__file__).with_name("pair-rejected-fixtures.sh")
26
+
27
+
28
+ def load_json(path: pathlib.Path) -> tuple[dict | None, str | None]:
18
29
  if not path.is_file():
30
+ return None, "missing"
31
+ try:
32
+ data = loads_strict_json_object(path.read_text())
33
+ except (ValueError, json.JSONDecodeError):
34
+ return None, "malformed"
35
+ return data, None
36
+
37
+
38
+ def bool_flag_failure(value: object, true_reason: str, malformed_reason: str) -> str | None:
39
+ if value is True:
40
+ return true_reason
41
+ if value is False or value is None:
19
42
  return None
20
- return json.loads(path.read_text())
43
+ return malformed_reason
44
+
45
+
46
+ def fixture_short(name: str) -> str:
47
+ return name.split("-", 1)[0] if "-" in name else name
48
+
49
+
50
+ def rejected_registry_path() -> pathlib.Path:
51
+ override = os.environ.get("PAIR_REJECTED_FIXTURES_REGISTRY")
52
+ return pathlib.Path(override) if override else REJECTED_REGISTRY
53
+
54
+
55
+ def load_rejected_short_ids(path: pathlib.Path) -> set[str]:
56
+ if not path.is_file():
57
+ raise ValueError(f"rejected fixture registry missing: {path}")
58
+ rejected = set()
59
+ for line in path.read_text().splitlines():
60
+ match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
61
+ if match and match.group(1) == match.group(2):
62
+ rejected.add(match.group(1))
63
+ if not rejected:
64
+ raise ValueError(f"rejected fixture registry has no fixture entries: {path}")
65
+ return rejected
21
66
 
22
67
 
23
68
  def score_for(judge: dict, arm: str) -> int | None:
24
- scores = judge.get("scores_by_arm") or {}
69
+ mapping = judge.get("_blind_mapping")
70
+ if not isinstance(mapping, dict):
71
+ return None
72
+ if arm not in {mapped for slot, mapped in mapping.items() if slot in {"A", "B", "C"}}:
73
+ return None
74
+ raw_scores = judge.get("scores_by_arm")
75
+ scores = raw_scores if isinstance(raw_scores, dict) else {}
25
76
  value = scores.get(arm)
26
- return value if isinstance(value, int) else None
77
+ return value if is_score(value) else None
78
+
79
+
80
+ def axis_validation_counts(judge: dict) -> tuple[dict[str, int], int]:
81
+ raw_mapping = judge.get("_blind_mapping")
82
+ mapping = raw_mapping if isinstance(raw_mapping, dict) else {}
83
+ raw_validation = judge.get("_axis_validation")
84
+ validation = raw_validation if isinstance(raw_validation, dict) else {}
85
+ cells = validation.get("out_of_range_cells") or []
86
+ declared_count = validation.get("out_of_range_count")
87
+ total_invalid = max(
88
+ declared_count if isinstance(declared_count, int) else 0,
89
+ len(cells) if isinstance(cells, list) else 0,
90
+ )
91
+ breakdown_to_letter = {
92
+ "a_breakdown": "A",
93
+ "b_breakdown": "B",
94
+ "c_breakdown": "C",
95
+ }
96
+ counts: dict[str, int] = {}
97
+ mapped_count = 0
98
+ if not isinstance(cells, list):
99
+ return counts, total_invalid
100
+ for cell in cells:
101
+ if not isinstance(cell, dict):
102
+ continue
103
+ letter = breakdown_to_letter.get(cell.get("breakdown"))
104
+ arm = mapping.get(letter) if letter else None
105
+ if arm in KNOWN_ARMS:
106
+ counts[arm] = counts.get(arm, 0) + 1
107
+ mapped_count += 1
108
+ return counts, max(0, total_invalid - mapped_count)
109
+
110
+
111
+ def axis_invalid_count(judge: dict, arm: str) -> int:
112
+ counts, _ = axis_validation_counts(judge)
113
+ return counts.get(arm, 0)
114
+
115
+
116
+ def axis_unmapped_invalid_count(judge: dict) -> int:
117
+ _, unmapped = axis_validation_counts(judge)
118
+ return unmapped
27
119
 
28
120
 
29
- def arm_clean_failures(fixture_dir: pathlib.Path, judge: dict, arm: str) -> list[str]:
121
+ def blind_mapping_failures(judge: dict, required_arms: set[str]) -> list[str]:
122
+ mapping = judge.get("_blind_mapping")
123
+ if not isinstance(mapping, dict):
124
+ return ["judge blind mapping missing"]
125
+ mapped_arms = {arm for key, arm in mapping.items() if key in {"A", "B", "C"}}
126
+ missing = sorted(required_arms - mapped_arms)
127
+ if missing:
128
+ return [f"judge blind mapping missing arm(s): {', '.join(missing)}"]
129
+ return []
130
+
131
+
132
+ def arm_complete_failures(fixture_dir: pathlib.Path, judge: dict, arm: str) -> list[str]:
30
133
  failures: list[str] = []
31
- result = load_json(fixture_dir / arm / "result.json")
32
- verify = load_json(fixture_dir / arm / "verify.json")
33
- if result is None:
34
- failures.append(f"{arm} result.json missing")
35
- if verify is None:
36
- failures.append(f"{arm} verify.json missing")
37
- dq_by_arm = judge.get("disqualifiers_by_arm") or {}
38
- if bool((dq_by_arm.get(arm) or {}).get("disqualifier")):
39
- failures.append(f"{arm} judge disqualifier")
134
+ result, result_error = load_json(fixture_dir / arm / "result.json")
135
+ verify, verify_error = load_json(fixture_dir / arm / "verify.json")
136
+ diff = fixture_dir / arm / "diff.patch"
137
+ if result_error:
138
+ failures.append(f"{arm} result.json {result_error}")
139
+ if verify_error:
140
+ failures.append(f"{arm} verify.json {verify_error}")
141
+ if not diff.is_file():
142
+ failures.append(f"{arm} diff.patch missing")
143
+ raw_dq_by_arm = judge.get("disqualifiers_by_arm")
144
+ dq_by_arm = raw_dq_by_arm if isinstance(raw_dq_by_arm, dict) else {}
145
+ dq_entry = dq_by_arm.get(arm)
146
+ dq_value = dq_entry.get("disqualifier") if isinstance(dq_entry, dict) else dq_entry
147
+ judge_dq_failure = bool_flag_failure(
148
+ dq_value,
149
+ f"{arm} judge disqualifier",
150
+ f"{arm} judge disqualifier malformed",
151
+ )
152
+ if judge_dq_failure:
153
+ failures.append(judge_dq_failure)
154
+ axis_invalid = axis_invalid_count(judge, arm)
155
+ if axis_invalid > 0:
156
+ failures.append(f"{arm} judge axis-invalid ({axis_invalid})")
40
157
  if result is not None:
41
- if bool(result.get("disqualifier")):
42
- failures.append(f"{arm} result disqualifier")
43
- if bool(result.get("timed_out")):
44
- failures.append(f"{arm} timed out")
45
- if bool(result.get("invoke_failure")):
46
- failures.append(f"{arm} invoke failure")
47
- if verify is not None and bool(verify.get("disqualifier")):
48
- failures.append(f"{arm} verify disqualifier")
158
+ for field, true_reason in (
159
+ ("disqualifier", f"{arm} result disqualifier"),
160
+ ("timed_out", f"{arm} timed out"),
161
+ ("invoke_failure", f"{arm} invoke failure"),
162
+ ("environment_contamination", f"{arm} environment contamination"),
163
+ ):
164
+ failure = bool_flag_failure(
165
+ result.get(field),
166
+ true_reason,
167
+ f"{arm} result {field} malformed",
168
+ )
169
+ if failure:
170
+ failures.append(failure)
171
+ if verify is not None:
172
+ verify_dq_failure = bool_flag_failure(
173
+ verify.get("disqualifier"),
174
+ f"{arm} verify disqualifier",
175
+ f"{arm} verify disqualifier malformed",
176
+ )
177
+ if verify_dq_failure:
178
+ failures.append(verify_dq_failure)
49
179
  return failures
50
180
 
51
181
 
182
+ def positive_int(value: str) -> int:
183
+ parsed = int(value)
184
+ if parsed <= 0:
185
+ raise argparse.ArgumentTypeError("value must be > 0")
186
+ return parsed
187
+
188
+
189
+ def non_negative_int(value: str) -> int:
190
+ parsed = int(value)
191
+ if parsed < 0:
192
+ raise argparse.ArgumentTypeError("value must be >= 0")
193
+ return parsed
194
+
195
+
196
+ def remaining_headroom(score: int | None, max_score: int) -> int | None:
197
+ return max_score - score if isinstance(score, int) else None
198
+
199
+
200
+ def average(values: list[int]) -> float | None:
201
+ return (sum(values) / len(values)) if values else None
202
+
203
+
204
+ def fmt_float(value: float | None) -> str:
205
+ return f"{value:.1f}" if isinstance(value, (int, float)) else "n/a"
206
+
207
+
52
208
  def main() -> int:
53
209
  parser = argparse.ArgumentParser()
54
210
  parser.add_argument("--run-id", required=True)
55
211
  parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
56
212
  parser.add_argument("--bare-max", type=int, default=60)
57
213
  parser.add_argument("--solo-max", type=int, default=80)
58
- parser.add_argument("--min-fixtures", type=int, default=2)
214
+ parser.add_argument("--min-bare-headroom", type=non_negative_int, default=5)
215
+ parser.add_argument("--min-solo-headroom", type=non_negative_int, default=5)
216
+ parser.add_argument("--min-fixtures", type=positive_int, default=2)
59
217
  parser.add_argument("--out-json", default=None)
60
218
  parser.add_argument("--out-md", default=None)
61
219
  args = parser.parse_args()
@@ -66,52 +224,117 @@ def main() -> int:
66
224
  return 2
67
225
 
68
226
  rows = []
227
+ try:
228
+ rejected_short_ids = load_rejected_short_ids(rejected_registry_path())
229
+ except ValueError as exc:
230
+ print(str(exc), file=sys.stderr)
231
+ return 2
69
232
  for fixture_dir in sorted(p for p in res_root.iterdir() if p.is_dir()):
70
- judge = load_json(fixture_dir / "judge.json")
233
+ judge, judge_error = load_json(fixture_dir / "judge.json")
71
234
  if judge is None:
72
235
  rows.append({
73
236
  "fixture": fixture_dir.name,
74
237
  "status": "MISSING_JUDGE",
75
- "reason": "judge.json missing",
238
+ "reason": f"judge.json {judge_error}",
76
239
  })
77
240
  continue
78
241
  bare = score_for(judge, "bare")
79
242
  solo = score_for(judge, "solo_claude")
80
- bare_clean_failures = arm_clean_failures(fixture_dir, judge, "bare")
81
- solo_clean_failures = arm_clean_failures(fixture_dir, judge, "solo_claude")
82
- bare_ok = bare is not None and bare <= args.bare_max and not bare_clean_failures
83
- solo_ok = solo is not None and solo <= args.solo_max and not solo_clean_failures
84
- status = "PASS" if bare_ok and solo_ok else "FAIL"
243
+ bare_headroom = remaining_headroom(bare, args.bare_max)
244
+ solo_headroom = remaining_headroom(solo, args.solo_max)
245
+ bare_complete_failures = arm_complete_failures(fixture_dir, judge, "bare")
246
+ solo_complete_failures = arm_complete_failures(fixture_dir, judge, "solo_claude")
247
+ unmapped_axis_invalid = axis_unmapped_invalid_count(judge)
248
+ mapping_failures = blind_mapping_failures(judge, KNOWN_ARMS)
249
+ rejected = fixture_short(fixture_dir.name) in rejected_short_ids
250
+ bare_headroom_ok = (
251
+ isinstance(bare_headroom, int)
252
+ and bare_headroom >= args.min_bare_headroom
253
+ )
254
+ solo_headroom_ok = (
255
+ isinstance(solo_headroom, int)
256
+ and solo_headroom >= args.min_solo_headroom
257
+ )
258
+ bare_ok = (
259
+ bare is not None
260
+ and bare <= args.bare_max
261
+ and bare_headroom_ok
262
+ and not bare_complete_failures
263
+ )
264
+ solo_ok = (
265
+ solo is not None
266
+ and solo <= args.solo_max
267
+ and solo_headroom_ok
268
+ and not solo_complete_failures
269
+ )
270
+ judge_ok = unmapped_axis_invalid == 0 and not mapping_failures
271
+ status = "PASS" if bare_ok and solo_ok and judge_ok and not rejected else "FAIL"
85
272
  reasons = []
86
273
  if bare is None:
87
274
  reasons.append("bare score missing")
88
275
  elif bare > args.bare_max:
89
276
  reasons.append(f"bare score {bare} > {args.bare_max}")
277
+ elif bare_headroom is not None and bare_headroom < args.min_bare_headroom:
278
+ reasons.append(
279
+ f"bare headroom {bare_headroom} < {args.min_bare_headroom}"
280
+ )
90
281
  if solo is None:
91
282
  reasons.append("solo_claude score missing")
92
283
  elif solo > args.solo_max:
93
284
  reasons.append(f"solo_claude score {solo} > {args.solo_max}")
94
- reasons.extend(bare_clean_failures)
95
- reasons.extend(solo_clean_failures)
285
+ elif solo_headroom is not None and solo_headroom < args.min_solo_headroom:
286
+ reasons.append(
287
+ f"solo_claude headroom {solo_headroom} < {args.min_solo_headroom}"
288
+ )
289
+ if unmapped_axis_invalid > 0:
290
+ reasons.append(f"judge axis-invalid unmapped ({unmapped_axis_invalid})")
291
+ reasons.extend(mapping_failures)
292
+ if rejected:
293
+ reasons.append("fixture rejected for pair-candidate runs")
294
+ reasons.extend(bare_complete_failures)
295
+ reasons.extend(solo_complete_failures)
96
296
  rows.append({
97
297
  "fixture": fixture_dir.name,
98
298
  "status": status,
99
299
  "bare_score": bare,
100
300
  "solo_score": solo,
301
+ "bare_headroom": bare_headroom,
302
+ "solo_headroom": solo_headroom,
101
303
  "reason": "; ".join(reasons) if reasons else "",
102
304
  })
103
305
 
104
306
  pass_count = sum(1 for row in rows if row["status"] == "PASS")
105
307
  fixture_count_ok = len(rows) >= args.min_fixtures
106
308
  verdict = "PASS" if pass_count == len(rows) and rows and fixture_count_ok else "FAIL"
309
+ bare_headrooms = [
310
+ value for row in rows
311
+ if isinstance((value := row.get("bare_headroom")), int)
312
+ ]
313
+ solo_headrooms = [
314
+ value for row in rows
315
+ if isinstance((value := row.get("solo_headroom")), int)
316
+ ]
107
317
  payload = {
108
318
  "run_id": args.run_id,
109
- "rule": f"at least {args.min_fixtures} candidate fixtures; each must satisfy bare <= {args.bare_max} and solo_claude <= {args.solo_max}, with both arms clean",
319
+ "rule": (
320
+ f"at least {args.min_fixtures} candidate fixtures; each must satisfy "
321
+ f"bare <= {args.bare_max} with headroom >= {args.min_bare_headroom}, "
322
+ f"solo_claude <= {args.solo_max} with headroom >= {args.min_solo_headroom}, "
323
+ "with both baseline arms evidence-complete"
324
+ ),
110
325
  "verdict": verdict,
111
326
  "fixtures_total": len(rows),
112
327
  "fixtures_passed": pass_count,
113
328
  "min_fixtures": args.min_fixtures,
329
+ "bare_max": args.bare_max,
330
+ "solo_max": args.solo_max,
331
+ "min_bare_headroom_required": args.min_bare_headroom,
332
+ "min_solo_headroom_required": args.min_solo_headroom,
114
333
  "fixture_count_ok": fixture_count_ok,
334
+ "avg_bare_headroom": average(bare_headrooms),
335
+ "min_bare_headroom": min(bare_headrooms) if bare_headrooms else None,
336
+ "avg_solo_headroom": average(solo_headrooms),
337
+ "min_solo_headroom": min(solo_headrooms) if solo_headrooms else None,
115
338
  "rows": rows,
116
339
  }
117
340
 
@@ -123,16 +346,24 @@ def main() -> int:
123
346
  "",
124
347
  f"Verdict: **{verdict}**",
125
348
  "",
126
- f"Rule: at least {args.min_fixtures} fixtures; bare <= {args.bare_max}, "
127
- f"solo_claude <= {args.solo_max}, both arms clean.",
349
+ f"Fixtures passed: {pass_count}/{len(rows)} (minimum required: {args.min_fixtures})",
350
+ "",
351
+ f"Rule: at least {args.min_fixtures} fixtures; bare <= {args.bare_max} "
352
+ f"with headroom >= {args.min_bare_headroom}, solo_claude <= {args.solo_max} "
353
+ f"with headroom >= {args.min_solo_headroom}, both baseline arms evidence-complete.",
354
+ f"Average bare headroom: {fmt_float(payload['avg_bare_headroom'])}",
355
+ f"Minimum bare headroom: {payload['min_bare_headroom'] if payload['min_bare_headroom'] is not None else 'n/a'}",
356
+ f"Average solo_claude headroom: {fmt_float(payload['avg_solo_headroom'])}",
357
+ f"Minimum solo_claude headroom: {payload['min_solo_headroom'] if payload['min_solo_headroom'] is not None else 'n/a'}",
128
358
  "",
129
- "| Fixture | Bare | Solo | Status | Reason |",
130
- "|---|---:|---:|---|---|",
359
+ "| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Status | Reason |",
360
+ "|---|---:|---:|---:|---:|---|---|",
131
361
  ]
132
362
  for row in rows:
133
363
  lines.append(
134
- f"| {row['fixture']} | {row.get('bare_score')} | {row.get('solo_score')} | "
135
- f"{row['status']} | {row.get('reason', '')} |"
364
+ f"| {row['fixture']} | {row.get('bare_score')} | {row.get('bare_headroom')} | "
365
+ f"{row.get('solo_score')} | {row.get('solo_headroom')} | {row['status']} | "
366
+ f"{row.get('reason', '')} |"
136
367
  )
137
368
  report = "\n".join(lines) + "\n"
138
369
  if args.out_md: