devlyn-cli 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +80 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +210 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -48,10 +48,151 @@ test -x "$CASE_DIR/setup.sh"
48
48
  grep -q 'SWE-bench local__repo-1' "$CASE_DIR/spec.md"
49
49
  grep -q -- '--pair-mode gated' "$CASE_DIR/run-command.txt"
50
50
 
51
- python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
52
- --dataset lite \
53
- --limit 1 \
54
- --out "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json"
51
+ set +e
52
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
53
+ --instance-json "$TMP/instance.json" \
54
+ --model-patch "$TMP/model.patch" \
55
+ --cases-root "$TMP/cases-bad-timeout" \
56
+ --repos-root "$TMP/repos-bad-timeout" \
57
+ --repo-dir "$REPO" \
58
+ --timeout-seconds 0 > "$TMP/prepare-bad-timeout.log" 2>&1
59
+ bad_timeout_status=$?
60
+ set -e
61
+ [ "$bad_timeout_status" -ne 0 ]
62
+ grep -Fq 'must be > 0' "$TMP/prepare-bad-timeout.log"
63
+
64
+ python3 - "$TMP/instance.json" "$TMP/instance-bad-repo.json" "$TMP/instance-bad-base.json" <<'PY'
65
+ import json, pathlib, sys
66
+ instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
67
+ bad_repo = dict(instance, repo="../bad/repo")
68
+ bad_base = dict(instance, base_commit="../bad")
69
+ pathlib.Path(sys.argv[2]).write_text(json.dumps(bad_repo) + "\n")
70
+ pathlib.Path(sys.argv[3]).write_text(json.dumps(bad_base) + "\n")
71
+ PY
72
+ set +e
73
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
74
+ --instance-json "$TMP/instance-bad-repo.json" \
75
+ --model-patch "$TMP/model.patch" \
76
+ --cases-root "$TMP/cases-bad-repo" \
77
+ --repos-root "$TMP/repos-bad-repo" \
78
+ --repo-dir "$REPO" > "$TMP/prepare-bad-repo.log" 2>&1
79
+ bad_repo_status=$?
80
+ set -e
81
+ [ "$bad_repo_status" -ne 0 ]
82
+ grep -Fq 'unsafe SWE-bench repo' "$TMP/prepare-bad-repo.log"
83
+
84
+ set +e
85
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
86
+ --instance-json "$TMP/instance-bad-base.json" \
87
+ --model-patch "$TMP/model.patch" \
88
+ --cases-root "$TMP/cases-bad-base" \
89
+ --repos-root "$TMP/repos-bad-base" \
90
+ --repo-dir "$REPO" > "$TMP/prepare-bad-base.log" 2>&1
91
+ bad_base_status=$?
92
+ set -e
93
+ [ "$bad_base_status" -ne 0 ]
94
+ grep -Fq 'unsafe SWE-bench base_commit' "$TMP/prepare-bad-base.log"
95
+
96
+ cat > "$TMP/instance-nan-case.json" <<JSON
97
+ {
98
+ "instance_id": "local__repo-1",
99
+ "repo": NaN,
100
+ "base_commit": "$BASE_SHA",
101
+ "problem_statement": "Change app.txt."
102
+ }
103
+ JSON
104
+ set +e
105
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
106
+ --instance-json "$TMP/instance-nan-case.json" \
107
+ --model-patch "$TMP/model.patch" \
108
+ --cases-root "$TMP/cases-nan-case" \
109
+ --repos-root "$TMP/repos-nan-case" \
110
+ --repo-dir "$REPO" > "$TMP/prepare-nan-case.log" 2>&1
111
+ prepare_nan_case_status=$?
112
+ set -e
113
+ [ "$prepare_nan_case_status" -ne 0 ]
114
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/prepare-nan-case.log"
115
+
116
+ set +e
117
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py" \
118
+ --instances-jsonl "$TMP/instance-bad-repo.json" \
119
+ --instance-id local__repo-1 \
120
+ --repos-root "$TMP/repos-solver-bad-repo" \
121
+ --worktrees-root "$TMP/worktrees-bad-repo" > "$TMP/solver-bad-repo.log" 2>&1
122
+ solver_bad_repo_status=$?
123
+ set -e
124
+ [ "$solver_bad_repo_status" -ne 0 ]
125
+ grep -Fq 'unsafe SWE-bench repo' "$TMP/solver-bad-repo.log"
126
+
127
+ set +e
128
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py" \
129
+ --instances-jsonl "$TMP/instance-bad-base.json" \
130
+ --instance-id local__repo-1 \
131
+ --repos-root "$TMP/repos-solver-bad-base" \
132
+ --worktrees-root "$TMP/worktrees-bad-base" > "$TMP/solver-bad-base.log" 2>&1
133
+ solver_bad_base_status=$?
134
+ set -e
135
+ [ "$solver_bad_base_status" -ne 0 ]
136
+ grep -Fq 'unsafe SWE-bench base_commit' "$TMP/solver-bad-base.log"
137
+
138
+ cat > "$TMP/instance-nan-solver.jsonl" <<'EOF'
139
+ {"instance_id": "local__repo-1", "repo": NaN, "base_commit": "abc123", "problem_statement": "Change app.txt."}
140
+ EOF
141
+ set +e
142
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py" \
143
+ --instances-jsonl "$TMP/instance-nan-solver.jsonl" \
144
+ --instance-id local__repo-1 \
145
+ --repos-root "$TMP/repos-solver-nan" \
146
+ --worktrees-root "$TMP/worktrees-nan" > "$TMP/solver-nan.log" 2>&1
147
+ solver_nan_status=$?
148
+ set -e
149
+ [ "$solver_nan_status" -ne 0 ]
150
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/solver-nan.log"
151
+
152
+ python3 - "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json" <<'PY'
153
+ import importlib.util
154
+ import pathlib
155
+ import sys
156
+
157
+ script = pathlib.Path(sys.argv[1])
158
+ out = pathlib.Path(sys.argv[2])
159
+ spec = importlib.util.spec_from_file_location("fetch_swebench_instances", script)
160
+ module = importlib.util.module_from_spec(spec)
161
+ assert spec.loader is not None
162
+ sys.path.insert(0, str(script.parent))
163
+ spec.loader.exec_module(module)
164
+
165
+ def fake_fetch_rows(dataset, split, offset, length):
166
+ assert dataset == "princeton-nlp/SWE-bench_Lite"
167
+ assert split == "test"
168
+ assert offset == 0
169
+ assert length == 1
170
+ return {
171
+ "num_rows_total": 1,
172
+ "rows": [
173
+ {
174
+ "row": {
175
+ "instance_id": "local__repo-1",
176
+ "repo": "local/repo",
177
+ "base_commit": "abc123",
178
+ "problem_statement": "Change app.txt.",
179
+ }
180
+ }
181
+ ],
182
+ }
183
+
184
+ module.fetch_rows = fake_fetch_rows
185
+ sys.argv = [
186
+ "fetch-swebench-instances.py",
187
+ "--dataset",
188
+ "lite",
189
+ "--limit",
190
+ "1",
191
+ "--out",
192
+ str(out),
193
+ ]
194
+ raise SystemExit(module.main())
195
+ PY
55
196
  grep -q '"rows_written": 1' "$TMP/fetch.json"
56
197
  python3 - "$TMP/fetched-lite.jsonl" <<'PY'
57
198
  import json, pathlib, sys
@@ -60,6 +201,52 @@ for key in ("instance_id", "repo", "base_commit", "problem_statement"):
60
201
  assert row.get(key), key
61
202
  PY
62
203
 
204
+ python3 - "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" <<'PY'
205
+ import importlib.util
206
+ import pathlib
207
+ import sys
208
+
209
+ script = pathlib.Path(sys.argv[1])
210
+ spec = importlib.util.spec_from_file_location("fetch_swebench_instances", script)
211
+ module = importlib.util.module_from_spec(spec)
212
+ assert spec.loader is not None
213
+ sys.path.insert(0, str(script.parent))
214
+ spec.loader.exec_module(module)
215
+
216
+ def fake_fetch_rows(dataset, split, offset, length):
217
+ return {
218
+ "num_rows_total": 1,
219
+ "rows": [{"row": ["not", "an", "object"]}],
220
+ }
221
+
222
+ module.fetch_rows = fake_fetch_rows
223
+ sys.argv = [
224
+ "fetch-swebench-instances.py",
225
+ "--dataset",
226
+ "lite",
227
+ "--limit",
228
+ "1",
229
+ "--out",
230
+ "/tmp/fetch-malformed-row.jsonl",
231
+ ]
232
+ try:
233
+ module.main()
234
+ except ValueError as exc:
235
+ assert "malformed fetched row 1: row must be object" in str(exc), str(exc)
236
+ else:
237
+ raise AssertionError("malformed fetched row was accepted")
238
+ PY
239
+
240
+ set +e
241
+ python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
242
+ --dataset lite \
243
+ --limit 0 \
244
+ --out "$TMP/fetched-empty-limit.jsonl" > "$TMP/fetch-empty-limit.log" 2>&1
245
+ fetch_empty_limit_status=$?
246
+ set -e
247
+ [ "$fetch_empty_limit_status" -ne 0 ]
248
+ grep -Fq 'must be > 0' "$TMP/fetch-empty-limit.log"
249
+
63
250
  python3 - "$TMP/instance.json" "$TMP/instances.jsonl" "$TMP/model.patch" "$TMP/predictions.jsonl" <<'PY'
64
251
  import json, pathlib, sys
65
252
  instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
@@ -88,6 +275,51 @@ assert row["model_name_or_path"] == "local-patch-root"
88
275
  assert row["model_patch"].endswith("\n")
89
276
  PY
90
277
 
278
+ python3 - "$TMP/instance.json" "$TMP/instances-with-missing.jsonl" <<'PY'
279
+ import json, pathlib, sys
280
+ instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
281
+ missing = dict(instance, instance_id="local__repo-missing")
282
+ pathlib.Path(sys.argv[2]).write_text(json.dumps(instance) + "\n" + json.dumps(missing) + "\n")
283
+ PY
284
+ set +e
285
+ python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
286
+ --patch-root "$TMP/patch-root" \
287
+ --instances-jsonl "$TMP/instances-with-missing.jsonl" \
288
+ --model-name local-patch-root \
289
+ --out "$TMP/collected-missing-predictions.jsonl" > "$TMP/collect-missing.log" 2>&1
290
+ collect_missing_status=$?
291
+ set -e
292
+ [ "$collect_missing_status" -ne 0 ]
293
+ grep -q 'missing patch.diff for instance ids: local__repo-missing' "$TMP/collect-missing.log"
294
+
295
+ mkdir -p "$TMP/patch-root-empty/local__repo-1"
296
+ : > "$TMP/patch-root-empty/local__repo-1/patch.diff"
297
+ set +e
298
+ python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
299
+ --patch-root "$TMP/patch-root-empty" \
300
+ --instances-jsonl "$TMP/instances.jsonl" \
301
+ --model-name local-patch-root \
302
+ --out "$TMP/collected-empty-predictions.jsonl" \
303
+ --allow-empty > "$TMP/collect-empty.log" 2>&1
304
+ collect_empty_status=$?
305
+ set -e
306
+ [ "$collect_empty_status" -ne 0 ]
307
+ grep -q 'no non-empty patches collected' "$TMP/collect-empty.log"
308
+
309
+ cat > "$TMP/instances-nan.jsonl" <<'EOF'
310
+ {"instance_id": NaN, "repo": "local/repo", "base_commit": "abc123", "problem_statement": "Change app.txt."}
311
+ EOF
312
+ set +e
313
+ python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
314
+ --patch-root "$TMP/patch-root" \
315
+ --instances-jsonl "$TMP/instances-nan.jsonl" \
316
+ --model-name local-patch-root \
317
+ --out "$TMP/collected-nan-instances.jsonl" > "$TMP/collect-nan-instances.log" 2>&1
318
+ collect_nan_instances_status=$?
319
+ set -e
320
+ [ "$collect_nan_instances_status" -ne 0 ]
321
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/collect-nan-instances.log"
322
+
91
323
  rm -rf "$TMP/cases-batch" "$TMP/repos-batch"
92
324
  python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
93
325
  --instances-jsonl "$TMP/instances.jsonl" \
@@ -99,6 +331,85 @@ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py"
99
331
  grep -q '"prepared_count": 1' "$TMP/manifest.json"
100
332
  test -f "$TMP/cases-batch/local__repo-1/model.patch"
101
333
 
334
+ set +e
335
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
336
+ --instances-jsonl "$TMP/instances.jsonl" \
337
+ --predictions-jsonl "$TMP/predictions.jsonl" \
338
+ --cases-root "$TMP/cases-batch-empty-limit" \
339
+ --repos-root "$TMP/repos-batch-empty-limit" \
340
+ --repo-dir "$REPO" \
341
+ --limit 0 > "$TMP/batch-empty-limit.log" 2>&1
342
+ batch_empty_limit_status=$?
343
+ set -e
344
+ [ "$batch_empty_limit_status" -ne 0 ]
345
+ grep -Fq 'must be > 0' "$TMP/batch-empty-limit.log"
346
+
347
+ touch "$TMP/empty-predictions.jsonl"
348
+ set +e
349
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
350
+ --instances-jsonl "$TMP/instances.jsonl" \
351
+ --predictions-jsonl "$TMP/empty-predictions.jsonl" \
352
+ --cases-root "$TMP/cases-batch-empty-predictions" \
353
+ --repos-root "$TMP/repos-batch-empty-predictions" \
354
+ --repo-dir "$REPO" > "$TMP/batch-empty-predictions.log" 2>&1
355
+ batch_empty_predictions_status=$?
356
+ set -e
357
+ [ "$batch_empty_predictions_status" -ne 0 ]
358
+ grep -q 'no prediction instances selected' "$TMP/batch-empty-predictions.log"
359
+
360
+ set +e
361
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
362
+ --instances-jsonl "$TMP/instances.jsonl" \
363
+ --predictions-jsonl "$TMP/predictions.jsonl" \
364
+ --cases-root "$TMP/cases-batch-bad-timeout" \
365
+ --repos-root "$TMP/repos-batch-bad-timeout" \
366
+ --repo-dir "$REPO" \
367
+ --timeout-seconds 0 > "$TMP/batch-bad-timeout.log" 2>&1
368
+ batch_bad_timeout_status=$?
369
+ set -e
370
+ [ "$batch_bad_timeout_status" -ne 0 ]
371
+ grep -Fq 'must be > 0' "$TMP/batch-bad-timeout.log"
372
+
373
+ cat > "$TMP/predictions-nan.jsonl" <<'EOF'
374
+ {"instance_id": "local__repo-1", "model_name_or_path": "local-test", "model_patch": NaN}
375
+ EOF
376
+ set +e
377
+ python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
378
+ --instances-jsonl "$TMP/instances.jsonl" \
379
+ --predictions-jsonl "$TMP/predictions-nan.jsonl" \
380
+ --cases-root "$TMP/cases-batch-nan-predictions" \
381
+ --repos-root "$TMP/repos-batch-nan-predictions" \
382
+ --repo-dir "$REPO" > "$TMP/batch-nan-predictions.log" 2>&1
383
+ batch_nan_predictions_status=$?
384
+ set -e
385
+ [ "$batch_nan_predictions_status" -ne 0 ]
386
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/batch-nan-predictions.log"
387
+
388
+ python3 - "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" <<'PY'
389
+ import importlib.util
390
+ import pathlib
391
+ import sys
392
+
393
+ path = pathlib.Path(sys.argv[1])
394
+ sys.path.insert(0, str(path.parent))
395
+ spec = importlib.util.spec_from_file_location("prepare_swebench_frozen_corpus", path)
396
+ module = importlib.util.module_from_spec(spec)
397
+ assert spec.loader is not None
398
+ spec.loader.exec_module(module)
399
+
400
+ for stdout, expected in [
401
+ ("[]", "expected JSON object"),
402
+ ('{"instance_id": NaN}', "invalid JSON numeric constant: NaN"),
403
+ ('{"instance_id":"x","case_dir":"c","repo_dir":"r","run_command":[]}', "missing non-empty 'run_command'"),
404
+ ]:
405
+ try:
406
+ module.parse_prepared_case(stdout, "child")
407
+ except ValueError as exc:
408
+ assert expected in str(exc), str(exc)
409
+ else:
410
+ raise AssertionError(f"expected ValueError for {stdout}")
411
+ PY
412
+
102
413
  bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
103
414
  --fixture local__repo-1 \
104
415
  --fixtures-root "$TMP/cases" \
@@ -109,12 +420,83 @@ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
109
420
  --timeout-seconds 7 \
110
421
  --prepare-only > "$TMP/runner.log"
111
422
 
423
+ grep -Fq 'Command: ' "$TMP/runner.log"
424
+ grep -Fq '# Frozen VERIFY Pair Summary' "$TMP/runner.log"
112
425
  grep -q 'Timeout: 7s per arm' "$TMP/runner.log"
426
+ test -f "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test/compare.md"
427
+ grep -Fq '| Arm | Verdict | Pair mode | Triggers | Findings | Elapsed | Invoke exit | Failure |' \
428
+ "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test/compare.md"
429
+ grep -Fq '| pair |' "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test/compare.md"
113
430
  grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/app.txt
114
431
  grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/app.txt
115
432
  test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/.devlyn/spec-verify.json
116
433
  test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/.devlyn/spec-verify.json
117
434
 
435
+ cp -R "$CASE_DIR" "$TMP/cases/local__repo-1-nan-metadata"
436
+ cat > "$TMP/cases/local__repo-1-nan-metadata/metadata.json" <<'EOF'
437
+ {"timeout_seconds": NaN}
438
+ EOF
439
+ set +e
440
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
441
+ --fixture local__repo-1-nan-metadata \
442
+ --fixtures-root "$TMP/cases" \
443
+ --base-repo "$BASE_REPO" \
444
+ --diff "$CASE_DIR/model.patch" \
445
+ --prepare-only > "$TMP/runner-nan-metadata.log" 2>&1
446
+ runner_nan_metadata_status=$?
447
+ set -e
448
+ [ "$runner_nan_metadata_status" -ne 0 ]
449
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/runner-nan-metadata.log"
450
+
451
+ cp -R "$CASE_DIR" "$TMP/cases/local__repo-1-nan-expected"
452
+ cat > "$TMP/cases/local__repo-1-nan-expected/expected.json" <<'EOF'
453
+ {"verification_commands": NaN}
454
+ EOF
455
+ set +e
456
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
457
+ --fixture local__repo-1-nan-expected \
458
+ --fixtures-root "$TMP/cases" \
459
+ --base-repo "$BASE_REPO" \
460
+ --diff "$CASE_DIR/model.patch" \
461
+ --prepare-only > "$TMP/runner-nan-expected.log" 2>&1
462
+ runner_nan_expected_status=$?
463
+ set -e
464
+ [ "$runner_nan_expected_status" -ne 0 ]
465
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/runner-nan-expected.log"
466
+
467
+ set +e
468
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
469
+ --fixture > "$TMP/runner-missing-value.log" 2>&1
470
+ runner_missing_value_status=$?
471
+ set -e
472
+ [ "$runner_missing_value_status" -ne 0 ]
473
+ grep -Fq -- '--fixture requires a value' "$TMP/runner-missing-value.log"
474
+
475
+ set +e
476
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
477
+ --fixture '../bad' \
478
+ --fixtures-root "$TMP/cases" \
479
+ --base-repo "$BASE_REPO" \
480
+ --diff "$CASE_DIR/model.patch" \
481
+ --prepare-only > "$TMP/runner-unsafe-fixture.log" 2>&1
482
+ runner_unsafe_fixture_status=$?
483
+ set -e
484
+ [ "$runner_unsafe_fixture_status" -ne 0 ]
485
+ grep -Fq -- '--fixture must match [A-Za-z0-9_.-]+' "$TMP/runner-unsafe-fixture.log"
486
+
487
+ set +e
488
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
489
+ --fixture local__repo-1 \
490
+ --fixtures-root "$TMP/cases" \
491
+ --base-repo "$BASE_REPO" \
492
+ --diff "$CASE_DIR/model.patch" \
493
+ --run-id '../bad-run' \
494
+ --prepare-only > "$TMP/runner-unsafe-run-id.log" 2>&1
495
+ runner_unsafe_run_id_status=$?
496
+ set -e
497
+ [ "$runner_unsafe_run_id_status" -ne 0 ]
498
+ grep -Fq -- '--run-id must match [A-Za-z0-9_.-]+' "$TMP/runner-unsafe-run-id.log"
499
+
118
500
  RESULTS_DIR="$ROOT/benchmark/auto-resolve/results"
119
501
  RESUME_RUN_ID="swebench-resume-arm-test-local__repo-1"
120
502
  mkdir -p "$RESULTS_DIR/$RESUME_RUN_ID/solo" "$TMP/fakebin"
@@ -146,6 +528,393 @@ grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/resume-arm.log"
146
528
  grep -Fq 'fake claude invoked' "$RESULTS_DIR/$RESUME_RUN_ID/pair/transcript.txt"
147
529
  grep -q '"invoke_exit": 0' "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json"
148
530
 
531
+ BOOL_ELAPSED_RUN_ID="swebench-bool-elapsed-test-local__repo-1"
532
+ mkdir -p "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID/solo" "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID/pair"
533
+ cat > "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID/solo/summary.json" <<'EOF'
534
+ {
535
+ "elapsed_seconds": true,
536
+ "invoke_exit": 0,
537
+ "timed_out": false,
538
+ "verify_verdict": "PASS_WITH_ISSUES",
539
+ "terminal_verdict": "PASS"
540
+ }
541
+ EOF
542
+ cat > "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID/pair/summary.json" <<'EOF'
543
+ {
544
+ "elapsed_seconds": false,
545
+ "invoke_exit": 0,
546
+ "timed_out": false,
547
+ "verify_verdict": "NEEDS_WORK",
548
+ "terminal_verdict": "PASS",
549
+ "pair_mode": true,
550
+ "sub_verdicts": {"judge": "PASS_WITH_ISSUES", "pair_judge": "NEEDS_WORK"}
551
+ }
552
+ EOF
553
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
554
+ --fixture local__repo-1 \
555
+ --fixtures-root "$TMP/cases" \
556
+ --base-repo "$BASE_REPO" \
557
+ --diff "$CASE_DIR/model.patch" \
558
+ --run-id "$BOOL_ELAPSED_RUN_ID" \
559
+ --pair-mode gated \
560
+ --timeout-seconds 3 \
561
+ --resume-completed-arms > "$TMP/bool-elapsed-runner.log" 2>&1
562
+ grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/bool-elapsed-runner.log"
563
+ grep -q '"pair_solo_wall_ratio": null' "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID/compare.json"
564
+ grep -Fq '| n/a |' "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID/compare.md"
565
+
566
+ MALFORMED_PAIR_JUDGE_RUN_ID="swebench-malformed-pair-judge-test-local__repo-1"
567
+ mkdir -p "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/solo" "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/pair"
568
+ cat > "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/solo/summary.json" <<'EOF'
569
+ {
570
+ "elapsed_seconds": 100,
571
+ "invoke_exit": 0,
572
+ "timed_out": false,
573
+ "verify_verdict": "PASS_WITH_ISSUES",
574
+ "terminal_verdict": "PASS"
575
+ }
576
+ EOF
577
+ cat > "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/pair/summary.json" <<'EOF'
578
+ {
579
+ "elapsed_seconds": 150,
580
+ "invoke_exit": 0,
581
+ "timed_out": false,
582
+ "verify_verdict": "NEEDS_WORK",
583
+ "terminal_verdict": "PASS",
584
+ "pair_mode": false,
585
+ "sub_verdicts": {"judge": "PASS_WITH_ISSUES", "pair_judge": ""}
586
+ }
587
+ EOF
588
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
589
+ --fixture local__repo-1 \
590
+ --fixtures-root "$TMP/cases" \
591
+ --base-repo "$BASE_REPO" \
592
+ --diff "$CASE_DIR/model.patch" \
593
+ --run-id "$MALFORMED_PAIR_JUDGE_RUN_ID" \
594
+ --pair-mode gated \
595
+ --timeout-seconds 3 \
596
+ --resume-completed-arms > "$TMP/malformed-pair-judge-runner.log" 2>&1
597
+ grep -q '"pair_mode": false' "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/compare.json"
598
+ grep -q '"pair_verdict_lift": false' "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/compare.json"
599
+ grep -q '"pair_internal_verdict_lift": false' "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID/compare.json"
600
+
601
+ MALFORMED_PAIR_TRIGGER_RUN_ID="swebench-malformed-pair-trigger-test-local__repo-1"
602
+ mkdir -p "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/solo" "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/pair"
603
+ cat > "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/solo/summary.json" <<'EOF'
604
+ {
605
+ "elapsed_seconds": 100,
606
+ "invoke_exit": 0,
607
+ "timed_out": false,
608
+ "verify_verdict": "PASS_WITH_ISSUES",
609
+ "terminal_verdict": "PASS",
610
+ "verify_findings_count": 1,
611
+ "severity_counts": {"LOW": 1, "MEDIUM": 0, "HIGH": 0, "CRITICAL": 0}
612
+ }
613
+ EOF
614
+ cat > "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/pair/summary.json" <<'EOF'
615
+ {
616
+ "elapsed_seconds": 150,
617
+ "invoke_exit": 0,
618
+ "timed_out": false,
619
+ "verify_verdict": "PASS_WITH_ISSUES",
620
+ "terminal_verdict": "PASS",
621
+ "pair_mode": false,
622
+ "pair_trigger": "eligible",
623
+ "verify_findings_count": "2",
624
+ "severity_counts": "bad"
625
+ }
626
+ EOF
627
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
628
+ --fixture local__repo-1 \
629
+ --fixtures-root "$TMP/cases" \
630
+ --base-repo "$BASE_REPO" \
631
+ --diff "$CASE_DIR/model.patch" \
632
+ --run-id "$MALFORMED_PAIR_TRIGGER_RUN_ID" \
633
+ --pair-mode gated \
634
+ --timeout-seconds 3 \
635
+ --resume-completed-arms > "$TMP/malformed-pair-trigger-runner.log" 2>&1
636
+ grep -q '"pair_trigger_missed": false' "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/compare.json"
637
+ grep -q '"pair_found_more_findings": false' "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/compare.json"
638
+ grep -q '"pair_found_more_low_or_worse": false' "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID/compare.json"
639
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
640
+ --results-root "$RESULTS_DIR" \
641
+ --title "Malformed Trigger Matrix" \
642
+ --verdict FAIL \
643
+ --run-id "$MALFORMED_PAIR_TRIGGER_RUN_ID" \
644
+ --out-json "$TMP/malformed-trigger-matrix.json" \
645
+ --out-md "$TMP/malformed-trigger-matrix.md" > "$TMP/malformed-trigger-matrix.log"
646
+ grep -q '"pair_trigger_eligible": false' "$TMP/malformed-trigger-matrix.json"
647
+ grep -Fq '"pair_trigger_failures": [' "$TMP/malformed-trigger-matrix.json"
648
+ grep -q '"pair_trigger missing or malformed"' "$TMP/malformed-trigger-matrix.json"
649
+ grep -q '"classification": "failed attempt: pair trigger contract: pair_trigger missing or malformed"' "$TMP/malformed-trigger-matrix.json"
650
+ grep -Fq '| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |' "$TMP/malformed-trigger-matrix.md"
651
+ grep -Fq '| unknown | PASS_WITH_ISSUES | PASS_WITH_ISSUES | false | malformed |' "$TMP/malformed-trigger-matrix.md"
652
+
653
+ UNKNOWN_PAIR_TRIGGER_RUN_ID="swebench-unknown-pair-trigger-test-local__repo-1"
654
+ mkdir -p "$RESULTS_DIR/$UNKNOWN_PAIR_TRIGGER_RUN_ID/pair"
655
+ cat > "$RESULTS_DIR/$UNKNOWN_PAIR_TRIGGER_RUN_ID/pair/input.md" <<'EOF'
656
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
657
+ EOF
658
+ cat > "$RESULTS_DIR/$UNKNOWN_PAIR_TRIGGER_RUN_ID/compare.json" <<'EOF'
659
+ {
660
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
661
+ "pair": {
662
+ "invoke_exit": 0,
663
+ "timed_out": false,
664
+ "verify_verdict": "NEEDS_WORK",
665
+ "pair_mode": true,
666
+ "pair_trigger": {"eligible": true, "reasons": ["looks-hard"], "skipped_reason": null},
667
+ "elapsed_seconds": 200
668
+ },
669
+ "comparison": {
670
+ "pair_trigger_missed": false,
671
+ "pair_verdict_lift": true,
672
+ "solo_verdict": "PASS_WITH_ISSUES",
673
+ "pair_verdict": "NEEDS_WORK"
674
+ }
675
+ }
676
+ EOF
677
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
678
+ --results-root "$RESULTS_DIR" \
679
+ --title "Unknown Trigger Matrix" \
680
+ --verdict FAIL \
681
+ --run-id "$UNKNOWN_PAIR_TRIGGER_RUN_ID" \
682
+ --out-json "$TMP/unknown-trigger-matrix.json" \
683
+ --out-md "$TMP/unknown-trigger-matrix.md" > "$TMP/unknown-trigger-matrix.log"
684
+ grep -q '"pair_trigger_eligible": false' "$TMP/unknown-trigger-matrix.json"
685
+ grep -q '"pair_trigger reasons missing known trigger reason"' "$TMP/unknown-trigger-matrix.json"
686
+ grep -q '"classification": "failed attempt: pair trigger contract: pair_trigger reasons missing known trigger reason"' "$TMP/unknown-trigger-matrix.json"
687
+ grep -Fq '| local__repo-1 | PASS_WITH_ISSUES | NEEDS_WORK | true | malformed |' "$TMP/unknown-trigger-matrix.md"
688
+
689
+ NORMALIZED_PAIR_TRIGGER_RUN_ID="swebench-normalized-pair-trigger-test-local__repo-1"
690
+ mkdir -p "$RESULTS_DIR/$NORMALIZED_PAIR_TRIGGER_RUN_ID/pair"
691
+ cat > "$RESULTS_DIR/$NORMALIZED_PAIR_TRIGGER_RUN_ID/pair/input.md" <<'EOF'
692
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
693
+ EOF
694
+ cat > "$RESULTS_DIR/$NORMALIZED_PAIR_TRIGGER_RUN_ID/compare.json" <<'EOF'
695
+ {
696
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
697
+ "pair": {
698
+ "invoke_exit": 0,
699
+ "timed_out": false,
700
+ "verify_verdict": "NEEDS_WORK",
701
+ "pair_mode": true,
702
+ "pair_trigger": {"eligible": true, "reasons": ["risk high"], "skipped_reason": null},
703
+ "elapsed_seconds": 200
704
+ },
705
+ "comparison": {
706
+ "pair_trigger_missed": false,
707
+ "pair_verdict_lift": true,
708
+ "solo_verdict": "PASS_WITH_ISSUES",
709
+ "pair_verdict": "NEEDS_WORK"
710
+ }
711
+ }
712
+ EOF
713
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
714
+ --results-root "$RESULTS_DIR" \
715
+ --title "Normalized Trigger Matrix" \
716
+ --verdict FAIL \
717
+ --run-id "$NORMALIZED_PAIR_TRIGGER_RUN_ID" \
718
+ --out-json "$TMP/normalized-trigger-matrix.json" \
719
+ --out-md "$TMP/normalized-trigger-matrix.md" > "$TMP/normalized-trigger-matrix.log"
720
+ grep -q '"pair_trigger_eligible": false' "$TMP/normalized-trigger-matrix.json"
721
+ grep -q '"pair_trigger reasons missing known trigger reason"' "$TMP/normalized-trigger-matrix.json"
722
+ grep -q '"classification": "failed attempt: pair trigger contract: pair_trigger reasons missing known trigger reason"' "$TMP/normalized-trigger-matrix.json"
723
+ grep -Fq '| local__repo-1 | PASS_WITH_ISSUES | NEEDS_WORK | true | malformed |' "$TMP/normalized-trigger-matrix.md"
724
+
725
+ MIXED_UNKNOWN_PAIR_TRIGGER_RUN_ID="swebench-mixed-unknown-pair-trigger-test-local__repo-1"
726
+ mkdir -p "$RESULTS_DIR/$MIXED_UNKNOWN_PAIR_TRIGGER_RUN_ID/pair"
727
+ cat > "$RESULTS_DIR/$MIXED_UNKNOWN_PAIR_TRIGGER_RUN_ID/pair/input.md" <<'EOF'
728
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
729
+ EOF
730
+ cat > "$RESULTS_DIR/$MIXED_UNKNOWN_PAIR_TRIGGER_RUN_ID/compare.json" <<'EOF'
731
+ {
732
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
733
+ "pair": {
734
+ "invoke_exit": 0,
735
+ "timed_out": false,
736
+ "verify_verdict": "NEEDS_WORK",
737
+ "pair_mode": true,
738
+ "pair_trigger": {"eligible": true, "reasons": ["mode.verify-only", "looks-hard"], "skipped_reason": null},
739
+ "elapsed_seconds": 200
740
+ },
741
+ "comparison": {
742
+ "pair_trigger_missed": false,
743
+ "pair_verdict_lift": true,
744
+ "solo_verdict": "PASS_WITH_ISSUES",
745
+ "pair_verdict": "NEEDS_WORK"
746
+ }
747
+ }
748
+ EOF
749
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
750
+ --results-root "$RESULTS_DIR" \
751
+ --title "Mixed Unknown Trigger Matrix" \
752
+ --verdict FAIL \
753
+ --run-id "$MIXED_UNKNOWN_PAIR_TRIGGER_RUN_ID" \
754
+ --out-json "$TMP/mixed-unknown-trigger-matrix.json" \
755
+ --out-md "$TMP/mixed-unknown-trigger-matrix.md" > "$TMP/mixed-unknown-trigger-matrix.log"
756
+ grep -q '"pair_trigger_eligible": false' "$TMP/mixed-unknown-trigger-matrix.json"
757
+ grep -q '"pair_trigger reasons contain unknown trigger reason"' "$TMP/mixed-unknown-trigger-matrix.json"
758
+ grep -q '"classification": "failed attempt: pair trigger contract: pair_trigger reasons contain unknown trigger reason"' "$TMP/mixed-unknown-trigger-matrix.json"
759
+ grep -Fq '| local__repo-1 | PASS_WITH_ISSUES | NEEDS_WORK | true | malformed |' "$TMP/mixed-unknown-trigger-matrix.md"
760
+
761
+ HISTORICAL_ONLY_TRIGGER_RUN_ID="swebench-historical-only-pair-trigger-test-local__repo-1"
762
+ mkdir -p "$RESULTS_DIR/$HISTORICAL_ONLY_TRIGGER_RUN_ID/pair"
763
+ cat > "$RESULTS_DIR/$HISTORICAL_ONLY_TRIGGER_RUN_ID/pair/input.md" <<'EOF'
764
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
765
+ EOF
766
+ cat > "$RESULTS_DIR/$HISTORICAL_ONLY_TRIGGER_RUN_ID/compare.json" <<'EOF'
767
+ {
768
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
769
+ "pair": {
770
+ "invoke_exit": 0,
771
+ "timed_out": false,
772
+ "verify_verdict": "NEEDS_WORK",
773
+ "pair_mode": true,
774
+ "pair_trigger": {"eligible": true, "reasons": ["risk_profile.high_risk"], "skipped_reason": null},
775
+ "elapsed_seconds": 200
776
+ },
777
+ "comparison": {
778
+ "pair_trigger_missed": false,
779
+ "pair_verdict_lift": true,
780
+ "solo_verdict": "PASS_WITH_ISSUES",
781
+ "pair_verdict": "NEEDS_WORK"
782
+ }
783
+ }
784
+ EOF
785
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
786
+ --results-root "$RESULTS_DIR" \
787
+ --title "Historical Trigger Matrix" \
788
+ --verdict FAIL \
789
+ --run-id "$HISTORICAL_ONLY_TRIGGER_RUN_ID" \
790
+ --out-json "$TMP/historical-trigger-matrix.json" \
791
+ --out-md "$TMP/historical-trigger-matrix.md" > "$TMP/historical-trigger-matrix.log"
792
+ grep -q '"pair_trigger_eligible": false' "$TMP/historical-trigger-matrix.json"
793
+ grep -q '"pair_trigger reasons missing canonical trigger reason"' "$TMP/historical-trigger-matrix.json"
794
+ grep -q '"classification": "failed attempt: pair trigger contract: pair_trigger reasons missing canonical trigger reason"' "$TMP/historical-trigger-matrix.json"
795
+ grep -Fq '| local__repo-1 | PASS_WITH_ISSUES | NEEDS_WORK | true | malformed |' "$TMP/historical-trigger-matrix.md"
796
+
797
+ HYPOTHESIS_TRIGGER_RUN_ID="swebench-missing-hypothesis-trigger-test-local__repo-hypothesis"
798
+ mkdir -p "$TMP/cases/local__repo-hypothesis" "$RESULTS_DIR/$HYPOTHESIS_TRIGGER_RUN_ID/pair"
799
+ cat > "$TMP/cases/local__repo-hypothesis/spec.md" <<'EOF'
800
+ ## Verification
801
+
802
+ - Solo-headroom hypothesis: `solo_claude` is expected to miss the frozen review defect; observable miss command: `python -m pytest tests/test_review.py`.
803
+ EOF
804
+ cat > "$RESULTS_DIR/$HYPOTHESIS_TRIGGER_RUN_ID/pair/input.md" <<'EOF'
805
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-hypothesis.md.
806
+ EOF
807
+ cat > "$RESULTS_DIR/$HYPOTHESIS_TRIGGER_RUN_ID/compare.json" <<'EOF'
808
+ {
809
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
810
+ "pair": {
811
+ "invoke_exit": 0,
812
+ "timed_out": false,
813
+ "verify_verdict": "NEEDS_WORK",
814
+ "pair_mode": true,
815
+ "pair_trigger": {"eligible": true, "reasons": ["mode.verify-only"], "skipped_reason": null},
816
+ "elapsed_seconds": 200
817
+ },
818
+ "comparison": {
819
+ "pair_trigger_missed": false,
820
+ "pair_verdict_lift": true,
821
+ "solo_verdict": "PASS_WITH_ISSUES",
822
+ "pair_verdict": "NEEDS_WORK"
823
+ }
824
+ }
825
+ EOF
826
+ set +e
827
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
828
+ --results-root "$RESULTS_DIR" \
829
+ --title "Hypothesis Trigger Matrix" \
830
+ --verdict FAIL \
831
+ --run-id "$HYPOTHESIS_TRIGGER_RUN_ID" \
832
+ --require-hypothesis-trigger \
833
+ --out-json "$TMP/hypothesis-trigger-missing-root.json" \
834
+ --out-md "$TMP/hypothesis-trigger-missing-root.md" > "$TMP/hypothesis-trigger-missing-root.log" 2>&1
835
+ missing_root_status=$?
836
+ set -e
837
+ [ "$missing_root_status" -ne 0 ]
838
+ grep -Fq -- '--require-hypothesis-trigger requires --fixtures-root' "$TMP/hypothesis-trigger-missing-root.log"
839
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
840
+ --results-root "$RESULTS_DIR" \
841
+ --fixtures-root "$TMP/cases" \
842
+ --title "Hypothesis Trigger Matrix" \
843
+ --verdict FAIL \
844
+ --run-id "$HYPOTHESIS_TRIGGER_RUN_ID" \
845
+ --require-hypothesis-trigger \
846
+ --out-json "$TMP/hypothesis-trigger-matrix.json" \
847
+ --out-md "$TMP/hypothesis-trigger-matrix.md" > "$TMP/hypothesis-trigger-matrix.log"
848
+ grep -q '"pair_trigger missing spec.solo_headroom_hypothesis"' "$TMP/hypothesis-trigger-matrix.json"
849
+ grep -q '"classification": "failed attempt: pair trigger contract: pair_trigger missing spec.solo_headroom_hypothesis"' "$TMP/hypothesis-trigger-matrix.json"
850
+ grep -Fq '| local__repo-hypothesis | PASS_WITH_ISSUES | NEEDS_WORK | true | malformed |' "$TMP/hypothesis-trigger-matrix.md"
851
+
852
+ set +e
853
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
854
+ --manifest "$TMP/manifest.json" \
855
+ --min-runs > "$TMP/corpus-missing-value.log" 2>&1
856
+ missing_value_status=$?
857
+ set -e
858
+ [ "$missing_value_status" -ne 0 ]
859
+ grep -Fq -- '--min-runs requires a value' "$TMP/corpus-missing-value.log"
860
+
861
+ cat > "$TMP/manifest-non-object.json" <<'EOF'
862
+ []
863
+ EOF
864
+ set +e
865
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
866
+ --manifest "$TMP/manifest-non-object.json" \
867
+ --prepare-only > "$TMP/corpus-manifest-non-object.log" 2>&1
868
+ manifest_non_object_status=$?
869
+ set -e
870
+ [ "$manifest_non_object_status" -ne 0 ]
871
+ grep -Fq 'manifest malformed: expected JSON object' "$TMP/corpus-manifest-non-object.log"
872
+
873
+ cat > "$TMP/manifest-nan.json" <<'EOF'
874
+ {
875
+ "cases_root": NaN,
876
+ "prepared": []
877
+ }
878
+ EOF
879
+ set +e
880
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
881
+ --manifest "$TMP/manifest-nan.json" \
882
+ --prepare-only > "$TMP/corpus-manifest-nan.log" 2>&1
883
+ manifest_nan_status=$?
884
+ set -e
885
+ [ "$manifest_nan_status" -ne 0 ]
886
+ grep -Fq 'invalid JSON numeric constant: NaN' "$TMP/corpus-manifest-nan.log"
887
+
888
+ cat > "$TMP/manifest-empty-prepared.json" <<EOF
889
+ {
890
+ "cases_root": "$TMP/cases-batch",
891
+ "prepared": []
892
+ }
893
+ EOF
894
+ set +e
895
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
896
+ --manifest "$TMP/manifest-empty-prepared.json" \
897
+ --prepare-only > "$TMP/corpus-manifest-empty-prepared.log" 2>&1
898
+ manifest_empty_prepared_status=$?
899
+ set -e
900
+ [ "$manifest_empty_prepared_status" -ne 0 ]
901
+ grep -Fq 'manifest malformed: prepared must be a non-empty array' "$TMP/corpus-manifest-empty-prepared.log"
902
+
903
+ cat > "$TMP/manifest-bad-row.json" <<EOF
904
+ {
905
+ "cases_root": "$TMP/cases-batch",
906
+ "prepared": ["not an object"]
907
+ }
908
+ EOF
909
+ set +e
910
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
911
+ --manifest "$TMP/manifest-bad-row.json" \
912
+ --prepare-only > "$TMP/corpus-manifest-bad-row.log" 2>&1
913
+ manifest_bad_row_status=$?
914
+ set -e
915
+ [ "$manifest_bad_row_status" -ne 0 ]
916
+ grep -Fq 'manifest malformed: prepared[1] expected JSON object' "$TMP/corpus-manifest-bad-row.log"
917
+
149
918
  bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
150
919
  --manifest "$TMP/manifest.json" \
151
920
  --run-prefix swebench-frozen-corpus-test \
@@ -154,6 +923,7 @@ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
154
923
  --out-json "$TMP/gate.json" \
155
924
  --out-md "$TMP/gate.md" \
156
925
  --prepare-only > "$TMP/corpus-runner.log"
926
+ grep -Fq 'Command: ' "$TMP/corpus-runner.log"
157
927
  grep -q 'prepare-only complete; gate skipped' "$TMP/corpus-runner.log"
158
928
  grep -q 'Timeout: 7s per arm' "$TMP/corpus-runner.log"
159
929
  grep -q '^swebench-frozen-corpus-test-1-local__repo-1$' "$TMP/prepare-run-ids.txt"
@@ -238,6 +1008,136 @@ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
238
1008
  grep -q '"classification": "failed attempt: provider limit"' "$TMP/provider-limit-matrix.json"
239
1009
  grep -Fq 'failed attempt: provider limit' "$TMP/provider-limit-matrix.md"
240
1010
 
1011
+ DIRTY_MATRIX_RUN_ID="swebench-dirty-matrix-test-local__repo-1"
1012
+ mkdir -p "$RESULTS_DIR/$DIRTY_MATRIX_RUN_ID/solo" "$RESULTS_DIR/$DIRTY_MATRIX_RUN_ID/pair"
1013
+ cat > "$RESULTS_DIR/$DIRTY_MATRIX_RUN_ID/solo/input.md" <<'EOF'
1014
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
1015
+ EOF
1016
+ cat > "$RESULTS_DIR/$DIRTY_MATRIX_RUN_ID/compare.json" <<'EOF'
1017
+ {
1018
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
1019
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200, "environment_contamination": true},
1020
+ "comparison": {
1021
+ "pair_trigger_missed": false,
1022
+ "pair_verdict_lift": true,
1023
+ "solo_verdict": "PASS_WITH_ISSUES",
1024
+ "pair_verdict": "NEEDS_WORK"
1025
+ }
1026
+ }
1027
+ EOF
1028
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
1029
+ --title "Local SWE-bench Dirty Matrix" \
1030
+ --verdict FAIL \
1031
+ --run-id "$DIRTY_MATRIX_RUN_ID" \
1032
+ --out-json "$TMP/dirty-matrix.json" \
1033
+ --out-md "$TMP/dirty-matrix.md" > "$TMP/dirty-matrix.log"
1034
+ grep -q '"classification": "failed attempt: environment contamination"' "$TMP/dirty-matrix.json"
1035
+ grep -Fq 'failed attempt: environment contamination' "$TMP/dirty-matrix.md"
1036
+
1037
+ MALFORMED_MATRIX_RUN_ID="swebench-malformed-matrix-test-local__repo-1"
1038
+ mkdir -p "$RESULTS_DIR/$MALFORMED_MATRIX_RUN_ID/solo" "$RESULTS_DIR/$MALFORMED_MATRIX_RUN_ID/pair"
1039
+ cat > "$RESULTS_DIR/$MALFORMED_MATRIX_RUN_ID/solo/input.md" <<'EOF'
1040
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
1041
+ EOF
1042
+ cat > "$RESULTS_DIR/$MALFORMED_MATRIX_RUN_ID/compare.json" <<'EOF'
1043
+ {
1044
+ "solo": ["not", "a", "dict"],
1045
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": ["bad"], "pair_mode": true, "elapsed_seconds": true},
1046
+ "comparison": {"pair_verdict_lift": true, "solo_verdict": ["bad"], "pair_verdict": ["bad"]}
1047
+ }
1048
+ EOF
1049
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
1050
+ --title "Local SWE-bench Malformed Matrix" \
1051
+ --verdict FAIL \
1052
+ --run-id "$MALFORMED_MATRIX_RUN_ID" \
1053
+ --out-json "$TMP/malformed-matrix.json" \
1054
+ --out-md "$TMP/malformed-matrix.md" > "$TMP/malformed-matrix.log"
1055
+ grep -q '"classification": "failed attempt: malformed compare"' "$TMP/malformed-matrix.json"
1056
+ grep -q '"solo_verdict": null' "$TMP/malformed-matrix.json"
1057
+ grep -q '"pair_solo_wall_ratio": null' "$TMP/malformed-matrix.json"
1058
+ grep -Fq 'failed attempt: malformed compare' "$TMP/malformed-matrix.md"
1059
+
1060
+ NAN_MATRIX_RUN_ID="swebench-nan-matrix-test-local__repo-1"
1061
+ mkdir -p "$RESULTS_DIR/$NAN_MATRIX_RUN_ID/solo" "$RESULTS_DIR/$NAN_MATRIX_RUN_ID/pair"
1062
+ cat > "$RESULTS_DIR/$NAN_MATRIX_RUN_ID/solo/input.md" <<'EOF'
1063
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
1064
+ EOF
1065
+ cat > "$RESULTS_DIR/$NAN_MATRIX_RUN_ID/compare.json" <<'EOF'
1066
+ {
1067
+ "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
1068
+ "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": NaN},
1069
+ "comparison": {"pair_verdict_lift": true, "solo_verdict": "PASS_WITH_ISSUES", "pair_verdict": "NEEDS_WORK"}
1070
+ }
1071
+ EOF
1072
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
1073
+ --title "Local SWE-bench NaN Matrix" \
1074
+ --verdict FAIL \
1075
+ --run-id "$NAN_MATRIX_RUN_ID" \
1076
+ --out-json "$TMP/nan-matrix.json" \
1077
+ --out-md "$TMP/nan-matrix.md" > "$TMP/nan-matrix.log"
1078
+ grep -q '"classification": "failed attempt: malformed compare"' "$TMP/nan-matrix.json"
1079
+ grep -q '"malformed_compare": true' "$TMP/nan-matrix.json"
1080
+ grep -Fq 'failed attempt: malformed compare' "$TMP/nan-matrix.md"
1081
+
1082
+ STRING_BOOL_MATRIX_RUN_ID="swebench-string-bool-matrix-test-local__repo-1"
1083
+ mkdir -p "$RESULTS_DIR/$STRING_BOOL_MATRIX_RUN_ID/solo" "$RESULTS_DIR/$STRING_BOOL_MATRIX_RUN_ID/pair"
1084
+ cat > "$RESULTS_DIR/$STRING_BOOL_MATRIX_RUN_ID/solo/input.md" <<'EOF'
1085
+ Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
1086
+ EOF
1087
+ cat > "$RESULTS_DIR/$STRING_BOOL_MATRIX_RUN_ID/compare.json" <<'EOF'
1088
+ {
1089
+ "solo": {"invoke_exit": 0, "timed_out": "true", "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
1090
+ "pair": {"invoke_exit": 0, "timed_out": "false", "verify_verdict": "NEEDS_WORK", "pair_mode": "true", "elapsed_seconds": 200, "environment_contamination": "true"},
1091
+ "comparison": {
1092
+ "pair_trigger_missed": "false",
1093
+ "pair_verdict_lift": "true",
1094
+ "pair_internal_verdict_lift": "true",
1095
+ "solo_verdict": "PASS_WITH_ISSUES",
1096
+ "pair_verdict": "NEEDS_WORK"
1097
+ }
1098
+ }
1099
+ EOF
1100
+ python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
1101
+ --title "Local SWE-bench String Bool Matrix" \
1102
+ --verdict FAIL \
1103
+ --run-id "$STRING_BOOL_MATRIX_RUN_ID" \
1104
+ --out-json "$TMP/string-bool-matrix.json" \
1105
+ --out-md "$TMP/string-bool-matrix.md" > "$TMP/string-bool-matrix.log"
1106
+ grep -q '"pair_mode": false' "$TMP/string-bool-matrix.json"
1107
+ grep -q '"external_lift": false' "$TMP/string-bool-matrix.json"
1108
+ grep -q '"internal_lift": false' "$TMP/string-bool-matrix.json"
1109
+ grep -q '"pair_environment_contamination": false' "$TMP/string-bool-matrix.json"
1110
+
1111
+ touch "$TMP/empty-run-ids.txt"
1112
+ set +e
1113
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
1114
+ --manifest "$TMP/manifest.json" \
1115
+ --gate-only-run-ids "$TMP/empty-run-ids.txt" > "$TMP/gate-empty-run-ids.log" 2>&1
1116
+ empty_run_ids_status=$?
1117
+ set -e
1118
+ [ "$empty_run_ids_status" -ne 0 ]
1119
+ grep -Fq 'run ids malformed: no run ids' "$TMP/gate-empty-run-ids.log"
1120
+
1121
+ printf 'valid-run\n\n' > "$TMP/blank-run-ids.txt"
1122
+ set +e
1123
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
1124
+ --manifest "$TMP/manifest.json" \
1125
+ --gate-only-run-ids "$TMP/blank-run-ids.txt" > "$TMP/gate-blank-run-ids.log" 2>&1
1126
+ blank_run_ids_status=$?
1127
+ set -e
1128
+ [ "$blank_run_ids_status" -ne 0 ]
1129
+ grep -Fq 'run ids malformed: line 2 is empty' "$TMP/gate-blank-run-ids.log"
1130
+
1131
+ printf 'bad/run\n' > "$TMP/unsafe-run-ids.txt"
1132
+ set +e
1133
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
1134
+ --manifest "$TMP/manifest.json" \
1135
+ --gate-only-run-ids "$TMP/unsafe-run-ids.txt" > "$TMP/gate-unsafe-run-ids.log" 2>&1
1136
+ unsafe_run_ids_status=$?
1137
+ set -e
1138
+ [ "$unsafe_run_ids_status" -ne 0 ]
1139
+ grep -Fq 'run ids malformed: line 1 has unsafe run id' "$TMP/gate-unsafe-run-ids.log"
1140
+
241
1141
  RUN_ID="swebench-gate-only-test-local__repo-1"
242
1142
  mkdir -p "$RESULTS_DIR/$RUN_ID/pair"
243
1143
  cat > "$RESULTS_DIR/$RUN_ID/pair/input.md" <<'EOF'
@@ -246,7 +1146,14 @@ EOF
246
1146
  cat > "$RESULTS_DIR/$RUN_ID/compare.json" <<'EOF'
247
1147
  {
248
1148
  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
249
- "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
1149
+ "pair": {
1150
+ "invoke_exit": 0,
1151
+ "timed_out": false,
1152
+ "verify_verdict": "NEEDS_WORK",
1153
+ "pair_mode": true,
1154
+ "pair_trigger": {"eligible": true, "reasons": ["mode.verify-only"], "skipped_reason": null},
1155
+ "elapsed_seconds": 200
1156
+ },
250
1157
  "comparison": {
251
1158
  "pair_trigger_missed": false,
252
1159
  "pair_verdict_lift": true,
@@ -284,9 +1191,14 @@ grep -q '"gate_rows": 1' "$TMP/matrix.json"
284
1191
  grep -q '"gate_rate": 1.0' "$TMP/matrix.json"
285
1192
  grep -q '"trailing_non_gate_rows": 0' "$TMP/matrix.json"
286
1193
  grep -q '"yield_verdict": "PASS"' "$TMP/matrix.json"
1194
+ grep -q '"pair_trigger_reasons": \[' "$TMP/matrix.json"
1195
+ grep -q '"mode.verify-only"' "$TMP/matrix.json"
1196
+ grep -q '"pair_trigger_has_canonical_reason": true' "$TMP/matrix.json"
287
1197
  grep -Fq 'Local SWE-bench Matrix' "$TMP/matrix.md"
288
1198
  grep -Fq 'Gate rate: 1.000' "$TMP/matrix.md"
289
1199
  grep -Fq 'Yield verdict: **PASS**' "$TMP/matrix.md"
1200
+ grep -Fq '| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |' "$TMP/matrix.md"
1201
+ grep -Fq '| local__repo-1 | PASS_WITH_ISSUES | NEEDS_WORK | true | eligible | mode.verify-only | 2.00x | true | false | true | gate: external lift |' "$TMP/matrix.md"
290
1202
 
291
1203
  rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-solo
292
1204
  rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-pair
@@ -296,7 +1208,14 @@ rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test"
296
1208
  rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-test-1-local__repo-1"
297
1209
  rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1"
298
1210
  rm -rf "$RESULTS_DIR/$RESUME_RUN_ID"
1211
+ rm -rf "$RESULTS_DIR/$BOOL_ELAPSED_RUN_ID"
1212
+ rm -rf "$RESULTS_DIR/$MALFORMED_PAIR_JUDGE_RUN_ID"
1213
+ rm -rf "$RESULTS_DIR/$MALFORMED_PAIR_TRIGGER_RUN_ID"
299
1214
  rm -rf "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID"
1215
+ rm -rf "$RESULTS_DIR/$DIRTY_MATRIX_RUN_ID"
1216
+ rm -rf "$RESULTS_DIR/$MALFORMED_MATRIX_RUN_ID"
1217
+ rm -rf "$RESULTS_DIR/$NAN_MATRIX_RUN_ID"
1218
+ rm -rf "$RESULTS_DIR/$STRING_BOOL_MATRIX_RUN_ID"
300
1219
  rm -rf "$RESULTS_DIR/$RUN_ID"
301
1220
 
302
1221
  echo "PASS test-swebench-frozen-case"