devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -0,0 +1,401 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for run-headroom-candidate.sh argument and output guards.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ RUNNER="$SCRIPT_DIR/run-headroom-candidate.sh"
8
+ REJECTED="$SCRIPT_DIR/pair-rejected-fixtures.sh"
9
+ TMP_DIR="$(mktemp -d /tmp/run-headroom-candidate-test.XXXXXX)"
10
+ BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
11
+ TEST_RUN="headroom-cli-replay-$(basename "$TMP_DIR")"
12
+ TEST_SHADOW="$BENCH_ROOT/shadow-fixtures/S98-runner-hypothesis"
13
+ trap 'rm -rf "$TMP_DIR" "$BENCH_ROOT/results/$TEST_RUN"* "$TEST_SHADOW"' EXIT
14
+
15
+ expect_fail_contains() {
16
+ local label="$1"
17
+ local needle="$2"
18
+ shift 2
19
+ local out="$TMP_DIR/$label.out"
20
+ if "$@" > "$out" 2>&1; then
21
+ echo "expected failure for $label" >&2
22
+ cat "$out" >&2
23
+ exit 1
24
+ fi
25
+ if ! grep -Fq -- "$needle" "$out"; then
26
+ echo "missing expected text for $label: $needle" >&2
27
+ cat "$out" >&2
28
+ exit 1
29
+ fi
30
+ }
31
+
32
+ bash "$RUNNER" --help > "$TMP_DIR/help.out" 2>&1
33
+ grep -Fq 'usage:' "$TMP_DIR/help.out"
34
+ grep -Fq -- '--bare-max N' "$TMP_DIR/help.out"
35
+ grep -Fq -- '--solo-max N' "$TMP_DIR/help.out"
36
+ grep -Fq -- '--min-bare-headroom N' "$TMP_DIR/help.out"
37
+ grep -Fq -- '--min-solo-headroom N' "$TMP_DIR/help.out"
38
+ grep -Fq -- '--min-fixtures N' "$TMP_DIR/help.out"
39
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/help.out"
40
+ grep -Fq -- '--dry-run' "$TMP_DIR/help.out"
41
+ grep -Fq 'print_command' "$RUNNER"
42
+ grep -Fq 'Command: ' "$RUNNER"
43
+ grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND' "$RUNNER"
44
+ grep -Fq 'cmd=(npx devlyn-cli benchmark headroom --run-id "$RUN_ID")' "$RUNNER"
45
+ grep -Fq 'cmd=(bash "$0" --run-id "$RUN_ID")' "$RUNNER"
46
+ grep -Fq 'cmd+=(--bare-max "$BARE_MAX")' "$RUNNER"
47
+ grep -Fq 'cmd+=(--solo-max "$SOLO_MAX")' "$RUNNER"
48
+ grep -Fq 'cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")' "$RUNNER"
49
+ grep -Fq 'cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")' "$RUNNER"
50
+ grep -Fq 'cmd+=(--min-fixtures "$MIN_FIXTURES")' "$RUNNER"
51
+ grep -Fq 'cmd+=(--allow-rejected-fixtures)' "$RUNNER"
52
+ grep -Fq 'cmd+=(--dry-run)' "$RUNNER"
53
+ grep -Fq 'baseline evidence-complete' "$RUNNER"
54
+ grep -Fq 'headroom gate passed — candidate set accepted' "$RUNNER"
55
+ grep -Fq 'headroom gate failed — candidate set rejected' "$RUNNER"
56
+ grep -Fq -- '--bare-max "$BARE_MAX"' "$RUNNER"
57
+ grep -Fq -- '--solo-max "$SOLO_MAX"' "$RUNNER"
58
+ grep -Fq -- '--min-bare-headroom "$MIN_BARE_HEADROOM"' "$RUNNER"
59
+ grep -Fq -- '--min-solo-headroom "$MIN_SOLO_HEADROOM"' "$RUNNER"
60
+ grep -Fq -- '--min-fixtures "$MIN_FIXTURES"' "$RUNNER"
61
+ grep -Fq 'cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"' "$RUNNER"
62
+ grep -Fq 'headroom gate report missing' "$RUNNER"
63
+ grep -Fq 'validate_fixtures' "$RUNNER"
64
+ grep -Fq 'fixture_has_solo_ceiling_avoidance_note' "$RUNNER"
65
+ grep -Fq 'shadow fixture NOTES.md needs ## Solo ceiling avoidance' "$RUNNER"
66
+ grep -Fq 'fixture not found in fixtures/ or shadow-fixtures/' "$RUNNER"
67
+ grep -Fq '[FS][0-9]*) FIXTURES+=("$1")' "$RUNNER"
68
+ grep -Fq 'retired_fixture_exists' "$RUNNER"
69
+ grep -Fq 'fixture is retired and is not rerun by pair-candidate runners' "$RUNNER"
70
+ grep -Fq 'fixture_smoke_only' "$RUNNER"
71
+ grep -Fq 'fixture is smoke-only and cannot run providers' "$RUNNER"
72
+ grep -Fq 'rejected_pair_fixture_reason' "$RUNNER"
73
+ grep -Fq 'source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"' "$RUNNER"
74
+ grep -Fq 'declare -F rejected_pair_fixture_reason' "$RUNNER"
75
+ grep -Fq '20260511-f3-http-error-headroom' "$REJECTED"
76
+ grep -Fq '20260507-f10-f11-tier1-full-pipeline' "$REJECTED"
77
+ grep -Fq '20260511-f12-webhook-headroom' "$REJECTED"
78
+ grep -Fq '20260511-f15-concurrency-headroom' "$REJECTED"
79
+ grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' "$REJECTED"
80
+ grep -Fq '20260511-f30-headroom-v1' "$REJECTED"
81
+ grep -Fq '20260513-s2-inventory-headroom' "$REJECTED"
82
+ grep -Fq '20260513-s3-ticket-headroom' "$REJECTED"
83
+ grep -Fq '20260513-s4-return-headroom' "$REJECTED"
84
+ grep -Fq '20260513-s5-credit-headroom' "$REJECTED"
85
+ grep -Fq 'Use --allow-rejected-fixtures for diagnostics only' "$RUNNER"
86
+
87
+ expect_fail_contains missing-fixture 'usage:' \
88
+ bash "$RUNNER" --run-id headroom-arg-test
89
+
90
+ expect_fail_contains unknown-arg 'unknown arg: --bad-flag' \
91
+ bash "$RUNNER" --bad-flag F21-cli-scheduler-priority
92
+
93
+ expect_fail_contains missing-bare-max-value '--bare-max requires a value' \
94
+ bash "$RUNNER" --bare-max
95
+
96
+ expect_fail_contains invalid-bare-max '--bare-max must be an integer: nope' \
97
+ bash "$RUNNER" --bare-max nope F21-cli-scheduler-priority
98
+
99
+ expect_fail_contains invalid-min-fixtures '--min-fixtures must be >= 1' \
100
+ bash "$RUNNER" --min-fixtures 0 F21-cli-scheduler-priority
101
+
102
+ expect_fail_contains invalid-min-bare-headroom '--min-bare-headroom must be an integer: nope' \
103
+ bash "$RUNNER" --min-bare-headroom nope F21-cli-scheduler-priority
104
+
105
+ expect_fail_contains negative-min-bare-headroom '--min-bare-headroom must be an integer: -1' \
106
+ bash "$RUNNER" --min-bare-headroom -1 F21-cli-scheduler-priority
107
+
108
+ expect_fail_contains negative-min-solo-headroom '--min-solo-headroom must be an integer: -1' \
109
+ bash "$RUNNER" --min-solo-headroom -1 F21-cli-scheduler-priority
110
+
111
+ expect_fail_contains missing-fixture-fast \
112
+ 'fixture not found in fixtures/ or shadow-fixtures/: F999-not-a-fixture' \
113
+ bash "$RUNNER" --run-id "$TEST_RUN-missing" F999-not-a-fixture
114
+
115
+ expect_fail_contains rejected-f1-fixture \
116
+ 'fixture rejected for pair-candidate runs: F1-cli-trivial-flag' \
117
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f1" --dry-run --min-fixtures 1 F1-cli-trivial-flag
118
+
119
+ expect_fail_contains rejected-f2-fixture \
120
+ 'fixture rejected for pair-candidate runs: F2-cli-medium-subcommand' \
121
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f2" --dry-run --min-fixtures 1 F2-cli-medium-subcommand
122
+
123
+ expect_fail_contains rejected-fixture \
124
+ 'fixture rejected for pair-candidate runs: F26-cli-payout-ledger-rules' \
125
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected" --dry-run --min-fixtures 1 F26-cli-payout-ledger-rules
126
+
127
+ expect_fail_contains rejected-f3-fixture \
128
+ 'fixture rejected for pair-candidate runs: F3-backend-contract-risk' \
129
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f3" --dry-run --min-fixtures 1 F3-backend-contract-risk
130
+
131
+ expect_fail_contains rejected-f4-fixture \
132
+ 'fixture rejected for pair-candidate runs: F4-web-browser-design' \
133
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f4" --dry-run --min-fixtures 1 F4-web-browser-design
134
+
135
+ expect_fail_contains rejected-f5-fixture \
136
+ 'fixture rejected for pair-candidate runs: F5-fix-loop-red-green' \
137
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f5" --dry-run --min-fixtures 1 F5-fix-loop-red-green
138
+
139
+ expect_fail_contains rejected-f6-fixture \
140
+ 'fixture rejected for pair-candidate runs: F6-dep-audit-native-module' \
141
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f6" --dry-run --min-fixtures 1 F6-dep-audit-native-module
142
+
143
+ expect_fail_contains rejected-f7-fixture \
144
+ 'fixture rejected for pair-candidate runs: F7-out-of-scope-trap' \
145
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f7" --dry-run --min-fixtures 1 F7-out-of-scope-trap
146
+
147
+ expect_fail_contains rejected-f8-fixture \
148
+ 'fixture rejected for pair-candidate runs: F8-known-limit-ambiguous' \
149
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f8" --dry-run --min-fixtures 1 F8-known-limit-ambiguous
150
+
151
+ expect_fail_contains rejected-f9-fixture \
152
+ 'fixture rejected for pair-candidate runs: F9-e2e-ideate-to-resolve' \
153
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f9" --dry-run --min-fixtures 1 F9-e2e-ideate-to-resolve
154
+
155
+ expect_fail_contains rejected-f10-fixture \
156
+ 'fixture rejected for pair-candidate runs: F10-persist-write-collision' \
157
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f10" --dry-run --min-fixtures 1 F10-persist-write-collision
158
+
159
+ expect_fail_contains rejected-f11-fixture \
160
+ 'fixture rejected for pair-candidate runs: F11-batch-import-all-or-nothing' \
161
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f11" --dry-run --min-fixtures 1 F11-batch-import-all-or-nothing
162
+
163
+ expect_fail_contains rejected-f12-fixture \
164
+ 'fixture rejected for pair-candidate runs: F12-webhook-raw-body-signature' \
165
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f12" --dry-run --min-fixtures 1 F12-webhook-raw-body-signature
166
+
167
+ expect_fail_contains rejected-f15-fixture \
168
+ 'fixture rejected for pair-candidate runs: F15-frozen-diff-race-review' \
169
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f15" --dry-run --min-fixtures 1 F15-frozen-diff-race-review
170
+
171
+ expect_fail_contains rejected-f31-fixture \
172
+ 'fixture rejected for pair-candidate runs: F31-cli-seat-rebalance' \
173
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f31" --dry-run --min-fixtures 1 F31-cli-seat-rebalance
174
+
175
+ expect_fail_contains rejected-f32-fixture \
176
+ 'fixture rejected for pair-candidate runs: F32-cli-subscription-renewal' \
177
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f32" --dry-run --min-fixtures 1 F32-cli-subscription-renewal
178
+
179
+ expect_fail_contains rejected-s2-shadow-fixture \
180
+ 'fixture rejected for pair-candidate runs: S2-cli-inventory-reservation' \
181
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s2" --dry-run --min-fixtures 1 S2-cli-inventory-reservation
182
+
183
+ expect_fail_contains rejected-s3-shadow-fixture \
184
+ 'fixture rejected for pair-candidate runs: S3-cli-ticket-assignment' \
185
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s3" --dry-run --min-fixtures 1 S3-cli-ticket-assignment
186
+
187
+ expect_fail_contains rejected-s4-shadow-fixture \
188
+ 'fixture rejected for pair-candidate runs: S4-cli-return-routing' \
189
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s4" --dry-run --min-fixtures 1 S4-cli-return-routing
190
+
191
+ expect_fail_contains rejected-s5-shadow-fixture \
192
+ 'fixture rejected for pair-candidate runs: S5-cli-credit-grant-ledger' \
193
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s5" --dry-run --min-fixtures 1 S5-cli-credit-grant-ledger
194
+
195
+ expect_fail_contains rejected-s6-shadow-fixture \
196
+ 'fixture rejected for pair-candidate runs: S6-cli-refund-window-ledger' \
197
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s6" --dry-run --min-fixtures 1 S6-cli-refund-window-ledger
198
+
199
+ expect_fail_contains retired-fixture \
200
+ 'fixture is retired and is not rerun by pair-candidate runners: F28-cli-return-authorization' \
201
+ bash "$RUNNER" --run-id "$TEST_RUN-retired" --dry-run --min-fixtures 1 F28-cli-return-authorization
202
+
203
+ expect_fail_contains smoke-only-s1-provider-run \
204
+ 'fixture is smoke-only and cannot run providers: S1-cli-lang-flag' \
205
+ bash "$RUNNER" --run-id "$TEST_RUN-smoke-only" --min-fixtures 1 S1-cli-lang-flag
206
+
207
+ expect_fail_contains cli-replay-command \
208
+ "Command: npx devlyn-cli benchmark headroom --run-id $TEST_RUN" \
209
+ env DEVLYN_BENCHMARK_CLI_SUBCOMMAND=headroom \
210
+ bash "$RUNNER" --run-id "$TEST_RUN" --min-fixtures 2 F999-not-a-fixture
211
+
212
+ expect_fail_contains dry-run-min-fixtures \
213
+ '[headroom] DRY RUN failed' \
214
+ bash "$RUNNER" --run-id "$TEST_RUN-dry-run-fail" --dry-run F21-cli-scheduler-priority
215
+
216
+ bash "$RUNNER" --run-id "$TEST_RUN-dry-run" --dry-run --min-fixtures 1 F21-cli-scheduler-priority \
217
+ > "$TMP_DIR/dry-run.out" 2>&1
218
+ grep -Fq 'Mode: DRY RUN (no model/provider invocations)' "$TMP_DIR/dry-run.out"
219
+ grep -Fq 'Command: ' "$TMP_DIR/dry-run.out"
220
+ grep -Fq -- '--dry-run' "$TMP_DIR/dry-run.out"
221
+ grep -Fq -- '--min-bare-headroom 5' "$TMP_DIR/dry-run.out"
222
+ grep -Fq -- '--min-solo-headroom 5' "$TMP_DIR/dry-run.out"
223
+ grep -Fq -- '--min-fixtures 1' "$TMP_DIR/dry-run.out"
224
+ grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/dry-run.out"
225
+
226
+ bash "$RUNNER" --run-id "$TEST_RUN-shadow-dry-run" --dry-run --min-fixtures 1 S1-cli-lang-flag \
227
+ > "$TMP_DIR/shadow-dry-run.out" 2>&1
228
+ grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP_DIR/shadow-dry-run.out"
229
+ grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/shadow-dry-run.out"
230
+
231
+ mkdir -p "$TEST_SHADOW"
232
+ cat > "$TEST_SHADOW/metadata.json" <<'EOF'
233
+ {
234
+ "id": "S98-runner-hypothesis",
235
+ "category": "high-risk"
236
+ }
237
+ EOF
238
+ cat > "$TEST_SHADOW/spec.md" <<'EOF'
239
+ # Runner hypothesis fixture
240
+
241
+ Add idempotency handling for duplicate requests.
242
+ EOF
243
+ cat > "$TEST_SHADOW/expected.json" <<'EOF'
244
+ {
245
+ "verification_commands": [
246
+ {
247
+ "cmd": "node -e \"process.exit(0)\"",
248
+ "exit_code": 0
249
+ }
250
+ ]
251
+ }
252
+ EOF
253
+ cat > "$TEST_SHADOW/NOTES.md" <<'EOF'
254
+ # Notes
255
+
256
+ Synthetic runner guard fixture.
257
+ EOF
258
+ expect_fail_contains missing-solo-headroom-hypothesis \
259
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
260
+ bash "$RUNNER" --run-id "$TEST_RUN-missing-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
261
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
262
+
263
+ ## Solo-headroom hypothesis
264
+
265
+ A capable solo_claude baseline is expected to miss duplicate idempotency ordering.
266
+ EOF
267
+ expect_fail_contains weak-solo-headroom-hypothesis \
268
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
269
+ bash "$RUNNER" --run-id "$TEST_RUN-weak-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
270
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
271
+
272
+ Implementation marker: `duplicate-idempotency`.
273
+ EOF
274
+ expect_fail_contains unrelated-backtick-solo-headroom-hypothesis \
275
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
276
+ bash "$RUNNER" --run-id "$TEST_RUN-unrelated-backtick-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
277
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
278
+
279
+ Observable command: `node -e "process.exit(0)"` exposes behavior.
280
+ EOF
281
+ expect_fail_contains observable-without-miss-solo-headroom-hypothesis \
282
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S98-runner-hypothesis' \
283
+ bash "$RUNNER" --run-id "$TEST_RUN-observable-without-miss-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis
284
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
285
+
286
+ Observable command: `node -e "process.exit(0)"` exposes the miss.
287
+ EOF
288
+ expect_fail_contains missing-solo-ceiling-avoidance \
289
+ 'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S98-runner-hypothesis' \
290
+ bash "$RUNNER" --run-id "$TEST_RUN-missing-ceiling" --dry-run --min-fixtures 1 S98-runner-hypothesis
291
+ cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
292
+
293
+ ## Solo ceiling avoidance
294
+
295
+ This candidate mentions solo_claude but gives no control comparison.
296
+ EOF
297
+ expect_fail_contains weak-solo-ceiling-avoidance \
298
+ 'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S98-runner-hypothesis' \
299
+ bash "$RUNNER" --run-id "$TEST_RUN-weak-ceiling" --dry-run --min-fixtures 1 S98-runner-hypothesis
300
+ cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
301
+
302
+ Unlike solo-saturated S2-S6 controls, this fixture should preserve
303
+ solo_claude headroom because it targets a multi-run state dependency.
304
+ EOF
305
+ bash "$RUNNER" --run-id "$TEST_RUN-hypothesis" --dry-run --min-fixtures 1 S98-runner-hypothesis \
306
+ > "$TMP_DIR/hypothesis.out" 2>&1
307
+ grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/hypothesis.out"
308
+
309
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-override" --dry-run --min-fixtures 1 \
310
+ --allow-rejected-fixtures F26-cli-payout-ledger-rules \
311
+ > "$TMP_DIR/rejected-override.out" 2>&1
312
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/rejected-override.out"
313
+ grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/rejected-override.out"
314
+
315
+ bash "$RUNNER" --run-id "$TEST_RUN-shadow-rejected-override" --dry-run --min-fixtures 1 \
316
+ --allow-rejected-fixtures S3-cli-ticket-assignment \
317
+ > "$TMP_DIR/shadow-rejected-override.out" 2>&1
318
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/shadow-rejected-override.out"
319
+ grep -Fq '[headroom] DRY RUN complete' "$TMP_DIR/shadow-rejected-override.out"
320
+
321
+ STUB_REPO="$TMP_DIR/stub-repo"
322
+ STUB_BENCH="$STUB_REPO/benchmark/auto-resolve"
323
+ mkdir -p \
324
+ "$STUB_BENCH/scripts" \
325
+ "$STUB_BENCH/fixtures/F21-cli-scheduler-priority" \
326
+ "$STUB_REPO/config/skills/devlyn:resolve"
327
+ cp "$RUNNER" "$STUB_BENCH/scripts/run-headroom-candidate.sh"
328
+ cp "$REJECTED" "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
329
+ chmod +x "$STUB_BENCH/scripts/run-headroom-candidate.sh"
330
+ chmod +x "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
331
+ printf -- '---\nname: devlyn:resolve\n---\n' > "$STUB_REPO/config/skills/devlyn:resolve/SKILL.md"
332
+ cat > "$STUB_BENCH/scripts/run-fixture.sh" <<'EOF'
333
+ #!/usr/bin/env bash
334
+ set -euo pipefail
335
+ echo "[stub-run-fixture] $*"
336
+ EOF
337
+ chmod +x "$STUB_BENCH/scripts/run-fixture.sh"
338
+ cat > "$STUB_BENCH/scripts/judge.sh" <<'EOF'
339
+ #!/usr/bin/env bash
340
+ set -euo pipefail
341
+ echo "[stub-judge] $*"
342
+ EOF
343
+ chmod +x "$STUB_BENCH/scripts/judge.sh"
344
+ cat > "$STUB_BENCH/scripts/headroom-gate.py" <<'PY'
345
+ #!/usr/bin/env python3
346
+ import json
347
+ import os
348
+ import pathlib
349
+ import sys
350
+
351
+ out_json = None
352
+ out_md = None
353
+ args = sys.argv[1:]
354
+ for index, arg in enumerate(args):
355
+ if arg == "--out-json":
356
+ out_json = pathlib.Path(args[index + 1])
357
+ if arg == "--out-md":
358
+ out_md = pathlib.Path(args[index + 1])
359
+ payload = {"verdict": "PASS" if os.environ.get("STUB_HEADROOM_EXIT", "0") == "0" else "FAIL"}
360
+ if out_json:
361
+ out_json.parent.mkdir(parents=True, exist_ok=True)
362
+ out_json.write_text(json.dumps(payload) + "\n", encoding="utf8")
363
+ if out_md:
364
+ out_md.parent.mkdir(parents=True, exist_ok=True)
365
+ out_md.write_text(
366
+ "# stub headroom\n\n"
367
+ "Verdict: **%s**\n\n"
368
+ "| fixture | bare | solo_claude | solo_claude-bare |\n"
369
+ "| --- | ---: | ---: | ---: |\n"
370
+ "| F21-cli-scheduler-priority | 50 | 75 | 25 |\n"
371
+ % payload["verdict"],
372
+ encoding="utf8",
373
+ )
374
+ sys.exit(int(os.environ.get("STUB_HEADROOM_EXIT", "0")))
375
+ PY
376
+
377
+ STUB_RUNNER="$STUB_BENCH/scripts/run-headroom-candidate.sh"
378
+ STUB_HEADROOM_EXIT=0 \
379
+ bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-success" --min-fixtures 1 F21-cli-scheduler-priority \
380
+ > "$TMP_DIR/stub-success.out" 2>&1
381
+ grep -Fq '[headroom] headroom gate passed — candidate set accepted.' "$TMP_DIR/stub-success.out"
382
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-success.out"
383
+ grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm bare' "$TMP_DIR/stub-success.out"
384
+ grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm solo_claude' "$TMP_DIR/stub-success.out"
385
+
386
+ if STUB_HEADROOM_EXIT=1 \
387
+ bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-fail" --min-fixtures 1 F21-cli-scheduler-priority \
388
+ > "$TMP_DIR/stub-fail.out" 2>&1; then
389
+ echo "expected stub headroom gate failure" >&2
390
+ cat "$TMP_DIR/stub-fail.out" >&2
391
+ exit 1
392
+ fi
393
+ grep -Fq '[headroom] headroom gate failed — candidate set rejected.' "$TMP_DIR/stub-fail.out"
394
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-fail.out"
395
+ if grep -Fq '[headroom] headroom gate passed — candidate set accepted.' "$TMP_DIR/stub-fail.out"; then
396
+ echo "accepted message must not print after headroom gate failure" >&2
397
+ cat "$TMP_DIR/stub-fail.out" >&2
398
+ exit 1
399
+ fi
400
+
401
+ echo "PASS test-run-headroom-candidate"
@@ -0,0 +1,111 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for run-swebench-solver-batch.sh argument guards.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ RUNNER="$SCRIPT_DIR/run-swebench-solver-batch.sh"
8
+ TMP_DIR="$(mktemp -d /tmp/run-swebench-solver-batch-test.XXXXXX)"
9
+ trap 'rm -rf "$TMP_DIR"' EXIT
10
+ FAKEBIN="$TMP_DIR/fakebin"
11
+ mkdir -p "$FAKEBIN"
12
+ cat > "$FAKEBIN/claude" <<'EOF'
13
+ #!/usr/bin/env bash
14
+ echo "fake claude should not be reached" >&2
15
+ exit 1
16
+ EOF
17
+ chmod +x "$FAKEBIN/claude"
18
+
19
+ expect_fail_contains() {
20
+ local label="$1"
21
+ local needle="$2"
22
+ shift 2
23
+ local out="$TMP_DIR/$label.out"
24
+ if "$@" > "$out" 2>&1; then
25
+ echo "expected failure for $label" >&2
26
+ cat "$out" >&2
27
+ exit 1
28
+ fi
29
+ if ! grep -Fq -- "$needle" "$out"; then
30
+ echo "missing expected text for $label: $needle" >&2
31
+ cat "$out" >&2
32
+ exit 1
33
+ fi
34
+ }
35
+
36
+ bash "$RUNNER" --help > "$TMP_DIR/help.out" 2>&1
37
+ grep -Fq 'usage:' "$TMP_DIR/help.out"
38
+ grep -Fq -- '--instances-jsonl <path>' "$TMP_DIR/help.out"
39
+ grep -Fq -- '--predictions-out <path>' "$TMP_DIR/help.out"
40
+ grep -Fq -- '--timeout-seconds N' "$TMP_DIR/help.out"
41
+ grep -Fq 'require_value()' "$RUNNER"
42
+
43
+ expect_fail_contains missing-instances-jsonl-value \
44
+ '--instances-jsonl requires a value' \
45
+ bash "$RUNNER" --instances-jsonl
46
+
47
+ expect_fail_contains missing-predictions-out-value \
48
+ '--predictions-out requires a value' \
49
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out
50
+
51
+ expect_fail_contains missing-model-name-value \
52
+ '--model-name requires a value' \
53
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --model-name
54
+
55
+ expect_fail_contains missing-repos-root-value \
56
+ '--repos-root requires a value' \
57
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --repos-root
58
+
59
+ expect_fail_contains missing-worktrees-root-value \
60
+ '--worktrees-root requires a value' \
61
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --worktrees-root
62
+
63
+ expect_fail_contains missing-timeout-value \
64
+ '--timeout-seconds requires a value' \
65
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --timeout-seconds
66
+
67
+ expect_fail_contains missing-limit-value \
68
+ '--limit requires a value' \
69
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --limit
70
+
71
+ expect_fail_contains missing-instance-id-value \
72
+ '--instance-id requires a value' \
73
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --instance-id
74
+
75
+ touch "$TMP_DIR/instances.jsonl"
76
+ expect_fail_contains invalid-timeout \
77
+ '--timeout-seconds must be an integer' \
78
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --timeout-seconds nope
79
+
80
+ expect_fail_contains zero-timeout \
81
+ '--timeout-seconds must be > 0' \
82
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --timeout-seconds 0
83
+
84
+ expect_fail_contains invalid-limit \
85
+ '--limit must be an integer' \
86
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --limit nope
87
+
88
+ expect_fail_contains zero-limit \
89
+ '--limit must be > 0' \
90
+ bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl" --limit 0
91
+
92
+ expect_fail_contains missing-claude \
93
+ 'claude command not found' \
94
+ env PATH="/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
95
+
96
+ printf '[]\n' > "$TMP_DIR/non-object-instances.jsonl"
97
+ expect_fail_contains non-object-instance-row \
98
+ 'expected JSON object' \
99
+ env PATH="$FAKEBIN:/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/non-object-instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
100
+
101
+ printf '{"instance_id": NaN}\n' > "$TMP_DIR/nan-instances.jsonl"
102
+ expect_fail_contains nan-instance-row \
103
+ 'invalid JSON numeric constant: NaN' \
104
+ env PATH="$FAKEBIN:/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/nan-instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
105
+
106
+ printf '{"repo":"local/repo"}\n' > "$TMP_DIR/missing-id-instances.jsonl"
107
+ expect_fail_contains missing-instance-id-row \
108
+ 'missing instance_id' \
109
+ env PATH="$FAKEBIN:/usr/bin:/bin" bash "$RUNNER" --instances-jsonl "$TMP_DIR/missing-id-instances.jsonl" --predictions-out "$TMP_DIR/predictions.jsonl"
110
+
111
+ echo "PASS test-run-swebench-solver-batch"