devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -8,25 +8,93 @@ set -euo pipefail
8
8
 
9
9
  usage() {
10
10
  local code="${1:-1}"
11
- echo "usage: $0 [--run-id ID] <fixture> [<fixture> ...]" >&2
11
+ cat >&2 <<'EOF'
12
+ usage: run-headroom-candidate.sh [options] <fixture> [<fixture> ...]
13
+
14
+ Options:
15
+ --run-id ID
16
+ --bare-max N (default: 60)
17
+ --solo-max N (default: 80)
18
+ --min-bare-headroom N (default: 5)
19
+ --min-solo-headroom N (default: 5)
20
+ --min-fixtures N (default: 2)
21
+ --allow-rejected-fixtures
22
+ allow rejected/ceiling fixtures for diagnostics only
23
+ --dry-run validate args/fixtures and print replay command only
24
+ EOF
12
25
  exit "$code"
13
26
  }
14
27
 
28
+ require_value() {
29
+ local flag="$1"
30
+ local value="${2:-}"
31
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
32
+ echo "$flag requires a value" >&2
33
+ exit 1
34
+ fi
35
+ }
36
+
15
37
  RUN_ID=""
38
+ BARE_MAX=60
39
+ SOLO_MAX=80
40
+ MIN_BARE_HEADROOM=5
41
+ MIN_SOLO_HEADROOM=5
42
+ MIN_FIXTURES=2
43
+ ALLOW_REJECTED_FIXTURES=0
44
+ DRY_RUN=0
16
45
  FIXTURES=()
17
46
  while [ $# -gt 0 ]; do
18
47
  case "$1" in
19
- --run-id) RUN_ID="$2"; shift 2;;
48
+ --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
49
+ --bare-max) require_value "$1" "${2:-}"; BARE_MAX="$2"; shift 2;;
50
+ --solo-max) require_value "$1" "${2:-}"; SOLO_MAX="$2"; shift 2;;
51
+ --min-bare-headroom) require_value "$1" "${2:-}"; MIN_BARE_HEADROOM="$2"; shift 2;;
52
+ --min-solo-headroom) require_value "$1" "${2:-}"; MIN_SOLO_HEADROOM="$2"; shift 2;;
53
+ --min-fixtures) require_value "$1" "${2:-}"; MIN_FIXTURES="$2"; shift 2;;
54
+ --allow-rejected-fixtures) ALLOW_REJECTED_FIXTURES=1; shift;;
55
+ --dry-run) DRY_RUN=1; shift;;
20
56
  -h|--help) usage 0;;
21
- F[0-9]*) FIXTURES+=("$1"); shift;;
57
+ [FS][0-9]*) FIXTURES+=("$1"); shift;;
22
58
  *) echo "unknown arg: $1" >&2; usage;;
23
59
  esac
24
60
  done
25
61
 
62
+ for threshold in BARE_MAX SOLO_MAX MIN_BARE_HEADROOM MIN_SOLO_HEADROOM MIN_FIXTURES; do
63
+ value="${!threshold}"
64
+ case "$threshold" in
65
+ BARE_MAX) flag="bare-max" ;;
66
+ SOLO_MAX) flag="solo-max" ;;
67
+ MIN_BARE_HEADROOM) flag="min-bare-headroom" ;;
68
+ MIN_SOLO_HEADROOM) flag="min-solo-headroom" ;;
69
+ MIN_FIXTURES) flag="min-fixtures" ;;
70
+ esac
71
+ if [[ ! "$value" =~ ^[0-9]+$ ]]; then
72
+ echo "--$flag must be an integer: $value" >&2
73
+ exit 1
74
+ fi
75
+ done
76
+ if [ "$MIN_FIXTURES" -lt 1 ]; then
77
+ echo "--min-fixtures must be >= 1" >&2
78
+ exit 1
79
+ fi
80
+ if [ "$MIN_BARE_HEADROOM" -lt 0 ]; then
81
+ echo "--min-bare-headroom must be >= 0" >&2
82
+ exit 1
83
+ fi
84
+ if [ "$MIN_SOLO_HEADROOM" -lt 0 ]; then
85
+ echo "--min-solo-headroom must be >= 0" >&2
86
+ exit 1
87
+ fi
88
+
26
89
  [ ${#FIXTURES[@]} -gt 0 ] || usage
27
90
 
28
91
  BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
29
92
  REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
93
+ source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"
94
+ if ! declare -F rejected_pair_fixture_reason >/dev/null; then
95
+ echo "rejected fixture registry must define rejected_pair_fixture_reason" >&2
96
+ exit 1
97
+ fi
30
98
 
31
99
  if [ -z "$RUN_ID" ]; then
32
100
  TS=$(date -u +%Y%m%dT%H%M%SZ)
@@ -34,16 +102,177 @@ if [ -z "$RUN_ID" ]; then
34
102
  RUN_ID="${TS}-${SHA}-headroom"
35
103
  fi
36
104
 
105
+ print_command() {
106
+ local cmd
107
+ if [ "${DEVLYN_BENCHMARK_CLI_SUBCOMMAND:-}" = "headroom" ]; then
108
+ cmd=(npx devlyn-cli benchmark headroom --run-id "$RUN_ID")
109
+ else
110
+ cmd=(bash "$0" --run-id "$RUN_ID")
111
+ fi
112
+ cmd+=(--bare-max "$BARE_MAX")
113
+ cmd+=(--solo-max "$SOLO_MAX")
114
+ cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")
115
+ cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")
116
+ cmd+=(--min-fixtures "$MIN_FIXTURES")
117
+ [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ] || cmd+=(--allow-rejected-fixtures)
118
+ [ "$DRY_RUN" -eq 0 ] || cmd+=(--dry-run)
119
+ cmd+=("${FIXTURES[@]}")
120
+ printf 'Command: '
121
+ printf '%q ' "${cmd[@]}"
122
+ printf '\n'
123
+ }
124
+
125
+ fixture_exists() {
126
+ local fid="$1"
127
+ [ -d "$BENCH_ROOT/fixtures/$fid" ] || [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
128
+ }
129
+
130
+ fixture_dir() {
131
+ local fid="$1"
132
+ if [ -d "$BENCH_ROOT/fixtures/$fid" ]; then
133
+ printf '%s\n' "$BENCH_ROOT/fixtures/$fid"
134
+ else
135
+ printf '%s\n' "$BENCH_ROOT/shadow-fixtures/$fid"
136
+ fi
137
+ }
138
+
139
+ is_shadow_fixture() {
140
+ local fid="$1"
141
+ [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
142
+ }
143
+
144
+ retired_fixture_exists() {
145
+ local fid="$1"
146
+ [ -d "$BENCH_ROOT/fixtures/retired/$fid" ]
147
+ }
148
+
149
+ fixture_smoke_only() {
150
+ local fid="$1"
151
+ [[ "$fid" == S1 || "$fid" == S1-* ]]
152
+ }
153
+
154
+ fixture_category() {
155
+ local dir="$1"
156
+ python3 - "$dir/metadata.json" <<'PY'
157
+ import json
158
+ import sys
159
+
160
+ try:
161
+ with open(sys.argv[1], encoding="utf-8") as handle:
162
+ print(json.load(handle).get("category", ""))
163
+ except FileNotFoundError:
164
+ print("")
165
+ PY
166
+ }
167
+
168
+ fixture_has_solo_headroom_hypothesis() {
169
+ local dir="$1"
170
+ python3 "$BENCH_ROOT/scripts/solo-headroom-hypothesis.py" --expected-json "$dir/expected.json" "$dir/spec.md"
171
+ }
172
+
173
+ fixture_has_solo_ceiling_avoidance_note() {
174
+ local dir="$1"
175
+ python3 "$BENCH_ROOT/scripts/solo-ceiling-avoidance.py" "$dir/NOTES.md"
176
+ }
177
+
178
+ fixture_has_pair_evidence() {
179
+ local fid="$1"
180
+ python3 - "$BENCH_ROOT/results" "$fid" <<'PY'
181
+ import json
182
+ import pathlib
183
+ import sys
184
+
185
+ results = pathlib.Path(sys.argv[1])
186
+ fixture = sys.argv[2]
187
+ if not results.is_dir():
188
+ sys.exit(1)
189
+ for path in results.glob("*/full-pipeline-pair-gate.json"):
190
+ try:
191
+ data = json.loads(path.read_text(encoding="utf-8"))
192
+ except (OSError, json.JSONDecodeError):
193
+ continue
194
+ if data.get("verdict") != "PASS":
195
+ continue
196
+ rows = data.get("rows")
197
+ if not isinstance(rows, list):
198
+ continue
199
+ for row in rows:
200
+ if isinstance(row, dict) and row.get("fixture") == fixture and row.get("status") == "PASS":
201
+ sys.exit(0)
202
+ sys.exit(1)
203
+ PY
204
+ }
205
+
206
+ validate_fixtures() {
207
+ local missing=0
208
+ local fid reason dir category
209
+ for fid in "${FIXTURES[@]}"; do
210
+ if ! fixture_exists "$fid"; then
211
+ if retired_fixture_exists "$fid"; then
212
+ echo "fixture is retired and is not rerun by pair-candidate runners: $fid. Use preserved results/docs for historical replay." >&2
213
+ missing=1
214
+ continue
215
+ fi
216
+ echo "fixture not found in fixtures/ or shadow-fixtures/: $fid" >&2
217
+ missing=1
218
+ continue
219
+ fi
220
+ if [ "$DRY_RUN" -eq 0 ] && fixture_smoke_only "$fid"; then
221
+ echo "fixture is smoke-only and cannot run providers: $fid. Use --dry-run for runner/package validation." >&2
222
+ missing=1
223
+ continue
224
+ fi
225
+ reason="$(rejected_pair_fixture_reason "$fid" || true)"
226
+ if [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ]; then
227
+ if [ -n "$reason" ]; then
228
+ echo "fixture rejected for pair-candidate runs: $fid ($reason). Use --allow-rejected-fixtures for diagnostics only." >&2
229
+ missing=1
230
+ continue
231
+ fi
232
+ fi
233
+ if [ -z "$reason" ]; then
234
+ dir="$(fixture_dir "$fid")"
235
+ category="$(fixture_category "$dir")"
236
+ if [ "$category" = "high-risk" ] && ! fixture_has_pair_evidence "$fid"; then
237
+ if ! fixture_has_solo_headroom_hypothesis "$dir"; then
238
+ echo "fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: $fid" >&2
239
+ missing=1
240
+ fi
241
+ if is_shadow_fixture "$fid" && ! fixture_has_solo_ceiling_avoidance_note "$dir"; then
242
+ echo "shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: $fid" >&2
243
+ missing=1
244
+ fi
245
+ fi
246
+ fi
247
+ done
248
+ [ "$missing" -eq 0 ] || exit 1
249
+ }
250
+
37
251
  echo ""
38
252
  echo "═══ Headroom Candidate Run ═══"
39
253
  echo "Run-id: $RUN_ID"
40
254
  echo "Fixtures: ${FIXTURES[*]}"
41
255
  echo "Arms: bare solo_claude"
42
- if [ ${#FIXTURES[@]} -lt 2 ]; then
43
- echo "Gate: will FAIL set gate unless at least 2 fixtures are supplied"
256
+ echo "Gate: bare <= $BARE_MAX (headroom >= $MIN_BARE_HEADROOM), solo_claude <= $SOLO_MAX (headroom >= $MIN_SOLO_HEADROOM), baseline evidence-complete, min fixtures $MIN_FIXTURES"
257
+ [ "$DRY_RUN" -eq 0 ] || echo "Mode: DRY RUN (no model/provider invocations)"
258
+ print_command
259
+ if [ ${#FIXTURES[@]} -lt "$MIN_FIXTURES" ]; then
260
+ echo "Gate: will FAIL set gate unless at least $MIN_FIXTURES fixtures are supplied"
44
261
  fi
45
262
  echo ""
46
263
 
264
+ validate_fixtures
265
+
266
+ if [ "$DRY_RUN" -eq 1 ] && [ "${#FIXTURES[@]}" -lt "$MIN_FIXTURES" ]; then
267
+ echo "[headroom] DRY RUN failed — ${#FIXTURES[@]} fixture(s) supplied, --min-fixtures requires $MIN_FIXTURES." >&2
268
+ exit 1
269
+ fi
270
+
271
+ if [ "$DRY_RUN" -eq 1 ]; then
272
+ echo "[headroom] DRY RUN complete — fixtures resolved, no arms or judges executed."
273
+ exit 0
274
+ fi
275
+
47
276
  SRC_SKILLS="$REPO_ROOT/config/skills"
48
277
  DST_SKILLS="$REPO_ROOT/.claude/skills"
49
278
  mkdir -p "$DST_SKILLS"
@@ -84,10 +313,24 @@ echo ""
84
313
  set +e
85
314
  python3 "$BENCH_ROOT/scripts/headroom-gate.py" \
86
315
  --run-id "$RUN_ID" \
316
+ --bare-max "$BARE_MAX" \
317
+ --solo-max "$SOLO_MAX" \
318
+ --min-bare-headroom "$MIN_BARE_HEADROOM" \
319
+ --min-solo-headroom "$MIN_SOLO_HEADROOM" \
320
+ --min-fixtures "$MIN_FIXTURES" \
87
321
  --out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json" \
88
322
  --out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
89
323
  GATE_EXIT=$?
90
324
  set -e
91
325
 
92
- cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
326
+ if [ -f "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md" ]; then
327
+ cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
328
+ else
329
+ echo "[headroom] headroom gate report missing: $BENCH_ROOT/results/$RUN_ID/headroom-gate.md" >&2
330
+ fi
331
+ if [ "$GATE_EXIT" -eq 0 ]; then
332
+ echo "[headroom] headroom gate passed — candidate set accepted."
333
+ else
334
+ echo "[headroom] headroom gate failed — candidate set rejected."
335
+ fi
93
336
  exit "$GATE_EXIT"
@@ -96,16 +96,29 @@ echo "[run-iter-0033c] RUN_ID=$RUN_ID"
96
96
  echo "[run-iter-0033c] RESULTS_DIR=$RESULTS_DIR"
97
97
 
98
98
  # --- Determine pair-eligible set from manifest input bundle ---
99
- # Build a draft manifest using the C1 summary as the L1 placeholder; we'll
100
- # rebuild with the real L1 rerun summary at the end. For now we just need
101
- # the pair-eligible set for arm-selection per fixture.
99
+ # Pair eligibility is pre-registered from C1/F9 before any iter-0033c arms run.
100
+ # The later L1 rerun summary is archived into the final manifest for provenance;
101
+ # it must not change the arm-selection set after execution has begun.
102
102
  DRAFT_MANIFEST="$RESULTS_DIR/manifest-draft.json"
103
103
  python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
104
104
  --c1-summary "$C1_SUMMARY" \
105
105
  --f9-judge "$F9_JUDGE" \
106
106
  --l1-rerun-summary "$C1_SUMMARY" \
107
107
  --output "$DRAFT_MANIFEST"
108
- PAIR_ELIGIBLE=$(python3 -c "import json;print(' '.join(json.load(open('$DRAFT_MANIFEST'))['fixtures_pair_eligible']))")
108
+ PAIR_ELIGIBLE=$(python3 - "$DRAFT_MANIFEST" "$REPO_ROOT/benchmark/auto-resolve/scripts" <<'PY'
109
+ import pathlib
110
+ import sys
111
+
112
+ sys.path.insert(0, sys.argv[2])
113
+ from pair_evidence_contract import loads_strict_json_object
114
+
115
+ manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
116
+ fixtures = manifest.get("fixtures_pair_eligible")
117
+ if not isinstance(fixtures, list) or not all(isinstance(item, str) for item in fixtures):
118
+ raise SystemExit("manifest fixtures_pair_eligible must be a string array")
119
+ print(" ".join(fixtures))
120
+ PY
121
+ )
109
122
  echo "[run-iter-0033c] pair-eligible: $PAIR_ELIGIBLE"
110
123
 
111
124
  # --- Per-fixture interleaved arm loop ---
@@ -161,50 +174,11 @@ done
161
174
 
162
175
  # --- Build L1 rerun summary from solo_claude arm result.json + judge.json ---
163
176
  L1_RERUN_SUMMARY="$RESULTS_DIR/l1-rerun-summary.json"
164
- python3 - "$RESULTS_DIR" "$L1_RERUN_SUMMARY" "$RUN_ID" "$HEAD_SHA" <<'PY'
165
- import json, sys
166
- from pathlib import Path
167
- results_dir = Path(sys.argv[1])
168
- out_path = Path(sys.argv[2])
169
- run_id = sys.argv[3]
170
- head_sha = sys.argv[4]
171
- rows = []
172
- for fx_dir in sorted(results_dir.iterdir()):
173
- if not fx_dir.is_dir():
174
- continue
175
- judge_p = fx_dir / "judge.json"
176
- if not judge_p.is_file():
177
- continue
178
- judge = json.loads(judge_p.read_text())
179
- mapping = judge.get("_blind_mapping") or {}
180
- inv = {v: k for k, v in mapping.items()}
181
- arms = {}
182
- for arm_name in ("solo_claude", "l2_gated", "l2_forced", "bare"):
183
- letter = inv.get(arm_name)
184
- if not letter:
185
- continue
186
- arm_dir = fx_dir / arm_name
187
- result = {}
188
- if (arm_dir / "result.json").is_file():
189
- result = json.loads((arm_dir / "result.json").read_text())
190
- arms[arm_name] = {
191
- "score": judge.get(f"{letter}_score"),
192
- "wall_s": result.get("elapsed_seconds"),
193
- "verify_score": result.get("verify_score"),
194
- "files_changed": result.get("files_changed"),
195
- "timed_out": result.get("timed_out"),
196
- "disqualifier": result.get("disqualifier"),
197
- }
198
- rows.append({"fixture": fx_dir.name, "arms": arms})
199
- out = {
200
- "run_id": run_id,
201
- "git_sha": head_sha,
202
- "fixtures_total": len(rows),
203
- "rows": rows,
204
- }
205
- out_path.write_text(json.dumps(out, indent=2) + "\n")
206
- print(f"[l1-rerun-summary] wrote {out_path} (fixtures={len(rows)})")
207
- PY
177
+ python3 benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
178
+ --results-dir "$RESULTS_DIR" \
179
+ --out "$L1_RERUN_SUMMARY" \
180
+ --run-id "$RUN_ID" \
181
+ --git-sha "$HEAD_SHA"
208
182
 
209
183
  # --- Build final manifest with real L1 rerun summary ---
210
184
  FINAL_MANIFEST="$RESULTS_DIR/iter-0033c-pair-eligible.json"
@@ -6,13 +6,13 @@
6
6
  #
7
7
  # Usage:
8
8
  # run-suite.sh # all fixtures, n=1 smoke
9
- # run-suite.sh --n 3 # 3 runs per fixture for ship decisions
10
9
  # run-suite.sh F2 F5 # specific fixtures only
11
10
  # run-suite.sh --dry-run # skip model invocations, validate setup
12
11
  # run-suite.sh --judge-only --run-id X # re-judge an existing run
13
12
  # run-suite.sh --label v3.6 # tag this run
14
13
  # run-suite.sh --bless # if ship-gate PASS, promote to baselines/shipped.json
15
14
  # run-suite.sh --resolve-skill new # invoke /devlyn:resolve --spec (the only supported value post iter-0034 cutover; flag kept as accepted no-op for historical runners)
15
+ # run-suite.sh --suite shadow --dry-run # list shadow tasks; shadow suite refuses provider/judge runs
16
16
  #
17
17
  # Exits 0 on PASS, 1 on FAIL.
18
18
 
@@ -32,17 +32,26 @@ SUITE="golden"
32
32
  RESOLVE_SKILL="new"
33
33
  FIXTURES=()
34
34
 
35
+ require_value() {
36
+ local flag="$1"
37
+ local value="${2:-}"
38
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
39
+ echo "$flag requires a value" >&2
40
+ exit 1
41
+ fi
42
+ }
43
+
35
44
  while [ $# -gt 0 ]; do
36
45
  case "$1" in
37
- --n) N="$2"; shift 2;;
38
- --label) LABEL="$2"; shift 2;;
46
+ --n) require_value "$1" "${2:-}"; N="$2"; shift 2;;
47
+ --label) require_value "$1" "${2:-}"; LABEL="$2"; shift 2;;
39
48
  --dry-run) DRY_RUN=1; shift;;
40
49
  --judge-only) JUDGE_ONLY=1; shift;;
41
- --run-id) RUN_ID_ARG="$2"; shift 2;;
50
+ --run-id) require_value "$1" "${2:-}"; RUN_ID_ARG="$2"; shift 2;;
42
51
  --bless) BLESS=1; shift;;
43
52
  --accept-missing) ACCEPT_MISSING=1; shift;;
44
- --suite) SUITE="$2"; shift 2;;
45
- --resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
53
+ --suite) require_value "$1" "${2:-}"; SUITE="$2"; shift 2;;
54
+ --resolve-skill) require_value "$1" "${2:-}"; RESOLVE_SKILL="$2"; shift 2;;
46
55
  -h|--help)
47
56
  head -22 "$0" | sed -n '3,22p'; exit 0;;
48
57
  [FS][0-9]*) FIXTURES+=("$1"); shift;;
@@ -69,8 +78,15 @@ case "$SUITE" in
69
78
  *) echo "error: --suite must be 'golden' or 'shadow' (got '$SUITE')" >&2; exit 1;;
70
79
  esac
71
80
 
81
+ if [ "$SUITE" = "shadow" ] && [ "$DRY_RUN" -eq 0 ]; then
82
+ echo "shadow suite run-suite is dry-run only. Use benchmark headroom/pair with explicit S* candidates for real provider measurement." >&2
83
+ exit 1
84
+ fi
85
+
72
86
  # n must be 1 while iteration semantics aren't wired through judge/report.
73
87
  # Remove this block when compile-report.py gains multi-iter aggregation.
88
+ case "$N" in ''|*[!0-9]*) echo "error: --n must be an integer" >&2; exit 1;; esac
89
+ [ "$N" -gt 0 ] || { echo "error: --n must be > 0" >&2; exit 1; }
74
90
  if [ "$N" -ne 1 ]; then
75
91
  echo "error: --n $N not yet supported — judge/report currently expect a single iteration per fixture." >&2
76
92
  echo " Track progress in benchmark/auto-resolve/BENCHMARK-DESIGN.md (#multi-iter-roadmap)." >&2
@@ -101,6 +117,22 @@ fi
101
117
  RES_DIR="$BENCH_ROOT/results/$RUN_ID"
102
118
  mkdir -p "$RES_DIR"
103
119
 
120
+ print_command() {
121
+ local cmd=(bash "$0" --n "$N" --suite "$SUITE" --resolve-skill "$RESOLVE_SKILL")
122
+ [ -z "$LABEL" ] || cmd+=(--label "$LABEL")
123
+ cmd+=(--run-id "$RUN_ID")
124
+ [ $DRY_RUN -eq 0 ] || cmd+=(--dry-run)
125
+ [ $JUDGE_ONLY -eq 0 ] || cmd+=(--judge-only)
126
+ [ $BLESS -eq 0 ] || cmd+=(--bless)
127
+ [ $ACCEPT_MISSING -eq 0 ] || cmd+=(--accept-missing)
128
+ if [ ${#FIXTURES[@]} -gt 0 ]; then
129
+ cmd+=("${FIXTURES[@]}")
130
+ fi
131
+ printf 'Command: '
132
+ printf '%q ' "${cmd[@]}"
133
+ printf '\n'
134
+ }
135
+
104
136
  echo ""
105
137
  echo "═══ Benchmark Suite Run ═══"
106
138
  echo "Run-id: $RUN_ID"
@@ -111,6 +143,7 @@ echo "n: $N"
111
143
  echo "Resolve skill: $RESOLVE_SKILL"
112
144
  [ $DRY_RUN -eq 1 ] && echo "Mode: DRY RUN (no model invocations)"
113
145
  [ $JUDGE_ONLY -eq 1 ] && echo "Mode: JUDGE ONLY (re-judging existing artifacts)"
146
+ print_command
114
147
  echo ""
115
148
 
116
149
  # ---- Mirror committed skills into .claude/skills (iter-0017) --------------
@@ -201,7 +234,11 @@ done
201
234
  if [ $DRY_RUN -eq 1 ]; then
202
235
  echo ""
203
236
  echo "[suite] DRY RUN complete — results in $RES_DIR"
204
- echo "Run without --dry-run to invoke models."
237
+ if [ "$SUITE" = "shadow" ]; then
238
+ echo "Use benchmark headroom/pair with explicit S* candidates for real provider measurement."
239
+ else
240
+ echo "Run without --dry-run to invoke models."
241
+ fi
205
242
  exit 0
206
243
  fi
207
244
 
@@ -19,6 +19,15 @@ EOF
19
19
  exit "${1:-1}"
20
20
  }
21
21
 
22
+ require_value() {
23
+ local flag="$1"
24
+ local value="${2:-}"
25
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
26
+ echo "$flag requires a value" >&2
27
+ exit 1
28
+ fi
29
+ }
30
+
22
31
  MANIFEST=""
23
32
  RUN_PREFIX=""
24
33
  PAIR_MODE="gated"
@@ -33,18 +42,18 @@ RUN_IDS_OUT=""
33
42
  RESUME_COMPLETED_ARMS=0
34
43
  while [ $# -gt 0 ]; do
35
44
  case "$1" in
36
- --manifest) MANIFEST="$2"; shift 2;;
37
- --run-prefix) RUN_PREFIX="$2"; shift 2;;
38
- --pair-mode) PAIR_MODE="$2"; shift 2;;
39
- --min-runs) MIN_RUNS="$2"; shift 2;;
40
- --out-json) OUT_JSON="$2"; shift 2;;
41
- --out-md) OUT_MD="$2"; shift 2;;
42
- --max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
43
- --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
44
- --run-ids-out) RUN_IDS_OUT="$2"; shift 2;;
45
+ --manifest) require_value "$1" "${2:-}"; MANIFEST="$2"; shift 2;;
46
+ --run-prefix) require_value "$1" "${2:-}"; RUN_PREFIX="$2"; shift 2;;
47
+ --pair-mode) require_value "$1" "${2:-}"; PAIR_MODE="$2"; shift 2;;
48
+ --min-runs) require_value "$1" "${2:-}"; MIN_RUNS="$2"; shift 2;;
49
+ --out-json) require_value "$1" "${2:-}"; OUT_JSON="$2"; shift 2;;
50
+ --out-md) require_value "$1" "${2:-}"; OUT_MD="$2"; shift 2;;
51
+ --max-pair-solo-wall-ratio) require_value "$1" "${2:-}"; MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
52
+ --timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_SECONDS="$2"; shift 2;;
53
+ --run-ids-out) require_value "$1" "${2:-}"; RUN_IDS_OUT="$2"; shift 2;;
45
54
  --resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
46
55
  --prepare-only) PREPARE_ONLY=1; shift;;
47
- --gate-only-run-ids) GATE_ONLY_RUN_IDS="$2"; shift 2;;
56
+ --gate-only-run-ids) require_value "$1" "${2:-}"; GATE_ONLY_RUN_IDS="$2"; shift 2;;
48
57
  -h|--help) usage 0;;
49
58
  *) echo "unknown arg: $1" >&2; usage 1;;
50
59
  esac
@@ -79,12 +88,91 @@ if [ -z "$RUN_PREFIX" ]; then
79
88
  RUN_PREFIX="$(date -u +%Y%m%dT%H%M%SZ)-swebench-frozen"
80
89
  fi
81
90
 
91
+ print_command() {
92
+ local cmd=(bash "$0" --manifest "$MANIFEST" --run-prefix "$RUN_PREFIX")
93
+ cmd+=(--pair-mode "$PAIR_MODE")
94
+ cmd+=(--min-runs "$MIN_RUNS")
95
+ [ -z "$OUT_JSON" ] || cmd+=(--out-json "$OUT_JSON")
96
+ [ -z "$OUT_MD" ] || cmd+=(--out-md "$OUT_MD")
97
+ [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || cmd+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
98
+ [ -z "$TIMEOUT_SECONDS" ] || cmd+=(--timeout-seconds "$TIMEOUT_SECONDS")
99
+ [ -z "$RUN_IDS_OUT" ] || cmd+=(--run-ids-out "$RUN_IDS_OUT")
100
+ [ "$RESUME_COMPLETED_ARMS" -eq 0 ] || cmd+=(--resume-completed-arms)
101
+ [ "$PREPARE_ONLY" -eq 0 ] || cmd+=(--prepare-only)
102
+ [ -z "$GATE_ONLY_RUN_IDS" ] || cmd+=(--gate-only-run-ids "$GATE_ONLY_RUN_IDS")
103
+ printf 'Command: '
104
+ printf '%q ' "${cmd[@]}"
105
+ printf '\n'
106
+ }
107
+
108
+ echo ""
109
+ echo "═══ SWE-bench Frozen VERIFY Corpus Run ═══"
110
+ echo "Run-prefix: $RUN_PREFIX"
111
+ echo "Pair mode: $PAIR_MODE"
112
+ echo "Min runs: $MIN_RUNS"
113
+ [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || echo "Wall cap: pair/solo <= ${MAX_PAIR_SOLO_WALL_RATIO}x"
114
+ print_command
115
+ echo ""
116
+
82
117
  TMP_RUN_IDS="$(mktemp)"
83
118
  trap 'rm -f "$TMP_RUN_IDS"' EXIT
84
119
  ROW_FAILURES=0
85
120
 
121
+ python3 - "$MANIFEST" "$GATE_ONLY_RUN_IDS" "$SCRIPT_DIR" <<'PY'
122
+ import pathlib
123
+ import sys
124
+
125
+ sys.path.insert(0, sys.argv[3])
126
+ from pair_evidence_contract import loads_strict_json_object
127
+
128
+ manifest_path = pathlib.Path(sys.argv[1])
129
+ gate_only_run_ids = sys.argv[2]
130
+ try:
131
+ manifest = loads_strict_json_object(manifest_path.read_text())
132
+ except ValueError as exc:
133
+ if str(exc) == "top-level JSON value must be an object":
134
+ raise SystemExit("manifest malformed: expected JSON object") from exc
135
+ raise
136
+ if not isinstance(manifest, dict):
137
+ raise SystemExit("manifest malformed: expected JSON object")
138
+ cases_root = manifest.get("cases_root")
139
+ if not isinstance(cases_root, str) or not cases_root.strip():
140
+ raise SystemExit("manifest malformed: missing non-empty cases_root")
141
+ if gate_only_run_ids:
142
+ raise SystemExit(0)
143
+ prepared = manifest.get("prepared")
144
+ if not isinstance(prepared, list) or not prepared:
145
+ raise SystemExit("manifest malformed: prepared must be a non-empty array")
146
+ for index, row in enumerate(prepared, start=1):
147
+ if not isinstance(row, dict):
148
+ raise SystemExit(f"manifest malformed: prepared[{index}] expected JSON object")
149
+ for key in ("instance_id", "case_dir", "repo_dir"):
150
+ value = row.get(key)
151
+ if not isinstance(value, str) or not value.strip():
152
+ raise SystemExit(f"manifest malformed: prepared[{index}] missing non-empty {key}")
153
+ PY
154
+
86
155
  if [ -n "$GATE_ONLY_RUN_IDS" ]; then
87
- cp "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS"
156
+ python3 - "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS" <<'PY'
157
+ import pathlib
158
+ import re
159
+ import sys
160
+
161
+ source = pathlib.Path(sys.argv[1])
162
+ dest = pathlib.Path(sys.argv[2])
163
+ safe = re.compile(r"^[A-Za-z0-9_.-]+$")
164
+ run_ids: list[str] = []
165
+ for line_no, line in enumerate(source.read_text(encoding="utf8").splitlines(), start=1):
166
+ run_id = line.strip()
167
+ if not run_id:
168
+ raise SystemExit(f"run ids malformed: line {line_no} is empty")
169
+ if not safe.match(run_id):
170
+ raise SystemExit(f"run ids malformed: line {line_no} has unsafe run id")
171
+ run_ids.append(run_id)
172
+ if not run_ids:
173
+ raise SystemExit("run ids malformed: no run ids")
174
+ dest.write_text("\n".join(run_ids) + "\n", encoding="utf8")
175
+ PY
88
176
  else
89
177
  while IFS=$'\t' read -r index instance_id cases_root repo_dir diff_path; do
90
178
  [ -n "$instance_id" ] || continue
@@ -157,10 +245,14 @@ if not compare_path.exists():
157
245
  PY
158
246
  fi
159
247
  printf '%s\n' "$safe_run_id" >> "$TMP_RUN_IDS"
160
- done < <(python3 - "$MANIFEST" <<'PY'
161
- import json, pathlib, sys
162
- manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
163
- for index, row in enumerate(manifest.get("prepared") or [], start=1):
248
+ done < <(python3 - "$MANIFEST" "$SCRIPT_DIR" <<'PY'
249
+ import pathlib, sys
250
+
251
+ sys.path.insert(0, sys.argv[2])
252
+ from pair_evidence_contract import loads_strict_json_object
253
+
254
+ manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
255
+ for index, row in enumerate(manifest["prepared"], start=1):
164
256
  instance_id = row["instance_id"]
165
257
  case_dir = pathlib.Path(row["case_dir"])
166
258
  repo_dir = pathlib.Path(row["repo_dir"])
@@ -192,13 +284,22 @@ fi
192
284
  run_count="$(wc -l < "$TMP_RUN_IDS" | tr -d ' ')"
193
285
  [ "$run_count" -gt 0 ] || { echo "manifest prepared no runs" >&2; exit 1; }
194
286
 
195
- fixtures_root="$(python3 - "$MANIFEST" <<'PY'
196
- import json, pathlib, sys
197
- manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
287
+ fixtures_root="$(python3 - "$MANIFEST" "$SCRIPT_DIR" <<'PY'
288
+ import pathlib, sys
289
+
290
+ sys.path.insert(0, sys.argv[2])
291
+ from pair_evidence_contract import loads_strict_json_object
292
+
293
+ manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
198
294
  print(manifest["cases_root"])
199
295
  PY
200
296
  )"
201
- gate_args=(python3 "$SCRIPT_DIR/frozen-verify-gate.py" --fixtures-root "$fixtures_root" --min-runs "$MIN_RUNS")
297
+ gate_args=(
298
+ python3 "$SCRIPT_DIR/frozen-verify-gate.py"
299
+ --fixtures-root "$fixtures_root"
300
+ --min-runs "$MIN_RUNS"
301
+ --require-hypothesis-trigger
302
+ )
202
303
  [ -z "$OUT_JSON" ] || gate_args+=(--out-json "$OUT_JSON")
203
304
  [ -z "$OUT_MD" ] || gate_args+=(--out-md "$OUT_MD")
204
305
  [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || gate_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")