devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -0,0 +1,497 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for run-full-pipeline-pair-candidate.sh argument guards.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ RUNNER="$SCRIPT_DIR/run-full-pipeline-pair-candidate.sh"
8
+ REJECTED="$SCRIPT_DIR/pair-rejected-fixtures.sh"
9
+ BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
10
+ TMP_DIR="$(mktemp -d /tmp/run-full-pipeline-pair-candidate-test.XXXXXX)"
11
+ TEST_RUN="arg-test-$(basename "$TMP_DIR")"
12
+ TEST_SHADOW="$BENCH_ROOT/shadow-fixtures/S97-runner-hypothesis"
13
+ trap 'rm -rf "$TMP_DIR" "$BENCH_ROOT/results/$TEST_RUN"* "$BENCH_ROOT/results/src-$TEST_RUN" "$TEST_SHADOW"' EXIT
14
+
15
+ expect_fail_contains() {
16
+ local label="$1"
17
+ local needle="$2"
18
+ shift 2
19
+ local out="$TMP_DIR/$label.out"
20
+ if "$@" > "$out" 2>&1; then
21
+ echo "expected failure for $label" >&2
22
+ cat "$out" >&2
23
+ exit 1
24
+ fi
25
+ if ! grep -Fq -- "$needle" "$out"; then
26
+ echo "missing expected text for $label: $needle" >&2
27
+ cat "$out" >&2
28
+ exit 1
29
+ fi
30
+ }
31
+
32
+ bash "$RUNNER" --help > "$TMP_DIR/help.out" 2>&1
33
+ grep -Fq 'default: l2_risk_probes' "$TMP_DIR/help.out"
34
+ grep -Fq -- '--min-bare-headroom N' "$TMP_DIR/help.out"
35
+ grep -Fq -- '--min-solo-headroom N' "$TMP_DIR/help.out"
36
+ grep -Fq -- '--max-pair-solo-wall-ratio N (default: 3)' "$TMP_DIR/help.out"
37
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/help.out"
38
+ grep -Fq -- '--dry-run' "$TMP_DIR/help.out"
39
+ grep -Fq 'run_gate_with_report' "$RUNNER"
40
+ grep -Fq 'Command: ' "$RUNNER"
41
+ grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND' "$RUNNER"
42
+ grep -Fq 'cmd=(npx devlyn-cli benchmark pair --run-id "$RUN_ID")' "$RUNNER"
43
+ grep -Fq 'cmd=(bash "$0" --run-id "$RUN_ID")' "$RUNNER"
44
+ grep -Fq 'cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")' "$RUNNER"
45
+ grep -Fq 'cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")' "$RUNNER"
46
+ grep -Fq 'cmd+=(--allow-rejected-fixtures)' "$RUNNER"
47
+ grep -Fq 'cmd+=(--dry-run)' "$RUNNER"
48
+ grep -Fq 'baseline evidence-complete' "$RUNNER"
49
+ grep -Fq '$PAIR_ARM evidence-clean' "$RUNNER"
50
+ grep -Fq 'MAX_PAIR_SOLO_WALL_RATIO=3' "$RUNNER"
51
+ grep -Fq 'headroom-gate.md' "$RUNNER"
52
+ grep -Fq 'full-pipeline-pair-gate.md' "$RUNNER"
53
+ grep -Fq 'cat "$report"' "$RUNNER"
54
+ grep -Fq 'headroom gate failed — pair arm not executed' "$RUNNER"
55
+ grep -Fq 'headroom gate passed — executing $PAIR_ARM' "$RUNNER"
56
+ grep -Fq 'pair gate failed — pair evidence rejected' "$RUNNER"
57
+ grep -Fq 'pair gate passed — pair evidence accepted' "$RUNNER"
58
+ grep -Fq 'if ! run_gate_with_report \' "$RUNNER"
59
+ grep -Fq 'mirror_skills()' "$RUNNER"
60
+ grep -Fq 'validate_fixtures' "$RUNNER"
61
+ grep -Fq 'fixture_has_solo_ceiling_avoidance_note' "$RUNNER"
62
+ grep -Fq 'shadow fixture NOTES.md needs ## Solo ceiling avoidance' "$RUNNER"
63
+ grep -Fq 'fixture not found in fixtures/ or shadow-fixtures/' "$RUNNER"
64
+ grep -Fq '[FS][0-9]*) FIXTURES+=("$1")' "$RUNNER"
65
+ grep -Fq 'retired_fixture_exists' "$RUNNER"
66
+ grep -Fq 'fixture is retired and is not rerun by pair-candidate runners' "$RUNNER"
67
+ grep -Fq 'fixture_smoke_only' "$RUNNER"
68
+ grep -Fq 'fixture is smoke-only and cannot run providers' "$RUNNER"
69
+ grep -Fq 'rejected_pair_fixture_reason' "$RUNNER"
70
+ grep -Fq 'source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"' "$RUNNER"
71
+ grep -Fq 'declare -F rejected_pair_fixture_reason' "$RUNNER"
72
+ grep -Fq '20260511-f3-http-error-headroom' "$REJECTED"
73
+ grep -Fq '20260507-f10-f11-tier1-full-pipeline' "$REJECTED"
74
+ grep -Fq '20260511-f12-webhook-headroom' "$REJECTED"
75
+ grep -Fq '20260511-f15-concurrency-headroom' "$REJECTED"
76
+ grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' "$REJECTED"
77
+ grep -Fq '20260511-f30-headroom-v1' "$REJECTED"
78
+ grep -Fq '20260513-s2-inventory-headroom' "$REJECTED"
79
+ grep -Fq '20260513-s3-ticket-headroom' "$REJECTED"
80
+ grep -Fq '20260513-s4-return-headroom' "$REJECTED"
81
+ grep -Fq '20260513-s5-credit-headroom' "$REJECTED"
82
+ grep -Fq 'Use --allow-rejected-fixtures for diagnostics only' "$RUNNER"
83
+ grep -Fq 'if [ -z "$REUSE_CALIBRATED_FROM" ]; then' "$RUNNER"
84
+ grep -Fq 'if [ -n "$REUSE_CALIBRATED_FROM" ]; then' "$RUNNER"
85
+
86
+ expect_fail_contains invalid-pair-arm \
87
+ "pair-arm must be l2_risk_probes or l2_gated" \
88
+ bash "$RUNNER" --run-id arg-test --pair-arm variant F21-cli-scheduler-priority
89
+
90
+ expect_fail_contains retired-pair-arm \
91
+ "pair-arm l2_forced is retired" \
92
+ bash "$RUNNER" --run-id arg-test --pair-arm l2_forced F21-cli-scheduler-priority
93
+
94
+ expect_fail_contains missing-bare-max-value \
95
+ "--bare-max requires a value" \
96
+ bash "$RUNNER" --bare-max
97
+
98
+ expect_fail_contains invalid-bare-max \
99
+ "--bare-max must be an integer: nope" \
100
+ bash "$RUNNER" --bare-max nope F21-cli-scheduler-priority
101
+
102
+ expect_fail_contains invalid-min-fixtures \
103
+ "--min-fixtures must be >= 1" \
104
+ bash "$RUNNER" --min-fixtures 0 F21-cli-scheduler-priority
105
+
106
+ expect_fail_contains invalid-min-solo-headroom \
107
+ "--min-solo-headroom must be an integer: nope" \
108
+ bash "$RUNNER" --min-solo-headroom nope F21-cli-scheduler-priority
109
+
110
+ expect_fail_contains negative-min-bare-headroom \
111
+ "--min-bare-headroom must be an integer: -1" \
112
+ bash "$RUNNER" --min-bare-headroom -1 F21-cli-scheduler-priority
113
+
114
+ expect_fail_contains negative-min-solo-headroom \
115
+ "--min-solo-headroom must be an integer: -1" \
116
+ bash "$RUNNER" --min-solo-headroom -1 F21-cli-scheduler-priority
117
+
118
+ expect_fail_contains invalid-wall-ratio \
119
+ "--max-pair-solo-wall-ratio must be a positive number: nope" \
120
+ bash "$RUNNER" --max-pair-solo-wall-ratio nope F21-cli-scheduler-priority
121
+
122
+ expect_fail_contains zero-wall-ratio \
123
+ "--max-pair-solo-wall-ratio must be > 0" \
124
+ bash "$RUNNER" --max-pair-solo-wall-ratio 0 F21-cli-scheduler-priority
125
+
126
+ expect_fail_contains missing-fixture-fast \
127
+ "fixture not found in fixtures/ or shadow-fixtures/: F999-not-a-fixture" \
128
+ bash "$RUNNER" --run-id "$TEST_RUN-missing-fixture" F999-not-a-fixture
129
+
130
+ expect_fail_contains rejected-f1-fixture \
131
+ "fixture rejected for pair-candidate runs: F1-cli-trivial-flag" \
132
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f1" --dry-run --min-fixtures 1 F1-cli-trivial-flag
133
+
134
+ expect_fail_contains rejected-f2-fixture \
135
+ "fixture rejected for pair-candidate runs: F2-cli-medium-subcommand" \
136
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f2" --dry-run --min-fixtures 1 F2-cli-medium-subcommand
137
+
138
+ expect_fail_contains rejected-fixture \
139
+ "fixture rejected for pair-candidate runs: F26-cli-payout-ledger-rules" \
140
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected" --dry-run --min-fixtures 1 F26-cli-payout-ledger-rules
141
+
142
+ expect_fail_contains rejected-f3-fixture \
143
+ "fixture rejected for pair-candidate runs: F3-backend-contract-risk" \
144
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f3" --dry-run --min-fixtures 1 F3-backend-contract-risk
145
+
146
+ expect_fail_contains rejected-f4-fixture \
147
+ "fixture rejected for pair-candidate runs: F4-web-browser-design" \
148
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f4" --dry-run --min-fixtures 1 F4-web-browser-design
149
+
150
+ expect_fail_contains rejected-f5-fixture \
151
+ "fixture rejected for pair-candidate runs: F5-fix-loop-red-green" \
152
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f5" --dry-run --min-fixtures 1 F5-fix-loop-red-green
153
+
154
+ expect_fail_contains rejected-f6-fixture \
155
+ "fixture rejected for pair-candidate runs: F6-dep-audit-native-module" \
156
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f6" --dry-run --min-fixtures 1 F6-dep-audit-native-module
157
+
158
+ expect_fail_contains rejected-f7-fixture \
159
+ "fixture rejected for pair-candidate runs: F7-out-of-scope-trap" \
160
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f7" --dry-run --min-fixtures 1 F7-out-of-scope-trap
161
+
162
+ expect_fail_contains rejected-f8-fixture \
163
+ "fixture rejected for pair-candidate runs: F8-known-limit-ambiguous" \
164
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f8" --dry-run --min-fixtures 1 F8-known-limit-ambiguous
165
+
166
+ expect_fail_contains rejected-f9-fixture \
167
+ "fixture rejected for pair-candidate runs: F9-e2e-ideate-to-resolve" \
168
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f9" --dry-run --min-fixtures 1 F9-e2e-ideate-to-resolve
169
+
170
+ expect_fail_contains rejected-f10-fixture \
171
+ "fixture rejected for pair-candidate runs: F10-persist-write-collision" \
172
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f10" --dry-run --min-fixtures 1 F10-persist-write-collision
173
+
174
+ expect_fail_contains rejected-f11-fixture \
175
+ "fixture rejected for pair-candidate runs: F11-batch-import-all-or-nothing" \
176
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f11" --dry-run --min-fixtures 1 F11-batch-import-all-or-nothing
177
+
178
+ expect_fail_contains rejected-f12-fixture \
179
+ "fixture rejected for pair-candidate runs: F12-webhook-raw-body-signature" \
180
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f12" --dry-run --min-fixtures 1 F12-webhook-raw-body-signature
181
+
182
+ expect_fail_contains rejected-f15-fixture \
183
+ "fixture rejected for pair-candidate runs: F15-frozen-diff-race-review" \
184
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f15" --dry-run --min-fixtures 1 F15-frozen-diff-race-review
185
+
186
+ expect_fail_contains rejected-f31-fixture \
187
+ "fixture rejected for pair-candidate runs: F31-cli-seat-rebalance" \
188
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f31" --dry-run --min-fixtures 1 F31-cli-seat-rebalance
189
+
190
+ expect_fail_contains rejected-f32-fixture \
191
+ "fixture rejected for pair-candidate runs: F32-cli-subscription-renewal" \
192
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-f32" --dry-run --min-fixtures 1 F32-cli-subscription-renewal
193
+
194
+ expect_fail_contains rejected-s2-shadow-fixture \
195
+ "fixture rejected for pair-candidate runs: S2-cli-inventory-reservation" \
196
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s2" --dry-run --min-fixtures 1 S2-cli-inventory-reservation
197
+
198
+ expect_fail_contains rejected-s3-shadow-fixture \
199
+ "fixture rejected for pair-candidate runs: S3-cli-ticket-assignment" \
200
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s3" --dry-run --min-fixtures 1 S3-cli-ticket-assignment
201
+
202
+ expect_fail_contains rejected-s4-shadow-fixture \
203
+ "fixture rejected for pair-candidate runs: S4-cli-return-routing" \
204
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s4" --dry-run --min-fixtures 1 S4-cli-return-routing
205
+
206
+ expect_fail_contains rejected-s5-shadow-fixture \
207
+ "fixture rejected for pair-candidate runs: S5-cli-credit-grant-ledger" \
208
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s5" --dry-run --min-fixtures 1 S5-cli-credit-grant-ledger
209
+
210
+ expect_fail_contains rejected-s6-shadow-fixture \
211
+ "fixture rejected for pair-candidate runs: S6-cli-refund-window-ledger" \
212
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-s6" --dry-run --min-fixtures 1 S6-cli-refund-window-ledger
213
+
214
+ expect_fail_contains retired-fixture \
215
+ "fixture is retired and is not rerun by pair-candidate runners: F28-cli-return-authorization" \
216
+ bash "$RUNNER" --run-id "$TEST_RUN-retired" --dry-run --min-fixtures 1 F28-cli-return-authorization
217
+
218
+ expect_fail_contains smoke-only-s1-provider-run \
219
+ "fixture is smoke-only and cannot run providers: S1-cli-lang-flag" \
220
+ bash "$RUNNER" --run-id "$TEST_RUN-smoke-only" --min-fixtures 1 S1-cli-lang-flag
221
+
222
+ expect_fail_contains reuse-source-missing \
223
+ "reuse source missing" \
224
+ bash "$RUNNER" --run-id "$TEST_RUN-source-missing" \
225
+ --reuse-calibrated-from "src-$TEST_RUN-missing" \
226
+ F21-cli-scheduler-priority
227
+
228
+ expect_fail_contains cli-replay-command \
229
+ "Command: npx devlyn-cli benchmark pair --run-id $TEST_RUN-cli-replay" \
230
+ env DEVLYN_BENCHMARK_CLI_SUBCOMMAND=pair \
231
+ bash "$RUNNER" --run-id "$TEST_RUN-cli-replay" \
232
+ --reuse-calibrated-from "src-$TEST_RUN-missing" \
233
+ F21-cli-scheduler-priority
234
+
235
+ expect_fail_contains dry-run-min-fixtures \
236
+ '[full-pipeline-pair] DRY RUN failed' \
237
+ bash "$RUNNER" --run-id "$TEST_RUN-dry-run-fail" --dry-run F21-cli-scheduler-priority
238
+
239
+ bash "$RUNNER" --run-id "$TEST_RUN-dry-run" --dry-run --min-fixtures 1 F21-cli-scheduler-priority \
240
+ > "$TMP_DIR/dry-run.out" 2>&1
241
+ grep -Fq 'Mode: DRY RUN (no model/provider invocations)' "$TMP_DIR/dry-run.out"
242
+ grep -Fq 'Command: ' "$TMP_DIR/dry-run.out"
243
+ grep -Fq -- '--dry-run' "$TMP_DIR/dry-run.out"
244
+ grep -Fq -- '--min-bare-headroom 5' "$TMP_DIR/dry-run.out"
245
+ grep -Fq -- '--min-solo-headroom 5' "$TMP_DIR/dry-run.out"
246
+ grep -Fq -- '--min-fixtures 1' "$TMP_DIR/dry-run.out"
247
+ grep -Fq -- '--max-pair-solo-wall-ratio 3' "$TMP_DIR/dry-run.out"
248
+ grep -Fq 'Pair: l2_risk_probes evidence-clean, canonical trigger, margin >= +5, wall ratio <= 3' "$TMP_DIR/dry-run.out"
249
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/dry-run.out"
250
+
251
+ bash "$RUNNER" --run-id "$TEST_RUN-shadow-dry-run" --dry-run --min-fixtures 1 S1-cli-lang-flag \
252
+ > "$TMP_DIR/shadow-dry-run.out" 2>&1
253
+ grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP_DIR/shadow-dry-run.out"
254
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/shadow-dry-run.out"
255
+
256
+ mkdir -p "$TEST_SHADOW"
257
+ cat > "$TEST_SHADOW/metadata.json" <<'EOF'
258
+ {
259
+ "id": "S97-runner-hypothesis",
260
+ "category": "high-risk"
261
+ }
262
+ EOF
263
+ cat > "$TEST_SHADOW/spec.md" <<'EOF'
264
+ # Runner hypothesis fixture
265
+
266
+ Add idempotency handling for duplicate requests.
267
+ EOF
268
+ cat > "$TEST_SHADOW/expected.json" <<'EOF'
269
+ {
270
+ "verification_commands": [
271
+ {
272
+ "cmd": "node -e \"process.exit(0)\"",
273
+ "exit_code": 0
274
+ }
275
+ ]
276
+ }
277
+ EOF
278
+ cat > "$TEST_SHADOW/NOTES.md" <<'EOF'
279
+ # Notes
280
+
281
+ Synthetic runner guard fixture.
282
+ EOF
283
+ expect_fail_contains missing-solo-headroom-hypothesis \
284
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
285
+ bash "$RUNNER" --run-id "$TEST_RUN-missing-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
286
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
287
+
288
+ ## Solo-headroom hypothesis
289
+
290
+ A capable solo_claude baseline is expected to miss duplicate idempotency ordering.
291
+ EOF
292
+ expect_fail_contains weak-solo-headroom-hypothesis \
293
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
294
+ bash "$RUNNER" --run-id "$TEST_RUN-weak-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
295
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
296
+
297
+ Implementation marker: `duplicate-idempotency`.
298
+ EOF
299
+ expect_fail_contains unrelated-backtick-solo-headroom-hypothesis \
300
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
301
+ bash "$RUNNER" --run-id "$TEST_RUN-unrelated-backtick-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
302
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
303
+
304
+ Observable command: `node -e "process.exit(0)"` exposes behavior.
305
+ EOF
306
+ expect_fail_contains observable-without-miss-solo-headroom-hypothesis \
307
+ 'fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: S97-runner-hypothesis' \
308
+ bash "$RUNNER" --run-id "$TEST_RUN-observable-without-miss-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis
309
+ cat >> "$TEST_SHADOW/spec.md" <<'EOF'
310
+
311
+ Observable command: `node -e "process.exit(0)"` exposes the miss.
312
+ EOF
313
+ expect_fail_contains missing-solo-ceiling-avoidance \
314
+ 'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S97-runner-hypothesis' \
315
+ bash "$RUNNER" --run-id "$TEST_RUN-missing-ceiling" --dry-run --min-fixtures 1 S97-runner-hypothesis
316
+ cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
317
+
318
+ ## Solo ceiling avoidance
319
+
320
+ This candidate mentions solo_claude but gives no control comparison.
321
+ EOF
322
+ expect_fail_contains weak-solo-ceiling-avoidance \
323
+ 'shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: S97-runner-hypothesis' \
324
+ bash "$RUNNER" --run-id "$TEST_RUN-weak-ceiling" --dry-run --min-fixtures 1 S97-runner-hypothesis
325
+ cat >> "$TEST_SHADOW/NOTES.md" <<'EOF'
326
+
327
+ Unlike solo-saturated S2-S6 controls, this fixture should preserve
328
+ solo_claude headroom because it targets a multi-run state dependency.
329
+ EOF
330
+ bash "$RUNNER" --run-id "$TEST_RUN-hypothesis" --dry-run --min-fixtures 1 S97-runner-hypothesis \
331
+ > "$TMP_DIR/hypothesis.out" 2>&1
332
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/hypothesis.out"
333
+
334
+ bash "$RUNNER" --run-id "$TEST_RUN-shadow-rejected-override" --dry-run --min-fixtures 1 \
335
+ --allow-rejected-fixtures S3-cli-ticket-assignment \
336
+ > "$TMP_DIR/shadow-rejected-override.out" 2>&1
337
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/shadow-rejected-override.out"
338
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/shadow-rejected-override.out"
339
+
340
+ bash "$RUNNER" --run-id "$TEST_RUN-rejected-override" --dry-run --min-fixtures 1 \
341
+ --allow-rejected-fixtures F26-cli-payout-ledger-rules \
342
+ > "$TMP_DIR/rejected-override.out" 2>&1
343
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP_DIR/rejected-override.out"
344
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP_DIR/rejected-override.out"
345
+
346
+ STUB_REPO="$TMP_DIR/stub-repo"
347
+ STUB_BENCH="$STUB_REPO/benchmark/auto-resolve"
348
+ mkdir -p \
349
+ "$STUB_BENCH/scripts" \
350
+ "$STUB_BENCH/fixtures/F21-cli-scheduler-priority" \
351
+ "$STUB_REPO/config/skills/devlyn:resolve"
352
+ cp "$RUNNER" "$STUB_BENCH/scripts/run-full-pipeline-pair-candidate.sh"
353
+ cp "$REJECTED" "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
354
+ chmod +x "$STUB_BENCH/scripts/run-full-pipeline-pair-candidate.sh"
355
+ chmod +x "$STUB_BENCH/scripts/pair-rejected-fixtures.sh"
356
+ printf -- '---\nname: devlyn:resolve\n---\n' > "$STUB_REPO/config/skills/devlyn:resolve/SKILL.md"
357
+ cat > "$STUB_BENCH/scripts/run-fixture.sh" <<'EOF'
358
+ #!/usr/bin/env bash
359
+ set -euo pipefail
360
+ echo "[stub-run-fixture] $*"
361
+ EOF
362
+ chmod +x "$STUB_BENCH/scripts/run-fixture.sh"
363
+ cat > "$STUB_BENCH/scripts/judge.sh" <<'EOF'
364
+ #!/usr/bin/env bash
365
+ set -euo pipefail
366
+ echo "[stub-judge] $*"
367
+ EOF
368
+ chmod +x "$STUB_BENCH/scripts/judge.sh"
369
+ cat > "$STUB_BENCH/scripts/headroom-gate.py" <<'PY'
370
+ #!/usr/bin/env python3
371
+ import json
372
+ import os
373
+ import pathlib
374
+ import sys
375
+
376
+ out_json = None
377
+ out_md = None
378
+ args = sys.argv[1:]
379
+ for index, arg in enumerate(args):
380
+ if arg == "--out-json":
381
+ out_json = pathlib.Path(args[index + 1])
382
+ if arg == "--out-md":
383
+ out_md = pathlib.Path(args[index + 1])
384
+ payload = {"verdict": "PASS" if os.environ.get("STUB_HEADROOM_EXIT", "0") == "0" else "FAIL"}
385
+ if out_json:
386
+ out_json.parent.mkdir(parents=True, exist_ok=True)
387
+ out_json.write_text(json.dumps(payload) + "\n", encoding="utf8")
388
+ if out_md:
389
+ out_md.parent.mkdir(parents=True, exist_ok=True)
390
+ out_md.write_text(
391
+ "# stub headroom\n\n"
392
+ "Verdict: **%s**\n\n"
393
+ "| fixture | bare | solo_claude | solo_claude-bare |\n"
394
+ "| --- | ---: | ---: | ---: |\n"
395
+ "| F21-cli-scheduler-priority | 50 | 75 | 25 |\n"
396
+ % payload["verdict"],
397
+ encoding="utf8",
398
+ )
399
+ sys.exit(int(os.environ.get("STUB_HEADROOM_EXIT", "0")))
400
+ PY
401
+ cat > "$STUB_BENCH/scripts/full-pipeline-pair-gate.py" <<'PY'
402
+ #!/usr/bin/env python3
403
+ import json
404
+ import os
405
+ import pathlib
406
+ import sys
407
+
408
+ out_json = None
409
+ out_md = None
410
+ args = sys.argv[1:]
411
+ for index, arg in enumerate(args):
412
+ if arg == "--out-json":
413
+ out_json = pathlib.Path(args[index + 1])
414
+ if arg == "--out-md":
415
+ out_md = pathlib.Path(args[index + 1])
416
+ payload = {"verdict": "PASS" if os.environ.get("STUB_PAIR_EXIT", "0") == "0" else "FAIL"}
417
+ print("stub-pair-gate args: " + " ".join(args))
418
+ if out_json:
419
+ out_json.parent.mkdir(parents=True, exist_ok=True)
420
+ out_json.write_text(json.dumps(payload) + "\n", encoding="utf8")
421
+ if out_md:
422
+ out_md.parent.mkdir(parents=True, exist_ok=True)
423
+ out_md.write_text(
424
+ "# stub pair\n\n"
425
+ "Verdict: **%s**\n\n"
426
+ "| fixture | bare | solo_claude | pair | pair-solo_claude |\n"
427
+ "| --- | ---: | ---: | ---: | ---: |\n"
428
+ "| F21-cli-scheduler-priority | 50 | 75 | 96 | 21 |\n"
429
+ % payload["verdict"],
430
+ encoding="utf8",
431
+ )
432
+ sys.exit(int(os.environ.get("STUB_PAIR_EXIT", "0")))
433
+ PY
434
+
435
+ STUB_RUNNER="$STUB_BENCH/scripts/run-full-pipeline-pair-candidate.sh"
436
+ STUB_HEADROOM_EXIT=0 STUB_PAIR_EXIT=0 \
437
+ bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-success" --min-fixtures 1 F21-cli-scheduler-priority \
438
+ > "$TMP_DIR/stub-success.out" 2>&1
439
+ grep -Fq '[full-pipeline-pair] headroom gate passed — executing l2_risk_probes.' "$TMP_DIR/stub-success.out"
440
+ grep -Fq '[full-pipeline-pair] pair gate passed — pair evidence accepted.' "$TMP_DIR/stub-success.out"
441
+ grep -Fq '[full-pipeline-pair] release audit: npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' "$TMP_DIR/stub-success.out"
442
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-success.out"
443
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 96 | 21 |' "$TMP_DIR/stub-success.out"
444
+ grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm l2_risk_probes' "$TMP_DIR/stub-success.out"
445
+ grep -Fq -- '--require-hypothesis-trigger' "$TMP_DIR/stub-success.out"
446
+
447
+ if STUB_HEADROOM_EXIT=1 STUB_PAIR_EXIT=0 \
448
+ bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-headroom-fail" --min-fixtures 1 F21-cli-scheduler-priority \
449
+ > "$TMP_DIR/stub-headroom-fail.out" 2>&1; then
450
+ echo "expected stub headroom failure" >&2
451
+ cat "$TMP_DIR/stub-headroom-fail.out" >&2
452
+ exit 1
453
+ fi
454
+ grep -Fq '[full-pipeline-pair] headroom gate failed — pair arm not executed.' "$TMP_DIR/stub-headroom-fail.out"
455
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-headroom-fail.out"
456
+ if grep -Fq '[stub-run-fixture] --fixture F21-cli-scheduler-priority --arm l2_risk_probes' "$TMP_DIR/stub-headroom-fail.out"; then
457
+ echo "pair arm must not run after headroom failure" >&2
458
+ cat "$TMP_DIR/stub-headroom-fail.out" >&2
459
+ exit 1
460
+ fi
461
+
462
+ if STUB_HEADROOM_EXIT=0 STUB_PAIR_EXIT=1 \
463
+ bash "$STUB_RUNNER" --run-id "$TEST_RUN-stub-pair-fail" --min-fixtures 1 F21-cli-scheduler-priority \
464
+ > "$TMP_DIR/stub-pair-fail.out" 2>&1; then
465
+ echo "expected stub pair gate failure" >&2
466
+ cat "$TMP_DIR/stub-pair-fail.out" >&2
467
+ exit 1
468
+ fi
469
+ grep -Fq '[full-pipeline-pair] headroom gate passed — executing l2_risk_probes.' "$TMP_DIR/stub-pair-fail.out"
470
+ grep -Fq '[full-pipeline-pair] pair gate failed — pair evidence rejected.' "$TMP_DIR/stub-pair-fail.out"
471
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 25 |' "$TMP_DIR/stub-pair-fail.out"
472
+ grep -Fq '| F21-cli-scheduler-priority | 50 | 75 | 96 | 21 |' "$TMP_DIR/stub-pair-fail.out"
473
+ if grep -Fq '[full-pipeline-pair] pair gate passed — pair evidence accepted.' "$TMP_DIR/stub-pair-fail.out"; then
474
+ echo "pair accepted message must not print after pair gate failure" >&2
475
+ cat "$TMP_DIR/stub-pair-fail.out" >&2
476
+ exit 1
477
+ fi
478
+
479
+ mkdir -p "$BENCH_ROOT/results/src-$TEST_RUN/F21-cli-scheduler-priority/bare"
480
+ printf '{}\n' > "$BENCH_ROOT/results/src-$TEST_RUN/F21-cli-scheduler-priority/bare/result.json"
481
+ printf '{}\n' > "$BENCH_ROOT/results/src-$TEST_RUN/F21-cli-scheduler-priority/bare/verify.json"
482
+ expect_fail_contains reuse-source-incomplete \
483
+ "reuse source missing diff.patch" \
484
+ bash "$RUNNER" --run-id "$TEST_RUN-source-incomplete" \
485
+ --reuse-calibrated-from "src-$TEST_RUN" \
486
+ F21-cli-scheduler-priority
487
+
488
+ mkdir -p "$BENCH_ROOT/results/$TEST_RUN-destination-incomplete/F21-cli-scheduler-priority/bare"
489
+ printf '{}\n' > "$BENCH_ROOT/results/$TEST_RUN-destination-incomplete/F21-cli-scheduler-priority/bare/result.json"
490
+ printf '{}\n' > "$BENCH_ROOT/results/$TEST_RUN-destination-incomplete/F21-cli-scheduler-priority/bare/verify.json"
491
+ expect_fail_contains reuse-destination-incomplete \
492
+ "reuse destination incomplete diff.patch" \
493
+ bash "$RUNNER" --run-id "$TEST_RUN-destination-incomplete" \
494
+ --reuse-calibrated-from "src-$TEST_RUN" \
495
+ F21-cli-scheduler-priority
496
+
497
+ echo "PASS test-run-full-pipeline-pair-candidate"