devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,933 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for benchmark runner argument parsing.
3
+ set -euo pipefail
4
+
5
+ ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
6
+ TMP="$(mktemp -d)"
7
+ BENCH_ROOT="$ROOT/benchmark/auto-resolve"
8
+ trap 'rm -rf "$TMP"; rm -rf "$BENCH_ROOT/results/arg-parse-command-test" "$BENCH_ROOT/results/arg-parse-discovery-test" "$BENCH_ROOT/results/arg-parse-shadow-suite-dry-run" "$BENCH_ROOT/results/arg-parse-shadow-cli-suite-dry-run" "$BENCH_ROOT/results/arg-parse-variant-path" "$BENCH_ROOT/results/arg-parse-headroom-cli-replay" "$BENCH_ROOT/results/arg-parse-pair-cli-replay" "$BENCH_ROOT/results/arg-parse-shadow-judge" "$BENCH_ROOT/results/arg-parse-opus-bad-mapping" "$BENCH_ROOT/results/arg-parse-opus-malformed-mapping" "$BENCH_ROOT/results/arg-parse-opus-malformed-score" "$BENCH_ROOT/results/arg-parse-opus-invalid-generated-score" "$BENCH_ROOT/results/arg-parse-opus-invalid-generated-dq" "$BENCH_ROOT/results/arg-parse-opus-summary-mapping" "$BENCH_ROOT/results/arg-parse-opus-summary-null-margin"; rm -rf /tmp/bench-arg-parse-variant-path-* /tmp/bench-arg-parse-headroom-cli-replay-*' EXIT
9
+
10
+ expect_fail_contains() {
11
+ local name="$1"
12
+ local expected="$2"
13
+ shift 2
14
+ set +e
15
+ "$@" > "$TMP/$name.out" 2>&1
16
+ local status=$?
17
+ set -e
18
+ [ "$status" -ne 0 ] || {
19
+ echo "expected failure for $name" >&2
20
+ exit 1
21
+ }
22
+ grep -Fq -- "$expected" "$TMP/$name.out" || {
23
+ echo "missing expected output for $name: $expected" >&2
24
+ cat "$TMP/$name.out" >&2
25
+ exit 1
26
+ }
27
+ }
28
+
29
+ expect_fail_contains suite-missing-n "--n requires a value" \
30
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --n
31
+ expect_fail_contains suite-bad-n "error: --n must be an integer" \
32
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --n abc --dry-run
33
+ expect_fail_contains suite-missing-run-id "--run-id requires a value" \
34
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --judge-only --run-id
35
+
36
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" --help > "$TMP/run-suite-help.out" 2>&1
37
+ grep -Fq 'run-suite.sh --suite shadow --dry-run' "$TMP/run-suite-help.out"
38
+ grep -Fq 'shadow suite refuses provider/judge runs' "$TMP/run-suite-help.out"
39
+
40
+ expect_fail_contains fixture-missing-arm "--arm requires a value" \
41
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" --fixture F1 --arm
42
+ expect_fail_contains fixture-missing-resolve-skill "--resolve-skill requires a value" \
43
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
44
+ --fixture F1 --arm bare --run-id arg-parse --resolve-skill
45
+
46
+ expect_fail_contains judge-missing-fixture "--fixture requires a value" \
47
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge.sh" --fixture
48
+ expect_fail_contains judge-missing-run-id "--run-id requires a value" \
49
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge.sh" --fixture F1 --run-id
50
+
51
+ grep -Fq 'shadow-fixtures/$FIXTURE' "$ROOT/benchmark/auto-resolve/scripts/judge.sh"
52
+
53
+ SHADOW_JUDGE_DIR="$BENCH_ROOT/results/arg-parse-shadow-judge/S1-cli-lang-flag"
54
+ mkdir -p "$SHADOW_JUDGE_DIR/bare" "$SHADOW_JUDGE_DIR/solo_claude" "$TMP/fakebin"
55
+ cat > "$SHADOW_JUDGE_DIR/bare/diff.patch" <<'EOF'
56
+ diff --git a/bin/cli.js b/bin/cli.js
57
+ --- a/bin/cli.js
58
+ +++ b/bin/cli.js
59
+ @@ -1 +1 @@
60
+ -old
61
+ +bare
62
+ EOF
63
+ cat > "$SHADOW_JUDGE_DIR/solo_claude/diff.patch" <<'EOF'
64
+ diff --git a/bin/cli.js b/bin/cli.js
65
+ --- a/bin/cli.js
66
+ +++ b/bin/cli.js
67
+ @@ -1 +1 @@
68
+ -old
69
+ +solo
70
+ EOF
71
+ printf '{"arm":"bare","verify_score":0.5}\n' > "$SHADOW_JUDGE_DIR/bare/verify.json"
72
+ printf '{"arm":"solo_claude","verify_score":0.75}\n' > "$SHADOW_JUDGE_DIR/solo_claude/verify.json"
73
+ cat > "$TMP/fakebin/codex" <<'EOF'
74
+ #!/usr/bin/env bash
75
+ if [ "${1:-}" = "--version" ]; then
76
+ echo "codex-cli fake"
77
+ exit 0
78
+ fi
79
+ last=""
80
+ while [ $# -gt 0 ]; do
81
+ if [ "$1" = "--output-last-message" ]; then
82
+ last="$2"
83
+ shift 2
84
+ continue
85
+ fi
86
+ shift
87
+ done
88
+ json='{"a_score":50,"b_score":75,"winner":"B","a_breakdown":{"spec":12,"constraint":13,"scope":12,"quality":13,"notes":"ok"},"b_breakdown":{"spec":19,"constraint":19,"scope":18,"quality":19,"notes":"ok"},"critical_findings":{"A":[],"B":[]},"disqualifiers":{"A":false,"A_reason":"","B":false,"B_reason":""},"overall_reasoning":"fake judge output for shadow fixture resolver regression."}'
89
+ [ -z "$last" ] || printf '%s\n' "$json" > "$last"
90
+ printf '%s\n' "$json"
91
+ EOF
92
+ chmod +x "$TMP/fakebin/codex"
93
+ PATH="$TMP/fakebin:$PATH" \
94
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge.sh" --fixture S1-cli-lang-flag --run-id arg-parse-shadow-judge \
95
+ > "$TMP/shadow-judge.out" 2>&1
96
+ grep -Fq '[judge]' "$TMP/shadow-judge.out"
97
+ grep -Fq '"solo_claude"' "$SHADOW_JUDGE_DIR/judge.json"
98
+
99
+ expect_fail_contains opus-missing-run-id "--run-id requires a value" \
100
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id
101
+
102
+ OPUS_BAD_MAPPING_DIR="$BENCH_ROOT/results/arg-parse-opus-bad-mapping/F9-e2e-ideate-to-resolve"
103
+ mkdir -p "$OPUS_BAD_MAPPING_DIR"
104
+ : > "$OPUS_BAD_MAPPING_DIR/judge-prompt.txt"
105
+ cat > "$OPUS_BAD_MAPPING_DIR/judge.json" <<'JSON'
106
+ {
107
+ "_blind_mapping": {"A": "variant", "B": "bare", "seed": 1},
108
+ "scores_by_arm": {"variant": 70, "bare": 50, "solo_claude": 60}
109
+ }
110
+ JSON
111
+ expect_fail_contains opus-bad-mapping "judge blind mapping missing arm(s): solo_claude" \
112
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id arg-parse-opus-bad-mapping
113
+
114
+ OPUS_MALFORMED_MAPPING_DIR="$BENCH_ROOT/results/arg-parse-opus-malformed-mapping/F9-e2e-ideate-to-resolve"
115
+ mkdir -p "$OPUS_MALFORMED_MAPPING_DIR"
116
+ : > "$OPUS_MALFORMED_MAPPING_DIR/judge-prompt.txt"
117
+ cat > "$OPUS_MALFORMED_MAPPING_DIR/judge.json" <<'JSON'
118
+ {
119
+ "_blind_mapping": "not-a-dict",
120
+ "scores_by_arm": {"variant": 70, "bare": 50, "solo_claude": 60}
121
+ }
122
+ JSON
123
+ expect_fail_contains opus-malformed-mapping "judge blind mapping missing" \
124
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id arg-parse-opus-malformed-mapping
125
+
126
+ OPUS_MALFORMED_SCORE_DIR="$BENCH_ROOT/results/arg-parse-opus-malformed-score/F9-e2e-ideate-to-resolve"
127
+ mkdir -p "$OPUS_MALFORMED_SCORE_DIR"
128
+ : > "$OPUS_MALFORMED_SCORE_DIR/judge-prompt.txt"
129
+ cat > "$OPUS_MALFORMED_SCORE_DIR/judge.json" <<'JSON'
130
+ {
131
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "C": "variant", "seed": 1},
132
+ "scores_by_arm": {"bare": 50, "solo_claude": true, "variant": 101}
133
+ }
134
+ JSON
135
+ expect_fail_contains opus-malformed-score "scores_by_arm malformed score(s): solo_claude, variant" \
136
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" --run-id arg-parse-opus-malformed-score
137
+
138
+ OPUS_SUMMARY_MAPPING_DIR="$BENCH_ROOT/results/arg-parse-opus-summary-mapping/F99-opus-summary-mapping"
139
+ mkdir -p "$OPUS_SUMMARY_MAPPING_DIR"
140
+ : > "$OPUS_SUMMARY_MAPPING_DIR/judge-prompt.txt"
141
+ cat > "$OPUS_SUMMARY_MAPPING_DIR/judge.json" <<'JSON'
142
+ {
143
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
144
+ "scores_by_arm": {"bare": 50, "solo_claude": 60},
145
+ "margins": {"solo_over_bare": 999, "variant_over_bare": 888},
146
+ "winner_arm": "variant",
147
+ "breakdowns_by_arm": {
148
+ "bare": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
149
+ "solo_claude": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11}
150
+ }
151
+ }
152
+ JSON
153
+ FAKE_CLAUDE_DIR="$TMP/fake-claude-bin"
154
+ mkdir -p "$FAKE_CLAUDE_DIR"
155
+ cat > "$FAKE_CLAUDE_DIR/claude" <<'EOF'
156
+ #!/usr/bin/env bash
157
+ if [ "${1:-}" = "--version" ]; then
158
+ echo "claude fake"
159
+ exit 0
160
+ fi
161
+ if [ "${FAKE_CLAUDE_INVALID_SCORE:-}" = "1" ]; then
162
+ cat <<'JSON'
163
+ {
164
+ "a_score": true,
165
+ "b_score": 101,
166
+ "winner": "B",
167
+ "disqualifiers": {"A": false, "A_reason": "", "B": false, "B_reason": ""},
168
+ "critical_findings": {"A": [], "B": []},
169
+ "a_breakdown": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
170
+ "b_breakdown": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11},
171
+ "overall_reasoning": "invalid scores for regression test"
172
+ }
173
+ JSON
174
+ exit 0
175
+ fi
176
+ if [ "${FAKE_CLAUDE_INVALID_DQ:-}" = "1" ]; then
177
+ cat <<'JSON'
178
+ {
179
+ "a_score": 40,
180
+ "b_score": 70,
181
+ "winner": "B",
182
+ "disqualifiers": {"A": "false", "A_reason": "", "B": false, "B_reason": ""},
183
+ "critical_findings": {"A": [], "B": []},
184
+ "a_breakdown": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
185
+ "b_breakdown": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11},
186
+ "overall_reasoning": "invalid disqualifier for regression test"
187
+ }
188
+ JSON
189
+ exit 0
190
+ fi
191
+ cat <<'JSON'
192
+ {
193
+ "a_score": 40,
194
+ "b_score": 70,
195
+ "winner": "B",
196
+ "disqualifiers": ["not", "a", "dict"],
197
+ "a_breakdown": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
198
+ "b_breakdown": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11}
199
+ }
200
+ JSON
201
+ EOF
202
+ chmod +x "$FAKE_CLAUDE_DIR/claude"
203
+ OPUS_INVALID_GENERATED_SCORE_DIR="$BENCH_ROOT/results/arg-parse-opus-invalid-generated-score/F99-opus-invalid-generated-score"
204
+ mkdir -p "$OPUS_INVALID_GENERATED_SCORE_DIR"
205
+ : > "$OPUS_INVALID_GENERATED_SCORE_DIR/judge-prompt.txt"
206
+ cat > "$OPUS_INVALID_GENERATED_SCORE_DIR/judge.json" <<'JSON'
207
+ {
208
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
209
+ "scores_by_arm": {"bare": 50, "solo_claude": 60}
210
+ }
211
+ JSON
212
+ expect_fail_contains opus-invalid-generated-score "invalid opus score value(s): a_score, b_score" \
213
+ env FAKE_CLAUDE_INVALID_SCORE=1 PATH="$FAKE_CLAUDE_DIR:$PATH" \
214
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
215
+ --run-id arg-parse-opus-invalid-generated-score
216
+ OPUS_INVALID_GENERATED_DQ_DIR="$BENCH_ROOT/results/arg-parse-opus-invalid-generated-dq/F99-opus-invalid-generated-dq"
217
+ mkdir -p "$OPUS_INVALID_GENERATED_DQ_DIR"
218
+ : > "$OPUS_INVALID_GENERATED_DQ_DIR/judge-prompt.txt"
219
+ cat > "$OPUS_INVALID_GENERATED_DQ_DIR/judge.json" <<'JSON'
220
+ {
221
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
222
+ "scores_by_arm": {"bare": 50, "solo_claude": 60}
223
+ }
224
+ JSON
225
+ expect_fail_contains opus-invalid-generated-dq "invalid opus disqualifier value(s): A" \
226
+ env FAKE_CLAUDE_INVALID_DQ=1 PATH="$FAKE_CLAUDE_DIR:$PATH" \
227
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
228
+ --run-id arg-parse-opus-invalid-generated-dq
229
+ PATH="$FAKE_CLAUDE_DIR:$PATH" \
230
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
231
+ --run-id arg-parse-opus-summary-mapping > "$TMP/opus-summary-mapping.out" 2>&1
232
+ python3 - "$BENCH_ROOT/results/arg-parse-opus-summary-mapping/cross-judge-summary.json" <<'PY'
233
+ import json
234
+ import pathlib
235
+ import sys
236
+
237
+ summary = json.loads(pathlib.Path(sys.argv[1]).read_text())
238
+ row = summary["rows"][0]
239
+ assert row["gpt_scores"] == {"bare": 50, "solo_claude": 60}, row
240
+ assert row["gpt_margin_l1_l0"] == 10, row
241
+ assert row["gpt_margin_v_l0"] is None, row
242
+ assert row["gpt_winner"] is None, row
243
+ assert row["opus_winner"] == "solo_claude", row
244
+ assert row["winner_agree"] is False, row
245
+ assert summary["sign_valid_count_variant_over_bare"] == 0, summary
246
+ PY
247
+
248
+ OPUS_SUMMARY_NULL_MARGIN_DIR="$BENCH_ROOT/results/arg-parse-opus-summary-null-margin/F99-opus-summary-null-margin"
249
+ mkdir -p "$OPUS_SUMMARY_NULL_MARGIN_DIR"
250
+ : > "$OPUS_SUMMARY_NULL_MARGIN_DIR/judge-prompt.txt"
251
+ cat > "$OPUS_SUMMARY_NULL_MARGIN_DIR/judge.json" <<'JSON'
252
+ {
253
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
254
+ "breakdowns_by_arm": {
255
+ "bare": {"spec": 10, "constraint": 10, "scope": 10, "quality": 10},
256
+ "solo_claude": {"spec": 11, "constraint": 11, "scope": 11, "quality": 11}
257
+ }
258
+ }
259
+ JSON
260
+ PATH="$FAKE_CLAUDE_DIR:$PATH" \
261
+ bash "$ROOT/benchmark/auto-resolve/scripts/judge-opus-pass.sh" \
262
+ --run-id arg-parse-opus-summary-null-margin > "$TMP/opus-summary-null-margin.out" 2>&1
263
+ grep -Fq 'gpt_l1_l0_avg=na' "$TMP/opus-summary-null-margin.out"
264
+ grep -Fq 'suite_avg_diff=na' "$TMP/opus-summary-null-margin.out"
265
+ python3 - "$BENCH_ROOT/results/arg-parse-opus-summary-null-margin/cross-judge-summary.json" <<'PY'
266
+ import json
267
+ import pathlib
268
+ import sys
269
+
270
+ summary = json.loads(pathlib.Path(sys.argv[1]).read_text())
271
+ row = summary["rows"][0]
272
+ assert row["gpt_scores"] == {}, row
273
+ assert row["gpt_margin_l1_l0"] is None, row
274
+ assert summary["suite_avg_l1_l0"]["gpt"] is None, summary
275
+ assert summary["suite_avg_l1_l0"]["gpt_valid_count"] == 0, summary
276
+ PY
277
+
278
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
279
+ --dry-run \
280
+ --run-id arg-parse-command-test \
281
+ F0 > "$TMP/suite-command.out" 2>&1
282
+ grep -Fq 'Command: ' "$TMP/suite-command.out"
283
+ grep -Fq -- '--dry-run' "$TMP/suite-command.out"
284
+ grep -Fq -- '--run-id arg-parse-command-test' "$TMP/suite-command.out"
285
+
286
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
287
+ --dry-run \
288
+ --judge-only \
289
+ --run-id arg-parse-discovery-test > "$TMP/suite-discovery.out" 2>&1
290
+ grep -Fq 'F25-cli-cart-promotion-rules' "$TMP/suite-discovery.out"
291
+ if grep -Fq 'F27-cli-subscription-proration' "$TMP/suite-discovery.out"; then
292
+ echo "retired F27 must not be auto-discovered by the golden suite" >&2
293
+ cat "$TMP/suite-discovery.out" >&2
294
+ exit 1
295
+ fi
296
+ if grep -Fq 'F28-cli-return-authorization' "$TMP/suite-discovery.out"; then
297
+ echo "retired F28 must not be auto-discovered by the golden suite" >&2
298
+ cat "$TMP/suite-discovery.out" >&2
299
+ exit 1
300
+ fi
301
+
302
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
303
+ --suite shadow \
304
+ --dry-run \
305
+ --run-id arg-parse-shadow-suite-dry-run > "$TMP/shadow-suite-dry-run.out" 2>&1
306
+ grep -Fq 'Suite: shadow' "$TMP/shadow-suite-dry-run.out"
307
+ grep -Fq 'S1-cli-lang-flag' "$TMP/shadow-suite-dry-run.out"
308
+ grep -Fq '[suite] DRY RUN complete' "$TMP/shadow-suite-dry-run.out"
309
+ grep -Fq 'Use benchmark headroom/pair with explicit S* candidates for real provider measurement.' "$TMP/shadow-suite-dry-run.out"
310
+ if grep -Fq 'Run without --dry-run to invoke models.' "$TMP/shadow-suite-dry-run.out"; then
311
+ echo "shadow suite dry-run must not invite a blocked non-dry-run suite invocation" >&2
312
+ cat "$TMP/shadow-suite-dry-run.out" >&2
313
+ exit 1
314
+ fi
315
+
316
+ expect_fail_contains shadow-suite-provider-run \
317
+ "shadow suite run-suite is dry-run only" \
318
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
319
+ --suite shadow \
320
+ --run-id arg-parse-shadow-suite-block
321
+
322
+ expect_fail_contains shadow-suite-judge-only-provider-run \
323
+ "shadow suite run-suite is dry-run only" \
324
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-suite.sh" \
325
+ --suite shadow \
326
+ --judge-only \
327
+ --run-id arg-parse-shadow-suite-judge-only-block
328
+
329
+ node "$ROOT/bin/devlyn.js" benchmark suite \
330
+ --suite shadow \
331
+ --dry-run \
332
+ --run-id arg-parse-shadow-cli-suite-dry-run > "$TMP/shadow-cli-suite-dry-run.out" 2>&1
333
+ grep -Fq 'Suite: shadow' "$TMP/shadow-cli-suite-dry-run.out"
334
+ grep -Fq 'S1-cli-lang-flag' "$TMP/shadow-cli-suite-dry-run.out"
335
+ grep -Fq 'Use benchmark headroom/pair with explicit S* candidates for real provider measurement.' "$TMP/shadow-cli-suite-dry-run.out"
336
+
337
+ node "$ROOT/bin/devlyn.js" --help > "$TMP/devlyn-help.out" 2>&1
338
+ grep -Fq 'npx devlyn-cli benchmark Run the resolve benchmark suite' "$TMP/devlyn-help.out"
339
+ grep -Fq 'npx devlyn-cli benchmark recent Show compact recent benchmark results' "$TMP/devlyn-help.out"
340
+ grep -Fq 'npx devlyn-cli benchmark frontier Show pair candidate frontier scores/triggers without providers' "$TMP/devlyn-help.out"
341
+ grep -Fq 'npx devlyn-cli benchmark audit Audit pair evidence readiness' "$TMP/devlyn-help.out"
342
+ grep -Fq 'npx devlyn-cli benchmark audit-headroom Audit failed headroom results' "$TMP/devlyn-help.out"
343
+ grep -Fq 'npx devlyn-cli benchmark headroom <fixtures...> Score bare vs solo_claude headroom' "$TMP/devlyn-help.out"
344
+ grep -Fq 'npx devlyn-cli benchmark pair <fixtures...> Score solo_claude vs pair path' "$TMP/devlyn-help.out"
345
+ if grep -Fq -- '--n 3' "$TMP/devlyn-help.out"; then
346
+ echo "help must not advertise unsupported --n 3 benchmark runs" >&2
347
+ cat "$TMP/devlyn-help.out" >&2
348
+ exit 1
349
+ fi
350
+ node "$ROOT/bin/devlyn.js" benchmark --help > "$TMP/devlyn-benchmark-help.out" 2>&1
351
+ grep -Fq 'npx devlyn-cli benchmark [suite] [options] [fixtures...]' "$TMP/devlyn-benchmark-help.out"
352
+ grep -Fq 'npx devlyn-cli benchmark recent [options]' "$TMP/devlyn-benchmark-help.out"
353
+ grep -Fq 'npx devlyn-cli benchmark frontier [options]' "$TMP/devlyn-benchmark-help.out"
354
+ grep -Fq 'npx devlyn-cli benchmark audit [options]' "$TMP/devlyn-benchmark-help.out"
355
+ grep -Fq 'npx devlyn-cli benchmark audit-headroom [options]' "$TMP/devlyn-benchmark-help.out"
356
+ grep -Fq 'npx devlyn-cli benchmark suite --suite shadow --dry-run' "$TMP/devlyn-benchmark-help.out"
357
+ grep -Fq 'use headroom/pair with explicit S* ids for real measurement' "$TMP/devlyn-benchmark-help.out"
358
+ grep -Fq 'Show compact, wrap-safe recent benchmark results' "$TMP/devlyn-benchmark-help.out"
359
+ grep -Fq 'npx devlyn-cli benchmark headroom [options] <fixtures...>' "$TMP/devlyn-benchmark-help.out"
360
+ grep -Fq 'npx devlyn-cli benchmark pair [options] <fixtures...>' "$TMP/devlyn-benchmark-help.out"
361
+ grep -Fq 'Show active rejected/evidence/unmeasured pair candidates, scores, and triggers without providers' "$TMP/devlyn-benchmark-help.out"
362
+ grep -Fq 'Fail on unmeasured pair candidates and invalid headroom rejections' "$TMP/devlyn-benchmark-help.out"
363
+ grep -Fq 'Prints frontier score rows plus headroom and pair quality handoff rows' "$TMP/devlyn-benchmark-help.out"
364
+ grep -Fq 'Fail on active failed or unsupported headroom rejections' "$TMP/devlyn-benchmark-help.out"
365
+ grep -Fq 'Score bare vs solo_claude before spending the pair arm' "$TMP/devlyn-benchmark-help.out"
366
+ grep -Fq 'Score solo_claude vs the selected pair path and print gate tables' "$TMP/devlyn-benchmark-help.out"
367
+ grep -Fq 'npx devlyn-cli benchmark recent --out-md /tmp/devlyn-recent-benchmark.md' "$TMP/devlyn-benchmark-help.out"
368
+ grep -Fq 'npx devlyn-cli benchmark pair --min-fixtures 3 --max-pair-solo-wall-ratio 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' "$TMP/devlyn-benchmark-help.out"
369
+
370
+ node "$ROOT/bin/devlyn.js" benchmark recent --help > "$TMP/devlyn-benchmark-recent-help.out" 2>&1
371
+ grep -Fq 'npx devlyn-cli benchmark recent [options]' "$TMP/devlyn-benchmark-recent-help.out"
372
+ grep -Fq -- '--out-json PATH' "$TMP/devlyn-benchmark-recent-help.out"
373
+ grep -Fq -- '--out-md PATH' "$TMP/devlyn-benchmark-recent-help.out"
374
+ grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-recent-help.out"
375
+ grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-recent-help.out"
376
+ grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-recent-help.out"
377
+ grep -Fq -- '--max-width N default: 92' "$TMP/devlyn-benchmark-recent-help.out"
378
+ grep -Fq 'Prints compact, wrap-safe benchmark status and pair-evidence cards without wide tables' "$TMP/devlyn-benchmark-recent-help.out"
379
+ grep -Fq 'npx devlyn-cli benchmark recent --out-md /tmp/devlyn-recent-benchmark.md' "$TMP/devlyn-benchmark-recent-help.out"
380
+
381
+ node "$ROOT/bin/devlyn.js" benchmark audit --help > "$TMP/devlyn-benchmark-audit-help.out" 2>&1
382
+ grep -Fq 'npx devlyn-cli benchmark audit [options]' "$TMP/devlyn-benchmark-audit-help.out"
383
+ grep -Fq -- '--out-dir PATH' "$TMP/devlyn-benchmark-audit-help.out"
384
+ grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-audit-help.out"
385
+ grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-audit-help.out"
386
+ grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-audit-help.out"
387
+ grep -Fq -- '--min-pair-evidence N default: 4' "$TMP/devlyn-benchmark-audit-help.out"
388
+ grep -Fq -- '--min-pair-margin N default: 5' "$TMP/devlyn-benchmark-audit-help.out"
389
+ grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' "$TMP/devlyn-benchmark-audit-help.out"
390
+ grep -Fq -- '--require-hypothesis-trigger' "$TMP/devlyn-benchmark-audit-help.out"
391
+ grep -Fq 'Prints frontier score rows plus headroom_rejections=PASS/FAIL, pair_evidence_quality=PASS/FAIL, pair_trigger_reasons=PASS/FAIL, pair_evidence_hypotheses=PASS/FAIL, pair_evidence_hypothesis_triggers=PASS/WARN/FAIL, historical-alias, and hypothesis-trigger gap handoff rows' "$TMP/devlyn-benchmark-audit-help.out"
392
+ grep -Fq 'npx devlyn-cli benchmark audit --out-dir /tmp/devlyn-benchmark-audit' "$TMP/devlyn-benchmark-audit-help.out"
393
+ grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' "$TMP/devlyn-benchmark-audit-help.out"
394
+
395
+ node "$ROOT/bin/devlyn.js" benchmark frontier --help > "$TMP/devlyn-benchmark-frontier-help.out" 2>&1
396
+ grep -Fq 'npx devlyn-cli benchmark frontier [options]' "$TMP/devlyn-benchmark-frontier-help.out"
397
+ grep -Fq -- '--out-json PATH' "$TMP/devlyn-benchmark-frontier-help.out"
398
+ grep -Fq -- '--out-md PATH' "$TMP/devlyn-benchmark-frontier-help.out"
399
+ grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-frontier-help.out"
400
+ grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-frontier-help.out"
401
+ grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-frontier-help.out"
402
+ grep -Fq -- '--fail-on-unmeasured' "$TMP/devlyn-benchmark-frontier-help.out"
403
+ grep -Fq -- '--min-pair-margin N default: 5' "$TMP/devlyn-benchmark-frontier-help.out"
404
+ grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' "$TMP/devlyn-benchmark-frontier-help.out"
405
+ grep -Fq 'Prints pair evidence score rows with trigger reasons; --out-md includes a Triggers column' "$TMP/devlyn-benchmark-frontier-help.out"
406
+ grep -Fq 'npx devlyn-cli benchmark frontier --out-md /tmp/devlyn-pair-frontier.md' "$TMP/devlyn-benchmark-frontier-help.out"
407
+
408
+ node "$ROOT/bin/devlyn.js" benchmark audit-headroom --help > "$TMP/devlyn-benchmark-audit-headroom-help.out" 2>&1
409
+ grep -Fq 'npx devlyn-cli benchmark audit-headroom [options]' "$TMP/devlyn-benchmark-audit-headroom-help.out"
410
+ grep -Fq -- '--out-json PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
411
+ grep -Fq -- '--fixtures-root PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
412
+ grep -Fq -- '--registry PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
413
+ grep -Fq -- '--results-root PATH' "$TMP/devlyn-benchmark-audit-headroom-help.out"
414
+ grep -Fq 'npx devlyn-cli benchmark audit-headroom --out-json /tmp/devlyn-headroom-audit.json' "$TMP/devlyn-benchmark-audit-headroom-help.out"
415
+
416
+ node "$ROOT/bin/devlyn.js" benchmark audit-headroom --out-json "$TMP/headroom-audit.json" > "$TMP/devlyn-benchmark-audit-headroom.out" 2>&1
417
+ grep -Fq 'PASS audit-headroom-rejections' "$TMP/devlyn-benchmark-audit-headroom.out"
418
+ python3 - "$TMP/headroom-audit.json" <<'PY'
419
+ import json
420
+ import sys
421
+
422
+ report = json.load(open(sys.argv[1], encoding="utf8"))
423
+ assert report["verdict"] == "PASS"
424
+ assert report["unrecorded_failures"] == []
425
+ assert report["unsupported_registry_rejections"] == []
426
+ PY
427
+
428
+ node "$ROOT/bin/devlyn.js" benchmark recent \
429
+ --out-json "$TMP/recent.json" \
430
+ --out-md "$TMP/recent.md" \
431
+ --max-width 92 > "$TMP/devlyn-benchmark-recent.out" 2>&1
432
+ grep -Fq 'Recent Benchmark Snapshot' "$TMP/devlyn-benchmark-recent.out"
433
+ grep -Fq 'Pair evidence rows: 4' "$TMP/devlyn-benchmark-recent.out"
434
+ grep -Fq 'Unmeasured candidates: 0' "$TMP/devlyn-benchmark-recent.out"
435
+ grep -Fq 'F21 cli scheduler priority' "$TMP/devlyn-benchmark-recent.out"
436
+ grep -Fq 'triggers: complexity.high, risk.high, risk_probes.enabled, spec.solo_headroom_hypothesis' "$TMP/devlyn-benchmark-recent.out"
437
+ grep -Fq '# Recent Benchmark Snapshot' "$TMP/recent.md"
438
+ grep -Fq '## Pair Evidence' "$TMP/recent.md"
439
+ if grep -Fq '| Fixture |' "$TMP/recent.md"; then
440
+ echo "recent benchmark markdown must use wrap-safe cards, not a wide table" >&2
441
+ cat "$TMP/recent.md" >&2
442
+ exit 1
443
+ fi
444
+ python3 - "$TMP/devlyn-benchmark-recent.out" "$TMP/recent.json" <<'PY'
445
+ import json
446
+ import pathlib
447
+ import sys
448
+
449
+ text = pathlib.Path(sys.argv[1]).read_text(encoding="utf8")
450
+ long_lines = [(i, len(line), line) for i, line in enumerate(text.splitlines(), 1) if len(line) > 92]
451
+ assert not long_lines, long_lines
452
+ report = json.load(open(sys.argv[2], encoding="utf8"))
453
+ assert report["verdict"] == "PASS"
454
+ assert report["pair_evidence_count"] == 4
455
+ assert report["unmeasured_count"] == 0
456
+ assert report["pair_margin_avg"] == 27.25
457
+ assert report["pair_solo_wall_ratio_max"] == 2.25
458
+ PY
459
+
460
+ node "$ROOT/bin/devlyn.js" benchmark audit --out-dir "$TMP/audit" > "$TMP/devlyn-benchmark-audit.out" 2>&1
461
+ grep -Fq '[audit] frontier' "$TMP/devlyn-benchmark-audit.out"
462
+ grep -Fq 'fixtures=21 rejected=17 candidates=4 pair_evidence=4 unmeasured=0 verdict=PASS' "$TMP/devlyn-benchmark-audit.out"
463
+ grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP/devlyn-benchmark-audit.out"
464
+ grep -Fq 'verdict=pair_evidence_passed' "$TMP/devlyn-benchmark-audit.out"
465
+ grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP/devlyn-benchmark-audit.out"
466
+ grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=2.25x max_wall_allowed=3.00x' "$TMP/devlyn-benchmark-audit.out"
467
+ grep -Fq 'pair_trigger_reasons=PASS canonical=4 historical_alias=0 exposed=4 total=4 summary=4 rows_match=true' "$TMP/devlyn-benchmark-audit.out"
468
+ grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$TMP/devlyn-benchmark-audit.out"
469
+ if grep -Fq 'pair_trigger_historical_aliases=' "$TMP/devlyn-benchmark-audit.out" \
470
+ || grep -Fq 'pair_evidence_hypothesis_trigger_gaps=' "$TMP/devlyn-benchmark-audit.out"; then
471
+ echo "current benchmark audit must not report historical aliases or hypothesis-trigger gaps" >&2
472
+ cat "$TMP/devlyn-benchmark-audit.out" >&2
473
+ exit 1
474
+ fi
475
+ grep -Fq 'PASS audit-pair-evidence' "$TMP/devlyn-benchmark-audit.out"
476
+ test -f "$TMP/audit/frontier.json"
477
+ test -f "$TMP/audit/frontier.stdout"
478
+ test -f "$TMP/audit/frontier.stderr"
479
+ test -f "$TMP/audit/headroom-audit.json"
480
+ test -f "$TMP/audit/headroom-rejections.stdout"
481
+ test -f "$TMP/audit/headroom-rejections.stderr"
482
+ test -f "$TMP/audit/audit.json"
483
+ grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP/audit/frontier.stdout"
484
+ grep -Fq 'verdict=pair_evidence_passed' "$TMP/audit/frontier.stdout"
485
+ python3 - "$TMP/audit/audit.json" "$TMP/audit/frontier.json" <<'PY'
486
+ import json
487
+ import sys
488
+
489
+ audit = json.load(open(sys.argv[1], encoding="utf8"))
490
+ frontier = json.load(open(sys.argv[2], encoding="utf8"))
491
+ assert audit["verdict"] == "PASS"
492
+ assert audit["min_pair_evidence"] == 4
493
+ assert audit["min_pair_margin"] == 5
494
+ assert audit["max_pair_solo_wall_ratio"] == 3.0
495
+ assert audit["checks"]["frontier"]["status"] == "PASS"
496
+ assert audit["checks"]["headroom_rejections"]["status"] == "PASS"
497
+ assert audit["checks"]["headroom_rejections"]["exit_code"] == 0
498
+ assert audit["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
499
+ assert audit["checks"]["headroom_rejections"]["verdict"] == "PASS"
500
+ assert audit["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
501
+ assert audit["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
502
+ assert audit["checks"]["frontier_report"]["status"] == "PASS"
503
+ assert audit["checks"]["frontier_report"]["verdict"] == frontier["verdict"]
504
+ assert audit["checks"]["frontier_report"]["unmeasured_count"] == frontier["unmeasured_count"]
505
+ assert audit["checks"]["frontier_stdout"]["status"] == "PASS"
506
+ assert audit["checks"]["frontier_stdout"]["summary_rows"] == 1
507
+ assert audit["checks"]["frontier_stdout"]["aggregate_rows"] == 1
508
+ assert audit["checks"]["frontier_stdout"]["final_verdict_rows"] == 1
509
+ assert audit["checks"]["frontier_stdout"]["expected_rows"] == len(audit["pair_evidence_rows"])
510
+ assert audit["checks"]["frontier_stdout"]["stdout_rows"] == len(audit["pair_evidence_rows"])
511
+ assert audit["checks"]["frontier_stdout"]["trigger_rows"] == len(audit["pair_evidence_rows"])
512
+ assert audit["checks"]["frontier_stdout"]["hypothesis_trigger_rows"] == len(audit["pair_evidence_rows"])
513
+ assert audit["checks"]["frontier_stdout"]["rows_match_count"] is True
514
+ assert audit["checks"]["frontier_stdout"]["trigger_rows_match_count"] is True
515
+ assert audit["checks"]["frontier_stdout"]["hypothesis_trigger_rows_match_count"] is True
516
+ assert audit["checks"]["min_pair_evidence"]["status"] == "PASS"
517
+ assert audit["checks"]["min_pair_evidence"]["actual_rows"] == len(audit["pair_evidence_rows"])
518
+ assert audit["checks"]["min_pair_evidence"]["actual_rows"] >= audit["min_pair_evidence"]
519
+ assert audit["checks"]["pair_evidence_quality"]["status"] == "PASS"
520
+ assert audit["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == frontier["pair_margin_min"]
521
+ assert audit["checks"]["pair_evidence_quality"]["max_pair_solo_wall_ratio_actual"] == frontier["pair_solo_wall_ratio_max"]
522
+ assert audit["checks"]["pair_trigger_reasons"]["status"] == "PASS"
523
+ assert audit["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 4
524
+ assert audit["checks"]["pair_trigger_reasons"]["canonical_rows"] == 4
525
+ assert audit["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 0
526
+ assert audit["checks"]["pair_trigger_reasons"]["historical_alias_details"] == []
527
+ assert audit["checks"]["pair_trigger_reasons"]["exposed_rows"] == 4
528
+ assert audit["checks"]["pair_trigger_reasons"]["total_rows"] == 4
529
+ assert audit["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
530
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "PASS"
531
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
532
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["required"] is False
533
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 4
534
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 4
535
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 4
536
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == []
537
+ assert audit["artifacts"]["frontier_stdout"] == "frontier.stdout"
538
+ assert audit["artifacts"]["headroom_rejections_stdout"] == "headroom-rejections.stdout"
539
+ assert audit["frontier_summary"]["pair_margin_avg"] == frontier["pair_margin_avg"]
540
+ assert audit["frontier_summary"]["unmeasured_count"] == frontier["unmeasured_count"]
541
+ assert len(audit["pair_evidence_rows"]) == frontier["pair_evidence_count"]
542
+ for row in audit["pair_evidence_rows"]:
543
+ assert isinstance(row["fixture"], str) and row["fixture"]
544
+ assert row["verdict"] == "pair_evidence_passed"
545
+ assert isinstance(row["run_id"], str) and row["run_id"]
546
+ assert isinstance(row["pair_arm"], str) and row["pair_arm"]
547
+ assert isinstance(row["bare_score"], int) and not isinstance(row["bare_score"], bool)
548
+ assert isinstance(row["solo_score"], int) and not isinstance(row["solo_score"], bool)
549
+ assert isinstance(row["pair_score"], int) and not isinstance(row["pair_score"], bool)
550
+ assert isinstance(row["pair_margin"], int) and not isinstance(row["pair_margin"], bool)
551
+ assert row["pair_mode"] is True
552
+ assert row["pair_trigger_eligible"] is True
553
+ assert isinstance(row["pair_solo_wall_ratio"], (int, float))
554
+ assert not isinstance(row["pair_solo_wall_ratio"], bool)
555
+ assert frontier["verdict"] == "PASS"
556
+ assert frontier["min_pair_margin"] == 5
557
+ assert frontier["max_pair_solo_wall_ratio"] == 3.0
558
+ assert frontier["unmeasured_count"] == 0
559
+ assert frontier["pair_margin_avg"] is not None
560
+ assert frontier["pair_margin_min"] is not None
561
+ PY
562
+
563
+ actual_pair_evidence=$(python3 - "$TMP/audit/audit.json" <<'PY'
564
+ import json
565
+ import sys
566
+
567
+ audit = json.load(open(sys.argv[1], encoding="utf8"))
568
+ actual = audit["checks"]["min_pair_evidence"]["actual_rows"]
569
+ assert isinstance(actual, int) and not isinstance(actual, bool)
570
+ print(actual)
571
+ PY
572
+ )
573
+ required_pair_evidence=$((actual_pair_evidence + 1))
574
+ if node "$ROOT/bin/devlyn.js" benchmark audit \
575
+ --min-pair-evidence "$required_pair_evidence" \
576
+ --out-dir "$TMP/audit-fail" \
577
+ > "$TMP/devlyn-benchmark-audit-fail.out" 2>&1; then
578
+ echo "benchmark audit must fail when min pair evidence exceeds current evidence rows" >&2
579
+ cat "$TMP/devlyn-benchmark-audit-fail.out" >&2
580
+ exit 1
581
+ fi
582
+ grep -Fq "pair evidence count ${actual_pair_evidence} below required minimum ${required_pair_evidence}" "$TMP/devlyn-benchmark-audit-fail.out"
583
+ grep -Fq 'pair_margin_avg=+27.25 pair_margin_min=+21 wall_avg=1.66x wall_max=2.25x' "$TMP/devlyn-benchmark-audit-fail.out"
584
+ grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' "$TMP/devlyn-benchmark-audit-fail.out"
585
+ grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$TMP/devlyn-benchmark-audit-fail.out"
586
+ grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=2.25x max_wall_allowed=3.00x' "$TMP/devlyn-benchmark-audit-fail.out"
587
+ grep -Fq 'pair_trigger_reasons=PASS canonical=4 historical_alias=0 exposed=4 total=4 summary=4 rows_match=true' "$TMP/devlyn-benchmark-audit-fail.out"
588
+ grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$TMP/devlyn-benchmark-audit-fail.out"
589
+ grep -Fq 'FAIL audit-pair-evidence' "$TMP/devlyn-benchmark-audit-fail.out"
590
+ python3 - "$TMP/audit-fail/audit.json" "$actual_pair_evidence" "$required_pair_evidence" <<'PY'
591
+ import json
592
+ import sys
593
+
594
+ audit = json.load(open(sys.argv[1], encoding="utf8"))
595
+ actual = int(sys.argv[2])
596
+ required = int(sys.argv[3])
597
+ assert audit["verdict"] == "FAIL"
598
+ assert audit["checks"]["frontier"]["status"] == "PASS"
599
+ assert audit["checks"]["headroom_rejections"]["status"] == "PASS"
600
+ assert audit["checks"]["headroom_rejections"]["report_check_exit_code"] == 0
601
+ assert audit["checks"]["headroom_rejections"]["verdict"] == "PASS"
602
+ assert audit["checks"]["headroom_rejections"]["unrecorded_failure_count"] == 0
603
+ assert audit["checks"]["headroom_rejections"]["unsupported_registry_rejection_count"] == 0
604
+ assert audit["checks"]["min_pair_evidence"]["status"] == "FAIL"
605
+ assert audit["checks"]["min_pair_evidence"]["required"] == required
606
+ assert audit["checks"]["min_pair_evidence"]["actual_rows"] == actual
607
+ assert audit["checks"]["pair_evidence_quality"]["status"] == "PASS"
608
+ assert audit["checks"]["pair_trigger_reasons"]["status"] == "PASS"
609
+ assert audit["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == actual
610
+ assert audit["checks"]["pair_trigger_reasons"]["historical_alias_rows"] == 0
611
+ assert audit["checks"]["pair_trigger_reasons"]["rows_match_count"] is True
612
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "PASS"
613
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == actual
614
+ PY
615
+
616
+ node "$ROOT/bin/devlyn.js" benchmark audit \
617
+ --require-hypothesis-trigger \
618
+ --out-dir "$TMP/audit-strict-trigger" \
619
+ > "$TMP/devlyn-benchmark-audit-strict-trigger.out" 2>&1
620
+ grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$TMP/devlyn-benchmark-audit-strict-trigger.out"
621
+ grep -Fq 'PASS audit-pair-evidence' "$TMP/devlyn-benchmark-audit-strict-trigger.out"
622
+ if grep -Fq 'pair_evidence_hypothesis_trigger_gaps=' "$TMP/devlyn-benchmark-audit-strict-trigger.out"; then
623
+ echo "strict benchmark audit must not report current hypothesis-trigger gaps" >&2
624
+ cat "$TMP/devlyn-benchmark-audit-strict-trigger.out" >&2
625
+ exit 1
626
+ fi
627
+ python3 - "$TMP/audit-strict-trigger/audit.json" <<'PY'
628
+ import json
629
+ import sys
630
+
631
+ audit = json.load(open(sys.argv[1], encoding="utf8"))
632
+ assert audit["verdict"] == "PASS"
633
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["status"] == "PASS"
634
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["exit_code"] == 0
635
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["required"] is True
636
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["matched_rows"] == 4
637
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["documented_rows"] == 4
638
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["total_rows"] == 4
639
+ assert audit["checks"]["pair_evidence_hypothesis_triggers"]["gap_details"] == []
640
+ PY
641
+
642
+ node "$ROOT/bin/devlyn.js" benchmark frontier --out-json "$TMP/frontier.json" > "$TMP/devlyn-benchmark-frontier.out" 2>&1
643
+ grep -Fq 'fixtures=' "$TMP/devlyn-benchmark-frontier.out"
644
+ grep -Fq 'rejected=' "$TMP/devlyn-benchmark-frontier.out"
645
+ grep -Fq 'candidates=' "$TMP/devlyn-benchmark-frontier.out"
646
+ grep -Fq 'pair_evidence=' "$TMP/devlyn-benchmark-frontier.out"
647
+ grep -Fq 'pair_margin_avg=' "$TMP/devlyn-benchmark-frontier.out"
648
+ grep -Fq 'PASS pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier.out"
649
+ python3 - "$TMP/frontier.json" <<'PY'
650
+ import json
651
+ import sys
652
+
653
+ report = json.load(open(sys.argv[1], encoding="utf8"))
654
+ assert report["verdict"] in {"PASS", "FAIL"}
655
+ assert report["fixtures_total"] >= 1
656
+ assert "unmeasured_count" in report
657
+ assert "pair_margin_avg" in report
658
+ assert "rows" in report
659
+ PY
660
+
661
+ frontier_fail_fixtures="$TMP/frontier-fail-fixtures"
662
+ frontier_fail_results="$TMP/frontier-fail-results"
663
+ frontier_fail_registry="$TMP/frontier-fail-rejected.sh"
664
+ mkdir -p "$frontier_fail_fixtures/F21-cli-scheduler-priority" "$frontier_fail_results"
665
+ cat > "$frontier_fail_registry" <<'SH'
666
+ rejected_pair_fixture_reason() {
667
+ local fid="$1"
668
+ case "$fid" in
669
+ F2-*|F2)
670
+ echo "measured ceiling"
671
+ ;;
672
+ *)
673
+ return 1
674
+ ;;
675
+ esac
676
+ }
677
+ SH
678
+ if node "$ROOT/bin/devlyn.js" benchmark frontier \
679
+ --fixtures-root "$frontier_fail_fixtures" \
680
+ --registry "$frontier_fail_registry" \
681
+ --results-root "$frontier_fail_results" \
682
+ --fail-on-unmeasured \
683
+ --out-json "$TMP/frontier-fail.json" \
684
+ > "$TMP/devlyn-benchmark-frontier-fail.out" 2>&1; then
685
+ echo "benchmark frontier must fail when active unmeasured candidates remain" >&2
686
+ cat "$TMP/devlyn-benchmark-frontier-fail.out" >&2
687
+ exit 1
688
+ fi
689
+ grep -Fq 'fixtures=1 rejected=0 candidates=1 pair_evidence=0 unmeasured=1 verdict=FAIL' "$TMP/devlyn-benchmark-frontier-fail.out"
690
+ grep -Fq 'unmeasured candidate fixture(s): F21-cli-scheduler-priority' "$TMP/devlyn-benchmark-frontier-fail.out"
691
+ grep -Fq 'FAIL pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier-fail.out"
692
+ python3 - "$TMP/frontier-fail.json" <<'PY'
693
+ import json
694
+ import sys
695
+
696
+ report = json.load(open(sys.argv[1], encoding="utf8"))
697
+ assert report["verdict"] == "FAIL"
698
+ assert report["fixtures_total"] == 1
699
+ assert report["candidate_count"] == 1
700
+ assert report["unmeasured_count"] == 1
701
+ assert report["rows"][0]["status"] == "candidate_unmeasured"
702
+ PY
703
+
704
+ set +e
705
+ node "$ROOT/bin/devlyn.js" benchmark frontier \
706
+ --fixtures-root "$frontier_fail_fixtures" \
707
+ --registry "$frontier_fail_registry" \
708
+ --results-root "$frontier_fail_results" \
709
+ --fail-on-unmeasured \
710
+ > "$TMP/devlyn-benchmark-frontier-json-fail.json" \
711
+ 2> "$TMP/devlyn-benchmark-frontier-json-fail.stderr"
712
+ frontier_json_fail_status=$?
713
+ set -e
714
+ if [ "$frontier_json_fail_status" -eq 0 ]; then
715
+ echo "benchmark frontier pure JSON failure path must fail" >&2
716
+ exit 1
717
+ fi
718
+ grep -Fq 'unmeasured candidate fixture(s): F21-cli-scheduler-priority' "$TMP/devlyn-benchmark-frontier-json-fail.stderr"
719
+ grep -Fq 'FAIL pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier-json-fail.stderr"
720
+ if grep -Fq 'FAIL pair-candidate-frontier' "$TMP/devlyn-benchmark-frontier-json-fail.json"; then
721
+ echo "benchmark frontier pure JSON stdout must not include final text verdict" >&2
722
+ cat "$TMP/devlyn-benchmark-frontier-json-fail.json" >&2
723
+ exit 1
724
+ fi
725
+ python3 - "$TMP/devlyn-benchmark-frontier-json-fail.json" <<'PY'
726
+ import json
727
+ import sys
728
+
729
+ report = json.load(open(sys.argv[1], encoding="utf8"))
730
+ assert report["verdict"] == "FAIL"
731
+ assert report["fixtures_total"] == 1
732
+ assert report["unmeasured_count"] == 1
733
+ assert report["rows"][0]["status"] == "candidate_unmeasured"
734
+ PY
735
+
736
+ node "$ROOT/bin/devlyn.js" benchmark suite --dry-run --run-id arg-parse-command-test F0 \
737
+ > "$TMP/devlyn-benchmark-suite.out" 2>&1
738
+ grep -Fq '═══ Benchmark Suite Run ═══' "$TMP/devlyn-benchmark-suite.out"
739
+ grep -Fq -- '--run-id arg-parse-command-test' "$TMP/devlyn-benchmark-suite.out"
740
+
741
+ node "$ROOT/bin/devlyn.js" benchmark headroom --help > "$TMP/devlyn-benchmark-headroom-help.out" 2>&1
742
+ grep -Fq 'npx devlyn-cli benchmark headroom [options] <fixtures...>' "$TMP/devlyn-benchmark-headroom-help.out"
743
+ grep -Fq 'use 3 for F16/F23/F25 proof reruns; audit requires 4 passing evidence rows' "$TMP/devlyn-benchmark-headroom-help.out"
744
+ grep -Fq 'npx devlyn-cli benchmark headroom --min-fixtures 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' "$TMP/devlyn-benchmark-headroom-help.out"
745
+ grep -Fq -- '--min-bare-headroom N' "$TMP/devlyn-benchmark-headroom-help.out"
746
+ grep -Fq -- '--min-solo-headroom N' "$TMP/devlyn-benchmark-headroom-help.out"
747
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP/devlyn-benchmark-headroom-help.out"
748
+ grep -Fq -- '--dry-run' "$TMP/devlyn-benchmark-headroom-help.out"
749
+ if grep -Fq 'run-headroom-candidate.sh' "$TMP/devlyn-benchmark-headroom-help.out"; then
750
+ echo "headroom CLI help must not expose internal runner name" >&2
751
+ cat "$TMP/devlyn-benchmark-headroom-help.out" >&2
752
+ exit 1
753
+ fi
754
+ node "$ROOT/bin/devlyn.js" benchmark pair --help > "$TMP/devlyn-benchmark-pair-help.out" 2>&1
755
+ grep -Fq 'npx devlyn-cli benchmark pair [options] <fixtures...>' "$TMP/devlyn-benchmark-pair-help.out"
756
+ grep -Fq 'use 3 for F16/F23/F25 proof reruns; audit requires 4 passing evidence rows' "$TMP/devlyn-benchmark-pair-help.out"
757
+ grep -Fq 'default: l2_risk_probes; l2_gated is diagnostic' "$TMP/devlyn-benchmark-pair-help.out"
758
+ grep -Fq -- '--min-bare-headroom N' "$TMP/devlyn-benchmark-pair-help.out"
759
+ grep -Fq -- '--min-solo-headroom N' "$TMP/devlyn-benchmark-pair-help.out"
760
+ grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' "$TMP/devlyn-benchmark-pair-help.out"
761
+ grep -Fq -- '--allow-rejected-fixtures' "$TMP/devlyn-benchmark-pair-help.out"
762
+ grep -Fq 'npx devlyn-cli benchmark pair --min-fixtures 3 --max-pair-solo-wall-ratio 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' "$TMP/devlyn-benchmark-pair-help.out"
763
+ grep -Fq -- '--dry-run' "$TMP/devlyn-benchmark-pair-help.out"
764
+ if grep -Fq 'run-full-pipeline-pair-candidate.sh' "$TMP/devlyn-benchmark-pair-help.out"; then
765
+ echo "pair CLI help must not expose internal runner name" >&2
766
+ cat "$TMP/devlyn-benchmark-pair-help.out" >&2
767
+ exit 1
768
+ fi
769
+ grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND: benchmarkMode' "$ROOT/bin/devlyn.js"
770
+
771
+ expect_fail_contains devlyn-headroom-cli-replay \
772
+ 'Command: npx devlyn-cli benchmark headroom --run-id arg-parse-headroom-cli-replay' \
773
+ node "$ROOT/bin/devlyn.js" benchmark headroom \
774
+ --run-id arg-parse-headroom-cli-replay \
775
+ --min-fixtures 2 \
776
+ F999-not-a-fixture
777
+
778
+ expect_fail_contains devlyn-pair-cli-replay \
779
+ 'Command: npx devlyn-cli benchmark pair --run-id arg-parse-pair-cli-replay' \
780
+ node "$ROOT/bin/devlyn.js" benchmark pair \
781
+ --run-id arg-parse-pair-cli-replay \
782
+ --reuse-calibrated-from arg-parse-missing-calibration \
783
+ F21-cli-scheduler-priority
784
+
785
+ node "$ROOT/bin/devlyn.js" benchmark headroom \
786
+ --run-id arg-parse-headroom-dry-run \
787
+ --dry-run \
788
+ --min-fixtures 1 \
789
+ F21-cli-scheduler-priority > "$TMP/devlyn-headroom-dry-run.out" 2>&1
790
+ grep -Fq 'Command: npx devlyn-cli benchmark headroom --run-id arg-parse-headroom-dry-run' "$TMP/devlyn-headroom-dry-run.out"
791
+ grep -Fq -- '--min-bare-headroom 5' "$TMP/devlyn-headroom-dry-run.out"
792
+ grep -Fq -- '--min-solo-headroom 5' "$TMP/devlyn-headroom-dry-run.out"
793
+ grep -Fq -- '--dry-run' "$TMP/devlyn-headroom-dry-run.out"
794
+ grep -Fq '[headroom] DRY RUN complete' "$TMP/devlyn-headroom-dry-run.out"
795
+
796
+ node "$ROOT/bin/devlyn.js" benchmark headroom \
797
+ --run-id arg-parse-shadow-headroom-dry-run \
798
+ --dry-run \
799
+ --min-fixtures 1 \
800
+ S1-cli-lang-flag > "$TMP/devlyn-shadow-headroom-dry-run.out" 2>&1
801
+ grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP/devlyn-shadow-headroom-dry-run.out"
802
+ grep -Fq '[headroom] DRY RUN complete' "$TMP/devlyn-shadow-headroom-dry-run.out"
803
+
804
+ expect_fail_contains smoke-only-s1-cli-headroom \
805
+ "fixture is smoke-only and cannot run providers: S1-cli-lang-flag" \
806
+ node "$ROOT/bin/devlyn.js" benchmark headroom \
807
+ --run-id arg-parse-shadow-headroom-block \
808
+ --min-fixtures 1 \
809
+ S1-cli-lang-flag
810
+
811
+ node "$ROOT/bin/devlyn.js" benchmark pair \
812
+ --run-id arg-parse-pair-dry-run \
813
+ --dry-run \
814
+ --min-fixtures 1 \
815
+ F21-cli-scheduler-priority > "$TMP/devlyn-pair-dry-run.out" 2>&1
816
+ grep -Fq 'Command: npx devlyn-cli benchmark pair --run-id arg-parse-pair-dry-run' "$TMP/devlyn-pair-dry-run.out"
817
+ grep -Fq -- '--min-bare-headroom 5' "$TMP/devlyn-pair-dry-run.out"
818
+ grep -Fq -- '--min-solo-headroom 5' "$TMP/devlyn-pair-dry-run.out"
819
+ grep -Fq -- '--max-pair-solo-wall-ratio 3' "$TMP/devlyn-pair-dry-run.out"
820
+ grep -Fq -- '--dry-run' "$TMP/devlyn-pair-dry-run.out"
821
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP/devlyn-pair-dry-run.out"
822
+
823
+ node "$ROOT/bin/devlyn.js" benchmark pair \
824
+ --run-id arg-parse-shadow-pair-dry-run \
825
+ --dry-run \
826
+ --min-fixtures 1 \
827
+ S1-cli-lang-flag > "$TMP/devlyn-shadow-pair-dry-run.out" 2>&1
828
+ grep -Fq 'Fixtures: S1-cli-lang-flag' "$TMP/devlyn-shadow-pair-dry-run.out"
829
+ grep -Fq '[full-pipeline-pair] DRY RUN complete' "$TMP/devlyn-shadow-pair-dry-run.out"
830
+
831
+ expect_fail_contains smoke-only-s1-cli-pair \
832
+ "fixture is smoke-only and cannot run providers: S1-cli-lang-flag" \
833
+ node "$ROOT/bin/devlyn.js" benchmark pair \
834
+ --run-id arg-parse-shadow-pair-block \
835
+ --min-fixtures 1 \
836
+ S1-cli-lang-flag
837
+
838
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
839
+ --fixture F1-cli-trivial-flag \
840
+ --arm variant \
841
+ --run-id arg-parse-variant-path \
842
+ --dry-run > "$TMP/variant-dry-run.out" 2>&1
843
+ grep -Fq -- '--engine claude --risk-probes' \
844
+ "$BENCH_ROOT/results/arg-parse-variant-path/F1-cli-trivial-flag/variant/input.md"
845
+ if grep -Fq -- '--engine auto' \
846
+ "$BENCH_ROOT/results/arg-parse-variant-path/F1-cli-trivial-flag/variant/input.md"; then
847
+ echo "variant arm must not use retired --engine auto route" >&2
848
+ cat "$BENCH_ROOT/results/arg-parse-variant-path/F1-cli-trivial-flag/variant/input.md" >&2
849
+ exit 1
850
+ fi
851
+ mkdir -p "$BENCH_ROOT/shadow-fixtures"
852
+ rm -rf "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata" \
853
+ "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected" \
854
+ "$BENCH_ROOT/results/arg-parse-nan-metadata" \
855
+ "$BENCH_ROOT/results/arg-parse-nan-expected"
856
+ cp -R "$BENCH_ROOT/fixtures/F1-cli-trivial-flag" "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata"
857
+ printf '{"timeout_seconds": NaN}\n' > "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata/metadata.json"
858
+ expect_fail_contains fixture-nan-metadata "invalid JSON numeric constant: NaN" \
859
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
860
+ --fixture arg-parse-nan-metadata \
861
+ --arm bare \
862
+ --run-id arg-parse-nan-metadata \
863
+ --dry-run
864
+ cp -R "$BENCH_ROOT/fixtures/F1-cli-trivial-flag" "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected"
865
+ printf '{"verification_commands": NaN}\n' > "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected/expected.json"
866
+ expect_fail_contains fixture-nan-expected "invalid JSON numeric constant: NaN" \
867
+ bash "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh" \
868
+ --fixture arg-parse-nan-expected \
869
+ --arm variant \
870
+ --run-id arg-parse-nan-expected \
871
+ --dry-run
872
+ rm -rf "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-metadata" \
873
+ "$BENCH_ROOT/shadow-fixtures/arg-parse-nan-expected" \
874
+ "$BENCH_ROOT/results/arg-parse-nan-metadata" \
875
+ "$BENCH_ROOT/results/arg-parse-nan-expected"
876
+ grep -Fq 'data = raw_oracle' \
877
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
878
+ grep -Fq 'expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' \
879
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
880
+ grep -Fq 'oracle artifact malformed or unreadable' \
881
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
882
+ grep -Fq 'findings = raw_findings if isinstance(raw_findings, list) else []' \
883
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
884
+ grep -Fq 'if not isinstance(finding, dict):' \
885
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
886
+ grep -Fq 'loads_strict_json_object(pathlib.Path(result_dir, "timing.json").read_text())' \
887
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
888
+ grep -Fq 'loads_strict_json_object(pathlib.Path(result_dir, "verify.json").read_text())' \
889
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
890
+ grep -Fq 'loads_strict_json_object(pathlib.Path(state_path).read_text())' \
891
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
892
+ grep -Fq '"type": "oracle-error"' \
893
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
894
+ grep -Fq 'verify["oracle_disqualifier"] = True' \
895
+ "$ROOT/benchmark/auto-resolve/scripts/run-fixture.sh"
896
+
897
+ SCOPE_REPO="$TMP/scope-repo"
898
+ mkdir -p "$SCOPE_REPO"
899
+ git -C "$SCOPE_REPO" init -q
900
+ git -C "$SCOPE_REPO" config user.email bench@example.com
901
+ git -C "$SCOPE_REPO" config user.name bench
902
+ printf 'console.log("ok")\n' > "$SCOPE_REPO/app.js"
903
+ git -C "$SCOPE_REPO" add app.js
904
+ git -C "$SCOPE_REPO" commit -q -m base
905
+ SCOPE_SHA="$(git -C "$SCOPE_REPO" rev-parse HEAD)"
906
+
907
+ cat > "$TMP/expected-nan.json" <<'JSON'
908
+ {"tier_a_waivers": NaN, "spec_output_files": ["app.js"]}
909
+ JSON
910
+ python3 "$ROOT/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py" \
911
+ --work "$SCOPE_REPO" \
912
+ --scaffold "$SCOPE_SHA" \
913
+ --expected "$TMP/expected-nan.json" > "$TMP/scope-tier-a-nan.json"
914
+ grep -Fq '"error": "expected.json unreadable: invalid JSON numeric constant: NaN"' \
915
+ "$TMP/scope-tier-a-nan.json"
916
+ python3 "$ROOT/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py" \
917
+ --work "$SCOPE_REPO" \
918
+ --scaffold "$SCOPE_SHA" \
919
+ --expected "$TMP/expected-nan.json" > "$TMP/scope-tier-b-nan.json"
920
+ grep -Fq '"error": "expected.json unreadable: invalid JSON numeric constant: NaN"' \
921
+ "$TMP/scope-tier-b-nan.json"
922
+
923
+ cat > "$TMP/expected-bad-tier-c.json" <<'JSON'
924
+ {"tier_a_waivers": [], "spec_output_files": "app.js"}
925
+ JSON
926
+ python3 "$ROOT/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py" \
927
+ --work "$SCOPE_REPO" \
928
+ --scaffold "$SCOPE_SHA" \
929
+ --expected "$TMP/expected-bad-tier-c.json" > "$TMP/scope-tier-b-bad-tier-c.json"
930
+ grep -Fq '"error": "expected.json malformed: spec_output_files must be a string array"' \
931
+ "$TMP/scope-tier-b-bad-tier-c.json"
932
+
933
+ echo "PASS test-benchmark-arg-parsing"