devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -2,8 +2,9 @@
2
2
  # run-full-pipeline-pair-candidate.sh — measure full-pipeline L2/pair candidates.
3
3
  #
4
4
  # Runs bare + solo_claude first and applies headroom-gate.py. Only if the set
5
- # leaves room for L2 does it run l2_gated, rejudge, and apply
6
- # full-pipeline-pair-gate.py.
5
+ # leaves room for L2 does it run the selected pair arm, rejudge, and apply
6
+ # full-pipeline-pair-gate.py. Default pair arm is l2_risk_probes because that is
7
+ # the current measured solo<pair proof path.
7
8
 
8
9
  set -euo pipefail
9
10
 
@@ -16,43 +17,120 @@ Options:
16
17
  --run-id ID
17
18
  --bare-max N
18
19
  --solo-max N
20
+ --min-bare-headroom N
21
+ --min-solo-headroom N
19
22
  --min-fixtures N
20
23
  --min-pair-margin N
21
- --max-pair-solo-wall-ratio N
22
- --pair-arm ARM
24
+ --max-pair-solo-wall-ratio N (default: 3)
25
+ --pair-arm ARM (default: l2_risk_probes; use l2_gated only for diagnostics)
23
26
  --reuse-calibrated-from RUN_ID
27
+ --allow-rejected-fixtures
28
+ allow rejected/ceiling fixtures for diagnostics only
29
+ --dry-run validate args/fixtures and print replay command only
24
30
  EOF
25
31
  exit "$code"
26
32
  }
27
33
 
34
+ require_value() {
35
+ local flag="$1"
36
+ local value="${2:-}"
37
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
38
+ echo "$flag requires a value" >&2
39
+ exit 1
40
+ fi
41
+ }
42
+
28
43
  RUN_ID=""
29
44
  BARE_MAX=60
30
45
  SOLO_MAX=80
46
+ MIN_BARE_HEADROOM=5
47
+ MIN_SOLO_HEADROOM=5
31
48
  MIN_FIXTURES=2
32
49
  MIN_PAIR_MARGIN=5
33
- MAX_PAIR_SOLO_WALL_RATIO=""
34
- PAIR_ARM="l2_gated"
50
+ MAX_PAIR_SOLO_WALL_RATIO=3
51
+ PAIR_ARM="l2_risk_probes"
35
52
  REUSE_CALIBRATED_FROM=""
53
+ ALLOW_REJECTED_FIXTURES=0
54
+ DRY_RUN=0
36
55
  FIXTURES=()
37
56
  while [ $# -gt 0 ]; do
38
57
  case "$1" in
39
- --run-id) RUN_ID="$2"; shift 2;;
40
- --bare-max) BARE_MAX="$2"; shift 2;;
41
- --solo-max) SOLO_MAX="$2"; shift 2;;
42
- --min-fixtures) MIN_FIXTURES="$2"; shift 2;;
43
- --min-pair-margin) MIN_PAIR_MARGIN="$2"; shift 2;;
44
- --max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
45
- --pair-arm) PAIR_ARM="$2"; shift 2;;
46
- --reuse-calibrated-from) REUSE_CALIBRATED_FROM="$2"; shift 2;;
58
+ --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
59
+ --bare-max) require_value "$1" "${2:-}"; BARE_MAX="$2"; shift 2;;
60
+ --solo-max) require_value "$1" "${2:-}"; SOLO_MAX="$2"; shift 2;;
61
+ --min-bare-headroom) require_value "$1" "${2:-}"; MIN_BARE_HEADROOM="$2"; shift 2;;
62
+ --min-solo-headroom) require_value "$1" "${2:-}"; MIN_SOLO_HEADROOM="$2"; shift 2;;
63
+ --min-fixtures) require_value "$1" "${2:-}"; MIN_FIXTURES="$2"; shift 2;;
64
+ --min-pair-margin) require_value "$1" "${2:-}"; MIN_PAIR_MARGIN="$2"; shift 2;;
65
+ --max-pair-solo-wall-ratio) require_value "$1" "${2:-}"; MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
66
+ --pair-arm) require_value "$1" "${2:-}"; PAIR_ARM="$2"; shift 2;;
67
+ --reuse-calibrated-from) require_value "$1" "${2:-}"; REUSE_CALIBRATED_FROM="$2"; shift 2;;
68
+ --allow-rejected-fixtures) ALLOW_REJECTED_FIXTURES=1; shift;;
69
+ --dry-run) DRY_RUN=1; shift;;
47
70
  -h|--help) usage 0;;
48
- F[0-9]*) FIXTURES+=("$1"); shift;;
71
+ [FS][0-9]*) FIXTURES+=("$1"); shift;;
49
72
  *) echo "unknown arg: $1" >&2; usage;;
50
73
  esac
51
74
  done
75
+
76
+ for threshold in BARE_MAX SOLO_MAX MIN_BARE_HEADROOM MIN_SOLO_HEADROOM MIN_FIXTURES MIN_PAIR_MARGIN; do
77
+ value="${!threshold}"
78
+ case "$threshold" in
79
+ BARE_MAX) flag="bare-max" ;;
80
+ SOLO_MAX) flag="solo-max" ;;
81
+ MIN_BARE_HEADROOM) flag="min-bare-headroom" ;;
82
+ MIN_SOLO_HEADROOM) flag="min-solo-headroom" ;;
83
+ MIN_FIXTURES) flag="min-fixtures" ;;
84
+ MIN_PAIR_MARGIN) flag="min-pair-margin" ;;
85
+ esac
86
+ if [[ ! "$value" =~ ^[0-9]+$ ]]; then
87
+ echo "--$flag must be an integer: $value" >&2
88
+ exit 1
89
+ fi
90
+ done
91
+ if [ "$MIN_FIXTURES" -lt 1 ]; then
92
+ echo "--min-fixtures must be >= 1" >&2
93
+ exit 1
94
+ fi
95
+ if [ "$MIN_BARE_HEADROOM" -lt 0 ]; then
96
+ echo "--min-bare-headroom must be >= 0" >&2
97
+ exit 1
98
+ fi
99
+ if [ "$MIN_SOLO_HEADROOM" -lt 0 ]; then
100
+ echo "--min-solo-headroom must be >= 0" >&2
101
+ exit 1
102
+ fi
103
+ if [ -n "$MAX_PAIR_SOLO_WALL_RATIO" ]; then
104
+ if ! [[ "$MAX_PAIR_SOLO_WALL_RATIO" =~ ^[0-9]+([.][0-9]+)?$ ]]; then
105
+ echo "--max-pair-solo-wall-ratio must be a positive number: $MAX_PAIR_SOLO_WALL_RATIO" >&2
106
+ exit 1
107
+ fi
108
+ if ! awk "BEGIN { exit !($MAX_PAIR_SOLO_WALL_RATIO > 0) }"; then
109
+ echo "--max-pair-solo-wall-ratio must be > 0" >&2
110
+ exit 1
111
+ fi
112
+ fi
52
113
  [ ${#FIXTURES[@]} -gt 0 ] || usage
53
114
 
115
+ case "$PAIR_ARM" in
116
+ l2_risk_probes|l2_gated) ;;
117
+ l2_forced)
118
+ echo "pair-arm l2_forced is retired: it leaks pair-awareness before IMPLEMENT; use l2_risk_probes for current proof runs or l2_gated for diagnostics." >&2
119
+ exit 1
120
+ ;;
121
+ *)
122
+ echo "pair-arm must be l2_risk_probes or l2_gated (diagnostic): $PAIR_ARM" >&2
123
+ exit 1
124
+ ;;
125
+ esac
126
+
54
127
  BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
55
128
  REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
129
+ source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"
130
+ if ! declare -F rejected_pair_fixture_reason >/dev/null; then
131
+ echo "rejected fixture registry must define rejected_pair_fixture_reason" >&2
132
+ exit 1
133
+ fi
56
134
 
57
135
  if [ -z "$RUN_ID" ]; then
58
136
  TS=$(date -u +%Y%m%dT%H%M%SZ)
@@ -60,50 +138,245 @@ if [ -z "$RUN_ID" ]; then
60
138
  RUN_ID="${TS}-${SHA}-full-pipeline-pair"
61
139
  fi
62
140
 
141
+ print_command() {
142
+ local cmd
143
+ if [ "${DEVLYN_BENCHMARK_CLI_SUBCOMMAND:-}" = "pair" ]; then
144
+ cmd=(npx devlyn-cli benchmark pair --run-id "$RUN_ID")
145
+ else
146
+ cmd=(bash "$0" --run-id "$RUN_ID")
147
+ fi
148
+ cmd+=(--bare-max "$BARE_MAX")
149
+ cmd+=(--solo-max "$SOLO_MAX")
150
+ cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")
151
+ cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")
152
+ cmd+=(--min-fixtures "$MIN_FIXTURES")
153
+ cmd+=(--min-pair-margin "$MIN_PAIR_MARGIN")
154
+ [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || cmd+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
155
+ cmd+=(--pair-arm "$PAIR_ARM")
156
+ [ -z "$REUSE_CALIBRATED_FROM" ] || cmd+=(--reuse-calibrated-from "$REUSE_CALIBRATED_FROM")
157
+ [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ] || cmd+=(--allow-rejected-fixtures)
158
+ [ "$DRY_RUN" -eq 0 ] || cmd+=(--dry-run)
159
+ cmd+=("${FIXTURES[@]}")
160
+ printf 'Command: '
161
+ printf '%q ' "${cmd[@]}"
162
+ printf '\n'
163
+ }
164
+
165
+ run_gate_with_report() {
166
+ local label="$1"
167
+ local report="$2"
168
+ shift 2
169
+ set +e
170
+ "$@"
171
+ local status=$?
172
+ set -e
173
+ if [ -f "$report" ]; then
174
+ cat "$report"
175
+ else
176
+ echo "[full-pipeline-pair] ${label} report missing: $report" >&2
177
+ fi
178
+ return "$status"
179
+ }
180
+
181
+ fixture_exists() {
182
+ local fid="$1"
183
+ [ -d "$BENCH_ROOT/fixtures/$fid" ] || [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
184
+ }
185
+
186
+ fixture_dir() {
187
+ local fid="$1"
188
+ if [ -d "$BENCH_ROOT/fixtures/$fid" ]; then
189
+ printf '%s\n' "$BENCH_ROOT/fixtures/$fid"
190
+ else
191
+ printf '%s\n' "$BENCH_ROOT/shadow-fixtures/$fid"
192
+ fi
193
+ }
194
+
195
+ is_shadow_fixture() {
196
+ local fid="$1"
197
+ [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
198
+ }
199
+
200
+ retired_fixture_exists() {
201
+ local fid="$1"
202
+ [ -d "$BENCH_ROOT/fixtures/retired/$fid" ]
203
+ }
204
+
205
+ fixture_smoke_only() {
206
+ local fid="$1"
207
+ [[ "$fid" == S1 || "$fid" == S1-* ]]
208
+ }
209
+
210
+ fixture_category() {
211
+ local dir="$1"
212
+ python3 - "$dir/metadata.json" <<'PY'
213
+ import json
214
+ import sys
215
+
216
+ try:
217
+ with open(sys.argv[1], encoding="utf-8") as handle:
218
+ print(json.load(handle).get("category", ""))
219
+ except FileNotFoundError:
220
+ print("")
221
+ PY
222
+ }
223
+
224
+ fixture_has_solo_headroom_hypothesis() {
225
+ local dir="$1"
226
+ python3 "$BENCH_ROOT/scripts/solo-headroom-hypothesis.py" --expected-json "$dir/expected.json" "$dir/spec.md"
227
+ }
228
+
229
+ fixture_has_solo_ceiling_avoidance_note() {
230
+ local dir="$1"
231
+ python3 "$BENCH_ROOT/scripts/solo-ceiling-avoidance.py" "$dir/NOTES.md"
232
+ }
233
+
234
+ fixture_has_pair_evidence() {
235
+ local fid="$1"
236
+ python3 - "$BENCH_ROOT/results" "$fid" <<'PY'
237
+ import json
238
+ import pathlib
239
+ import sys
240
+
241
+ results = pathlib.Path(sys.argv[1])
242
+ fixture = sys.argv[2]
243
+ if not results.is_dir():
244
+ sys.exit(1)
245
+ for path in results.glob("*/full-pipeline-pair-gate.json"):
246
+ try:
247
+ data = json.loads(path.read_text(encoding="utf-8"))
248
+ except (OSError, json.JSONDecodeError):
249
+ continue
250
+ if data.get("verdict") != "PASS":
251
+ continue
252
+ rows = data.get("rows")
253
+ if not isinstance(rows, list):
254
+ continue
255
+ for row in rows:
256
+ if isinstance(row, dict) and row.get("fixture") == fixture and row.get("status") == "PASS":
257
+ sys.exit(0)
258
+ sys.exit(1)
259
+ PY
260
+ }
261
+
262
+ validate_fixtures() {
263
+ local missing=0
264
+ local fid reason dir category
265
+ for fid in "${FIXTURES[@]}"; do
266
+ if ! fixture_exists "$fid"; then
267
+ if retired_fixture_exists "$fid"; then
268
+ echo "fixture is retired and is not rerun by pair-candidate runners: $fid. Use preserved results/docs for historical replay." >&2
269
+ missing=1
270
+ continue
271
+ fi
272
+ echo "fixture not found in fixtures/ or shadow-fixtures/: $fid" >&2
273
+ missing=1
274
+ continue
275
+ fi
276
+ if [ "$DRY_RUN" -eq 0 ] && fixture_smoke_only "$fid"; then
277
+ echo "fixture is smoke-only and cannot run providers: $fid. Use --dry-run for runner/package validation." >&2
278
+ missing=1
279
+ continue
280
+ fi
281
+ reason="$(rejected_pair_fixture_reason "$fid" || true)"
282
+ if [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ]; then
283
+ if [ -n "$reason" ]; then
284
+ echo "fixture rejected for pair-candidate runs: $fid ($reason). Use --allow-rejected-fixtures for diagnostics only." >&2
285
+ missing=1
286
+ continue
287
+ fi
288
+ fi
289
+ if [ -z "$reason" ]; then
290
+ dir="$(fixture_dir "$fid")"
291
+ category="$(fixture_category "$dir")"
292
+ if [ "$category" = "high-risk" ] && ! fixture_has_pair_evidence "$fid"; then
293
+ if ! fixture_has_solo_headroom_hypothesis "$dir"; then
294
+ echo "fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: $fid" >&2
295
+ missing=1
296
+ fi
297
+ if is_shadow_fixture "$fid" && ! fixture_has_solo_ceiling_avoidance_note "$dir"; then
298
+ echo "shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: $fid" >&2
299
+ missing=1
300
+ fi
301
+ fi
302
+ fi
303
+ done
304
+ [ "$missing" -eq 0 ] || exit 1
305
+ }
306
+
63
307
  echo ""
64
308
  echo "═══ Full-Pipeline Pair Candidate Run ═══"
65
309
  echo "Run-id: $RUN_ID"
66
310
  echo "Fixtures: ${FIXTURES[*]}"
67
311
  echo "Arms: bare solo_claude $PAIR_ARM"
312
+ echo "Headroom: bare <= $BARE_MAX (headroom >= $MIN_BARE_HEADROOM), solo_claude <= $SOLO_MAX (headroom >= $MIN_SOLO_HEADROOM), baseline evidence-complete, min fixtures $MIN_FIXTURES"
313
+ echo "Pair: $PAIR_ARM evidence-clean, canonical trigger, margin >= +$MIN_PAIR_MARGIN${MAX_PAIR_SOLO_WALL_RATIO:+, wall ratio <= $MAX_PAIR_SOLO_WALL_RATIO}"
68
314
  [ -z "$REUSE_CALIBRATED_FROM" ] || echo "Reuse: bare+solo from $REUSE_CALIBRATED_FROM"
315
+ [ "$DRY_RUN" -eq 0 ] || echo "Mode: DRY RUN (no model/provider invocations)"
316
+ print_command
69
317
  echo ""
70
318
 
71
- SRC_SKILLS="$REPO_ROOT/config/skills"
72
- DST_SKILLS="$REPO_ROOT/.claude/skills"
73
- mkdir -p "$DST_SKILLS"
74
- mirrored=0
75
- for src_dir in "$SRC_SKILLS"/*/; do
76
- [ -d "$src_dir" ] || continue
77
- name=$(basename "$src_dir")
78
- case "$name" in
79
- devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
80
- continue ;;
81
- esac
82
- staging="$DST_SKILLS/.${name}.staging"
83
- rm -rf "$staging"
84
- cp -R "$src_dir" "$staging"
85
- rm -rf "$DST_SKILLS/$name"
86
- mv "$staging" "$DST_SKILLS/$name"
87
- mirrored=$((mirrored + 1))
88
- done
89
- echo "[full-pipeline-pair] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
319
+ validate_fixtures
320
+
321
+ if [ "$DRY_RUN" -eq 1 ] && [ "${#FIXTURES[@]}" -lt "$MIN_FIXTURES" ]; then
322
+ echo "[full-pipeline-pair] DRY RUN failed — ${#FIXTURES[@]} fixture(s) supplied, --min-fixtures requires $MIN_FIXTURES." >&2
323
+ exit 1
324
+ fi
325
+
326
+ if [ "$DRY_RUN" -eq 1 ]; then
327
+ echo "[full-pipeline-pair] DRY RUN complete — fixtures resolved, no arms or judges executed."
328
+ exit 0
329
+ fi
330
+
331
+ mirror_skills() {
332
+ local src_skills="$REPO_ROOT/config/skills"
333
+ local dst_skills="$REPO_ROOT/.claude/skills"
334
+ local mirrored=0
335
+ local src_dir name staging
336
+ mkdir -p "$dst_skills"
337
+ for src_dir in "$src_skills"/*/; do
338
+ [ -d "$src_dir" ] || continue
339
+ name=$(basename "$src_dir")
340
+ case "$name" in
341
+ devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
342
+ continue ;;
343
+ esac
344
+ staging="$dst_skills/.${name}.staging"
345
+ rm -rf "$staging"
346
+ cp -R "$src_dir" "$staging"
347
+ rm -rf "$dst_skills/$name"
348
+ mv "$staging" "$dst_skills/$name"
349
+ mirrored=$((mirrored + 1))
350
+ done
351
+ echo "[full-pipeline-pair] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
352
+ }
90
353
 
91
354
  copy_calibrated_arm() {
92
355
  local fid="$1"
93
356
  local arm="$2"
94
357
  local src="$BENCH_ROOT/results/$REUSE_CALIBRATED_FROM/$fid/$arm"
95
358
  local dst="$BENCH_ROOT/results/$RUN_ID/$fid/$arm"
96
- if [ -f "$dst/result.json" ]; then
359
+ if [ -e "$dst" ]; then
360
+ [ -d "$dst" ] || { echo "reuse destination is not a directory: $dst" >&2; exit 1; }
361
+ for required in result.json verify.json diff.patch; do
362
+ [ -f "$dst/$required" ] || { echo "reuse destination incomplete $required: $dst" >&2; exit 1; }
363
+ done
97
364
  echo "[full-pipeline-pair] reuse skip: $fid / $arm already exists in $RUN_ID"
98
365
  return 0
99
366
  fi
100
367
  [ -d "$src" ] || { echo "reuse source missing: $src" >&2; exit 1; }
101
- [ -f "$src/result.json" ] || { echo "reuse source missing result.json: $src" >&2; exit 1; }
368
+ for required in result.json verify.json diff.patch; do
369
+ [ -f "$src/$required" ] || { echo "reuse source missing $required: $src" >&2; exit 1; }
370
+ done
102
371
  mkdir -p "$(dirname "$dst")"
103
372
  cp -R "$src" "$dst"
104
373
  echo "[full-pipeline-pair] reused $fid / $arm from $REUSE_CALIBRATED_FROM"
105
374
  }
106
375
 
376
+ if [ -z "$REUSE_CALIBRATED_FROM" ]; then
377
+ mirror_skills
378
+ fi
379
+
107
380
  for fid in "${FIXTURES[@]}"; do
108
381
  if [ -n "$REUSE_CALIBRATED_FROM" ]; then
109
382
  copy_calibrated_arm "$fid" bare
@@ -129,11 +402,24 @@ headroom_args=(
129
402
  --run-id "$RUN_ID"
130
403
  --bare-max "$BARE_MAX"
131
404
  --solo-max "$SOLO_MAX"
405
+ --min-bare-headroom "$MIN_BARE_HEADROOM"
406
+ --min-solo-headroom "$MIN_SOLO_HEADROOM"
132
407
  --min-fixtures "$MIN_FIXTURES"
133
408
  --out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json"
134
409
  --out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
135
410
  )
136
- python3 "$BENCH_ROOT/scripts/headroom-gate.py" "${headroom_args[@]}"
411
+ if ! run_gate_with_report \
412
+ "headroom gate" \
413
+ "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md" \
414
+ python3 "$BENCH_ROOT/scripts/headroom-gate.py" "${headroom_args[@]}"; then
415
+ echo "[full-pipeline-pair] headroom gate failed — pair arm not executed."
416
+ exit 1
417
+ fi
418
+ echo "[full-pipeline-pair] headroom gate passed — executing $PAIR_ARM."
419
+
420
+ if [ -n "$REUSE_CALIBRATED_FROM" ]; then
421
+ mirror_skills
422
+ fi
137
423
 
138
424
  for fid in "${FIXTURES[@]}"; do
139
425
  echo "[full-pipeline-pair] ► $fid / $PAIR_ARM"
@@ -150,13 +436,23 @@ pair_args=(
150
436
  --run-id "$RUN_ID"
151
437
  --bare-max "$BARE_MAX"
152
438
  --solo-max "$SOLO_MAX"
439
+ --min-bare-headroom "$MIN_BARE_HEADROOM"
440
+ --min-solo-headroom "$MIN_SOLO_HEADROOM"
153
441
  --min-fixtures "$MIN_FIXTURES"
154
442
  --min-pair-margin "$MIN_PAIR_MARGIN"
155
443
  --pair-arm "$PAIR_ARM"
444
+ --require-hypothesis-trigger
156
445
  --out-json "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.json"
157
446
  --out-md "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md"
158
447
  )
159
448
  [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || pair_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
160
449
 
161
- python3 "$BENCH_ROOT/scripts/full-pipeline-pair-gate.py" "${pair_args[@]}"
162
- cat "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md"
450
+ if ! run_gate_with_report \
451
+ "full-pipeline pair gate" \
452
+ "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md" \
453
+ python3 "$BENCH_ROOT/scripts/full-pipeline-pair-gate.py" "${pair_args[@]}"; then
454
+ echo "[full-pipeline-pair] pair gate failed — pair evidence rejected."
455
+ exit 1
456
+ fi
457
+ echo "[full-pipeline-pair] pair gate passed — pair evidence accepted."
458
+ echo "[full-pipeline-pair] release audit: npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict"