devlyn-cli 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +80 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +210 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -64,7 +64,12 @@ forces invariant derivation — the discriminating axis.
64
64
 
65
65
  ## Rotation trigger
66
66
 
67
- Retire when both arms consistently land > 90 across two shipped versions,
68
- OR when "all-or-nothing batch" becomes a recognized pattern such that
69
- solo arm reliably validates-first on the initial implementation pass.
67
+ Headroom run `20260507-f10-f11-tier1-full-pipeline` rejected this fixture as
68
+ pair-lift evidence: bare scored 98 and solo_claude scored 97. Keep it as an
69
+ atomic batch control unless the visible contract is reworked to expose lower
70
+ bare/solo ceilings.
71
+
72
+ Retire when both `bare` and `solo_claude` consistently land > 90 across two
73
+ shipped versions, OR when "all-or-nothing batch" becomes a recognized pattern
74
+ such that solo arm reliably validates-first on the initial implementation pass.
70
75
  Whichever comes first.
@@ -77,7 +77,13 @@ keywords. Raw-body trap is intentionally left without explicit
77
77
 
78
78
  ## Rotation trigger
79
79
 
80
- Retire when both arms consistently land > 90 across two shipped versions
81
- on this fixture. If the raw-body verifier (#5) becomes saturated faster
82
- than the others, replace it with a different platform blindspot rather
83
- than retiring the whole fixture.
80
+ Headroom run `20260511-f12-webhook-headroom` rejected this fixture as pair-lift
81
+ evidence: bare scored 85 and solo_claude scored 99. Bare still missed one of
82
+ seven verifiers, but the `bare` and `solo_claude` judge scores exceed the
83
+ headroom ceilings. Keep it as a webhook/security control unless the visible
84
+ contract is reworked to expose lower bare/solo ceilings.
85
+
86
+ Retire when both `bare` and `solo_claude` consistently land > 90 across two
87
+ shipped versions on this fixture. If the raw-body verifier (#5) becomes
88
+ saturated faster than the others, replace it with a different platform blindspot
89
+ rather than retiring the whole fixture.
@@ -92,7 +92,13 @@ that doesn't read the await sequence carefully will gloss over these.
92
92
 
93
93
  ## Rotation trigger
94
94
 
95
- Retire when both bare and solo arms consistently land > 85 across two
96
- shipped versions. If 2026 baseline reliably catches the awaited RMW race
97
- on cold read of someone else's code, the frozen-diff review thesis also
98
- needs updating not just the seeded bug.
95
+ Headroom run `20260511-f15-concurrency-headroom` rejected this fixture as
96
+ pair-lift evidence: bare scored 99 and solo_claude scored 94, so `bare` and
97
+ `solo_claude` are above the headroom ceilings (`bare <= 60`, `solo_claude <=
98
+ 80`). Keep the fixture as a frozen-diff review control unless the visible
99
+ contract is reworked to expose a lower solo ceiling.
100
+
101
+ Retire when both `bare` and `solo_claude` consistently land > 85 across two
102
+ shipped versions. If 2026 baseline reliably catches the awaited RMW race on
103
+ cold read of someone else's code, the frozen-diff review thesis also needs
104
+ updating — not just the seeded bug.
@@ -19,6 +19,18 @@ F1/F2 test CLI shape, but not business-rule arithmetic. F10/F11/F12 test
19
19
  server behavior and persistence. F15 tests review behavior. None combine
20
20
  hidden product math, exact machine output, and source-of-truth pricing.
21
21
 
22
+ ## Measurement status
23
+
24
+ Pair evidence passed in `20260510-f16-f23-f25-combined-proof`:
25
+ bare `50`, solo_claude `75`, pair `96`, margin `+21`, wall `1.28x`,
26
+ arm `l2_risk_probes`, verdict `pair_evidence_passed`.
27
+
28
+ ## Solo-headroom hypothesis
29
+
30
+ A capable solo_claude baseline is expected to miss duplicate-SKU aggregation
31
+ before stock validation and exact integer tax/shipping totals; observable
32
+ command `node "$BENCH_FIXTURE_DIR/verifiers/exact-success.js"` exposes the miss.
33
+
22
34
  ## Retirement
23
35
 
24
36
  Retire or replace this fixture if both bare and solo consistently score
@@ -55,3 +55,9 @@ the output must be machine-readable.
55
55
  - A quote over combined stock exits `2`, prints one JSON error to stderr, and prints no stdout.
56
56
  - The stock error object includes `sku`, `available`, and `requested`.
57
57
  - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched (the pricing seed comes from setup, not the arm).
58
+
59
+ ## Solo-headroom hypothesis
60
+
61
+ A capable solo_claude baseline is expected to miss duplicate-SKU aggregation
62
+ before stock validation or tax/discount calculation; observable command
63
+ `node "$BENCH_FIXTURE_DIR/verifiers/exact-success.js"` exposes the miss.
@@ -39,16 +39,19 @@ small enough that every arm can plausibly finish in < 10 minutes.
39
39
 
40
40
  ## When should this fixture be retired or replaced?
41
41
 
42
- When both arms score > 95 for two consecutive shipped versions — i.e., the
43
- fixture saturates and no longer differentiates. Candidate replacement: a
44
- similar-size CLI task with multiple interacting flags or a subcommand that
45
- spawns a child process.
42
+ When both `bare` and `solo_claude` score > 95 for two consecutive shipped
43
+ versions — i.e., the fixture saturates and no longer differentiates. Candidate
44
+ replacement: a similar-size CLI task with multiple interacting flags or a
45
+ subcommand that spawns a child process.
46
46
 
47
47
  ## Calibration history
48
48
 
49
49
  - v3.4 skill 57 / bare 45 / margin +12 (gpt-5.3-codex judge)
50
50
  - v3.4.1 skill 59 / bare 43 / margin +16 (gpt-5.3-codex judge)
51
51
  - v3.5 skill 92 / bare 81 / margin +11 (gpt-5.4 xhigh judge) — huge absolute jump; bare silent-catch caught
52
+ - 20260512-f2-medium-headroom bare 83 / solo_claude 95 — rejected as
53
+ pair-lift evidence because both baseline scores exceed current headroom
54
+ ceilings.
52
55
 
53
56
  Absolute scores jumped with the stronger judge. Margin stays solid (+11
54
57
  after stdlib calibration is expected to open a few points more).
@@ -20,6 +20,18 @@ F16 covers checkout arithmetic. F10/F11/F12/F15 cover server behavior. None
20
20
  exercise a CLI algorithm where the correct result depends on sorting,
21
21
  interval arithmetic, and output ordering at once.
22
22
 
23
+ ## Measurement status
24
+
25
+ Pair evidence passed in `20260511-f21-current-riskprobes-v1`: bare `33`,
26
+ solo_claude `66`, pair `99`, margin `+33`, wall `1.47x`,
27
+ arm `l2_risk_probes`, verdict `pair_evidence_passed`.
28
+
29
+ ## Solo-headroom hypothesis
30
+
31
+ A capable solo_claude baseline is expected to miss global priority ordering
32
+ combined with blocked-interval earliest-fit placement; observable command
33
+ `node "$BENCH_FIXTURE_DIR/verifiers/priority-blocked.js"` exposes the miss.
34
+
23
35
  ## Retirement
24
36
 
25
37
  Retire or replace when both bare and solo consistently exceed the headroom
@@ -59,3 +59,9 @@ failure reasons must be deterministic.
59
59
  - Unknown resources are reported in `rejected` without aborting the whole run.
60
60
  - Duplicate request ids are invalid input: exit `2`, one JSON error to stderr, no stdout.
61
61
  - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
62
+
63
+ ## Solo-headroom hypothesis
64
+
65
+ A capable solo_claude baseline is expected to miss global priority ordering
66
+ across resources while preserving blocked-interval earliest-fit placement;
67
+ observable command `node "$BENCH_FIXTURE_DIR/verifiers/priority-blocked.js"` exposes the miss.
@@ -20,6 +20,14 @@ F16 covers order quote arithmetic, but not ledger idempotency or full-input
20
20
  validation before mutation. F21 covers interval scheduling. Server fixtures
21
21
  cover API behavior rather than CLI reconciliation.
22
22
 
23
+ ## Measurement status
24
+
25
+ Headroom runs reject F22 as full-pipeline pair-lift evidence. In
26
+ `20260507-f21-f22-full-pipeline-pair`, F22 scored bare 91 / solo_claude 98 and
27
+ failed the headroom gate. In `20260508-f22-exact-error-headroom`, F22 scored
28
+ bare 94 / solo_claude 98 after the exact-error fixture revision. Keep it as a ledger
29
+ reconciliation control, not as counted `solo < pair` evidence.
30
+
23
31
  ## Retirement
24
32
 
25
33
  Retire or replace if both bare and solo consistently score above the headroom
@@ -20,6 +20,18 @@ F21 covers interval scheduling. F16 covers quote arithmetic. F22 was too easy
20
20
  for bare in the first calibration run. This fixture targets allocation rollback
21
21
  and inventory consumption across multiple dimensions.
22
22
 
23
+ ## Measurement status
24
+
25
+ Pair evidence passed in `20260510-f16-f23-f25-combined-proof`: bare `33`,
26
+ solo_claude `66`, pair `97`, margin `+31`, wall `2.25x`,
27
+ arm `l2_risk_probes`, verdict `pair_evidence_passed`.
28
+
29
+ ## Solo-headroom hypothesis
30
+
31
+ A capable solo_claude baseline is expected to miss all-or-nothing rollback after
32
+ a higher-priority order consumes stock first; observable command
33
+ `node "$BENCH_FIXTURE_DIR/verifiers/priority-rollback.js"` exposes the miss.
34
+
23
35
  ## Retirement
24
36
 
25
37
  Retire or replace if both bare and solo consistently exceed the headroom
@@ -68,3 +68,9 @@ orders must be deterministic.
68
68
  - Lot choice is FEFO by expiry date, then lot id.
69
69
  - `remaining` is sorted by warehouse id, then sku, then expires, then lot.
70
70
  - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
71
+
72
+ ## Solo-headroom hypothesis
73
+
74
+ A capable solo_claude baseline is expected to miss all-or-nothing rollback after
75
+ a higher-priority order tentatively allocates FEFO lots; observable command
76
+ `node "$BENCH_FIXTURE_DIR/verifiers/priority-rollback.js"` exposes the miss.
@@ -17,12 +17,24 @@ examples rather than only checking a happy path.
17
17
  ## Why existing fixtures do not cover it
18
18
 
19
19
  F16 covers quote tax rules, but not multiple line-promotion types plus an order
20
- coupon. F21/F23 cover scheduling/allocation but became oracle-control fixtures.
20
+ coupon. F21/F23 cover scheduling/allocation, not checkout interaction ordering.
21
21
  This fixture keeps the F16-style fair visible-contract shape while testing a
22
22
  different checkout interaction.
23
23
 
24
+ ## Measurement status
25
+
26
+ Pair evidence passed in `20260510-f16-f23-f25-combined-proof`: bare `25`,
27
+ solo_claude `75`, pair `99`, margin `+24`, wall `1.65x`,
28
+ arm `l2_risk_probes`, verdict `pair_evidence_passed`.
29
+
30
+ ## Solo-headroom hypothesis
31
+
32
+ A capable solo_claude baseline is expected to miss the interaction between
33
+ duplicate aggregation, line promotions, tax base, coupon order, and shipping;
34
+ observable command `node "$BENCH_FIXTURE_DIR/verifiers/exact-success.js"` exposes the miss.
35
+
24
36
  ## Retirement
25
37
 
26
- Retire or replace this fixture if bare or solo consistently reaches ceiling, or
27
- if a later fixture covers the same promotion-order and catalog-source failure
28
- mode with cleaner full-pipeline lift.
38
+ Retire or replace this fixture if either `bare` or `solo_claude` consistently
39
+ reaches ceiling, or if a later fixture covers the same promotion-order and
40
+ catalog-source failure mode with cleaner full-pipeline lift.
@@ -62,3 +62,10 @@ and stdout must stay machine-readable.
62
62
  - The stock error object includes `sku`, `available`, and `requested`.
63
63
  - Changing `data/catalog.json` prices or rates changes command output without code changes.
64
64
  - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched (the catalog seed comes from setup, not the arm).
65
+
66
+ ## Solo-headroom hypothesis
67
+
68
+ A capable solo_claude baseline is expected to miss the interaction between
69
+ duplicate-SKU aggregation, line-promotion ordering, coupon ordering, and
70
+ tax/shipping thresholds; observable command
71
+ `node "$BENCH_FIXTURE_DIR/verifiers/exact-success.js"` exposes the miss.
@@ -15,11 +15,17 @@ adversarial ledger examples with repeated IDs, refunds, disputes, and reserves.
15
15
  ## Why existing fixtures do not cover it
16
16
 
17
17
  F16 covers quote math and F25 covers cart promotions, but neither has ledger
18
- idempotency or conflicting duplicate events. F21/F23 became oracle-control
19
- fixtures, so this adds a fresh visible-contract stateful arithmetic candidate.
18
+ idempotency or conflicting duplicate events. F21/F23 cover scheduling and
19
+ allocation ordering, not payout ledger arithmetic.
20
+
21
+ ## Measurement status
22
+
23
+ Headroom run `20260508-f26-headroom` rejected F26 as full-pipeline pair-lift
24
+ evidence: bare scored 25, but `solo_claude` scored 98 and passed all 4
25
+ verification commands, so the fixture is at solo ceiling. Keep it as a ledger
26
+ math control unless the spec is revised to expose a lower solo ceiling.
20
27
 
21
28
  ## Retirement
22
29
 
23
- Retire or replace this fixture if solo consistently reaches ceiling or if
24
- another fixture provides the same idempotent-ledger signal with cleaner
25
- full-pipeline pair lift.
30
+ Retire or replace this fixture if another fixture provides the same
31
+ idempotent-ledger signal with cleaner full-pipeline pair headroom.
@@ -25,4 +25,11 @@ update tests but forget backward-compat requirements (single-item route,
25
25
 
26
26
  ## Rotation trigger
27
27
 
28
- Retire when both arms consistently score > 95 AND produce 2+ new tests covering paging edge cases without pipeline intervention.
28
+ Headroom run `20260511-f3-http-error-headroom` rejected this fixture as
29
+ pair-lift evidence after the invalid-query HTTP error body verifier was
30
+ tightened: bare scored 97 and solo_claude scored 99. Keep it as a backend
31
+ contract control unless the visible contract is reworked to expose lower
32
+ bare/solo ceilings.
33
+
34
+ Retire when both `bare` and `solo_claude` consistently score > 95 AND produce
35
+ 2+ new tests covering paging edge cases without pipeline intervention.
@@ -26,10 +26,12 @@
26
26
  "stdout_not_contains": []
27
27
  },
28
28
  {
29
- "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?per_page=abc`, r => { console.log(r.statusCode); s.close(); }); });'",
29
+ "cmd": "node -e 'const { app } = require(\"./server\"); const http = require(\"http\"); const s = http.createServer(app).listen(0, () => { const { port } = s.address(); http.get(`http://127.0.0.1:${port}/items?per_page=abc`, r => { let b = \"\"; r.on(\"data\", c => b += c); r.on(\"end\", () => { const d = JSON.parse(b); console.log(JSON.stringify({ status: r.statusCode, error: d.error, field: d.field })); s.close(); }); }); });'",
30
30
  "exit_code": 0,
31
31
  "stdout_contains": [
32
- "400"
32
+ "\"status\":400",
33
+ "\"error\":\"invalid_query\"",
34
+ "\"field\":\"per_page\""
33
35
  ],
34
36
  "stdout_not_contains": []
35
37
  }
@@ -50,6 +50,6 @@ so existing assertions continue to pass alongside new paging assertions.
50
50
  - Server start: `node server/index.js` listens on port 3000 (exit via SIGINT).
51
51
  - `curl -s http://127.0.0.1:3000/items | jq '.total'` returns `2`.
52
52
  - `curl -s 'http://127.0.0.1:3000/items?per_page=1&page=2' | jq '.items[0].name'` returns `"beta"`.
53
- - `curl -s 'http://127.0.0.1:3000/items?per_page=abc' -o /dev/null -w '%{http_code}'` returns `400`.
53
+ - `curl -s 'http://127.0.0.1:3000/items?per_page=abc'` returns HTTP status `400` with JSON error body `{ "error": "invalid_query", "field": "per_page" }`.
54
54
  - `node --test tests/server.test.js` passes; must include ≥ 2 new paging tests.
55
55
  - `git diff --stat` shows only `server/index.js` and `tests/server.test.js` touched.
@@ -0,0 +1,34 @@
1
+ # F31 CLI seat rebalance
2
+
3
+ ## Failure mode
4
+
5
+ This fixture detects implementations that pass simple entitlement updates while
6
+ missing the interaction between priority ordering, transfer rollback, rejected
7
+ row ordering, and exact machine-readable error handling.
8
+
9
+ ## Pipeline phase target
10
+
11
+ PLAN must preserve the ordering distinction between processing order and
12
+ rejected-output order. IMPLEMENT must keep transfer mutations all-or-nothing.
13
+ VERIFY should build adversarial cases where a later high-priority transfer
14
+ changes the outcome of an earlier low-priority reserve, and where a failed
15
+ transfer would corrupt state if mutations are applied too early.
16
+
17
+ ## Why existing fixtures do not cover it
18
+
19
+ F21 covers scheduling priority and blocked intervals. F23 covers inventory
20
+ allocation rollback. F25 covers checkout calculation order. This fixture covers
21
+ account entitlement reconciliation with a different state shape and a duplicate
22
+ event-id hard error.
23
+
24
+ ## Retirement
25
+
26
+ Headroom run `20260512-f31-seat-rebalance-headroom` rejected this fixture as
27
+ pair-lift evidence: bare scored 33 but carried judge/result/verify
28
+ disqualifiers, and solo_claude scored 98 with all 3 verification commands
29
+ passing. It should remain a control fixture unless reworked to lower the solo
30
+ ceiling.
31
+
32
+ Retire or replace this fixture if either `bare` or `solo_claude` consistently
33
+ reaches ceiling, or if a later fixture covers priority event processing plus
34
+ all-or-nothing transfer rollback with cleaner full-pipeline lift.
@@ -0,0 +1,57 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/cli.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/priority-transfer-rollback.js\"",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": [],
14
+ "contract_refs": [
15
+ "Process events globally by `priority` descending, then `effective_at` ascending, then `id` ascending.",
16
+ "`transfer` is all-or-nothing. It accepts only when both accounts exist, the source has `used >= qty`, the destination has at least `qty` free seats, and both accounts have the same `region` unless `allow_cross_region` is `true`.",
17
+ "A rejected `transfer` must not change either account. Use reason `unknown_account`, `region_mismatch`, `insufficient_used`, or `no_capacity` for the first failing transfer rule in the order listed above.",
18
+ "`rejected` is ordered in the original input event order. Each row has keys `id`, `reason`.",
19
+ "`accounts` is sorted by account id ascending. Each row has keys `id`, `region`, `seats`, `used`, `free`, where `free = seats - used`.",
20
+ "On success, write exactly one JSON object to stdout and no stderr. Keys: `applied`, `rejected`, `accounts`."
21
+ ]
22
+ },
23
+ {
24
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/duplicate-event-error.js\"",
25
+ "exit_code": 0,
26
+ "stdout_contains": ["\"ok\":true"],
27
+ "stdout_not_contains": [],
28
+ "contract_refs": [
29
+ "Before processing any event, duplicate event ids are invalid input: exit `2`, write exactly one JSON error object `{ \"error\": \"duplicate_event_id\", \"id\": string }` to stderr, and write no stdout."
30
+ ]
31
+ }
32
+ ],
33
+ "forbidden_patterns": [
34
+ {
35
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
36
+ "description": "silent catch returning fallback in seat rebalance path",
37
+ "files": ["bin/cli.js", "tests/cli.test.js"],
38
+ "severity": "disqualifier"
39
+ },
40
+ {
41
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
42
+ "description": "empty catch block",
43
+ "files": ["bin/cli.js", "tests/cli.test.js"],
44
+ "severity": "disqualifier"
45
+ },
46
+ {
47
+ "pattern": "/\\*\\s*eslint-disable",
48
+ "description": "eslint-disable without scoped justification",
49
+ "files": ["bin/cli.js"],
50
+ "severity": "disqualifier"
51
+ }
52
+ ],
53
+ "required_files": ["bin/cli.js", "tests/cli.test.js"],
54
+ "forbidden_files": [],
55
+ "max_deps_added": 0,
56
+ "spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
57
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F31-cli-seat-rebalance",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1500,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a bench-cli rebalance-seats command that reads account capacity and seat events from a JSON file, processes events by priority with all-or-nothing transfers, rejects invalid per-event operations without corrupting state, and prints exact applied, rejected, and final account rows."
10
+ }
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ set -e
@@ -0,0 +1,67 @@
1
+ ---
2
+ id: "F31-cli-seat-rebalance"
3
+ title: "Seat rebalance command"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F31 Seat rebalance command
10
+
11
+ ## Context
12
+
13
+ `bench-cli` currently has greeting and version commands only. The task:
14
+ add a `rebalance-seats` command that reads account capacity and seat events from
15
+ a JSON file, processes events by priority with all-or-nothing transfers, rejects
16
+ invalid per-event operations without corrupting state, and prints exact applied,
17
+ rejected, and final account rows.
18
+
19
+ This is account entitlement reconciliation. Downstream billing tools parse the
20
+ output, so success and error output must be exact machine-readable JSON.
21
+
22
+ ## Requirements
23
+
24
+ - [ ] `bench-cli rebalance-seats --input <path>` reads JSON shaped as `{ "accounts": [{ "id": string, "region": string, "seats": number, "used": number }], "events": [event] }`.
25
+ - [ ] Valid event types are `reserve`, `release`, and `transfer`.
26
+ - [ ] `reserve` events have keys `id`, `type`, `account`, `qty`, `priority`, and `effective_at`.
27
+ - [ ] `release` events have keys `id`, `type`, `account`, `qty`, `priority`, and `effective_at`.
28
+ - [ ] `transfer` events have keys `id`, `type`, `from`, `to`, `qty`, `priority`, `effective_at`, and optional `allow_cross_region`.
29
+ - [ ] Before processing any event, duplicate event ids are invalid input: exit `2`, write exactly one JSON error object `{ "error": "duplicate_event_id", "id": string }` to stderr, and write no stdout.
30
+ - [ ] Before processing any event, account rows must have unique ids, non-empty string `id` and `region`, integer `seats >= 0`, and integer `used` with `0 <= used <= seats`. Invalid account input exits `2` with one JSON error object and no stdout.
31
+ - [ ] Before processing any event, every event `qty` must be a positive integer, every `priority` must be an integer, and every `effective_at` must be a non-empty string. Invalid event input exits `2` with one JSON error object and no stdout.
32
+ - [ ] Process events globally by `priority` descending, then `effective_at` ascending, then `id` ascending.
33
+ - [ ] `reserve` accepts only when the account exists and has at least `qty` free seats. On accept, increase that account's `used` by `qty`. Otherwise reject the event with reason `unknown_account` or `no_capacity`.
34
+ - [ ] `release` accepts only when the account exists and `used >= qty`. On accept, decrease that account's `used` by `qty`. Otherwise reject the event with reason `unknown_account` or `insufficient_used`.
35
+ - [ ] `transfer` is all-or-nothing. It accepts only when both accounts exist, the source has `used >= qty`, the destination has at least `qty` free seats, and both accounts have the same `region` unless `allow_cross_region` is `true`.
36
+ - [ ] A rejected `transfer` must not change either account. Use reason `unknown_account`, `region_mismatch`, `insufficient_used`, or `no_capacity` for the first failing transfer rule in the order listed above.
37
+ - [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `applied`, `rejected`, `accounts`.
38
+ - [ ] `applied` is ordered in processing order. Each row has keys `id`, `type`.
39
+ - [ ] `rejected` is ordered in the original input event order. Each row has keys `id`, `reason`.
40
+ - [ ] `accounts` is sorted by account id ascending. Each row has keys `id`, `region`, `seats`, `used`, `free`, where `free = seats - used`.
41
+ - [ ] `tests/cli.test.js` is updated. Existing tests still pass and at least two new tests cover `rebalance-seats`: one successful priority/transfer scenario and one validation failure.
42
+
43
+ ## Constraints
44
+
45
+ - **No new npm dependencies.**
46
+ - **No hidden mutable global state.** The command must derive output only from the input JSON for that invocation.
47
+ - **No silent catches.** Parse and file-read failures must emit a visible JSON error to stderr and exit `2`.
48
+ - **No extra stdout/stderr text** on the success path; downstream tooling parses stdout as JSON.
49
+
50
+ ## Out of Scope
51
+
52
+ - Persisting account state between command invocations.
53
+ - Adding billing invoices, plan catalogs, or currency calculations.
54
+ - Adding web UI or server routes.
55
+ - Touching `server/`, `web/`, or `tests/server.test.js`.
56
+
57
+ ## Verification
58
+
59
+ - `node --test tests/cli.test.js` exits 0.
60
+ - A later high-priority transfer is processed before an earlier low-priority reserve, and the low-priority reserve can lose capacity because of that ordering.
61
+ - A rejected transfer leaves both source and destination account usage unchanged.
62
+ - Region mismatch rejects a transfer unless `allow_cross_region` is `true`.
63
+ - `rejected` rows are reported in the original input event order, even though processing order is priority based.
64
+ - Duplicate event ids exit `2`, print exactly `{ "error": "duplicate_event_id", "id": string }` to stderr, and print no stdout.
65
+ - Final `accounts` rows are sorted by id and include exact `free` values.
66
+ - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
67
+ - Solo-headroom hypothesis: solo_claude is expected to miss transfer rollback or rejected-row ordering under priority processing; observable command `node "$BENCH_FIXTURE_DIR/verifiers/priority-transfer-rollback.js"` exposes the miss.
@@ -0,0 +1,7 @@
1
+ Add a bench-cli rebalance-seats command that reads account capacity and seat events from a JSON file, processes events by priority with all-or-nothing transfers, rejects invalid per-event operations without corrupting state, and prints exact applied, rejected, and final account rows.
2
+
3
+ The command should be `bench-cli rebalance-seats --input <path>`. The input JSON has account rows with id, region, seats, and used, plus event rows for reserve, release, and transfer operations. Process all events by priority descending, then effective_at ascending, then id ascending.
4
+
5
+ Transfers must be all-or-nothing: if either account is missing, the source lacks used seats, the destination lacks free seats, or the regions differ without allow_cross_region, reject the transfer and leave both accounts unchanged. Reserve and release should reject per-event failures without aborting the whole command. Duplicate event ids are invalid input and must exit 2 with exactly one JSON error object on stderr and no stdout.
6
+
7
+ On success, stdout must be exactly one JSON object with applied rows in processing order, rejected rows in original input order, and final account rows sorted by account id with free = seats - used. Update `tests/cli.test.js` so existing tests still pass and at least two new tests cover rebalance-seats, including one successful priority/transfer scenario and one validation failure. Do not add dependencies or touch server/web files.
@@ -0,0 +1,35 @@
1
+ 'use strict';
2
+ const assert = require('node:assert');
3
+ const { spawnSync } = require('node:child_process');
4
+ const fs = require('node:fs');
5
+ const os = require('node:os');
6
+ const path = require('node:path');
7
+
8
+ const work = process.env.BENCH_WORKDIR || process.cwd();
9
+ const cli = path.join(work, 'bin', 'cli.js');
10
+ const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'f31-duplicate-'));
11
+ const input = path.join(tmp, 'events.json');
12
+
13
+ fs.writeFileSync(input, JSON.stringify({
14
+ accounts: [
15
+ { id: 'team-a', region: 'us', seats: 5, used: 1 }
16
+ ],
17
+ events: [
18
+ { id: 'dup', type: 'reserve', account: 'team-a', qty: 1, priority: 2, effective_at: '2026-01-01T09:00:00Z' },
19
+ { id: 'dup', type: 'release', account: 'team-a', qty: 1, priority: 1, effective_at: '2026-01-01T09:01:00Z' }
20
+ ]
21
+ }), 'utf8');
22
+
23
+ const result = spawnSync('node', [cli, 'rebalance-seats', '--input', input], {
24
+ cwd: work,
25
+ encoding: 'utf8'
26
+ });
27
+
28
+ assert.strictEqual(result.status, 2);
29
+ assert.strictEqual(result.stdout, '');
30
+ assert.deepStrictEqual(JSON.parse(result.stderr), {
31
+ error: 'duplicate_event_id',
32
+ id: 'dup'
33
+ });
34
+
35
+ console.log(JSON.stringify({ ok: true }));
@@ -0,0 +1,53 @@
1
+ 'use strict';
2
+ const assert = require('node:assert');
3
+ const { spawnSync } = require('node:child_process');
4
+ const fs = require('node:fs');
5
+ const os = require('node:os');
6
+ const path = require('node:path');
7
+
8
+ const work = process.env.BENCH_WORKDIR || process.cwd();
9
+ const cli = path.join(work, 'bin', 'cli.js');
10
+ const tmp = fs.mkdtempSync(path.join(os.tmpdir(), 'f31-rebalance-'));
11
+ const input = path.join(tmp, 'events.json');
12
+
13
+ fs.writeFileSync(input, JSON.stringify({
14
+ accounts: [
15
+ { id: 'team-a', region: 'us', seats: 5, used: 3 },
16
+ { id: 'team-b', region: 'us', seats: 4, used: 1 },
17
+ { id: 'team-eu', region: 'eu', seats: 4, used: 0 }
18
+ ],
19
+ events: [
20
+ { id: 'low-reserve', type: 'reserve', account: 'team-b', qty: 3, priority: 1, effective_at: '2026-01-01T09:00:00Z' },
21
+ { id: 'bad-cross', type: 'transfer', from: 'team-a', to: 'team-eu', qty: 1, priority: 8, effective_at: '2026-01-01T09:02:00Z' },
22
+ { id: 'high-transfer', type: 'transfer', from: 'team-a', to: 'team-b', qty: 2, priority: 10, effective_at: '2026-01-01T09:05:00Z' },
23
+ { id: 'after-release', type: 'release', account: 'team-a', qty: 1, priority: 7, effective_at: '2026-01-01T09:03:00Z' },
24
+ { id: 'after-reserve', type: 'reserve', account: 'team-a', qty: 5, priority: 6, effective_at: '2026-01-01T09:04:00Z' }
25
+ ]
26
+ }), 'utf8');
27
+
28
+ const result = spawnSync('node', [cli, 'rebalance-seats', '--input', input], {
29
+ cwd: work,
30
+ encoding: 'utf8'
31
+ });
32
+ assert.strictEqual(result.status, 0, result.stderr || result.stdout);
33
+ assert.strictEqual(result.stderr, '');
34
+ const parsed = JSON.parse(result.stdout);
35
+
36
+ assert.deepStrictEqual(parsed, {
37
+ applied: [
38
+ { id: 'high-transfer', type: 'transfer' },
39
+ { id: 'after-release', type: 'release' },
40
+ { id: 'after-reserve', type: 'reserve' }
41
+ ],
42
+ rejected: [
43
+ { id: 'low-reserve', reason: 'no_capacity' },
44
+ { id: 'bad-cross', reason: 'region_mismatch' }
45
+ ],
46
+ accounts: [
47
+ { id: 'team-a', region: 'us', seats: 5, used: 5, free: 0 },
48
+ { id: 'team-b', region: 'us', seats: 4, used: 3, free: 1 },
49
+ { id: 'team-eu', region: 'eu', seats: 4, used: 0, free: 4 }
50
+ ]
51
+ });
52
+
53
+ console.log(JSON.stringify({ ok: true }));
@@ -0,0 +1,38 @@
1
+ # F32 Subscription renewal command
2
+
3
+ ## Failure mode
4
+
5
+ This fixture targets billing-style state mutation where an implementation can
6
+ look correct on isolated cases but fail the interaction between renewal
7
+ priority, tentative credit application, rollback after `payment_required`, exact
8
+ credit consumption order, and strict JSON row shapes.
9
+
10
+ ## Pipeline phase coverage
11
+
12
+ - PLAN must preserve the exact input/output field names and ordering clauses.
13
+ - RISK_PROBES should derive a compound priority + rollback + shape probe.
14
+ - IMPLEMENT must avoid input-order processing and must not leak tentative credit
15
+ consumption from rejected renewals.
16
+ - VERIFY pair mode should catch aliased keys, extra keys, and weak tests that
17
+ check only one field rather than the full parsed output.
18
+
19
+ ## Why existing fixtures do not cover it
20
+
21
+ F25 covers pricing math and output shape, F31 covers entitlement transfers and
22
+ duplicate-id errors, and F23 covers fulfillment rollback. F32 combines billing
23
+ credits with a failed high-priority renewal that must roll back before a later
24
+ renewal can consume credits, plus exact nested output key sets.
25
+
26
+ ## Retirement criteria
27
+
28
+ Retire or replace this fixture if both `bare` and `solo_claude` score above 95
29
+ on two current-model runs, or if another active fixture covers priority-ordered
30
+ tentative monetary credit rollback with exact nested output shape and duplicate
31
+ ID error contracts.
32
+
33
+ ## Pair-candidate status
34
+
35
+ Rejected as pair-lift evidence by `20260512-f32-subscription-renewal-headroom`:
36
+ bare scored 33, but solo_claude scored 98 and passed all 3 verification
37
+ commands. Keep it as a billing rollback/shape control, not as a pair arm target,
38
+ unless it is reworked and clears a fresh headroom gate.