devlyn-cli 2.3.0 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +80 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +210 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,57 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/cli.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/priority-credit-rollback.js\"",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": [],
14
+ "contract_refs": [
15
+ "Process renewals globally by `priority` descending, then `requested_at` ascending, then `id` ascending.",
16
+ "A rejected renewal with reason `payment_required` must not consume any credits, even if it tentatively applied credits before discovering the remaining due exceeded `max_due_cents`.",
17
+ "Usable credits are credits for the same customer with `expires_at >= as_of` and `cents > 0`, consumed by `expires_at` ascending, then `id` ascending.",
18
+ "`rejected` is ordered in the original input renewal order. Each row has keys `id` and `reason`.",
19
+ "`remaining_credits` includes only non-expired credits with positive cents after accepted renewals, sorted by `customer`, then `expires_at`, then `id`. Each row has keys `id`, `customer`, `cents`, and `expires_at`.",
20
+ "Output row key names and nested `credits` key names match the visible spec exactly, with no aliased or extra keys."
21
+ ]
22
+ },
23
+ {
24
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/duplicate-renewal-error.js\"",
25
+ "exit_code": 0,
26
+ "stdout_contains": ["\"ok\":true"],
27
+ "stdout_not_contains": [],
28
+ "contract_refs": [
29
+ "Before processing any renewal, duplicate renewal ids are invalid input: exit `2`, write exactly one JSON error object `{ \"error\": \"duplicate_renewal_id\", \"id\": string }` to stderr, and write no stdout."
30
+ ]
31
+ }
32
+ ],
33
+ "forbidden_patterns": [
34
+ {
35
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
36
+ "description": "silent catch returning fallback in subscription renewal path",
37
+ "files": ["bin/cli.js", "tests/cli.test.js"],
38
+ "severity": "disqualifier"
39
+ },
40
+ {
41
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
42
+ "description": "empty catch block",
43
+ "files": ["bin/cli.js", "tests/cli.test.js"],
44
+ "severity": "disqualifier"
45
+ },
46
+ {
47
+ "pattern": "/\\*\\s*eslint-disable",
48
+ "description": "eslint-disable without scoped justification",
49
+ "files": ["bin/cli.js"],
50
+ "severity": "disqualifier"
51
+ }
52
+ ],
53
+ "required_files": ["bin/cli.js", "tests/cli.test.js"],
54
+ "forbidden_files": [],
55
+ "max_deps_added": 0,
56
+ "spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
57
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F32-cli-subscription-renewal",
3
+ "category": "high-risk",
4
+ "difficulty": "hard",
5
+ "timeout_seconds": 1800,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a subscription renewal CLI command that applies expiring credits in priority order with rollback and exact machine-readable output."
10
+ }
@@ -0,0 +1,2 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
@@ -0,0 +1,70 @@
1
+ ---
2
+ id: "F32-cli-subscription-renewal"
3
+ title: "Subscription renewal command"
4
+ status: planned
5
+ complexity: high
6
+ depends-on: []
7
+ ---
8
+
9
+ # F32 Subscription renewal command
10
+
11
+ ## Context
12
+
13
+ `bench-cli` currently has greeting and version commands only. The task:
14
+ add a `renew-subscriptions` command that applies subscription renewal requests,
15
+ uses expiring customer credits in deterministic order, rolls back credits when a
16
+ renewal cannot be paid, and prints exact invoice, rejected, and remaining-credit
17
+ rows.
18
+
19
+ This is billing reconciliation. Downstream finance tools parse stdout as JSON,
20
+ so field names, error objects, and row shapes are part of the contract.
21
+
22
+ ## Requirements
23
+
24
+ - [ ] `bench-cli renew-subscriptions --input <path>` reads JSON shaped as `{ "as_of": string, "plans": [plan], "customers": [customer], "credits": [credit], "renewals": [renewal] }`.
25
+ - [ ] Each plan has keys `id`, `monthly_cents`, `included_seats`, and `overage_cents`.
26
+ - [ ] Each customer has keys `id`, `plan`, and `active`.
27
+ - [ ] Each credit has keys `id`, `customer`, `cents`, and `expires_at`.
28
+ - [ ] Each renewal has keys `id`, `customer`, `seats`, `months`, `priority`, `requested_at`, and `max_due_cents`.
29
+ - [ ] Before processing any renewal, duplicate renewal ids are invalid input: exit `2`, write exactly one JSON error object `{ "error": "duplicate_renewal_id", "id": string }` to stderr, and write no stdout.
30
+ - [ ] Before processing any renewal, all cents, seat, month, and priority fields must be integers; `monthly_cents`, `overage_cents`, `included_seats`, `cents`, `seats`, and `months` must be non-negative except `seats` and `months` must be positive. Invalid input exits `2` with one JSON error object and no stdout.
31
+ - [ ] Process renewals globally by `priority` descending, then `requested_at` ascending, then `id` ascending.
32
+ - [ ] A renewal rejects with reason `unknown_customer` when the customer does not exist, `inactive_customer` when the customer is inactive, and `unknown_plan` when the customer's plan does not exist.
33
+ - [ ] Renewal subtotal is `(plan.monthly_cents + max(0, seats - plan.included_seats) * plan.overage_cents) * months`.
34
+ - [ ] Usable credits are credits for the same customer with `expires_at >= as_of` and `cents > 0`, consumed by `expires_at` ascending, then `id` ascending.
35
+ - [ ] A renewal accepts only when `subtotal_cents - credit_applied_cents <= max_due_cents`.
36
+ - [ ] A rejected renewal with reason `payment_required` must not consume any credits, even if it tentatively applied credits before discovering the remaining due exceeded `max_due_cents`.
37
+ - [ ] On success, write exactly one JSON object to stdout and no stderr. Keys: `invoices`, `rejected`, `remaining_credits`.
38
+ - [ ] `invoices` is ordered in processing order. Each row has keys `id`, `customer`, `subtotal_cents`, `credit_applied_cents`, `due_cents`, and `credits`.
39
+ - [ ] Each invoice `credits` row has keys `id` and `applied_cents`, ordered by the credit consumption order.
40
+ - [ ] `rejected` is ordered in the original input renewal order. Each row has keys `id` and `reason`.
41
+ - [ ] `remaining_credits` includes only non-expired credits with positive cents after accepted renewals, sorted by `customer`, then `expires_at`, then `id`. Each row has keys `id`, `customer`, `cents`, and `expires_at`.
42
+ - [ ] `tests/cli.test.js` is updated. Existing tests still pass and at least two new tests cover `renew-subscriptions`: one successful priority/rollback scenario and one validation failure.
43
+
44
+ ## Constraints
45
+
46
+ - **No new npm dependencies.**
47
+ - **No floating money output.** All public amounts are integer cents.
48
+ - **No hidden mutable global state.** The command must derive output only from the input JSON for that invocation.
49
+ - **No silent catches.** Parse and file-read failures must emit a visible JSON error to stderr and exit `2`.
50
+ - **No extra stdout/stderr text** on the success path; downstream tooling parses stdout as JSON.
51
+
52
+ ## Out of Scope
53
+
54
+ - Persisting renewal state between command invocations.
55
+ - Adding invoices to a database or writing files.
56
+ - Adding currencies, payment gateways, or tax rules.
57
+ - Adding web UI or server routes.
58
+ - Touching `server/`, `web/`, or `tests/server.test.js`.
59
+
60
+ ## Verification
61
+
62
+ - `node --test tests/cli.test.js` exits 0.
63
+ - A later high-priority renewal is processed before an earlier low-priority renewal, and the low-priority renewal can lose usable credits because of that ordering.
64
+ - A rejected `payment_required` renewal leaves all tentatively applied credits available for later renewals.
65
+ - Credits are consumed by `expires_at` ascending, then `id` ascending, and expired or zero-cent credits are absent from `remaining_credits`.
66
+ - `rejected` rows are reported in the original input renewal order, even though processing order is priority based.
67
+ - Duplicate renewal ids exit `2`, print exactly `{ "error": "duplicate_renewal_id", "id": string }` to stderr, and print no stdout.
68
+ - Output row key names and nested `credits` key names match the visible spec exactly, with no aliased or extra keys.
69
+ - `git diff --stat` shows only `bin/cli.js` and `tests/cli.test.js` touched.
70
+ - Solo-headroom hypothesis: solo_claude is expected to miss payment-required credit rollback or expiring-credit consumption order; observable command `node "$BENCH_FIXTURE_DIR/verifiers/priority-credit-rollback.js"` exposes the miss.
@@ -0,0 +1,3 @@
1
+ Add a subscription renewal CLI command that applies expiring credits in priority order with rollback and exact machine-readable output.
2
+
3
+ The command should be `bench-cli renew-subscriptions --input <path>`. It reads plans, customers, credits, and renewal requests from JSON, processes renewals by priority, applies usable credits in expiration order, rejects unpaid renewals without consuming their tentative credits, and prints JSON invoices, rejected rows, and remaining credits. Keep the change scoped to the CLI and CLI tests, with no new dependencies and no extra stdout/stderr text.
@@ -0,0 +1,42 @@
1
+ #!/usr/bin/env node
2
+ const assert = require('node:assert/strict');
3
+ const { mkdtempSync, writeFileSync, rmSync } = require('node:fs');
4
+ const { tmpdir } = require('node:os');
5
+ const { join } = require('node:path');
6
+ const { spawnSync } = require('node:child_process');
7
+
8
+ const workdir = process.env.BENCH_WORKDIR || process.cwd();
9
+ const tmp = mkdtempSync(join(tmpdir(), 'f32-renewal-dup-'));
10
+
11
+ try {
12
+ const inputPath = join(tmp, 'input.json');
13
+ writeFileSync(inputPath, JSON.stringify({
14
+ as_of: '2026-05-15',
15
+ plans: [
16
+ { id: 'starter', monthly_cents: 1000, included_seats: 5, overage_cents: 200 }
17
+ ],
18
+ customers: [
19
+ { id: 'c1', plan: 'starter', active: true }
20
+ ],
21
+ credits: [],
22
+ renewals: [
23
+ { id: 'dup-renewal', customer: 'c1', seats: 5, months: 1, priority: 1, requested_at: '2026-05-01', max_due_cents: 1000 },
24
+ { id: 'dup-renewal', customer: 'missing', seats: 5, months: 1, priority: 9, requested_at: '2026-05-02', max_due_cents: 1000 }
25
+ ]
26
+ }));
27
+
28
+ const proc = spawnSync('node', ['bin/cli.js', 'renew-subscriptions', '--input', inputPath], {
29
+ cwd: workdir,
30
+ encoding: 'utf8'
31
+ });
32
+
33
+ assert.equal(proc.status, 2, proc.stderr || proc.stdout);
34
+ assert.equal(proc.stdout, '');
35
+ assert.deepEqual(JSON.parse(proc.stderr), {
36
+ error: 'duplicate_renewal_id',
37
+ id: 'dup-renewal'
38
+ });
39
+ process.stdout.write(JSON.stringify({ ok: true }) + '\n');
40
+ } finally {
41
+ rmSync(tmp, { recursive: true, force: true });
42
+ }
@@ -0,0 +1,70 @@
1
+ #!/usr/bin/env node
2
+ const assert = require('node:assert/strict');
3
+ const { mkdtempSync, writeFileSync, rmSync } = require('node:fs');
4
+ const { tmpdir } = require('node:os');
5
+ const { join } = require('node:path');
6
+ const { spawnSync } = require('node:child_process');
7
+
8
+ const workdir = process.env.BENCH_WORKDIR || process.cwd();
9
+ const tmp = mkdtempSync(join(tmpdir(), 'f32-renewal-'));
10
+
11
+ try {
12
+ const inputPath = join(tmp, 'input.json');
13
+ writeFileSync(inputPath, JSON.stringify({
14
+ as_of: '2026-05-15',
15
+ plans: [
16
+ { id: 'starter', monthly_cents: 1000, included_seats: 5, overage_cents: 200 },
17
+ { id: 'pro', monthly_cents: 3000, included_seats: 10, overage_cents: 150 }
18
+ ],
19
+ customers: [
20
+ { id: 'c1', plan: 'starter', active: true },
21
+ { id: 'c2', plan: 'pro', active: true }
22
+ ],
23
+ credits: [
24
+ { id: 'cr-late', customer: 'c1', cents: 500, expires_at: '2026-06-30' },
25
+ { id: 'cr-expired', customer: 'c1', cents: 999, expires_at: '2026-04-01' },
26
+ { id: 'cr-early', customer: 'c1', cents: 400, expires_at: '2026-05-31' },
27
+ { id: 'cr-zero', customer: 'c1', cents: 0, expires_at: '2026-05-20' },
28
+ { id: 'cr-c2', customer: 'c2', cents: 1000, expires_at: '2026-12-31' }
29
+ ],
30
+ renewals: [
31
+ { id: 'r-low', customer: 'c1', seats: 5, months: 1, priority: 1, requested_at: '2026-05-01', max_due_cents: 100 },
32
+ { id: 'r-mid', customer: 'c1', seats: 8, months: 1, priority: 10, requested_at: '2026-05-02', max_due_cents: 0 },
33
+ { id: 'r-high', customer: 'c1', seats: 8, months: 1, priority: 9, requested_at: '2026-05-03', max_due_cents: 800 }
34
+ ]
35
+ }));
36
+
37
+ const proc = spawnSync('node', ['bin/cli.js', 'renew-subscriptions', '--input', inputPath], {
38
+ cwd: workdir,
39
+ encoding: 'utf8'
40
+ });
41
+
42
+ assert.equal(proc.status, 0, proc.stderr || proc.stdout);
43
+ assert.equal(proc.stderr, '');
44
+ const output = JSON.parse(proc.stdout);
45
+ assert.deepEqual(output, {
46
+ invoices: [
47
+ {
48
+ id: 'r-high',
49
+ customer: 'c1',
50
+ subtotal_cents: 1600,
51
+ credit_applied_cents: 900,
52
+ due_cents: 700,
53
+ credits: [
54
+ { id: 'cr-early', applied_cents: 400 },
55
+ { id: 'cr-late', applied_cents: 500 }
56
+ ]
57
+ }
58
+ ],
59
+ rejected: [
60
+ { id: 'r-low', reason: 'payment_required' },
61
+ { id: 'r-mid', reason: 'payment_required' }
62
+ ],
63
+ remaining_credits: [
64
+ { id: 'cr-c2', customer: 'c2', cents: 1000, expires_at: '2026-12-31' }
65
+ ]
66
+ });
67
+ process.stdout.write(JSON.stringify({ ok: true }) + '\n');
68
+ } finally {
69
+ rmSync(tmp, { recursive: true, force: true });
70
+ }
@@ -33,8 +33,15 @@ tests won't surface.
33
33
  stricter browser-required gating; today the fixture only checks file
34
34
  presence in verification.
35
35
 
36
+ ## Current status
37
+
38
+ Rejected as pair-lift evidence. `20260512-f4-web-headroom` measured bare 70 /
39
+ solo_claude 92, with a +22 solo-over-bare margin, but failed headroom because
40
+ bare exceeded 60, solo exceeded 80, and bare carried judge/result/verify
41
+ disqualifiers. Rework the fixture or verifier before spending a pair arm on it.
42
+
36
43
  ## Rotation trigger
37
44
 
38
- When both arms consistently produce correct output AND include accessible
39
- markup without pipeline intervention, rotate to a harder UI task (e.g., a
40
- form with validation states).
45
+ When both `bare` and `solo_claude` consistently produce correct output AND
46
+ include accessible markup without pipeline intervention, rotate to a harder UI
47
+ task (e.g., a form with validation states).
@@ -31,6 +31,13 @@ calls it done. Verification catches that.
31
31
  - **Phase 2.5 FIX LOOP** runs at least once. A fixture passing with 0 fix rounds is a smoke signal that the test-trap design is too lenient; inspect.
32
32
  - **Phase 1.4 BUILD GATE** uses `node --test` which exits non-zero on any failure, forcing route to 2.5.
33
33
 
34
+ ## Current status
35
+
36
+ Rejected as pair-lift evidence. `20260512-f5-fixloop-headroom` measured bare
37
+ 99 / solo_claude 99, with bare and solo each passing 5/5 verification commands.
38
+ It fails both headroom preconditions and should remain a fix-loop control unless
39
+ reworked.
40
+
34
41
  ## Rotation trigger
35
42
 
36
43
  When fix rounds consistently = 0 across two shipped versions, the trap is too
@@ -30,6 +30,11 @@ of over-reaching. As models improve, they should take the stdlib path more
30
30
  often. Margin on this fixture is a clean signal of pipeline's ability to
31
31
  enforce repo-level no-deps policy.
32
32
 
33
+ Current status: rejected as pair-lift evidence. `20260512-f6-checksum-headroom`
34
+ measured bare 97 / solo_claude 96, with `bare` and `solo_claude` passing 6/6
35
+ verification commands. It fails both headroom preconditions and should remain a
36
+ dep-audit control unless reworked.
37
+
33
38
  ## Rotation trigger
34
39
 
35
40
  When bare arms consistently avoid dependency-adding and pipeline still
@@ -45,6 +45,13 @@ If bare somehow beats variant (variant fixes the bug = scope violation,
45
45
  bare doesn't), that's a real signal that the pipeline's scope discipline
46
46
  is weak and needs CRITIC prompt tuning.
47
47
 
48
+ ## Current status
49
+
50
+ Rejected as pair-lift evidence. `20260512-f7-scope-headroom` measured bare
51
+ 99 / solo_claude 100, with bare and solo each passing 6/6 verification commands.
52
+ It fails both headroom preconditions and should remain a scope-discipline
53
+ control unless reworked.
54
+
48
55
  ## Rotation trigger
49
56
 
50
57
  Retire when variant scope-discipline axis > 24 on two shipped versions.
@@ -12,6 +12,9 @@ Margin ∈ [-3, +3] is the expected range. Both arms should produce small,
12
12
  reasonable improvements. The judge may slightly prefer one or the other
13
13
  based on taste.
14
14
 
15
+ Pair-candidate status: rejected by design. F8 is a known-limit ambiguity
16
+ barometer whose expected margin is a tie range, not pair-lift evidence.
17
+
15
18
  Margin > +3 means the fixture is no longer a known limit — either the
16
19
  harness got notably better at ambiguous specs (improve prompt or reuse the
17
20
  pattern elsewhere), or the task is drifting from its "under-specified"
@@ -2,7 +2,7 @@
2
2
  id: "F8-known-limit-ambiguous"
3
3
  title: "Improve the CLI"
4
4
  status: planned
5
- complexity: ambiguous
5
+ complexity: medium
6
6
  depends-on: []
7
7
  ---
8
8
 
@@ -57,13 +57,15 @@ The harness refuses `--resolve-skill old` on F9 with a hard error.
57
57
  This asymmetry is INTENTIONAL — the fixture tests total-output quality,
58
58
  not per-file quality.
59
59
 
60
- ## Variant artifact check (out-of-band, NOT in expected.json)
60
+ ## Skill-driven artifact check (out-of-band, NOT in expected.json)
61
61
 
62
62
  Per Codex R0.5 §B: `expected.json.verification_commands` apply to ALL arms
63
63
  (see `run-fixture.sh:472`). A `docs/specs/**` check in expected.json would
64
- punish the bare arm (which doesn't run ideate). Variant-only artifact
64
+ punish the bare arm (which doesn't run ideate). Skill-driven artifact
65
65
  verification lives in `scripts/check-f9-artifacts.py`, which runs AFTER
66
- the per-fixture verification block and asserts variant/solo arms produced:
66
+ the per-fixture verification block and asserts every non-bare skill arm
67
+ (`variant`, `solo_claude`, `l2_gated`, `l2_risk_probes`, `l2_forced`)
68
+ produced:
67
69
 
68
70
  - `docs/specs/<id>-<slug>/spec.md` exists.
69
71
  - `docs/specs/<id>-<slug>/spec.expected.json` exists.
@@ -91,3 +93,13 @@ the per-fixture verification block and asserts variant/solo arms produced:
91
93
  F9 is the last fixture we rotate — it's the anchor. If it saturates
92
94
  (variant consistently > 95), the whole suite needs a harder novice-flow
93
95
  anchor before we retire this one.
96
+
97
+ ## Current pair-evidence status
98
+
99
+ Rejected as pair-lift evidence until reworked. `20260512-f9-e2e-headroom`
100
+ measured bare 60 / solo_claude 90 with a +30 solo-over-bare margin, and
101
+ `check-f9-artifacts.py` passed for bare (exempt) and solo_claude. The headroom
102
+ gate still failed because bare headroom was 0 < 5, solo_claude exceeded 80, and
103
+ bare carried a judge disqualifier. Keep F9 as the novice-flow anchor, but do not
104
+ spend pair arms on it as pair evidence until the fixture is reworked and clears
105
+ a fresh headroom gate.
@@ -56,4 +56,4 @@ inside `/devlyn:resolve` (no separate preflight skill in the 2-skill design).
56
56
  - `cd /tmp && node <worktree>/bin/cli.js gitstats` (from outside a repo — use the worktree's absolute path) exits 2.
57
57
  - `node --test tests/` passes.
58
58
 
59
- (Variant-only artifact checks — `docs/specs/<id>-<slug>/spec.md` + `spec.expected.json` existence, transcript fingerprint — live in `scripts/check-f9-artifacts.py`, NOT in the shared verification block above. See NOTES.md.)
59
+ (Skill-driven artifact checks — `docs/specs/<id>-<slug>/spec.md` + `spec.expected.json` existence, transcript fingerprint — live in `scripts/check-f9-artifacts.py`, NOT in the shared verification block above. Bare is exempt. See NOTES.md.)
@@ -19,16 +19,16 @@ Every fixture is a directory under `benchmark/auto-resolve/fixtures/F<N>-<slug>/
19
19
 
20
20
  - **id** — matches directory name, used across artifacts.
21
21
  - **category** — one of `trivial | medium | high-risk | stress | edge | e2e`. Drives which ship-gate rule applies.
22
- - **difficulty** — expected difficulty independent of category. Rubric uses this only for saturation detection (when both arms > 95 for two versions, flag fixture for rotation).
22
+ - **difficulty** — expected difficulty independent of category. Rubric uses this only for saturation detection (when `bare` and `solo_claude` both exceed 95 for two versions, flag fixture for rotation).
23
23
  - **timeout_seconds** — per-arm hard timeout. Runner kills the arm at this limit and marks result `TIMEOUT`.
24
24
  - **required_tools** — binaries the arm's environment must provide. Runner checks before invocation.
25
25
  - **browser** — true if arm must be able to run Playwright. Runner uses this to decide whether `test-repo`'s Playwright deps get installed before the arm starts.
26
- - **deps_change_expected** — true if the task involves modifying `package.json` / lockfiles. Variant's CRITIC security sub-pass is expected to trigger native `security-review` dep audit when true.
27
- - **intent** — **load-bearing**. A short plain-language statement shared by both arms. `spec.md` formalizes it into auto-resolve-ready form; `task.txt` renders it as a direct prompt. A CI lint ensures both derive from this field and stay in sync.
26
+ - **deps_change_expected** — true if the task involves modifying `package.json` / lockfiles. The pipeline arm's CRITIC security sub-pass is expected to trigger native `security-review` dep audit when true.
27
+ - **intent** — **load-bearing**. A short plain-language statement shared by all arms. `spec.md` formalizes it into resolve-ready form; `task.txt` renders it as a direct prompt. A CI lint ensures both derive from this field and stay in sync.
28
28
 
29
29
  ## spec.md
30
30
 
31
- Auto-resolve-ready spec for the pipeline arm. Same format `/devlyn:ideate` produces:
31
+ Resolve-ready spec for the pipeline arm. Same format `/devlyn:ideate` produces:
32
32
 
33
33
  ```markdown
34
34
  ---
@@ -52,12 +52,17 @@ depends-on: []
52
52
  - Concrete, with reasoning for each (not bare).
53
53
 
54
54
  ## Out of Scope
55
- - Explicit "must NOT build" list. Audited by preflight as anti-commitments.
55
+ - Explicit "must NOT build" list. Audited by resolve/JUDGE as anti-commitments.
56
56
 
57
57
  ## Verification
58
58
  - Concrete commands whose expected behavior is named.
59
59
  ```
60
60
 
61
+ `complexity` is the resolve spec contract enum, not the benchmark difficulty
62
+ label. Use `trivial`, `medium`, or `high` for new fixtures; `large` is accepted
63
+ only for compatibility with external/legacy specs. Keep ambiguous calibration
64
+ labels in `metadata.difficulty`, not spec frontmatter.
65
+
61
66
  ## task.txt
62
67
 
63
68
  Bare-arm input. Plain English, same intent, but framed as a user request rather than a formal spec. Intentionally lacks the structured Requirements/Constraints/Out-of-Scope sections — bare must make those calls itself. Must not leak "use the devlyn skill" hints.
@@ -97,10 +102,16 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
97
102
  Commands run with `BENCH_WORKDIR` (fresh arm work tree) and
98
103
  `BENCH_FIXTURE_DIR` (the fixture directory outside the arm work tree) in
99
104
  the environment. Put discriminator/oracle scripts under the fixture
100
- directory when the arm should not read the verifier source.
105
+ directory when the arm should not read the verifier source; any
106
+ `$BENCH_FIXTURE_DIR/...` file path referenced by a command must exist and
107
+ must not escape the fixture directory. Hidden oracle commands must reference
108
+ the verifier through an explicit `$BENCH_FIXTURE_DIR/...` path rather than
109
+ `cd "$BENCH_FIXTURE_DIR"` indirection.
101
110
  Any command that references `BENCH_FIXTURE_DIR` is a hidden oracle and must
102
111
  include `contract_refs`: exact substrings from `spec.md` proving the oracle
103
- tests a visible contract rather than inventing a narrower one.
112
+ tests a visible contract rather than inventing a narrower one. Hidden oracle
113
+ commands must also assert `stdout_contains: ["\"ok\":true"]` so a verifier
114
+ cannot pass silently without emitting the success sentinel.
104
115
  - **forbidden_patterns** — regexes scanned across `diff.patch`. Match at `severity: "disqualifier"` is a hard-floor fail. Match at `severity: "warning"` goes into the judge's critical-findings report.
105
116
  - **required_files** — must exist after the arm runs.
106
117
  - **forbidden_files** — must NOT appear in the arm's diff.
@@ -108,6 +119,17 @@ Machine-readable acceptance criteria used by both `run-fixture.sh` verification
108
119
  - **spec_output_files** — files or globs that define the authorized output surface for Tier B scope tracing.
109
120
  - **max_deps_added** — count of new entries under `dependencies`/`devDependencies` in `package.json`. Exceeds → hard-floor fail.
110
121
 
122
+ ## high-risk metadata
123
+
124
+ Fixtures with `metadata.json` `category: "high-risk"` must include at least
125
+ one resolve risk-trigger term in `metadata.intent` or `spec.md`, matching the
126
+ conditional pair/risk-probe triggers used by `/devlyn:resolve`: security/auth,
127
+ money/pricing/tax/ledger, persistence/data mutation, idempotency/replay,
128
+ API/webhook/signature, allocation/scheduling/inventory/rollback/transaction,
129
+ priority, or output/response-shape contracts. This keeps future pair-evidence
130
+ candidates from relying on a label that would not actually activate the pair
131
+ path.
132
+
111
133
  ## NOTES.md
112
134
 
113
135
  Human-readable explanation of why this fixture exists. Must answer:
@@ -119,6 +141,20 @@ Human-readable explanation of why this fixture exists. Must answer:
119
141
 
120
142
  Notes are read during suite design review, not during runs.
121
143
 
144
+ If `NOTES.md` records that a fixture failed a headroom gate or was rejected as
145
+ pair-lift evidence, add the fixture to
146
+ `benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh` in the same change.
147
+ Rejected controls should remain replayable, but they must not be silently
148
+ re-spent as fresh pair candidates.
149
+
150
+ ## Retired fixtures
151
+
152
+ Move fixtures that are no longer valid active golden-suite evidence to
153
+ `fixtures/retired/<fixture-id>/`. Retired fixtures are preserved for replay and
154
+ historical diagnosis, but `run-suite.sh` does not auto-discover them. Each
155
+ retired fixture must keep the six fixture files plus `RETIRED.md` explaining
156
+ the run id or concrete reason it left the active suite.
157
+
122
158
  ## setup.sh
123
159
 
124
160
  Deterministic starting state. Runs against a fresh copy of `benchmark/auto-resolve/fixtures/test-repo/` before either arm starts. Common uses:
@@ -139,4 +175,14 @@ A CI lint step (`scripts/lint-fixtures.sh`) verifies:
139
175
  - `metadata.intent` substring appears in both `spec.md::Context` and `task.txt` (≥ 60% token overlap using simple tokenization).
140
176
  - `spec.md` frontmatter `id` matches directory name.
141
177
  - `expected.json` is valid JSON.
178
+ - Active high-risk fixtures include a resolve risk-trigger term in
179
+ `metadata.intent` or `spec.md`.
180
+ - Active fixtures whose `NOTES.md` records headroom-gate failure or pair-lift
181
+ rejection are covered by `pair-rejected-fixtures.sh`.
182
+ - Active fixtures whose `NOTES.md` records `pair_evidence_passed` include an
183
+ actionable solo-headroom hypothesis in `spec.md`, using the same checker as
184
+ shadow candidates, and the hypothesis observable command must match a
185
+ `verification_commands[].cmd` entry in `expected.json`.
142
186
  - `setup.sh` is executable.
187
+ - Retired fixtures under `fixtures/retired/F*/` keep `RETIRED.md`, preserve the
188
+ six fixture files, and are excluded from active suite discovery.
@@ -0,0 +1,37 @@
1
+ # F27 CLI subscription proration
2
+
3
+ ## Failure mode
4
+
5
+ This fixture detects billing implementations that look correct on one happy
6
+ path but mishandle date boundaries, per-segment rounding, duplicate credits, or
7
+ hardcode plan and tax rules instead of reading the seeded data file.
8
+
9
+ ## Pipeline phase target
10
+
11
+ PLAN must separate input validation, period segmentation, per-segment proration,
12
+ credit de-duplication, tax calculation, and output formatting. VERIFY should
13
+ probe date boundary and data-source variants because a small example can pass
14
+ while production invoices are off by one day or one cent.
15
+
16
+ ## Why existing fixtures do not cover it
17
+
18
+ F25 covers cart promotions and F26 covers payout ledger events. This fixture
19
+ adds subscription billing proration: effective-date segmentation, period-day
20
+ denominators, credit idempotency, and tax after credits. It was intended to
21
+ cover a pair-risk-probe gap, but the first real headroom smoke showed the
22
+ visible contract was explicit enough for `solo_claude` to solve cleanly.
23
+
24
+ ## Retirement
25
+
26
+ Retire or replace if both bare and solo consistently exceed the headroom
27
+ thresholds, or if a later billing fixture provides the same proration and
28
+ idempotent-credit signal with lower wall time.
29
+
30
+ ## Measurement notes
31
+
32
+ - `20260511-f27-headroom-smoke-061401`: headroom FAIL. Judge scores were
33
+ bare 33 / solo_claude 94 (`solo_over_bare` +61). Bare passed 1 of 3
34
+ verification commands; solo passed 3 of 3 with terminal `PASS`.
35
+ - Do not spend a pair arm on this fixture in its current shape. It needs either
36
+ a harder visible contract that solo misses without pair probes, or rotation
37
+ out of the pair-candidate set.
@@ -0,0 +1,13 @@
1
+ # Retired: F27 CLI subscription proration
2
+
3
+ Retired from the active golden suite after headroom smoke
4
+ `20260511-f27-headroom-smoke-061401`.
5
+
6
+ Reason: `solo_claude` scored 94, exceeding the headroom ceiling of 80, while
7
+ `bare` scored 33 and passed only 1 of 3 verification commands. The fixture is
8
+ too explicit for current solo/pair lift measurement and too expensive to keep
9
+ in the default suite.
10
+
11
+ Future use: rework the visible contract so it creates a fair pair-risk-probe
12
+ gap, or replace it with a different billing fixture. Do not count this fixture
13
+ as pair evidence in its current form.
@@ -0,0 +1,56 @@
1
+ {
2
+ "verification_commands": [
3
+ {
4
+ "cmd": "node --test tests/cli.test.js",
5
+ "exit_code": 0,
6
+ "stdout_contains": [],
7
+ "stdout_not_contains": ["not ok "]
8
+ },
9
+ {
10
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/exact-proration.js\"",
11
+ "exit_code": 0,
12
+ "stdout_contains": ["\"ok\":true"],
13
+ "stdout_not_contains": [],
14
+ "contract_refs": [
15
+ "Dates are interpreted as UTC calendar dates. The billing period start is inclusive and the end is exclusive.",
16
+ "Each segment amount is `Math.round(plan.monthly_cents * segment_days / period_days)`. Round each segment independently before summing.",
17
+ "Identical duplicate credits, where both `id` and `amount_cents` match, are idempotent and apply only once.",
18
+ "Tax is computed after credits: `tax_cents = Math.round((subtotal_cents - credit_cents) * tax_rate)`."
19
+ ]
20
+ },
21
+ {
22
+ "cmd": "node \"$BENCH_FIXTURE_DIR/verifiers/rules-source-and-conflict.js\"",
23
+ "exit_code": 0,
24
+ "stdout_contains": ["\"ok\":true"],
25
+ "stdout_not_contains": [],
26
+ "contract_refs": [
27
+ "Plan monthly prices and state tax rates come from `data/subscription-plans.json`. Do not hardcode these values in the command implementation.",
28
+ "Credits with the same `id` but different `amount_cents` are conflicting duplicates. They exit `2` with exact error shape `{ \"error\": \"conflicting_credit\", \"id\": string }`, write it to stderr, and write nothing to stdout."
29
+ ]
30
+ }
31
+ ],
32
+ "forbidden_patterns": [
33
+ {
34
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{[^}]*return\\s+(null|undefined|''|\\{\\})",
35
+ "description": "silent catch returning fallback in subscription-invoice path",
36
+ "files": ["bin/cli.js"],
37
+ "severity": "disqualifier"
38
+ },
39
+ {
40
+ "pattern": "catch\\s*\\([^)]*\\)\\s*\\{\\s*\\}",
41
+ "description": "empty catch block",
42
+ "files": ["bin/cli.js"],
43
+ "severity": "disqualifier"
44
+ },
45
+ {
46
+ "pattern": "/\\*\\s*eslint-disable",
47
+ "description": "eslint-disable without scoped justification",
48
+ "files": ["bin/cli.js"],
49
+ "severity": "disqualifier"
50
+ }
51
+ ],
52
+ "required_files": ["bin/cli.js", "tests/cli.test.js"],
53
+ "forbidden_files": [],
54
+ "max_deps_added": 0,
55
+ "spec_output_files": ["bin/cli.js", "tests/cli.test.js"]
56
+ }
@@ -0,0 +1,10 @@
1
+ {
2
+ "id": "F27-cli-subscription-proration",
3
+ "category": "high-risk",
4
+ "difficulty": "high",
5
+ "timeout_seconds": 1800,
6
+ "required_tools": ["node"],
7
+ "browser": false,
8
+ "deps_change_expected": false,
9
+ "intent": "Add a subscription-invoice command that prorates plan changes across a billing period, applies idempotent credits, reads plan and tax rules from data/subscription-plans.json, and prints exact integer-cent invoice totals."
10
+ }
@@ -0,0 +1,18 @@
1
+ #!/usr/bin/env bash
2
+ set -euo pipefail
3
+
4
+ mkdir -p data
5
+ cat > data/subscription-plans.json <<'JSON'
6
+ {
7
+ "plans": {
8
+ "starter": { "monthly_cents": 1200 },
9
+ "growth": { "monthly_cents": 3600 },
10
+ "scale": { "monthly_cents": 9600 }
11
+ },
12
+ "tax_rates": {
13
+ "CA": 0.0825,
14
+ "NY": 0.04,
15
+ "OR": 0
16
+ }
17
+ }
18
+ JSON