devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -10,7 +10,7 @@ Independent quality layer. You answer one question: did the diff deliver what th
10
10
  - `spec.md` (or `.devlyn/criteria.generated.md` for free-form mode) — the contract.
11
11
  - `spec.expected.json` — the mechanical acceptance contract per `_shared/expected.schema.json`.
12
12
  - The cumulative diff against `state.base_ref.sha`.
13
- - The spec hash (`state.source.spec_sha256`) — re-read the spec from disk and confirm the hash matches; if it does not, write `state.phases.verify.verdict: "BLOCKED"` with reason `spec_sha256_mismatch` and stop.
13
+ - The source hash (`state.source.spec_sha256` for spec mode, `state.source.criteria_sha256` for generated free-form mode) — re-read the source contract from disk and confirm the hash matches; if it does not, write `state.phases.verify.verdict: "BLOCKED"` with reason `source_sha256_mismatch` and stop.
14
14
 
15
15
  You do NOT receive: PLAN, IMPLEMENT's reasoning, BUILD_GATE's findings, CLEANUP's allowlist negotiations. Reading those would compromise independence.
16
16
  </input>
@@ -21,10 +21,7 @@ You do NOT receive: PLAN, IMPLEMENT's reasoning, BUILD_GATE's findings, CLEANUP'
21
21
 
22
22
  Re-run the mechanical checks fresh, independent of BUILD_GATE's earlier run:
23
23
 
24
- 1. `python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` against the post-CLEANUP code.
25
- 2. Re-scan `spec.expected.json.forbidden_patterns` against the diff (Python re.search; honor each pattern's `files` allowlist).
26
- 3. Confirm `required_files` exist post-diff; confirm `forbidden_files` do not appear in the diff.
27
- 4. Confirm `max_deps_added` is not exceeded (`git diff -- package.json` for Node; equivalent for other ecosystems).
24
+ 1. `SPEC_VERIFY_PHASE=verify_mechanical SPEC_VERIFY_FINDINGS_FILE=verify-mechanical.findings.jsonl SPEC_VERIFY_FINDING_PREFIX=VERIFY-MECH python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` against the post-CLEANUP code. In spec mode, sibling `spec.expected.json` wins; a malformed sibling is CRITICAL, not a fallback. When `state.risk_profile.risk_probes_enabled == true`, missing `.devlyn/risk-probes.jsonl` is also CRITICAL. The script also checks `forbidden_patterns`, `required_files`, `forbidden_files`, and `max_deps_added`.
28
25
 
29
26
  Emit findings to `.devlyn/verify-mechanical.findings.jsonl`. Each match = one finding. Severity from the pattern's `severity` field (disqualifier → CRITICAL, warning → MEDIUM).
30
27
 
@@ -87,28 +84,40 @@ design/style concerns remain non-binding MEDIUM and produce `PASS_WITH_ISSUES`.
87
84
 
88
85
  ### Pair-mode (when triggered by orchestrator)
89
86
 
90
- Pair-mode is eligible only after MECHANICAL has no HIGH/CRITICAL findings.
91
- Deterministic blockers already decide the verdict and route to the fix loop; a
92
- second judge there duplicates evidence and wastes wall-time. If MECHANICAL has
93
- a HIGH/CRITICAL finding, record `pair_judge: null` and do not spawn the second
94
- VERIFY agent.
87
+ Pair-mode is eligible only after MECHANICAL and the primary JUDGE have no
88
+ verdict-binding findings. Deterministic blockers and primary JUDGE blockers
89
+ already decide the verdict and route to the fix loop; a second judge there
90
+ duplicates evidence and wastes wall-time. If MECHANICAL or the primary JUDGE
91
+ has a verdict-binding finding, record `pair_judge: null` and do not spawn the
92
+ second VERIFY agent.
95
93
 
96
94
  When eligible, trigger pair-mode if any of these are true:
97
- - `--pair-verify` was set.
95
+ - `state.pair_verify == true` (`--pair-verify` was set).
98
96
  - `state.mode == "verify-only"`.
99
- - The spec frontmatter has `complexity: high`.
100
- - `state.complexity` is `"high"` or `"large"`.
97
+ - The spec frontmatter has `complexity: high`; legacy/external spec
98
+ `complexity: large` is accepted for compatibility, but new specs use `high`.
99
+ - Current free-form `state.complexity` is `"large"`; legacy `"high"` state remains accepted by the merge validator only for archived run compatibility.
101
100
  - `state.risk_profile.high_risk == true`.
102
101
  - `.devlyn/risk-probes.jsonl` exists or `state.risk_profile.risk_probes_enabled == true`.
103
- - MECHANICAL emitted warning-level findings but no HIGH/CRITICAL blockers.
102
+ - The spec includes an actionable solo-headroom hypothesis.
103
+ - MECHANICAL or the primary JUDGE emitted warning-level findings but no
104
+ verdict-binding blockers.
104
105
  - `state.verify.coverage_failed == true`.
105
106
 
107
+ Malformed `state.risk_profile` is a VERIFY contract violation: it must be an
108
+ object, `high_risk` / `risk_probes_enabled` / `pair_default_enabled` must be
109
+ JSON booleans when present, and `reasons` must be a string array. Do not treat
110
+ missing or malformed risk state as low-risk; `verify-merge-findings.py` blocks
111
+ it because it can hide `risk.high` or `risk_probes.enabled` pair triggers.
112
+
106
113
  If `--no-pair` was set, do not spawn the OTHER-engine judge. Record
107
114
  `pair_trigger: { eligible: false, reasons: [], skipped_reason: "user_no_pair" }`
108
115
  and continue with solo VERIFY. This is an explicit user opt-out, not an engine
109
- availability fallback.
116
+ availability fallback. `--pair-verify` and `--no-pair` are mutually exclusive;
117
+ if both are present, stop with `BLOCKED:invalid-flags`.
110
118
 
111
- Before JUDGE spawn, compute and persist:
119
+ After MECHANICAL and the primary JUDGE finish, compute and persist this before
120
+ spawning the OTHER-engine pair judge:
112
121
 
113
122
  ```json
114
123
  "pair_trigger": {
@@ -118,9 +127,23 @@ Before JUDGE spawn, compute and persist:
118
127
  }
119
128
  ```
120
129
 
121
- If `eligible == true` and `reasons` is non-empty, the OTHER-engine judge is
122
- mandatory. Skipping it is a VERIFY contract violation. If ineligible, record the
123
- reason, e.g. `"mechanical_blocker"`.
130
+ If `eligible == true`, `reasons` must be non-empty and include every applicable canonical reason; for example, a spec with an actionable solo-headroom
131
+ hypothesis must include `spec.solo_headroom_hypothesis` even when another reason
132
+ such as `risk.high` also applies. The OTHER-engine judge is mandatory. Skipping
133
+ it is a VERIFY contract violation. If ineligible, record the
134
+ reason, e.g. `"mechanical_blocker"` or `"primary_judge_blocker"`.
135
+
136
+ `pair_trigger` is a strict contract, not advisory metadata. `eligible: true`
137
+ requires a non-empty `reasons` list and `skipped_reason: null`; `eligible: false`
138
+ requires an empty `reasons` list and a string/null `skipped_reason`. Do not emit
139
+ contradictory states such as `eligible: true` with `skipped_reason`, or
140
+ `eligible: false` with trigger reasons. `verify-merge-findings.py` blocks VERIFY
141
+ on malformed trigger state. Eligible triggers must contain only canonical
142
+ reasons and at least one reason: `mode.verify-only`, `complexity.high`, `complexity.large`,
143
+ `mode.pair-verify`, `spec.complexity.high`, `spec.complexity.large`,
144
+ `spec.solo_headroom_hypothesis`, `risk.high`, `risk_probes.enabled`,
145
+ `risk_probes.present`, `coverage.failed`, `mechanical.warning`, or
146
+ `judge.warning`.
124
147
 
125
148
  The `--engine` flag never disables this rule. Explicit `--engine claude` means
126
149
  Claude is the primary judge; if pair-mode triggers, Codex is still the mandatory
@@ -160,12 +183,22 @@ When eligible and the orchestrator spawns a second VERIFY agent with the OTHER e
160
183
  after the first verdict-binding finding and emit JSONL. If both probes pass
161
184
  and static scope/dependency checks show no blocker, emit PASS; do not continue
162
185
  exhaustive exploration.
186
+ If the spec includes a solo-headroom hypothesis, one of the two targeted
187
+ probes must exercise that hypothesis with the visible command/input shape and
188
+ compare the full externally visible result. The probe must use the
189
+ hypothesis's backticked observable command as its command anchor before adding
190
+ bounded input variations. Do not substitute a neighboring easier edge case;
191
+ the pair judge exists to test the stated expected solo miss.
163
192
  A targeted probe must compare the full externally visible result
164
193
  (stdout/stderr/exit and full parsed output object, including accepted/scheduled
165
194
  rows, rejected rows, and remaining state when present), not just a single
166
- property. For priority/stateful specs, at least one probe must include an
167
- earlier input entity that would succeed under input-order processing, a later
168
- higher-priority entity that consumes or blocks the critical resource, and a
195
+ property. When the spec names exact keys, row shapes, JSON object shape, or an
196
+ exact error body, compare parsed key sets/deep equality so aliased keys,
197
+ missing keys, and extra keys are verdict-binding failures. Use the spec's
198
+ visible input key names literally when constructing the probe input. For
199
+ priority/stateful specs, at least one probe must include an earlier input
200
+ entity that would succeed under input-order processing, a later higher-priority
201
+ entity that consumes or blocks the critical resource, and a
169
202
  failure/blocked/rollback edge that determines a later entity's state. This is
170
203
  the minimum compound shape for priority + failure/state-mutation bugs.
171
204
  Scope qualifiers are binding for the pair judge too: do not reinterpret
@@ -181,12 +214,13 @@ When eligible and the orchestrator spawns a second VERIFY agent with the OTHER e
181
214
  (or scheduled) and rejected rows.
182
215
 
183
216
  Codex pair-JUDGE is read-only. Invoke `codex-monitored.sh` directly with
184
- `-c model_reasoning_effort=medium`; this phase is a bounded two-probe review,
185
- not an unbounded implementation task. Do not pipe it to `tail`, `head`, `grep`,
186
- `sed`, or `awk`. Capture stdout/stderr by direct tool capture or file
187
- redirection. The Codex judge must return JSONL
188
- findings on stdout; the orchestrator writes `.devlyn/verify.pair.findings.jsonl`
189
- and merges verdicts. Do not ask Codex to `apply_patch` or edit `.devlyn`.
217
+ `CODEX_MONITORED_ISOLATED=1` and `-c model_reasoning_effort=medium`; this is a
218
+ bounded two-probe review, not implementation. Isolation blocks user config,
219
+ AGENTS.md, pyx-memory, hooks, and project rules from hidden context/tool
220
+ side effects. Do not pipe it to `tail`, `head`, `grep`, `sed`, or `awk`.
221
+ Capture stdout/stderr directly. The Codex judge must return JSONL findings on
222
+ stdout; the orchestrator writes `.devlyn/verify.pair.findings.jsonl` and merges
223
+ verdicts. Do not ask Codex to `apply_patch` or edit `.devlyn`.
190
224
  The Codex prompt must include a bounded-output contract: no harness-doc reads,
191
225
  maximum two targeted probes before first output, stop on the first
192
226
  verdict-binding finding, and emit PASS immediately after the bounded checks pass.
@@ -11,6 +11,7 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
11
11
  "started_at": "2026-04-30T12:00:00Z",
12
12
  "engine": "claude",
13
13
  "mode": "spec",
14
+ "pair_verify": false,
14
15
  "complexity": null,
15
16
  "risk_profile": { "high_risk": false, "reasons": [], "risk_probes_enabled": false, "pair_default_enabled": true },
16
17
  "base_ref": { "branch": "main", "sha": "abc123..." },
@@ -44,16 +45,18 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
44
45
 
45
46
  - **version** — string. Bump major on a breaking schema change.
46
47
  - **mode** — `"free-form" | "spec" | "verify-only"`.
48
+ - **pair_verify** — boolean. Set true only when the user passed `--pair-verify`; otherwise false. This is the durable state evidence for the `mode.pair-verify` pair-trigger reason. It is mutually exclusive with `risk_profile.pair_default_enabled == false` from `--no-pair`; `verify-merge-findings.py` blocks the contradictory state.
47
49
  - **complexity** — `null | "trivial" | "medium" | "large"`. Free-form mode populates this; spec/verify-only mode leaves it null.
48
50
  - **engine** — `"claude" | "codex" | "auto"` initially; a required unavailable engine stops the run with `BLOCKED:<engine>-unavailable`.
49
- - **risk_profile** — PHASE 0 classification for conditional defaults. `high_risk` records durable-risk signals from the goal/spec; `risk_probes_enabled` is true for explicit `--risk-probes` or high-risk specs unless `--no-risk-probes`; `pair_default_enabled` is false only for explicit `--no-pair`.
51
+ - **source** — provenance for the contract all downstream phases read. Spec and verify-only mode set `type: "spec"`, `spec_path`, and `spec_sha256`. Free-form mode sets `type: "generated"`, leaves `spec_path`/`spec_sha256` null, and must set `criteria_path: ".devlyn/criteria.generated.md"` plus `criteria_sha256` from the generated file's raw bytes. VERIFY re-checks the matching hash before judging.
52
+ - **risk_profile** — PHASE 0 classification for conditional defaults. `high_risk` records durable-risk signals from the goal/spec; `risk_probes_enabled` is true for explicit `--risk-probes` or high-risk specs unless `--no-risk-probes`; `pair_default_enabled` is false only for explicit `--no-pair`. `risk_profile` must remain an object with boolean `high_risk`, `risk_probes_enabled`, and `pair_default_enabled` fields when present, plus `reasons` as a list of strings. Malformed `risk_profile` blocks VERIFY because pair-trigger reasons derive `risk.high` and `risk_probes.enabled` from this state.
50
53
  - **rounds.global** — incremented every fix-loop pass (BUILD_GATE → fix-loop OR VERIFY → fix-loop).
51
54
  - **phases.probe_derive** — optional PHASE 1.5 entry when `--risk-probes` is enabled. Artifacts include `.devlyn/risk-probes.jsonl`. Probe failures later surface through BUILD_GATE/VERIFY as `correctness.risk-probe-failed`.
52
55
  - **bypasses** — array of phase names from `--bypass`. Valid: `"build-gate" | "cleanup"`. PLAN, IMPLEMENT, VERIFY are non-bypassable (orchestrator rejects at parse time).
53
56
  - **implement_passed_sha** — captured at end of PHASE 2; null until then. Activates the post-implement invariant for CLEANUP and VERIFY.
54
57
  - **criteria** — generated from spec's `## Requirements` checklist (one per `- [ ]`). `status: pending → implemented` is the legal transition. `failed_by_finding_ids` populates when VERIFY surfaces a finding tied to a criterion.
55
- - **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for verify-only mode, high-risk specs, active risk probes, `complexity: high` specs, or `state.complexity` of `"high"`/`"large"` when MECHANICAL has no HIGH/CRITICAL blockers.
56
- - **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. If eligible with any reason, `pair_judge` must be non-null.
58
+ - **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `state.pair_verify == true`, verify-only mode, high-risk specs, active risk probes, actionable solo-headroom hypotheses, `complexity: high` specs, or current free-form `state.complexity` of `"large"` when MECHANICAL and the primary JUDGE have no verdict-binding blockers. Legacy/external spec `complexity: large` remains accepted for compatibility; new specs use `high`. Legacy `"high"` state remains accepted by the merge validator only for archived run compatibility.
59
+ - **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. The shape is strict: `eligible: true` requires a non-empty reasons list containing every applicable canonical eligible reason and only canonical eligible reasons, plus `skipped_reason: null`; `eligible: false` requires an empty reasons list and may set only `user_no_pair`, `mechanical_blocker`, `primary_judge_blocker`, or null as the skip cause. Canonical eligible reasons are `mode.verify-only`, `mode.pair-verify`, `complexity.high`, `complexity.large`, `spec.complexity.high`, `spec.complexity.large`, `spec.solo_headroom_hypothesis`, `risk.high`, `risk_probes.enabled`, `risk_probes.present`, `coverage.failed`, `mechanical.warning`, and `judge.warning`. `user_no_pair` is valid only when `risk_profile.pair_default_enabled == false` from an explicit `--no-pair`; `mechanical_blocker` and `primary_judge_blocker` are valid only when the matching source has a verdict-binding finding. If state implies a pair decision is required but `pair_trigger` is missing, if it records `eligible:false` with no supported skip reason, if an eligible trigger omits an applicable reason such as `spec.solo_headroom_hypothesis`, or if any combination is malformed, `verify-merge-findings.py` blocks VERIFY.
57
60
 
58
61
  ## Per-phase shape
59
62
 
@@ -107,7 +110,7 @@ Per-phase summary table: `phase | verdict | duration_ms | round | triggered_by |
107
110
 
108
111
  Findings table (post-IMPLEMENT phases only — they are findings-only): each finding's `severity | rule_id | file:line | message | confidence`.
109
112
 
110
- Follow-up notes: any `--continue-on-large` assumptions, pair/risk-probe opt-out state, engine setup guidance for `BLOCKED:<engine>-unavailable`, any `state.verify.coverage_failed` axes.
113
+ Follow-up notes: any `--continue-on-large` assumptions, pair/risk-probe opt-out state, engine setup guidance for `BLOCKED:<engine>-unavailable`, `/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required` that asks for the visible behavior `solo_claude` is expected to miss, `/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required` that asks for the concrete difference from rejected or solo-saturated controls such as `S2`-`S6`, and any `state.verify.coverage_failed` axes.
111
114
 
112
115
  ## Archive contract
113
116
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "devlyn-cli",
3
- "version": "2.3.0",
4
- "description": "AI development toolkit for Claude Code — ideate, auto-resolve, and ship with context engineering and agent orchestration",
3
+ "version": "2.3.2",
4
+ "description": "AI development toolkit for Claude Code — ideate, resolve, and ship with context engineering and agent orchestration",
5
5
  "homepage": "https://github.com/fysoul17/devlyn-cli#readme",
6
6
  "bin": {
7
7
  "devlyn": "bin/devlyn.js"
@@ -20,13 +20,58 @@
20
20
  "agents-config",
21
21
  "optional-skills",
22
22
  "benchmark/auto-resolve/BENCHMARK-DESIGN.md",
23
+ "benchmark/auto-resolve/BENCHMARK-RESULTS.md",
23
24
  "benchmark/auto-resolve/README.md",
24
25
  "benchmark/auto-resolve/RUBRIC.md",
26
+ "benchmark/auto-resolve/run-real-benchmark.md",
25
27
  "benchmark/auto-resolve/fixtures/SCHEMA.md",
26
28
  "benchmark/auto-resolve/fixtures/F*/**",
29
+ "benchmark/auto-resolve/fixtures/retired/F*/**",
30
+ "benchmark/auto-resolve/shadow-fixtures/S*/**",
27
31
  "benchmark/auto-resolve/fixtures/test-repo/**",
28
32
  "!benchmark/auto-resolve/fixtures/test-repo/node_modules/**",
33
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md",
34
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json",
35
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md",
36
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json",
37
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.md",
38
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.json",
39
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md",
40
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json",
41
+ "benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.md",
42
+ "benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.json",
43
+ "benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.md",
44
+ "benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.json",
45
+ "benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.md",
46
+ "benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.json",
47
+ "benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.md",
48
+ "benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.json",
49
+ "benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.md",
50
+ "benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.json",
51
+ "benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.md",
52
+ "benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.json",
53
+ "benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.md",
54
+ "benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.json",
55
+ "benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.md",
56
+ "benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.json",
57
+ "benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.md",
58
+ "benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.json",
59
+ "benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.md",
60
+ "benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.json",
61
+ "benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.md",
62
+ "benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.json",
63
+ "benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.md",
64
+ "benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.json",
65
+ "benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.md",
66
+ "benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.json",
67
+ "benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.md",
68
+ "benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.json",
29
69
  "benchmark/auto-resolve/scripts/**",
70
+ "!**/__pycache__",
71
+ "!**/__pycache__/**",
72
+ "!**/*.pyc",
73
+ "scripts/lint-fixtures.sh",
74
+ "scripts/lint-shadow-fixtures.sh",
30
75
  "scripts/lint-skills.sh",
31
76
  "CLAUDE.md",
32
77
  "AGENTS.md"
@@ -0,0 +1,349 @@
1
+ #!/usr/bin/env bash
2
+ # lint-fixtures.sh — schema validity + structural check for golden fixtures/.
3
+
4
+ set -euo pipefail
5
+
6
+ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
7
+ FIXTURES_DIR="${DEVLYN_FIXTURES_DIR:-$REPO_ROOT/benchmark/auto-resolve/fixtures}"
8
+ FIXTURE_GLOB="${DEVLYN_FIXTURE_GLOB:-F*}"
9
+ RETIRED_FIXTURE_GLOB="${DEVLYN_RETIRED_FIXTURE_GLOB:-F*}"
10
+ REJECTED_REGISTRY="${DEVLYN_REJECTED_FIXTURE_REGISTRY:-$REPO_ROOT/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh}"
11
+ SCHEMA="${DEVLYN_EXPECTED_SCHEMA:-$REPO_ROOT/config/skills/_shared/expected.schema.json}"
12
+ SPEC_VERIFY_CHECK="$REPO_ROOT/config/skills/_shared/spec-verify-check.py"
13
+ SOLO_HEADROOM_CHECK="$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py"
14
+
15
+ [ -d "$FIXTURES_DIR" ] || { echo "✗ $FIXTURES_DIR missing"; exit 1; }
16
+ [ -f "$SCHEMA" ] || { echo "✗ $SCHEMA missing"; exit 1; }
17
+ [ -f "$SPEC_VERIFY_CHECK" ] || { echo "✗ $SPEC_VERIFY_CHECK missing"; exit 1; }
18
+ [ -f "$SOLO_HEADROOM_CHECK" ] || { echo "✗ solo-headroom checker missing: $SOLO_HEADROOM_CHECK"; exit 1; }
19
+ [ -f "$REJECTED_REGISTRY" ] || { echo "✗ rejected fixture registry missing: $REJECTED_REGISTRY"; exit 1; }
20
+
21
+ # shellcheck source=/dev/null
22
+ source "$REJECTED_REGISTRY"
23
+ if ! declare -F rejected_pair_fixture_reason >/dev/null; then
24
+ echo "✗ rejected fixture registry must define rejected_pair_fixture_reason: $REJECTED_REGISTRY"
25
+ exit 1
26
+ fi
27
+
28
+ REQUIRED_FILES=(metadata.json spec.md task.txt expected.json setup.sh NOTES.md)
29
+
30
+ ERRORS=0
31
+ COUNT=0
32
+ RETIRED_COUNT=0
33
+
34
+ for d in "$FIXTURES_DIR"/$FIXTURE_GLOB/; do
35
+ [ -d "$d" ] || continue
36
+ COUNT=$((COUNT + 1))
37
+ fid="$(basename "$d")"
38
+
39
+ for f in "${REQUIRED_FILES[@]}"; do
40
+ if [ ! -f "$d/$f" ]; then
41
+ echo "✗ $fid: missing $f"
42
+ ERRORS=$((ERRORS + 1))
43
+ fi
44
+ done
45
+
46
+ if [ -f "$d/metadata.json" ]; then
47
+ meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
48
+ if [ "$meta_id" != "$fid" ]; then
49
+ echo "✗ $fid: metadata.json id='$meta_id' does not match dir name"
50
+ ERRORS=$((ERRORS + 1))
51
+ fi
52
+
53
+ python3 - "$d/metadata.json" "$d/spec.md" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
54
+ import json
55
+ import re
56
+ import sys
57
+
58
+ metadata_path, spec_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
59
+ try:
60
+ metadata = json.load(open(metadata_path, encoding="utf-8"))
61
+ except Exception:
62
+ sys.exit(0)
63
+ if metadata.get("category") != "high-risk":
64
+ sys.exit(0)
65
+ intent = str(metadata.get("intent") or "")
66
+ try:
67
+ spec = open(spec_path, encoding="utf-8").read()
68
+ except FileNotFoundError:
69
+ spec = ""
70
+ text = f"{intent}\n{spec}".lower()
71
+ risk_pattern = re.compile(
72
+ r"\b("
73
+ r"auth|authz|permissions?|security|tokens?|sessions?|"
74
+ r"payments?|money|billing|invoices?|pricing|tax|ledger|"
75
+ r"persistence|persist\w*|data mutation|delet\w*|migrations?|"
76
+ r"idempoten\w*|replay|duplicates?|api|webhook|raw-body|signatures?|"
77
+ r"allocation|scheduling|inventory|rollback|transaction|"
78
+ r"priority|error-priority|output-shape|output shape|response-shape|response shape"
79
+ r")\b"
80
+ )
81
+ if not risk_pattern.search(text):
82
+ print(
83
+ f"✗ {fid}: high-risk fixture must include a resolve risk-trigger term "
84
+ "in metadata intent or spec.md"
85
+ )
86
+ sys.exit(1)
87
+ PY
88
+ fi
89
+
90
+ if [ -f "$d/spec.md" ]; then
91
+ spec_id=$(python3 - "$d/spec.md" <<'PY' 2>/dev/null || true
92
+ import re, sys
93
+ text = open(sys.argv[1], encoding="utf-8").read()
94
+ m = re.search(r'^id:\s*"?([^"\n]+)"?\s*$', text, re.M)
95
+ print(m.group(1) if m else "")
96
+ PY
97
+ )
98
+ if [ "$spec_id" != "$fid" ]; then
99
+ echo "✗ $fid: spec.md frontmatter id='$spec_id' does not match dir name"
100
+ ERRORS=$((ERRORS + 1))
101
+ fi
102
+ fi
103
+
104
+ if [ -f "$d/expected.json" ]; then
105
+ if ! python3 - "$d/expected.json" "$fid" <<'PY'
106
+ import json
107
+ import sys
108
+
109
+ expected_path, fid = sys.argv[1], sys.argv[2]
110
+ try:
111
+ data = json.load(open(expected_path, encoding="utf-8"))
112
+ except json.JSONDecodeError:
113
+ print(f"✗ {fid}: expected.json is not valid JSON")
114
+ sys.exit(1)
115
+ if not isinstance(data, dict):
116
+ print(f"✗ {fid}: expected.json must be an object")
117
+ sys.exit(1)
118
+ PY
119
+ then
120
+ ERRORS=$((ERRORS + 1))
121
+ continue
122
+ fi
123
+
124
+ n_cmds=$(python3 - "$d/expected.json" <<'PY'
125
+ import json
126
+ import sys
127
+
128
+ data = json.load(open(sys.argv[1], encoding="utf-8"))
129
+ commands = data.get("verification_commands", [])
130
+ print(len(commands) if isinstance(commands, list) else 0)
131
+ PY
132
+ )
133
+ if [ "$n_cmds" -lt 1 ]; then
134
+ echo "✗ $fid: expected.json has 0 verification_commands (need ≥1)"
135
+ ERRORS=$((ERRORS + 1))
136
+ fi
137
+
138
+ schema_ok=1
139
+ if ! python3 - "$SCHEMA" "$d/expected.json" "$fid" <<'PY'
140
+ import json, os, sys
141
+ schema_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
142
+ schema = json.load(open(schema_path))
143
+ data = json.load(open(expected_path))
144
+
145
+ def is_string_list(value):
146
+ return isinstance(value, list) and all(isinstance(item, str) and item for item in value)
147
+
148
+ def fallback_validate():
149
+ allowed = set(schema["properties"])
150
+ errors = []
151
+ if not isinstance(data, dict):
152
+ return ["expected.json must be an object"]
153
+ unknown = sorted(set(data) - allowed)
154
+ if unknown:
155
+ errors.append(f"expected.json has unknown key(s): {', '.join(unknown)}")
156
+ commands = data.get("verification_commands", [])
157
+ if not isinstance(commands, list):
158
+ errors.append("verification_commands must be an array")
159
+ else:
160
+ for idx, command in enumerate(commands):
161
+ if not isinstance(command, dict):
162
+ errors.append(f"verification_commands[{idx}] must be an object")
163
+ continue
164
+ unknown_command = sorted(set(command) - {"cmd", "exit_code", "stdout_contains", "stdout_not_contains", "contract_refs"})
165
+ if unknown_command:
166
+ errors.append(f"verification_commands[{idx}] has unknown key(s): {', '.join(unknown_command)}")
167
+ if not isinstance(command.get("cmd"), str) or not command.get("cmd"):
168
+ errors.append(f"verification_commands[{idx}].cmd must be a non-empty string")
169
+ exit_code = command.get("exit_code", 0)
170
+ if isinstance(exit_code, bool) or not isinstance(exit_code, int):
171
+ errors.append(f"verification_commands[{idx}].exit_code must be an integer")
172
+ for key in ("stdout_contains", "stdout_not_contains", "contract_refs"):
173
+ if key in command and not is_string_list(command[key]):
174
+ errors.append(f"verification_commands[{idx}].{key} must be an array of non-empty strings")
175
+ patterns = data.get("forbidden_patterns", [])
176
+ if not isinstance(patterns, list):
177
+ errors.append("forbidden_patterns must be an array")
178
+ else:
179
+ for idx, pattern in enumerate(patterns):
180
+ if not isinstance(pattern, dict):
181
+ errors.append(f"forbidden_patterns[{idx}] must be an object")
182
+ continue
183
+ unknown_pattern = sorted(set(pattern) - {"pattern", "description", "files", "severity"})
184
+ if unknown_pattern:
185
+ errors.append(f"forbidden_patterns[{idx}] has unknown key(s): {', '.join(unknown_pattern)}")
186
+ for key in ("pattern", "description"):
187
+ if not isinstance(pattern.get(key), str) or not pattern.get(key):
188
+ errors.append(f"forbidden_patterns[{idx}].{key} must be a non-empty string")
189
+ if pattern.get("severity") not in {"disqualifier", "warning"}:
190
+ errors.append(f"forbidden_patterns[{idx}].severity must be disqualifier or warning")
191
+ if "files" in pattern and not is_string_list(pattern["files"]):
192
+ errors.append(f"forbidden_patterns[{idx}].files must be an array of non-empty strings")
193
+ for key in ("required_files", "forbidden_files", "tier_a_waivers", "spec_output_files"):
194
+ if key in data and not is_string_list(data[key]):
195
+ errors.append(f"{key} must be an array of non-empty strings")
196
+ max_deps_added = data.get("max_deps_added", 0)
197
+ if isinstance(max_deps_added, bool) or not isinstance(max_deps_added, int) or max_deps_added < 0:
198
+ errors.append("max_deps_added must be an integer >= 0")
199
+ return errors
200
+
201
+ force_fallback = os.environ.get("DEVLYN_LINT_FIXTURES_NO_JSONSCHEMA") == "1"
202
+ try:
203
+ if force_fallback:
204
+ raise ImportError
205
+ import jsonschema
206
+ except ImportError:
207
+ fallback_errors = fallback_validate()
208
+ if fallback_errors:
209
+ for error in fallback_errors:
210
+ print(f"✗ {fid}: expected.json schema violation: {error}")
211
+ sys.exit(1)
212
+ else:
213
+ try:
214
+ jsonschema.validate(data, schema)
215
+ except jsonschema.ValidationError as e:
216
+ print(f"✗ {fid}: expected.json schema violation: {e.message}")
217
+ sys.exit(1)
218
+ PY
219
+ then
220
+ ERRORS=$((ERRORS + 1))
221
+ schema_ok=0
222
+ fi
223
+
224
+ if [ "$schema_ok" -eq 1 ]; then
225
+ if ! python3 "$SPEC_VERIFY_CHECK" --check "$d/spec.md"; then
226
+ echo "✗ $fid: spec-verify-check --check failed"
227
+ ERRORS=$((ERRORS + 1))
228
+ fi
229
+ if ! python3 "$SPEC_VERIFY_CHECK" --check-expected "$d/expected.json"; then
230
+ echo "✗ $fid: spec-verify-check --check-expected failed"
231
+ ERRORS=$((ERRORS + 1))
232
+ fi
233
+
234
+ python3 - "$d/spec.md" "$d/expected.json" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
235
+ import json, pathlib, re, sys
236
+ spec_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
237
+ spec = open(spec_path, encoding="utf-8").read()
238
+ expected = json.load(open(expected_path, encoding="utf-8"))
239
+ fixture_dir = pathlib.Path(expected_path).parent
240
+ fixture_root = fixture_dir.resolve()
241
+ errors = []
242
+ for idx, command in enumerate(expected.get("verification_commands", [])):
243
+ cmd = str(command.get("cmd", ""))
244
+ if "BENCH_FIXTURE_DIR" not in cmd:
245
+ continue
246
+ fixture_refs = re.findall(r"(?:\$\{BENCH_FIXTURE_DIR\}|\$BENCH_FIXTURE_DIR)/([^\"'\s]+)", cmd)
247
+ if not fixture_refs:
248
+ errors.append(
249
+ f"verification_commands[{idx}] hidden oracle must reference an explicit $BENCH_FIXTURE_DIR/... file"
250
+ )
251
+ stdout_contains = command.get("stdout_contains", [])
252
+ if '"ok":true' not in stdout_contains:
253
+ errors.append(
254
+ f"verification_commands[{idx}] hidden oracle must assert stdout_contains includes '\"ok\":true'"
255
+ )
256
+ for fixture_ref in fixture_refs:
257
+ target = (fixture_dir / fixture_ref).resolve(strict=False)
258
+ try:
259
+ target.relative_to(fixture_root)
260
+ except ValueError:
261
+ errors.append(
262
+ f"verification_commands[{idx}] BENCH_FIXTURE_DIR file escapes fixture dir: {fixture_ref!r}"
263
+ )
264
+ continue
265
+ if not target.is_file():
266
+ errors.append(
267
+ f"verification_commands[{idx}] BENCH_FIXTURE_DIR file not found: {fixture_ref!r}"
268
+ )
269
+ refs = command.get("contract_refs", [])
270
+ if not refs:
271
+ errors.append(f"verification_commands[{idx}] hidden oracle missing contract_refs")
272
+ continue
273
+ for ref in refs:
274
+ if ref not in spec:
275
+ errors.append(
276
+ f"verification_commands[{idx}] contract_ref not found in spec.md: {ref!r}"
277
+ )
278
+ if errors:
279
+ for err in errors:
280
+ print(f"✗ {fid}: {err}")
281
+ sys.exit(1)
282
+ PY
283
+ fi
284
+ fi
285
+
286
+ if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
287
+ echo "✗ $fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
288
+ ERRORS=$((ERRORS + 1))
289
+ fi
290
+
291
+ if [ -f "$d/NOTES.md" ] \
292
+ && { { grep -Fq 'headroom gate' "$d/NOTES.md" && grep -Eq '`?FAIL`?' "$d/NOTES.md"; } \
293
+ || { grep -Fq 'pair-lift evidence' "$d/NOTES.md" && grep -Eiq 'reject|rejected' "$d/NOTES.md"; }; } \
294
+ && ! rejected_pair_fixture_reason "$fid" >/dev/null 2>&1; then
295
+ echo "✗ $fid: NOTES.md records pair-candidate rejection but pair-rejected-fixtures.sh has no rejected reason"
296
+ ERRORS=$((ERRORS + 1))
297
+ fi
298
+
299
+ if [ -f "$d/NOTES.md" ] \
300
+ && grep -Fq 'pair_evidence_passed' "$d/NOTES.md" \
301
+ && ! python3 "$SOLO_HEADROOM_CHECK" --expected-json "$d/expected.json" "$d/spec.md"; then
302
+ echo "✗ $fid: pair_evidence_passed fixture spec.md must document an actionable solo-headroom hypothesis with solo_claude miss and observable command from expected.json"
303
+ ERRORS=$((ERRORS + 1))
304
+ fi
305
+ done
306
+
307
+ for d in "$FIXTURES_DIR"/retired/$RETIRED_FIXTURE_GLOB/; do
308
+ [ -d "$d" ] || continue
309
+ RETIRED_COUNT=$((RETIRED_COUNT + 1))
310
+ fid="$(basename "$d")"
311
+
312
+ if [ ! -f "$d/RETIRED.md" ]; then
313
+ echo "✗ retired/$fid: missing RETIRED.md"
314
+ ERRORS=$((ERRORS + 1))
315
+ fi
316
+
317
+ for f in "${REQUIRED_FILES[@]}"; do
318
+ if [ ! -f "$d/$f" ]; then
319
+ echo "✗ retired/$fid: missing preserved $f"
320
+ ERRORS=$((ERRORS + 1))
321
+ fi
322
+ done
323
+
324
+ if [ -f "$d/metadata.json" ]; then
325
+ meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
326
+ if [ "$meta_id" != "$fid" ]; then
327
+ echo "✗ retired/$fid: metadata.json id='$meta_id' does not match dir name"
328
+ ERRORS=$((ERRORS + 1))
329
+ fi
330
+ fi
331
+
332
+ if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
333
+ echo "✗ retired/$fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
334
+ ERRORS=$((ERRORS + 1))
335
+ fi
336
+ done
337
+
338
+ if [ $COUNT -eq 0 ]; then
339
+ echo "✗ no fixtures found in $FIXTURES_DIR"
340
+ exit 1
341
+ fi
342
+
343
+ if [ $ERRORS -gt 0 ]; then
344
+ echo ""
345
+ echo "✗ lint-fixtures: $ERRORS error(s) across $COUNT active fixture(s) and $RETIRED_COUNT retired fixture(s)"
346
+ exit 1
347
+ fi
348
+
349
+ echo "✓ lint-fixtures: $COUNT active fixture(s) passed schema + structural checks; $RETIRED_COUNT retired fixture(s) preserved"