devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -11,7 +11,9 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
11
11
  "started_at": "2026-04-30T12:00:00Z",
12
12
  "engine": "claude",
13
13
  "mode": "spec",
14
+ "pair_verify": false,
14
15
  "complexity": null,
16
+ "risk_profile": { "high_risk": false, "reasons": [], "risk_probes_enabled": false, "pair_default_enabled": true },
15
17
  "base_ref": { "branch": "main", "sha": "abc123..." },
16
18
  "rounds": { "max_rounds": 4, "global": 0 },
17
19
  "bypasses": [],
@@ -43,15 +45,18 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
43
45
 
44
46
  - **version** — string. Bump major on a breaking schema change.
45
47
  - **mode** — `"free-form" | "spec" | "verify-only"`.
48
+ - **pair_verify** — boolean. Set true only when the user passed `--pair-verify`; otherwise false. This is the durable state evidence for the `mode.pair-verify` pair-trigger reason. It is mutually exclusive with `risk_profile.pair_default_enabled == false` from `--no-pair`; `verify-merge-findings.py` blocks the contradictory state.
46
49
  - **complexity** — `null | "trivial" | "medium" | "large"`. Free-form mode populates this; spec/verify-only mode leaves it null.
47
- - **engine** — `"claude" | "codex" | "auto"` initially; rewritten by engine-preflight if a downgrade fired.
50
+ - **engine** — `"claude" | "codex" | "auto"` initially; a required unavailable engine stops the run with `BLOCKED:<engine>-unavailable`.
51
+ - **source** — provenance for the contract all downstream phases read. Spec and verify-only mode set `type: "spec"`, `spec_path`, and `spec_sha256`. Free-form mode sets `type: "generated"`, leaves `spec_path`/`spec_sha256` null, and must set `criteria_path: ".devlyn/criteria.generated.md"` plus `criteria_sha256` from the generated file's raw bytes. VERIFY re-checks the matching hash before judging.
52
+ - **risk_profile** — PHASE 0 classification for conditional defaults. `high_risk` records durable-risk signals from the goal/spec; `risk_probes_enabled` is true for explicit `--risk-probes` or high-risk specs unless `--no-risk-probes`; `pair_default_enabled` is false only for explicit `--no-pair`. `risk_profile` must remain an object with boolean `high_risk`, `risk_probes_enabled`, and `pair_default_enabled` fields when present, plus `reasons` as a list of strings. Malformed `risk_profile` blocks VERIFY because pair-trigger reasons derive `risk.high` and `risk_probes.enabled` from this state.
48
53
  - **rounds.global** — incremented every fix-loop pass (BUILD_GATE → fix-loop OR VERIFY → fix-loop).
49
54
  - **phases.probe_derive** — optional PHASE 1.5 entry when `--risk-probes` is enabled. Artifacts include `.devlyn/risk-probes.jsonl`. Probe failures later surface through BUILD_GATE/VERIFY as `correctness.risk-probe-failed`.
50
55
  - **bypasses** — array of phase names from `--bypass`. Valid: `"build-gate" | "cleanup"`. PLAN, IMPLEMENT, VERIFY are non-bypassable (orchestrator rejects at parse time).
51
56
  - **implement_passed_sha** — captured at end of PHASE 2; null until then. Activates the post-implement invariant for CLEANUP and VERIFY.
52
57
  - **criteria** — generated from spec's `## Requirements` checklist (one per `- [ ]`). `status: pending → implemented` is the legal transition. `failed_by_finding_ids` populates when VERIFY surfaces a finding tied to a criterion.
53
- - **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `complexity: high` specs or `state.complexity` of `"high"`/`"large"` when MECHANICAL has no HIGH/CRITICAL blockers.
54
- - **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. If eligible with any reason, `pair_judge` must be non-null.
58
+ - **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `state.pair_verify == true`, verify-only mode, high-risk specs, active risk probes, actionable solo-headroom hypotheses, `complexity: high` specs, or current free-form `state.complexity` of `"large"` when MECHANICAL and the primary JUDGE have no verdict-binding blockers. Legacy/external spec `complexity: large` remains accepted for compatibility; new specs use `high`. Legacy `"high"` state remains accepted by the merge validator only for archived run compatibility.
59
+ - **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. The shape is strict: `eligible: true` requires a non-empty reasons list containing every applicable canonical eligible reason and only canonical eligible reasons, plus `skipped_reason: null`; `eligible: false` requires an empty reasons list and may set only `user_no_pair`, `mechanical_blocker`, `primary_judge_blocker`, or null as the skip cause. Canonical eligible reasons are `mode.verify-only`, `mode.pair-verify`, `complexity.high`, `complexity.large`, `spec.complexity.high`, `spec.complexity.large`, `spec.solo_headroom_hypothesis`, `risk.high`, `risk_probes.enabled`, `risk_probes.present`, `coverage.failed`, `mechanical.warning`, and `judge.warning`. `user_no_pair` is valid only when `risk_profile.pair_default_enabled == false` from an explicit `--no-pair`; `mechanical_blocker` and `primary_judge_blocker` are valid only when the matching source has a verdict-binding finding. If state implies a pair decision is required but `pair_trigger` is missing, if it records `eligible:false` with no supported skip reason, if an eligible trigger omits an applicable reason such as `spec.solo_headroom_hypothesis`, or if any combination is malformed, `verify-merge-findings.py` blocks VERIFY.
55
60
 
56
61
  ## Per-phase shape
57
62
 
@@ -105,7 +110,7 @@ Per-phase summary table: `phase | verdict | duration_ms | round | triggered_by |
105
110
 
106
111
  Findings table (post-IMPLEMENT phases only — they are findings-only): each finding's `severity | rule_id | file:line | message | confidence`.
107
112
 
108
- Follow-up notes: any `--continue-on-large` assumptions, any silent fallbacks (engine downgrade), any `state.verify.coverage_failed` axes.
113
+ Follow-up notes: any `--continue-on-large` assumptions, pair/risk-probe opt-out state, engine setup guidance for `BLOCKED:<engine>-unavailable`, `/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required` that asks for the visible behavior `solo_claude` is expected to miss, `/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required` that asks for the concrete difference from rejected or solo-saturated controls such as `S2`-`S6`, and any `state.verify.coverage_failed` axes.
109
114
 
110
115
  ## Archive contract
111
116
 
package/package.json CHANGED
@@ -1,7 +1,7 @@
1
1
  {
2
2
  "name": "devlyn-cli",
3
- "version": "2.2.2",
4
- "description": "AI development toolkit for Claude Code — ideate, auto-resolve, and ship with context engineering and agent orchestration",
3
+ "version": "2.3.1",
4
+ "description": "AI development toolkit for Claude Code — ideate, resolve, and ship with context engineering and agent orchestration",
5
5
  "homepage": "https://github.com/fysoul17/devlyn-cli#readme",
6
6
  "bin": {
7
7
  "devlyn": "bin/devlyn.js"
@@ -20,13 +20,58 @@
20
20
  "agents-config",
21
21
  "optional-skills",
22
22
  "benchmark/auto-resolve/BENCHMARK-DESIGN.md",
23
+ "benchmark/auto-resolve/BENCHMARK-RESULTS.md",
23
24
  "benchmark/auto-resolve/README.md",
24
25
  "benchmark/auto-resolve/RUBRIC.md",
26
+ "benchmark/auto-resolve/run-real-benchmark.md",
25
27
  "benchmark/auto-resolve/fixtures/SCHEMA.md",
26
28
  "benchmark/auto-resolve/fixtures/F*/**",
29
+ "benchmark/auto-resolve/fixtures/retired/F*/**",
30
+ "benchmark/auto-resolve/shadow-fixtures/S*/**",
27
31
  "benchmark/auto-resolve/fixtures/test-repo/**",
28
32
  "!benchmark/auto-resolve/fixtures/test-repo/node_modules/**",
33
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md",
34
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json",
35
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md",
36
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json",
37
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.md",
38
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.json",
39
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md",
40
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json",
41
+ "benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.md",
42
+ "benchmark/auto-resolve/results/20260507-f10-f11-tier1-full-pipeline/headroom-gate.json",
43
+ "benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.md",
44
+ "benchmark/auto-resolve/results/20260508-f22-exact-error-headroom/headroom-gate.json",
45
+ "benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.md",
46
+ "benchmark/auto-resolve/results/20260508-f26-headroom/headroom-gate.json",
47
+ "benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.md",
48
+ "benchmark/auto-resolve/results/20260511-f3-http-error-headroom/headroom-gate.json",
49
+ "benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.md",
50
+ "benchmark/auto-resolve/results/20260511-f12-webhook-headroom/headroom-gate.json",
51
+ "benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.md",
52
+ "benchmark/auto-resolve/results/20260511-f15-concurrency-headroom/headroom-gate.json",
53
+ "benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.md",
54
+ "benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.json",
55
+ "benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.md",
56
+ "benchmark/auto-resolve/results/20260512-f4-web-headroom/headroom-gate.json",
57
+ "benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.md",
58
+ "benchmark/auto-resolve/results/20260512-f5-fixloop-headroom/headroom-gate.json",
59
+ "benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.md",
60
+ "benchmark/auto-resolve/results/20260512-f6-checksum-headroom/headroom-gate.json",
61
+ "benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.md",
62
+ "benchmark/auto-resolve/results/20260512-f7-scope-headroom/headroom-gate.json",
63
+ "benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.md",
64
+ "benchmark/auto-resolve/results/20260512-f9-e2e-headroom/headroom-gate.json",
65
+ "benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.md",
66
+ "benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.json",
67
+ "benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.md",
68
+ "benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.json",
29
69
  "benchmark/auto-resolve/scripts/**",
70
+ "!**/__pycache__",
71
+ "!**/__pycache__/**",
72
+ "!**/*.pyc",
73
+ "scripts/lint-fixtures.sh",
74
+ "scripts/lint-shadow-fixtures.sh",
30
75
  "scripts/lint-skills.sh",
31
76
  "CLAUDE.md",
32
77
  "AGENTS.md"
@@ -0,0 +1,349 @@
1
+ #!/usr/bin/env bash
2
+ # lint-fixtures.sh — schema validity + structural check for golden fixtures/.
3
+
4
+ set -euo pipefail
5
+
6
+ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
7
+ FIXTURES_DIR="${DEVLYN_FIXTURES_DIR:-$REPO_ROOT/benchmark/auto-resolve/fixtures}"
8
+ FIXTURE_GLOB="${DEVLYN_FIXTURE_GLOB:-F*}"
9
+ RETIRED_FIXTURE_GLOB="${DEVLYN_RETIRED_FIXTURE_GLOB:-F*}"
10
+ REJECTED_REGISTRY="${DEVLYN_REJECTED_FIXTURE_REGISTRY:-$REPO_ROOT/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh}"
11
+ SCHEMA="${DEVLYN_EXPECTED_SCHEMA:-$REPO_ROOT/config/skills/_shared/expected.schema.json}"
12
+ SPEC_VERIFY_CHECK="$REPO_ROOT/config/skills/_shared/spec-verify-check.py"
13
+ SOLO_HEADROOM_CHECK="$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py"
14
+
15
+ [ -d "$FIXTURES_DIR" ] || { echo "✗ $FIXTURES_DIR missing"; exit 1; }
16
+ [ -f "$SCHEMA" ] || { echo "✗ $SCHEMA missing"; exit 1; }
17
+ [ -f "$SPEC_VERIFY_CHECK" ] || { echo "✗ $SPEC_VERIFY_CHECK missing"; exit 1; }
18
+ [ -f "$SOLO_HEADROOM_CHECK" ] || { echo "✗ solo-headroom checker missing: $SOLO_HEADROOM_CHECK"; exit 1; }
19
+ [ -f "$REJECTED_REGISTRY" ] || { echo "✗ rejected fixture registry missing: $REJECTED_REGISTRY"; exit 1; }
20
+
21
+ # shellcheck source=/dev/null
22
+ source "$REJECTED_REGISTRY"
23
+ if ! declare -F rejected_pair_fixture_reason >/dev/null; then
24
+ echo "✗ rejected fixture registry must define rejected_pair_fixture_reason: $REJECTED_REGISTRY"
25
+ exit 1
26
+ fi
27
+
28
+ REQUIRED_FILES=(metadata.json spec.md task.txt expected.json setup.sh NOTES.md)
29
+
30
+ ERRORS=0
31
+ COUNT=0
32
+ RETIRED_COUNT=0
33
+
34
+ for d in "$FIXTURES_DIR"/$FIXTURE_GLOB/; do
35
+ [ -d "$d" ] || continue
36
+ COUNT=$((COUNT + 1))
37
+ fid="$(basename "$d")"
38
+
39
+ for f in "${REQUIRED_FILES[@]}"; do
40
+ if [ ! -f "$d/$f" ]; then
41
+ echo "✗ $fid: missing $f"
42
+ ERRORS=$((ERRORS + 1))
43
+ fi
44
+ done
45
+
46
+ if [ -f "$d/metadata.json" ]; then
47
+ meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
48
+ if [ "$meta_id" != "$fid" ]; then
49
+ echo "✗ $fid: metadata.json id='$meta_id' does not match dir name"
50
+ ERRORS=$((ERRORS + 1))
51
+ fi
52
+
53
+ python3 - "$d/metadata.json" "$d/spec.md" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
54
+ import json
55
+ import re
56
+ import sys
57
+
58
+ metadata_path, spec_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
59
+ try:
60
+ metadata = json.load(open(metadata_path, encoding="utf-8"))
61
+ except Exception:
62
+ sys.exit(0)
63
+ if metadata.get("category") != "high-risk":
64
+ sys.exit(0)
65
+ intent = str(metadata.get("intent") or "")
66
+ try:
67
+ spec = open(spec_path, encoding="utf-8").read()
68
+ except FileNotFoundError:
69
+ spec = ""
70
+ text = f"{intent}\n{spec}".lower()
71
+ risk_pattern = re.compile(
72
+ r"\b("
73
+ r"auth|authz|permissions?|security|tokens?|sessions?|"
74
+ r"payments?|money|billing|invoices?|pricing|tax|ledger|"
75
+ r"persistence|persist\w*|data mutation|delet\w*|migrations?|"
76
+ r"idempoten\w*|replay|duplicates?|api|webhook|raw-body|signatures?|"
77
+ r"allocation|scheduling|inventory|rollback|transaction|"
78
+ r"priority|error-priority|output-shape|output shape|response-shape|response shape"
79
+ r")\b"
80
+ )
81
+ if not risk_pattern.search(text):
82
+ print(
83
+ f"✗ {fid}: high-risk fixture must include a resolve risk-trigger term "
84
+ "in metadata intent or spec.md"
85
+ )
86
+ sys.exit(1)
87
+ PY
88
+ fi
89
+
90
+ if [ -f "$d/spec.md" ]; then
91
+ spec_id=$(python3 - "$d/spec.md" <<'PY' 2>/dev/null || true
92
+ import re, sys
93
+ text = open(sys.argv[1], encoding="utf-8").read()
94
+ m = re.search(r'^id:\s*"?([^"\n]+)"?\s*$', text, re.M)
95
+ print(m.group(1) if m else "")
96
+ PY
97
+ )
98
+ if [ "$spec_id" != "$fid" ]; then
99
+ echo "✗ $fid: spec.md frontmatter id='$spec_id' does not match dir name"
100
+ ERRORS=$((ERRORS + 1))
101
+ fi
102
+ fi
103
+
104
+ if [ -f "$d/expected.json" ]; then
105
+ if ! python3 - "$d/expected.json" "$fid" <<'PY'
106
+ import json
107
+ import sys
108
+
109
+ expected_path, fid = sys.argv[1], sys.argv[2]
110
+ try:
111
+ data = json.load(open(expected_path, encoding="utf-8"))
112
+ except json.JSONDecodeError:
113
+ print(f"✗ {fid}: expected.json is not valid JSON")
114
+ sys.exit(1)
115
+ if not isinstance(data, dict):
116
+ print(f"✗ {fid}: expected.json must be an object")
117
+ sys.exit(1)
118
+ PY
119
+ then
120
+ ERRORS=$((ERRORS + 1))
121
+ continue
122
+ fi
123
+
124
+ n_cmds=$(python3 - "$d/expected.json" <<'PY'
125
+ import json
126
+ import sys
127
+
128
+ data = json.load(open(sys.argv[1], encoding="utf-8"))
129
+ commands = data.get("verification_commands", [])
130
+ print(len(commands) if isinstance(commands, list) else 0)
131
+ PY
132
+ )
133
+ if [ "$n_cmds" -lt 1 ]; then
134
+ echo "✗ $fid: expected.json has 0 verification_commands (need ≥1)"
135
+ ERRORS=$((ERRORS + 1))
136
+ fi
137
+
138
+ schema_ok=1
139
+ if ! python3 - "$SCHEMA" "$d/expected.json" "$fid" <<'PY'
140
+ import json, os, sys
141
+ schema_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
142
+ schema = json.load(open(schema_path))
143
+ data = json.load(open(expected_path))
144
+
145
+ def is_string_list(value):
146
+ return isinstance(value, list) and all(isinstance(item, str) and item for item in value)
147
+
148
+ def fallback_validate():
149
+ allowed = set(schema["properties"])
150
+ errors = []
151
+ if not isinstance(data, dict):
152
+ return ["expected.json must be an object"]
153
+ unknown = sorted(set(data) - allowed)
154
+ if unknown:
155
+ errors.append(f"expected.json has unknown key(s): {', '.join(unknown)}")
156
+ commands = data.get("verification_commands", [])
157
+ if not isinstance(commands, list):
158
+ errors.append("verification_commands must be an array")
159
+ else:
160
+ for idx, command in enumerate(commands):
161
+ if not isinstance(command, dict):
162
+ errors.append(f"verification_commands[{idx}] must be an object")
163
+ continue
164
+ unknown_command = sorted(set(command) - {"cmd", "exit_code", "stdout_contains", "stdout_not_contains", "contract_refs"})
165
+ if unknown_command:
166
+ errors.append(f"verification_commands[{idx}] has unknown key(s): {', '.join(unknown_command)}")
167
+ if not isinstance(command.get("cmd"), str) or not command.get("cmd"):
168
+ errors.append(f"verification_commands[{idx}].cmd must be a non-empty string")
169
+ exit_code = command.get("exit_code", 0)
170
+ if isinstance(exit_code, bool) or not isinstance(exit_code, int):
171
+ errors.append(f"verification_commands[{idx}].exit_code must be an integer")
172
+ for key in ("stdout_contains", "stdout_not_contains", "contract_refs"):
173
+ if key in command and not is_string_list(command[key]):
174
+ errors.append(f"verification_commands[{idx}].{key} must be an array of non-empty strings")
175
+ patterns = data.get("forbidden_patterns", [])
176
+ if not isinstance(patterns, list):
177
+ errors.append("forbidden_patterns must be an array")
178
+ else:
179
+ for idx, pattern in enumerate(patterns):
180
+ if not isinstance(pattern, dict):
181
+ errors.append(f"forbidden_patterns[{idx}] must be an object")
182
+ continue
183
+ unknown_pattern = sorted(set(pattern) - {"pattern", "description", "files", "severity"})
184
+ if unknown_pattern:
185
+ errors.append(f"forbidden_patterns[{idx}] has unknown key(s): {', '.join(unknown_pattern)}")
186
+ for key in ("pattern", "description"):
187
+ if not isinstance(pattern.get(key), str) or not pattern.get(key):
188
+ errors.append(f"forbidden_patterns[{idx}].{key} must be a non-empty string")
189
+ if pattern.get("severity") not in {"disqualifier", "warning"}:
190
+ errors.append(f"forbidden_patterns[{idx}].severity must be disqualifier or warning")
191
+ if "files" in pattern and not is_string_list(pattern["files"]):
192
+ errors.append(f"forbidden_patterns[{idx}].files must be an array of non-empty strings")
193
+ for key in ("required_files", "forbidden_files", "tier_a_waivers", "spec_output_files"):
194
+ if key in data and not is_string_list(data[key]):
195
+ errors.append(f"{key} must be an array of non-empty strings")
196
+ max_deps_added = data.get("max_deps_added", 0)
197
+ if isinstance(max_deps_added, bool) or not isinstance(max_deps_added, int) or max_deps_added < 0:
198
+ errors.append("max_deps_added must be an integer >= 0")
199
+ return errors
200
+
201
+ force_fallback = os.environ.get("DEVLYN_LINT_FIXTURES_NO_JSONSCHEMA") == "1"
202
+ try:
203
+ if force_fallback:
204
+ raise ImportError
205
+ import jsonschema
206
+ except ImportError:
207
+ fallback_errors = fallback_validate()
208
+ if fallback_errors:
209
+ for error in fallback_errors:
210
+ print(f"✗ {fid}: expected.json schema violation: {error}")
211
+ sys.exit(1)
212
+ else:
213
+ try:
214
+ jsonschema.validate(data, schema)
215
+ except jsonschema.ValidationError as e:
216
+ print(f"✗ {fid}: expected.json schema violation: {e.message}")
217
+ sys.exit(1)
218
+ PY
219
+ then
220
+ ERRORS=$((ERRORS + 1))
221
+ schema_ok=0
222
+ fi
223
+
224
+ if [ "$schema_ok" -eq 1 ]; then
225
+ if ! python3 "$SPEC_VERIFY_CHECK" --check "$d/spec.md"; then
226
+ echo "✗ $fid: spec-verify-check --check failed"
227
+ ERRORS=$((ERRORS + 1))
228
+ fi
229
+ if ! python3 "$SPEC_VERIFY_CHECK" --check-expected "$d/expected.json"; then
230
+ echo "✗ $fid: spec-verify-check --check-expected failed"
231
+ ERRORS=$((ERRORS + 1))
232
+ fi
233
+
234
+ python3 - "$d/spec.md" "$d/expected.json" "$fid" <<'PY' || ERRORS=$((ERRORS + 1))
235
+ import json, pathlib, re, sys
236
+ spec_path, expected_path, fid = sys.argv[1], sys.argv[2], sys.argv[3]
237
+ spec = open(spec_path, encoding="utf-8").read()
238
+ expected = json.load(open(expected_path, encoding="utf-8"))
239
+ fixture_dir = pathlib.Path(expected_path).parent
240
+ fixture_root = fixture_dir.resolve()
241
+ errors = []
242
+ for idx, command in enumerate(expected.get("verification_commands", [])):
243
+ cmd = str(command.get("cmd", ""))
244
+ if "BENCH_FIXTURE_DIR" not in cmd:
245
+ continue
246
+ fixture_refs = re.findall(r"(?:\$\{BENCH_FIXTURE_DIR\}|\$BENCH_FIXTURE_DIR)/([^\"'\s]+)", cmd)
247
+ if not fixture_refs:
248
+ errors.append(
249
+ f"verification_commands[{idx}] hidden oracle must reference an explicit $BENCH_FIXTURE_DIR/... file"
250
+ )
251
+ stdout_contains = command.get("stdout_contains", [])
252
+ if '"ok":true' not in stdout_contains:
253
+ errors.append(
254
+ f"verification_commands[{idx}] hidden oracle must assert stdout_contains includes '\"ok\":true'"
255
+ )
256
+ for fixture_ref in fixture_refs:
257
+ target = (fixture_dir / fixture_ref).resolve(strict=False)
258
+ try:
259
+ target.relative_to(fixture_root)
260
+ except ValueError:
261
+ errors.append(
262
+ f"verification_commands[{idx}] BENCH_FIXTURE_DIR file escapes fixture dir: {fixture_ref!r}"
263
+ )
264
+ continue
265
+ if not target.is_file():
266
+ errors.append(
267
+ f"verification_commands[{idx}] BENCH_FIXTURE_DIR file not found: {fixture_ref!r}"
268
+ )
269
+ refs = command.get("contract_refs", [])
270
+ if not refs:
271
+ errors.append(f"verification_commands[{idx}] hidden oracle missing contract_refs")
272
+ continue
273
+ for ref in refs:
274
+ if ref not in spec:
275
+ errors.append(
276
+ f"verification_commands[{idx}] contract_ref not found in spec.md: {ref!r}"
277
+ )
278
+ if errors:
279
+ for err in errors:
280
+ print(f"✗ {fid}: {err}")
281
+ sys.exit(1)
282
+ PY
283
+ fi
284
+ fi
285
+
286
+ if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
287
+ echo "✗ $fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
288
+ ERRORS=$((ERRORS + 1))
289
+ fi
290
+
291
+ if [ -f "$d/NOTES.md" ] \
292
+ && { { grep -Fq 'headroom gate' "$d/NOTES.md" && grep -Eq '`?FAIL`?' "$d/NOTES.md"; } \
293
+ || { grep -Fq 'pair-lift evidence' "$d/NOTES.md" && grep -Eiq 'reject|rejected' "$d/NOTES.md"; }; } \
294
+ && ! rejected_pair_fixture_reason "$fid" >/dev/null 2>&1; then
295
+ echo "✗ $fid: NOTES.md records pair-candidate rejection but pair-rejected-fixtures.sh has no rejected reason"
296
+ ERRORS=$((ERRORS + 1))
297
+ fi
298
+
299
+ if [ -f "$d/NOTES.md" ] \
300
+ && grep -Fq 'pair_evidence_passed' "$d/NOTES.md" \
301
+ && ! python3 "$SOLO_HEADROOM_CHECK" --expected-json "$d/expected.json" "$d/spec.md"; then
302
+ echo "✗ $fid: pair_evidence_passed fixture spec.md must document an actionable solo-headroom hypothesis with solo_claude miss and observable command from expected.json"
303
+ ERRORS=$((ERRORS + 1))
304
+ fi
305
+ done
306
+
307
+ for d in "$FIXTURES_DIR"/retired/$RETIRED_FIXTURE_GLOB/; do
308
+ [ -d "$d" ] || continue
309
+ RETIRED_COUNT=$((RETIRED_COUNT + 1))
310
+ fid="$(basename "$d")"
311
+
312
+ if [ ! -f "$d/RETIRED.md" ]; then
313
+ echo "✗ retired/$fid: missing RETIRED.md"
314
+ ERRORS=$((ERRORS + 1))
315
+ fi
316
+
317
+ for f in "${REQUIRED_FILES[@]}"; do
318
+ if [ ! -f "$d/$f" ]; then
319
+ echo "✗ retired/$fid: missing preserved $f"
320
+ ERRORS=$((ERRORS + 1))
321
+ fi
322
+ done
323
+
324
+ if [ -f "$d/metadata.json" ]; then
325
+ meta_id=$(python3 -c "import json,sys; print(json.load(open('$d/metadata.json'))['id'])" 2>/dev/null || echo "")
326
+ if [ "$meta_id" != "$fid" ]; then
327
+ echo "✗ retired/$fid: metadata.json id='$meta_id' does not match dir name"
328
+ ERRORS=$((ERRORS + 1))
329
+ fi
330
+ fi
331
+
332
+ if [ -f "$d/setup.sh" ] && [ ! -x "$d/setup.sh" ]; then
333
+ echo "✗ retired/$fid: setup.sh not executable (run: chmod +x $d/setup.sh)"
334
+ ERRORS=$((ERRORS + 1))
335
+ fi
336
+ done
337
+
338
+ if [ $COUNT -eq 0 ]; then
339
+ echo "✗ no fixtures found in $FIXTURES_DIR"
340
+ exit 1
341
+ fi
342
+
343
+ if [ $ERRORS -gt 0 ]; then
344
+ echo ""
345
+ echo "✗ lint-fixtures: $ERRORS error(s) across $COUNT active fixture(s) and $RETIRED_COUNT retired fixture(s)"
346
+ exit 1
347
+ fi
348
+
349
+ echo "✓ lint-fixtures: $COUNT active fixture(s) passed schema + structural checks; $RETIRED_COUNT retired fixture(s) preserved"
@@ -0,0 +1,58 @@
1
+ #!/usr/bin/env bash
2
+ # lint-shadow-fixtures.sh — run the standard fixture lint over shadow-fixtures/.
3
+
4
+ set -euo pipefail
5
+
6
+ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
7
+ SHADOW_FIXTURES_DIR="${DEVLYN_SHADOW_FIXTURES_DIR:-$REPO_ROOT/benchmark/auto-resolve/shadow-fixtures}"
8
+
9
+ DEVLYN_FIXTURES_DIR="$SHADOW_FIXTURES_DIR" \
10
+ DEVLYN_FIXTURE_GLOB="S*" \
11
+ DEVLYN_RETIRED_FIXTURE_GLOB="S*" \
12
+ bash "$REPO_ROOT/scripts/lint-fixtures.sh"
13
+
14
+ has_actionable_solo_headroom_hypothesis() {
15
+ python3 "$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py" "$@"
16
+ }
17
+
18
+ has_solo_ceiling_avoidance_note() {
19
+ local notes="$1"
20
+ python3 "$REPO_ROOT/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py" "$notes"
21
+ }
22
+
23
+ errors=0
24
+ for d in "$SHADOW_FIXTURES_DIR"/S*/; do
25
+ [ -d "$d" ] || continue
26
+ fid="$(basename "$d")"
27
+ meta="$d/metadata.json"
28
+ spec="$d/spec.md"
29
+ notes="$d/NOTES.md"
30
+ has_failed_headroom=0
31
+ if [ -f "$notes" ] && grep -Fq 'headroom' "$notes" && grep -Eq '`?FAIL`?' "$notes"; then
32
+ has_failed_headroom=1
33
+ fi
34
+ category="$(
35
+ python3 - "$meta" <<'PY'
36
+ import json
37
+ import sys
38
+
39
+ try:
40
+ with open(sys.argv[1], encoding="utf-8") as handle:
41
+ print(json.load(handle).get("category", ""))
42
+ except FileNotFoundError:
43
+ print("")
44
+ PY
45
+ )"
46
+ if [ "$category" = "high-risk" ] && [ "$has_failed_headroom" -eq 0 ]; then
47
+ if ! has_actionable_solo_headroom_hypothesis --expected-json "$d/expected.json" "$spec"; then
48
+ echo "✗ $fid: unmeasured high-risk shadow fixture spec.md must document a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend"
49
+ errors=$((errors + 1))
50
+ fi
51
+ if ! has_solo_ceiling_avoidance_note "$notes"; then
52
+ echo "✗ $fid: unmeasured high-risk shadow fixture NOTES.md must include ## Solo ceiling avoidance naming how it differs from solo-saturated controls before provider spend"
53
+ errors=$((errors + 1))
54
+ fi
55
+ fi
56
+ done
57
+
58
+ [ "$errors" -eq 0 ] || exit 1