devlyn-cli 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/CLAUDE.md +1 -1
  2. package/README.md +1 -1
  3. package/benchmark/auto-resolve/README.md +318 -2
  4. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  91. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  100. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  101. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  102. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  103. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  104. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  105. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  106. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  107. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  110. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  111. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  112. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  113. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  114. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  116. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  117. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  118. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  119. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  120. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  121. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  122. package/bin/devlyn.js +56 -10
  123. package/config/skills/_shared/archive_run.py +3 -0
  124. package/config/skills/_shared/codex-config.md +2 -2
  125. package/config/skills/_shared/codex-monitored.sh +72 -7
  126. package/config/skills/_shared/collect-codex-findings.py +125 -0
  127. package/config/skills/_shared/engine-preflight.md +1 -1
  128. package/config/skills/_shared/expected.schema.json +18 -0
  129. package/config/skills/_shared/spec-verify-check.py +312 -10
  130. package/config/skills/_shared/verify-merge-findings.py +327 -0
  131. package/config/skills/devlyn:ideate/SKILL.md +1 -1
  132. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  133. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  134. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  135. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  136. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  137. package/package.json +1 -1
  138. package/scripts/lint-skills.sh +32 -0
@@ -21,7 +21,7 @@ You do NOT receive: PLAN, IMPLEMENT's reasoning, BUILD_GATE's findings, CLEANUP'
21
21
 
22
22
  Re-run the mechanical checks fresh, independent of BUILD_GATE's earlier run:
23
23
 
24
- 1. `python3 .claude/skills/_shared/spec-verify-check.py` against the post-CLEANUP code.
24
+ 1. `python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` against the post-CLEANUP code.
25
25
  2. Re-scan `spec.expected.json.forbidden_patterns` against the diff (Python re.search; honor each pattern's `files` allowlist).
26
26
  3. Confirm `required_files` exist post-diff; confirm `forbidden_files` do not appear in the diff.
27
27
  4. Confirm `max_deps_added` is not exceeded (`git diff -- package.json` for Node; equivalent for other ecosystems).
@@ -39,23 +39,175 @@ Grade the diff against the spec on rubric axes:
39
39
 
40
40
  For each finding, write file:line evidence. Do not paraphrase code; quote it.
41
41
 
42
+ **Clause-level check**: split each Requirement into its binding clauses before
43
+ you pass it. Words like `before`, `after`, `once`, `always`, `never`,
44
+ `regardless`, `irrelevant`, `permanent`, `idempotent`, `duplicate`, `raw`, and
45
+ `signature` usually encode a separate invariant. A passing verification command
46
+ proves only the case it actually exercises; it does not prove neighboring
47
+ clauses. For stateful, auth, parsing, idempotency, rollback, and error-priority
48
+ flows, construct at least one counterexample in your head and trace the code
49
+ order, including failed-operation rollback and the next entity's state. If the
50
+ code order can return the wrong status/body/output for a binding clause, emit a
51
+ HIGH spec-compliance finding even when the provided verifier passes.
52
+
53
+ Respect each clause's scope qualifiers. Do not widen an invariant beyond the
54
+ words in the spec: `inside a warehouse`, `per resource`, `for this line`,
55
+ `after validation`, and similar qualifiers are binding. When two ordering rules
56
+ coexist, compose them in the stated order instead of inventing a stronger global
57
+ ordering. A finding based on a widened invariant is a false positive and must
58
+ not drive the fix loop.
59
+
60
+ **Interaction check**: for high-complexity specs, one-axis examples are not
61
+ enough. Construct at least one adversarial scenario that combines two or more
62
+ explicit verification bullets. Prioritize combinations such as
63
+ ordering/priority + blocked interval/failure, ordering/priority +
64
+ all-or-nothing rollback + later entity state, validation/error-priority +
65
+ stdout/stderr contract, or auth/idempotency + duplicate/replay ordering. If the
66
+ implementation only passes isolated examples but fails the combined scenario,
67
+ emit a HIGH finding tied to all relevant spec clauses.
68
+
69
+ For high-complexity specs, execute at least one combined adversarial check with
70
+ the repo's existing CLI/API/test runner before declaring PASS. Use a temporary
71
+ script or inline command that leaves no tracked files behind. The check must
72
+ cross two or more explicit verification bullets, not merely repeat the visible
73
+ acceptance command. If the command exposes a mismatch, emit a HIGH finding with
74
+ the command, expected output/state, and actual output/state.
75
+
42
76
  **Coverage check**: before declaring done, confirm you have evidence for every spec axis. If you could not exercise an axis (the spec asks for behavior X but the diff does not touch the code that produces X), set `state.verify.coverage_failed: true` and surface the missing-evidence finding rather than passing on assumption.
43
77
 
78
+ **Verdict-binding severity check**: HIGH/CRITICAL findings are always
79
+ verdict-binding. A MEDIUM finding is also verdict-binding when it identifies a
80
+ concrete behavioral regression against the visible spec, an existing public
81
+ contract, or an existing test contract. Examples: a previously valid input now
82
+ errors, duplicate/idempotent handling regresses, warning/error semantics change
83
+ for a real API path, or a focused existing regression test would fail. Advisory
84
+ design/style concerns remain non-binding MEDIUM and produce `PASS_WITH_ISSUES`.
85
+
44
86
  **Anti-self-filter rule**: report every finding you observe, including ones you consider low-severity or low-confidence. Tag each with `confidence: high|medium|low` and let the harness's downstream filter rank them. Filtering at this stage suppresses recall.
45
87
 
46
88
  ### Pair-mode (when triggered by orchestrator)
47
89
 
48
- When the orchestrator spawns a second VERIFY agent with the OTHER engine's adapter, both judgments are merged:
90
+ Pair-mode is eligible only after MECHANICAL has no HIGH/CRITICAL findings.
91
+ Deterministic blockers already decide the verdict and route to the fix loop; a
92
+ second judge there duplicates evidence and wastes wall-time. If MECHANICAL has
93
+ a HIGH/CRITICAL finding, record `pair_judge: null` and do not spawn the second
94
+ VERIFY agent.
95
+
96
+ When eligible, trigger pair-mode if any of these are true:
97
+ - `--pair-verify` was set.
98
+ - The spec frontmatter has `complexity: high`.
99
+ - `state.complexity` is `"high"` or `"large"`.
100
+ - MECHANICAL emitted warning-level findings but no HIGH/CRITICAL blockers.
101
+ - `state.verify.coverage_failed == true`.
102
+
103
+ Before JUDGE spawn, compute and persist:
104
+
105
+ ```json
106
+ "pair_trigger": {
107
+ "eligible": true,
108
+ "reasons": ["complexity.high"],
109
+ "skipped_reason": null
110
+ }
111
+ ```
112
+
113
+ If `eligible == true` and `reasons` is non-empty, the OTHER-engine judge is
114
+ mandatory. Skipping it is a VERIFY contract violation. If ineligible, record the
115
+ reason, e.g. `"mechanical_blocker"`.
116
+
117
+ The `--engine` flag never disables this rule. Explicit `--engine claude` means
118
+ Claude is the primary judge; if pair-mode triggers, Codex is still the mandatory
119
+ OTHER-engine judge. Do not record "explicit --engine claude" as a skip reason.
120
+ The only valid skip reasons after a non-empty eligible trigger are deterministic
121
+ MECHANICAL HIGH/CRITICAL blockers or Codex unavailability proven by the
122
+ invocation layer.
123
+
124
+ When eligible and the orchestrator spawns a second VERIFY agent with the OTHER engine's adapter, both judgments are merged:
49
125
  - Any HIGH/CRITICAL finding either model surfaces is verdict-binding.
50
- - Lower-severity disagreements are logged but do not change the verdict.
126
+ - Any high-confidence MEDIUM finding either model surfaces is also
127
+ verdict-binding when it identifies a concrete behavioral regression against
128
+ the spec, public contract, or existing test contract. This includes
129
+ duplicate/idempotent/order-preservation regressions and real warning/error
130
+ behavior changes. Do not downgrade these to advisory simply because they are
131
+ not HIGH.
132
+ - Other lower-severity disagreements are logged but do not change the verdict.
51
133
  - The orchestrator handles merge; you only emit your own findings.
134
+ - The second judge's job is adversarial complement, not a duplicate summary:
135
+ prioritize the two highest-risk explicit `## Verification` bullets that cross
136
+ state mutation, all-or-nothing rollback, ordering, idempotency, auth, or
137
+ error-priority clauses. The primary judge owns broad coverage; the pair judge
138
+ is a bounded adversarial complement. Do not read `.claude/skills`,
139
+ `.codex/skills`, `CLAUDE.md`, `AGENTS.md`, or other harness docs unless the
140
+ orchestrator pasted a specific excerpt into the prompt. Use only the spec,
141
+ diff, implementation files, tests, and the repo's CLI/API/test runner.
142
+ Execute at most two targeted probes before first output. Stop immediately
143
+ after the first verdict-binding finding and emit JSONL. If both probes pass
144
+ and static scope/dependency checks show no blocker, emit PASS; do not continue
145
+ exhaustive exploration.
146
+ A targeted probe must compare the full externally visible result
147
+ (stdout/stderr/exit and full parsed output object, including accepted/scheduled
148
+ rows, rejected rows, and remaining state when present), not just a single
149
+ property. For priority/stateful specs, at least one probe must include an
150
+ earlier input entity that would succeed under input-order processing, a later
151
+ higher-priority entity that consumes or blocks the critical resource, and a
152
+ failure/blocked/rollback edge that determines a later entity's state. This is
153
+ the minimum compound shape for priority + failure/state-mutation bugs.
154
+ Scope qualifiers are binding for the pair judge too: do not reinterpret
155
+ `inside a warehouse`, `per resource`, or line-scoped rules as global rules.
156
+ If a candidate finding depends on that widening, emit PASS for that probe and
157
+ use the second bounded probe for a different explicit clause.
158
+ When both priority ordering and rollback/blocked-interval behavior appear in
159
+ the spec, this dominance-loss probe is mandatory and comes before any other
160
+ probe: an earlier lower-priority entity that would succeed alone or under
161
+ input-order processing must lose because a later higher-priority entity is
162
+ processed first; a failed/blocked middle entity must not corrupt later state;
163
+ and the assertion must cover the complete output ordering for both accepted
164
+ (or scheduled) and rejected rows.
165
+
166
+ Codex pair-JUDGE is read-only. Invoke `codex-monitored.sh` directly with
167
+ `-c model_reasoning_effort=medium`; this phase is a bounded two-probe review,
168
+ not an unbounded implementation task. Do not pipe it to `tail`, `head`, `grep`,
169
+ `sed`, or `awk`. Capture stdout/stderr by direct tool capture or file
170
+ redirection. The Codex judge must return JSONL
171
+ findings on stdout; the orchestrator writes `.devlyn/verify.pair.findings.jsonl`
172
+ and merges verdicts. Do not ask Codex to `apply_patch` or edit `.devlyn`.
173
+ The Codex prompt must include a bounded-output contract: no harness-doc reads,
174
+ maximum two targeted probes before first output, stop on the first
175
+ verdict-binding finding, and emit PASS immediately after the bounded checks pass.
176
+ If stdout is first captured to `.devlyn/codex-judge.stdout`, run
177
+ `python3 .claude/skills/_shared/collect-codex-findings.py` before merge. That
178
+ script is the deterministic boundary writer for
179
+ `.devlyn/verify.pair.findings.jsonl`.
180
+ If raw Codex stdout is captured as `.devlyn/codex-judge.stdout`,
181
+ `verify-merge-findings.py` treats it as a diagnostic only. If stdout contains
182
+ findings or a non-PASS summary while `.devlyn/verify.pair.findings.jsonl` is
183
+ empty, VERIFY is `BLOCKED` for `verify.pair.emission-contract`; do not pass or
184
+ silently recover from a broken capture contract.
185
+
186
+ After all VERIFY findings files are written, run:
187
+
188
+ ```bash
189
+ python3 .claude/skills/_shared/verify-merge-findings.py --write-state
190
+ ```
191
+
192
+ This deterministic merge is the routing source of truth for VERIFY. It writes
193
+ `.devlyn/verify-merged.findings.jsonl`, `.devlyn/verify-merge.summary.json`, and
194
+ updates `state.phases.verify.{verdict,sub_verdicts,merged}`. Branch on the
195
+ merged state verdict, not on either model's prose verdict. Any HIGH/CRITICAL
196
+ finding from either judge is `NEEDS_WORK`; a high-confidence MEDIUM must set
197
+ `verdict_binding: true` to become `NEEDS_WORK`.
198
+
199
+ Do not create, edit, truncate, or summarize `.devlyn/verify-merged.findings.jsonl`
200
+ or `.devlyn/verify-merge.summary.json` by hand. Those files have exactly one
201
+ writer: `verify-merge-findings.py`. If that command fails, preserve stderr and set
202
+ VERIFY to `BLOCKED`; do not synthesize merge artifacts in prose.
52
203
 
53
204
  </sub_phases>
54
205
 
55
206
  <output>
56
207
  - `.devlyn/verify-mechanical.findings.jsonl` — MECHANICAL findings.
57
208
  - `.devlyn/verify.findings.jsonl` — JUDGE findings.
58
- - `state.phases.verify.{verdict, completed_at, duration_ms, sub_verdicts: {mechanical, judge}, artifacts}`. Verdict: WORSE of the two sub-verdicts. `PASS` requires zero CRITICAL/HIGH findings AND coverage met.
209
+ - `.devlyn/verify-merged.findings.jsonl` and `.devlyn/verify-merge.summary.json` deterministic merge artifacts.
210
+ - `state.phases.verify.{verdict, completed_at, duration_ms, sub_verdicts: {mechanical, judge, pair_judge?}, merged, artifacts}`. Verdict: result of `verify-merge-findings.py`. `PASS` requires zero CRITICAL/HIGH findings, zero verdict-binding MEDIUM regressions, and coverage met.
59
211
  </output>
60
212
 
61
213
  <quality_bar>
@@ -28,13 +28,14 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
28
28
  ],
29
29
  "phases": {
30
30
  "plan": null,
31
+ "probe_derive": null,
31
32
  "implement": null,
32
33
  "build_gate": null,
33
34
  "cleanup": null,
34
35
  "verify": null,
35
36
  "final_report": null
36
37
  },
37
- "verify": { "coverage_failed": false }
38
+ "verify": { "coverage_failed": false, "pair_trigger": null }
38
39
  }
39
40
  ```
40
41
 
@@ -45,14 +46,16 @@ Single authoritative verdict source for `/devlyn:resolve`. The orchestrator bran
45
46
  - **complexity** — `null | "trivial" | "medium" | "large"`. Free-form mode populates this; spec/verify-only mode leaves it null.
46
47
  - **engine** — `"claude" | "codex" | "auto"` initially; rewritten by engine-preflight if a downgrade fired.
47
48
  - **rounds.global** — incremented every fix-loop pass (BUILD_GATE → fix-loop OR VERIFY → fix-loop).
49
+ - **phases.probe_derive** — optional PHASE 1.5 entry when `--risk-probes` is enabled. Artifacts include `.devlyn/risk-probes.jsonl`. Probe failures later surface through BUILD_GATE/VERIFY as `correctness.risk-probe-failed`.
48
50
  - **bypasses** — array of phase names from `--bypass`. Valid: `"build-gate" | "cleanup"`. PLAN, IMPLEMENT, VERIFY are non-bypassable (orchestrator rejects at parse time).
49
51
  - **implement_passed_sha** — captured at end of PHASE 2; null until then. Activates the post-implement invariant for CLEANUP and VERIFY.
50
52
  - **criteria** — generated from spec's `## Requirements` checklist (one per `- [ ]`). `status: pending → implemented` is the legal transition. `failed_by_finding_ids` populates when VERIFY surfaces a finding tied to a criterion.
51
- - **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set.
53
+ - **verify.coverage_failed** — set by VERIFY's JUDGE sub-phase when a spec axis could not be exercised against the diff. Triggers pair-mode escalation when set. Pair-mode also triggers for `complexity: high` specs or `state.complexity` of `"high"`/`"large"` when MECHANICAL has no HIGH/CRITICAL blockers.
54
+ - **verify.pair_trigger** — VERIFY's trigger decision: `{ "eligible": boolean, "reasons": string[], "skipped_reason": string|null }`. If eligible with any reason, `pair_judge` must be non-null.
52
55
 
53
56
  ## Per-phase shape
54
57
 
55
- Each entry under `phases.<name>` (for `plan`, `implement`, `build_gate`, `cleanup`, `verify`, `final_report`):
58
+ Each entry under `phases.<name>` (for `plan`, `probe_derive`, `implement`, `build_gate`, `cleanup`, `verify`, `final_report`):
56
59
 
57
60
  ```json
58
61
  {
@@ -73,7 +76,10 @@ Each entry under `phases.<name>` (for `plan`, `implement`, `build_gate`, `cleanu
73
76
  - `verdict` — `"PASS" | "PASS_WITH_ISSUES" | "FAIL" | "NEEDS_WORK" | "BLOCKED"`. PHASE 6 (FINAL_REPORT) writes its own verdict per the terminal-verdict precedence.
74
77
  - `triggered_by` — null on first run; one of `"build_gate" | "verify"` when the phase is a fix-loop respawn.
75
78
  - `pre_sha` — captured by orchestrator before CLEANUP and (if needed) other allowlist-enforced phases. Used to validate the post-spawn diff.
76
- - `sub_verdicts` — only populated for VERIFY: `{ "mechanical": "PASS|FAIL", "judge": "PASS|...", "pair_judge": "PASS|..." | null }`.
79
+ - `sub_verdicts` — only populated for VERIFY: `{ "mechanical": "PASS|FAIL", "judge": "PASS|...", "pair_judge": "PASS|..." | null }`. Values are normalized by `verify-merge-findings.py`; model prose verdicts cannot upgrade or downgrade the deterministic findings-derived verdict.
80
+ - `merged` — only populated for VERIFY after `verify-merge-findings.py --write-state`: `{ "verdict": "...", "findings_file": ".devlyn/verify-merged.findings.jsonl", "summary_file": ".devlyn/verify-merge.summary.json" }`.
81
+ - `pair_trigger` — only populated for VERIFY; same shape as top-level `verify.pair_trigger` when the phase stores it locally.
82
+ - `correctness.risk-probe-failed` — emitted by `spec-verify-check.py --include-risk-probes` when an executable probe derived from the visible `## Verification` section fails.
77
83
 
78
84
  ## Write protocol
79
85
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "devlyn-cli",
3
- "version": "2.0.0",
3
+ "version": "2.2.0",
4
4
  "description": "AI development toolkit for Claude Code — ideate, auto-resolve, and ship with context engineering and agent orchestration",
5
5
  "homepage": "https://github.com/fysoul17/devlyn-cli#readme",
6
6
  "bin": {
@@ -35,6 +35,7 @@ offenders=$(grep -RIln 'mcp__codex-cli__' \
35
35
  | grep -v 'config/skills/devlyn:auto-resolve-workspace/' \
36
36
  | grep -v 'config/skills/devlyn:ideate-workspace/' \
37
37
  | grep -v 'config/skills/preflight-workspace/' \
38
+ | grep -v 'benchmark/auto-resolve/external/' \
38
39
  | grep -v 'benchmark/auto-resolve/PILOT-RESULTS' \
39
40
  || true)
40
41
  if [ -z "$offenders" ]; then
@@ -53,6 +54,7 @@ offenders=$(grep -RIln 'Requires Codex MCP\|Codex MCP server\|Codex MCP availabl
53
54
  | grep -v 'config/skills/devlyn:auto-resolve-workspace/' \
54
55
  | grep -v 'config/skills/devlyn:ideate-workspace/' \
55
56
  | grep -v 'config/skills/preflight-workspace/' \
57
+ | grep -v 'benchmark/auto-resolve/external/' \
56
58
  | grep -v 'benchmark/auto-resolve/PILOT-RESULTS' \
57
59
  || true)
58
60
  if [ -z "$offenders" ]; then
@@ -127,6 +129,8 @@ else
127
129
  # plus the `_shared/` kernel.
128
130
  for rel in \
129
131
  _shared/spec-verify-check.py \
132
+ _shared/collect-codex-findings.py \
133
+ _shared/verify-merge-findings.py \
130
134
  devlyn:ideate/SKILL.md \
131
135
  devlyn:ideate/references/spec-template.md \
132
136
  devlyn:ideate/references/elicitation.md \
@@ -136,6 +140,7 @@ else
136
140
  devlyn:resolve/references/state-schema.md \
137
141
  devlyn:resolve/references/free-form-mode.md \
138
142
  devlyn:resolve/references/phases/plan.md \
143
+ devlyn:resolve/references/phases/probe-derive.md \
139
144
  devlyn:resolve/references/phases/implement.md \
140
145
  devlyn:resolve/references/phases/build-gate.md \
141
146
  devlyn:resolve/references/phases/cleanup.md \
@@ -171,6 +176,33 @@ else
171
176
  fi
172
177
  fi
173
178
 
179
+ # ---------------------------------------------------------------------------
180
+ # 6b. VERIFY merge verdict binding self-test.
181
+ # F23 full-pipeline prompt-fix rerun exposed a real failure where Codex
182
+ # pair-JUDGE emitted HIGH findings but state kept pair_judge as
183
+ # PASS_WITH_ISSUES. Routing severity must be deterministic, not prose.
184
+ # ---------------------------------------------------------------------------
185
+ section "Check 6b: VERIFY merge makes pair HIGH verdict-binding"
186
+ if python3 config/skills/_shared/verify-merge-findings.py --self-test >/dev/null 2>&1; then
187
+ ok "verify-merge-findings.py self-test passed"
188
+ else
189
+ bad "verify-merge-findings.py self-test failed"
190
+ fi
191
+
192
+ section "Check 6c: Codex stdout collection writes canonical pair findings"
193
+ if python3 config/skills/_shared/collect-codex-findings.py --self-test >/dev/null 2>&1; then
194
+ ok "collect-codex-findings.py self-test passed"
195
+ else
196
+ bad "collect-codex-findings.py self-test failed"
197
+ fi
198
+
199
+ section "Check 6d: Spec verification executes hidden-blind risk probes"
200
+ if python3 config/skills/_shared/spec-verify-check.py --self-test >/dev/null 2>&1; then
201
+ ok "spec-verify-check.py risk-probe self-test passed"
202
+ else
203
+ bad "spec-verify-check.py risk-probe self-test failed"
204
+ fi
205
+
174
206
  # ---------------------------------------------------------------------------
175
207
  # 8. CRITIC security sub-pass must be native, not Dual.
176
208
  # Catches the specific drift where a section updates but a cross-reference doesn't.