devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -14,15 +14,18 @@ Default mode (BUILD_GATE invocation, no args):
14
14
  benchmark truth is `commits`/`authors`) silently overwrote the
15
15
  authoritative benchmark contract. For benchmarks, expected.json is
16
16
  canonical.
17
- (2) Otherwise, source markdown extract read `pipeline.state.json:
18
- source.{spec_path | criteria_path}` and extract a `## Verification`
17
+ (2) Otherwise, real-user spec mode first reads sibling `spec.expected.json`
18
+ next to `spec.md`; if it exists, validate it and stage its
19
+ `verification_commands`. A malformed sibling fails closed. If absent,
20
+ fall back to source markdown extract.
21
+ (3) For generated criteria and legacy handwritten specs without a sibling,
22
+ source markdown extract reads `pipeline.state.json:
23
+ source.{spec_path | criteria_path}` and extracts a `## Verification`
19
24
  ```json``` block. If present, overwrite `.devlyn/spec-verify.json`.
20
- This is the real-user carrier path; a pre-existing file from a
21
- killed prior run is stale and must not be trusted in real-user mode.
22
- (3) If no json block in source AND source.type=="generated": emit
25
+ (4) If no json block in source AND source.type=="generated": emit
23
26
  CRITICAL `correctness.spec-verify-malformed` so the fix-loop reruns
24
27
  BUILD.
25
- (4) If no json block in source AND source.type=="spec": benchmark mode
28
+ (5) If no sibling/json block in source AND source.type=="spec": benchmark mode
26
29
  with a pre-staged file would have hit branch (1). Without the
27
30
  pre-staged file, benchmark falls through to no-op (rare — fixture
28
31
  mis-config). Real-user mode silent no-op + drops any stale
@@ -35,11 +38,28 @@ Default mode (BUILD_GATE invocation, no args):
35
38
 
36
39
  Check mode (`--check <markdown_path>`):
37
40
  - Used by /devlyn:ideate after writing each item spec to validate that the
38
- generated `## Verification` ```json``` block parses + matches the schema.
41
+ generated `## Verification` ```json``` block parses + matches the schema,
42
+ and that present `complexity` frontmatter has a supported value.
39
43
  - Exits 0 if the block is well-formed (or absent — ideate's check applies
40
44
  to both new specs that include the block and pre-carrier handwritten
41
45
  specs that omit it; absence is not failure here, only malformed JSON or
42
- shape error is). Exits 2 on malformed json or shape error.
46
+ shape error is). Exits 2 on malformed json, shape error, or unsupported
47
+ `complexity` value.
48
+
49
+ Expected-contract check mode (`--check-expected <json_path>`):
50
+ - Used by /devlyn:ideate after writing sibling `spec.expected.json`.
51
+ - Exits 0 if the file is valid JSON and matches `_shared/expected.schema.json`
52
+ shape, and if sibling `spec.md` has supported `complexity` frontmatter.
53
+ Exits 2 on unreadable, malformed, unsupported fields, or unsupported sibling
54
+ spec complexity.
55
+
56
+ Output routing:
57
+ - Default BUILD_GATE output writes `.devlyn/spec-verify-findings.jsonl` with
58
+ `phase: build_gate` and `BGATE-*` ids.
59
+ - VERIFY may set `SPEC_VERIFY_PHASE=verify_mechanical`,
60
+ `SPEC_VERIFY_FINDINGS_FILE=verify-mechanical.findings.jsonl`, and
61
+ `SPEC_VERIFY_FINDING_PREFIX=VERIFY-MECH` so `verify-merge-findings.py` consumes
62
+ deterministic blockers directly.
43
63
 
44
64
  Why: iter-0018.5's prompt-only contract enforcement was empirically dead
45
65
  (F9 verify=0.4 across all engines in iter-0019). Same lesson as iter-0008
@@ -50,11 +70,15 @@ markdown directly — closes NORTH-STAR test #14.
50
70
 
51
71
  Exit codes:
52
72
  - 0: silent no-op (no source carrier, real-user mode) OR --check passed
53
- OR all commands passed.
54
- - 1: at least one command failed OR carrier malformed (generated source
73
+ OR all commands passed. Non-blocking expected-contract findings may be
74
+ written with exit 0.
75
+ - 1: at least one command failed, carrier malformed (generated source
55
76
  required carrier, generated source had invalid json/shape, or pre-staged
56
- file failed shape validation). All paths emit a CRITICAL finding to
57
- `.devlyn/spec-verify-findings.jsonl`.
77
+ file failed shape validation), or a blocking expected-contract finding
78
+ was emitted. Findings are written to the routed `.devlyn/` findings file:
79
+ `.devlyn/spec-verify-findings.jsonl` by default, or the file selected by
80
+ `SPEC_VERIFY_FINDINGS_FILE` (for example, VERIFY uses
81
+ `.devlyn/verify-mechanical.findings.jsonl`).
58
82
  - 2: invocation error (unreadable spec-verify.json, missing markdown in
59
83
  --check mode, etc.)
60
84
  """
@@ -62,6 +86,7 @@ Exit codes:
62
86
  from __future__ import annotations
63
87
 
64
88
  import json
89
+ import hashlib
65
90
  import os
66
91
  import re
67
92
  import subprocess
@@ -70,6 +95,26 @@ import tempfile
70
95
  from pathlib import Path
71
96
 
72
97
 
98
+ def reject_json_constant(token: str) -> None:
99
+ raise ValueError(f"invalid JSON numeric constant: {token}")
100
+
101
+
102
+ def loads_strict_json(text: str):
103
+ return json.loads(text, parse_constant=reject_json_constant)
104
+
105
+
106
+ def output_phase() -> str:
107
+ return os.environ.get("SPEC_VERIFY_PHASE", "build_gate")
108
+
109
+
110
+ def output_findings_name() -> str:
111
+ return os.environ.get("SPEC_VERIFY_FINDINGS_FILE", "spec-verify-findings.jsonl")
112
+
113
+
114
+ def output_finding_prefix() -> str:
115
+ return os.environ.get("SPEC_VERIFY_FINDING_PREFIX", "BGATE")
116
+
117
+
73
118
  VERIFICATION_SECTION_RE = re.compile(
74
119
  r'(?ms)^##[ \t]+Verification\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)'
75
120
  )
@@ -78,6 +123,39 @@ FORBIDDEN_RISK_PROBE_CMD_RE = re.compile(
78
123
  r'BENCH_FIXTURE_DIR|benchmark/auto-resolve/fixtures|/verifiers/|verifiers/'
79
124
  )
80
125
  EXTERNAL_URL_RE = re.compile(r"https?://([^/\s\"']+)", re.IGNORECASE)
126
+ INLINE_JSON_OBJECT_RE = re.compile(r'`?\{\s*"[^"\n]+"\s*:', re.IGNORECASE)
127
+ BACKTICKED_TEXT_RE = re.compile(r"`[^`\n]+`")
128
+ OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")
129
+ RESERVED_BACKTICK_TERMS = {"solo-headroom hypothesis", "solo_claude", "miss"}
130
+ SOLO_CEILING_CONTROL_RE = re.compile(
131
+ r'\bS[2-6]\b|S2-S6|solo-saturated|rejected controls?|solo ceiling',
132
+ re.IGNORECASE,
133
+ )
134
+ SOLO_CEILING_DIFFERENCE_RE = re.compile(
135
+ r'\bdiffer(?:s|ent|ence)?\b|\bunlike\b|\bbecause\b|\bpreserve\b|\bheadroom\b',
136
+ re.IGNORECASE,
137
+ )
138
+ COMMAND_PREFIXES = {
139
+ "bash",
140
+ "bun",
141
+ "cargo",
142
+ "git",
143
+ "go",
144
+ "jest",
145
+ "make",
146
+ "node",
147
+ "npm",
148
+ "pnpm",
149
+ "printf",
150
+ "pytest",
151
+ "python",
152
+ "python3",
153
+ "ruff",
154
+ "sh",
155
+ "uv",
156
+ "vitest",
157
+ "yarn",
158
+ }
81
159
  LOCAL_URL_HOSTS = {
82
160
  'localhost',
83
161
  '127.0.0.1',
@@ -93,6 +171,11 @@ RISK_PROBE_TAGS = {
93
171
  "positive_remaining",
94
172
  "stdout_stderr_contract",
95
173
  "error_contract",
174
+ "http_error_contract",
175
+ "auth_signature_contract",
176
+ "idempotency_replay",
177
+ "concurrent_state_consistency",
178
+ "atomic_batch_state",
96
179
  "shape_contract",
97
180
  }
98
181
  RISK_PROBE_REQUIRED_EVIDENCE = {
@@ -117,7 +200,59 @@ RISK_PROBE_REQUIRED_EVIDENCE = {
117
200
  "asserts_full_remaining_state",
118
201
  "zero_quantity_rows_absent",
119
202
  },
203
+ "stdout_stderr_contract": {
204
+ "asserts_named_stream_output",
205
+ },
206
+ "error_contract": {
207
+ "asserts_error_payload_or_stderr",
208
+ "asserts_nonzero_or_exit_2",
209
+ },
210
+ "http_error_contract": {
211
+ "asserts_http_error_status",
212
+ "asserts_error_payload_body",
213
+ },
214
+ "auth_signature_contract": {
215
+ "asserts_signature_over_exact_bytes",
216
+ "asserts_tampered_or_missing_signature_rejected",
217
+ },
218
+ "idempotency_replay": {
219
+ "first_delivery_then_duplicate",
220
+ "duplicate_id_rejected_regardless_of_body",
221
+ },
222
+ "concurrent_state_consistency": {
223
+ "overlapping_mutations_exercised",
224
+ "all_successful_responses_reflected",
225
+ "distinct_identifiers_asserted",
226
+ },
227
+ "atomic_batch_state": {
228
+ "mixed_valid_invalid_batch",
229
+ "asserts_store_unchanged_after_failure",
230
+ "asserts_success_order_and_distinct_ids",
231
+ },
120
232
  }
233
+ SHAPE_CONTRACT_REQUIRED_EVIDENCE = {
234
+ "uses_visible_input_key_names",
235
+ "asserts_visible_output_key_names",
236
+ "asserts_no_unexpected_output_keys",
237
+ }
238
+ EXPECTED_TOP_LEVEL_KEYS = {
239
+ "verification_commands",
240
+ "forbidden_patterns",
241
+ "required_files",
242
+ "forbidden_files",
243
+ "tier_a_waivers",
244
+ "spec_output_files",
245
+ "max_deps_added",
246
+ }
247
+ EXPECTED_VERIFICATION_COMMAND_KEYS = {
248
+ "cmd",
249
+ "exit_code",
250
+ "stdout_contains",
251
+ "stdout_not_contains",
252
+ "contract_refs",
253
+ }
254
+ PURE_DESIGN_ESCAPE = "all Requirements are pure-design"
255
+ SPEC_COMPLEXITY_VALUES = {"trivial", "medium", "high", "large"}
121
256
 
122
257
 
123
258
  def extract_verification_block(text: str) -> str | None:
@@ -139,6 +274,157 @@ def extract_verification_text(text: str) -> str:
139
274
  return section.group(1) if section else ""
140
275
 
141
276
 
277
+ def extract_frontmatter_field(text: str, field: str) -> str | None:
278
+ if not text.startswith("---"):
279
+ return None
280
+ end = text.find("\n---", 3)
281
+ if end == -1:
282
+ return None
283
+ pattern = re.compile(rf"\s*{re.escape(field)}\s*:\s*[\"']?([^\"'\n#]+)")
284
+ for line in text[3:end].splitlines():
285
+ match = pattern.match(line)
286
+ if match:
287
+ return match.group(1).strip().lower()
288
+ return None
289
+
290
+
291
+ def validate_present_spec_complexity(text: str) -> str | None:
292
+ complexity = extract_frontmatter_field(text, "complexity")
293
+ if complexity is None or complexity in SPEC_COMPLEXITY_VALUES:
294
+ return None
295
+ values = ", ".join(sorted(SPEC_COMPLEXITY_VALUES))
296
+ return f"frontmatter complexity must be one of: {values}"
297
+
298
+
299
+ def backticked_observable_miss_commands(text: str) -> list[str]:
300
+ commands: list[str] = []
301
+ for line in text.splitlines():
302
+ lower = line.lower()
303
+ if "miss" not in lower or not any(marker in lower for marker in OBSERVABLE_COMMAND_MARKERS):
304
+ continue
305
+ for match in BACKTICKED_TEXT_RE.finditer(line):
306
+ value = match.group(0).strip("`")
307
+ if is_command_like_backtick(value):
308
+ commands.append(value)
309
+ return commands
310
+
311
+
312
+ def is_command_like_backtick(value: str) -> bool:
313
+ stripped = value.strip()
314
+ lower = stripped.lower()
315
+ if not stripped or lower in RESERVED_BACKTICK_TERMS:
316
+ return False
317
+ first = lower.split(maxsplit=1)[0]
318
+ return (
319
+ first in COMMAND_PREFIXES
320
+ or any(marker in stripped for marker in ("/", "$", "=", "|", "&&", ";"))
321
+ or stripped.endswith((".js", ".py", ".sh"))
322
+ )
323
+
324
+
325
+ def has_backticked_observable_miss_command(text: str) -> bool:
326
+ return bool(backticked_observable_miss_commands(text))
327
+
328
+
329
+ def validate_present_solo_headroom_hypothesis(text: str) -> str | None:
330
+ lower = text.lower()
331
+ if "solo-headroom hypothesis" not in lower and not ("solo_claude" in lower and "miss" in lower):
332
+ return None
333
+ if (
334
+ "solo-headroom hypothesis" in lower
335
+ and "solo_claude" in lower
336
+ and "miss" in lower
337
+ and has_backticked_observable_miss_command(text)
338
+ ):
339
+ return None
340
+ return (
341
+ "solo-headroom hypothesis must include `solo-headroom hypothesis`, "
342
+ "`solo_claude`, `miss`, and a backticked command/observable line "
343
+ "that exposes the miss"
344
+ )
345
+
346
+
347
+ def validate_present_solo_ceiling_avoidance(text: str) -> str | None:
348
+ lower = text.lower()
349
+ if "solo ceiling avoidance" not in lower:
350
+ return None
351
+ if (
352
+ "solo_claude" in lower
353
+ and SOLO_CEILING_CONTROL_RE.search(text)
354
+ and SOLO_CEILING_DIFFERENCE_RE.search(text)
355
+ ):
356
+ return None
357
+ return (
358
+ "solo ceiling avoidance must include `solo ceiling avoidance`, "
359
+ "`solo_claude`, and a concrete difference from rejected or "
360
+ "solo-saturated controls such as `S2`-`S6`"
361
+ )
362
+
363
+
364
+ def validate_solo_headroom_commands_against_expected(
365
+ spec_text: str,
366
+ commands: object,
367
+ expected_label: str,
368
+ ) -> str | None:
369
+ lower = spec_text.lower()
370
+ if "solo-headroom hypothesis" not in lower and not ("solo_claude" in lower and "miss" in lower):
371
+ return None
372
+ expected_cmds = {
373
+ command.get("cmd")
374
+ for command in commands
375
+ if isinstance(command, dict) and isinstance(command.get("cmd"), str)
376
+ } if isinstance(commands, list) else set()
377
+ hypothesis_cmds = backticked_observable_miss_commands(spec_text)
378
+ if any(command in expected_cmds for command in hypothesis_cmds):
379
+ return None
380
+ return (
381
+ "solo-headroom hypothesis observable command must match "
382
+ f"{expected_label} verification_commands[].cmd"
383
+ )
384
+
385
+
386
+ def command_contains_expected(actual: str, expected: str) -> bool:
387
+ normalized_actual = " ".join(actual.split())
388
+ normalized_expected = " ".join(expected.split())
389
+ if not normalized_expected:
390
+ return False
391
+ pattern = re.compile(
392
+ rf"(?<![A-Za-z0-9_.:/=-]){re.escape(normalized_expected)}(?![A-Za-z0-9_.:/=-])"
393
+ )
394
+ return bool(pattern.search(normalized_actual))
395
+
396
+
397
+ def validate_risk_probes_cover_solo_headroom_hypothesis(
398
+ probes: list[dict],
399
+ verification_text: str,
400
+ ) -> str | None:
401
+ hypothesis_cmds = backticked_observable_miss_commands(verification_text)
402
+ if not hypothesis_cmds:
403
+ return None
404
+ if not probes:
405
+ return None
406
+ derived_from = probes[0].get("derived_from")
407
+ if not (
408
+ isinstance(derived_from, str)
409
+ and "solo-headroom hypothesis" in derived_from.lower()
410
+ and any(command_contains_expected(derived_from, hypothesis_cmd) for hypothesis_cmd in hypothesis_cmds)
411
+ ):
412
+ return (
413
+ "risk-probes[0].derived_from must reference the solo-headroom "
414
+ "hypothesis bullet and observable command"
415
+ )
416
+ cmd = probes[0].get("cmd")
417
+ if isinstance(cmd, str) and any(
418
+ command_contains_expected(cmd, hypothesis_cmd)
419
+ for hypothesis_cmd in hypothesis_cmds
420
+ ):
421
+ return None
422
+ return (
423
+ "risk-probes[0].cmd must contain a "
424
+ "solo-headroom hypothesis observable command"
425
+ )
426
+
427
+
142
428
  def external_url_hosts(text: str) -> list[str]:
143
429
  hosts: list[str] = []
144
430
  for match in EXTERNAL_URL_RE.finditer(text or ''):
@@ -183,6 +469,120 @@ def validate_shape(data) -> str | None:
183
469
  return None
184
470
 
185
471
 
472
+ def validate_string_list(data: object, key: str) -> str | None:
473
+ value = data.get(key, []) if isinstance(data, dict) else None
474
+ if not isinstance(value, list) or not all(isinstance(item, str) and item for item in value):
475
+ return f"{key} must be a list of non-empty strings"
476
+ return None
477
+
478
+
479
+ def validate_expected_shape(data) -> str | None:
480
+ """Return None if shape matches the sibling spec.expected.json schema.
481
+
482
+ Keep this dependency-free: it mirrors `_shared/expected.schema.json` enough
483
+ to catch malformed ideate output before /devlyn:resolve consumes it.
484
+ """
485
+ if not isinstance(data, dict):
486
+ return "top-level must be a JSON object"
487
+ unknown = sorted(set(data) - EXPECTED_TOP_LEVEL_KEYS)
488
+ if unknown:
489
+ return f"unknown top-level key(s): {', '.join(unknown)}"
490
+ if "verification_commands" in data:
491
+ commands = data["verification_commands"]
492
+ if not isinstance(commands, list):
493
+ return "verification_commands must be a list"
494
+ if commands:
495
+ err = validate_shape({"verification_commands": commands})
496
+ if err:
497
+ return err
498
+ for i, command in enumerate(commands):
499
+ unknown_command_keys = sorted(set(command) - EXPECTED_VERIFICATION_COMMAND_KEYS)
500
+ if unknown_command_keys:
501
+ return (
502
+ f"verification_commands[{i}] unknown key(s): "
503
+ f"{', '.join(unknown_command_keys)}"
504
+ )
505
+ contract_refs = command.get("contract_refs", [])
506
+ if not isinstance(contract_refs, list) or not all(
507
+ isinstance(item, str) and item for item in contract_refs
508
+ ):
509
+ return f"verification_commands[{i}].contract_refs must be a list of non-empty strings"
510
+ for key in ("required_files", "forbidden_files", "tier_a_waivers", "spec_output_files"):
511
+ err = validate_string_list(data, key)
512
+ if err:
513
+ return err
514
+ max_deps = data.get("max_deps_added", 0)
515
+ if isinstance(max_deps, bool) or not isinstance(max_deps, int) or max_deps < 0:
516
+ return "max_deps_added must be a non-negative integer"
517
+ patterns = data.get("forbidden_patterns", [])
518
+ if not isinstance(patterns, list):
519
+ return "forbidden_patterns must be a list"
520
+ for i, pattern in enumerate(patterns):
521
+ if not isinstance(pattern, dict):
522
+ return f"forbidden_patterns[{i}] must be an object"
523
+ unknown_pattern_keys = sorted(set(pattern) - {"pattern", "description", "files", "severity"})
524
+ if unknown_pattern_keys:
525
+ return (
526
+ f"forbidden_patterns[{i}] unknown key(s): "
527
+ f"{', '.join(unknown_pattern_keys)}"
528
+ )
529
+ for key in ("pattern", "description", "severity"):
530
+ value = pattern.get(key)
531
+ if not isinstance(value, str) or not value:
532
+ return f"forbidden_patterns[{i}].{key} must be a non-empty string"
533
+ if pattern["severity"] not in {"disqualifier", "warning"}:
534
+ return f"forbidden_patterns[{i}].severity must be disqualifier or warning"
535
+ files = pattern.get("files", [])
536
+ if not isinstance(files, list) or not all(isinstance(item, str) and item for item in files):
537
+ return f"forbidden_patterns[{i}].files must be a list of non-empty strings"
538
+ return None
539
+
540
+
541
+ def validate_expected_against_sibling_spec(expected_path: Path, data: object) -> str | None:
542
+ if not isinstance(data, dict):
543
+ return None
544
+ spec_path = expected_path.with_name("spec.md")
545
+ if not spec_path.is_file():
546
+ return None
547
+ try:
548
+ spec_text = spec_path.read_text(encoding="utf-8")
549
+ except OSError:
550
+ return None
551
+ solo_headroom_err = validate_present_solo_headroom_hypothesis(spec_text)
552
+ if solo_headroom_err:
553
+ return solo_headroom_err
554
+ solo_ceiling_err = validate_present_solo_ceiling_avoidance(spec_text)
555
+ if solo_ceiling_err:
556
+ return solo_ceiling_err
557
+ commands = data.get("verification_commands", [])
558
+ solo_headroom_command_err = validate_solo_headroom_commands_against_expected(
559
+ spec_text,
560
+ commands,
561
+ "spec.expected.json",
562
+ )
563
+ if solo_headroom_command_err:
564
+ return solo_headroom_command_err
565
+ if commands:
566
+ return None
567
+ if PURE_DESIGN_ESCAPE in spec_text:
568
+ return None
569
+ return (
570
+ "verification_commands must contain at least one entry unless sibling "
571
+ "spec.md declares all Requirements are pure-design"
572
+ )
573
+
574
+
575
+ def validate_sibling_spec_complexity(expected_path: Path) -> str | None:
576
+ spec_path = expected_path.with_name("spec.md")
577
+ if not spec_path.is_file():
578
+ return None
579
+ try:
580
+ spec_text = spec_path.read_text(encoding="utf-8")
581
+ except OSError:
582
+ return None
583
+ return validate_present_spec_complexity(spec_text)
584
+
585
+
186
586
  def validate_risk_probe(probe: object, index: int, verification_text: str) -> str | None:
187
587
  if not isinstance(probe, dict):
188
588
  return f"risk-probes[{index}] must be a JSON object"
@@ -221,13 +621,22 @@ def validate_risk_probe(probe: object, index: int, verification_text: str) -> st
221
621
  if unknown_tags:
222
622
  return f"risk-probes[{index}].tags contains unknown tag(s): {', '.join(unknown_tags)}"
223
623
  if "error_contract" in tags and not re.search(
224
- r'invalid|stderr|json[ -]?error|error object|exit[ `]*2',
624
+ r'stderr|exit[ `]*2|(?:prints?|writes?).{0,40}json[ -]?error|'
625
+ r'json[ -]?error.{0,40}(?:stderr|exit)',
225
626
  derived_from,
226
627
  re.IGNORECASE,
227
628
  ):
228
629
  return (
229
630
  f"risk-probes[{index}].derived_from for error_contract must name "
230
- "an invalid-input, stderr, JSON-error, or exit-2 verification bullet"
631
+ "a stderr, JSON-error stream, or exit-2 verification bullet"
632
+ )
633
+ if "http_error_contract" in tags and not (
634
+ re.search(r'\b(?:400|401|403|404|409|422|5[0-9][0-9])\b|http|status', derived_from, re.IGNORECASE)
635
+ and re.search(r'error|invalid', derived_from, re.IGNORECASE)
636
+ ):
637
+ return (
638
+ f"risk-probes[{index}].derived_from for http_error_contract must name "
639
+ "an HTTP/status error response verification bullet"
231
640
  )
232
641
  evidence = probe.get("tag_evidence")
233
642
  if not isinstance(evidence, dict):
@@ -245,22 +654,93 @@ def validate_risk_probe(probe: object, index: int, verification_text: str) -> st
245
654
  f"risk-probes[{index}].tag_evidence.{tag} missing required "
246
655
  f"item(s): {', '.join(missing_evidence)}"
247
656
  )
657
+ if "shape_contract" in tags and shape_contract_requires_evidence(derived_from):
658
+ actual = evidence.get("shape_contract")
659
+ if not isinstance(actual, list) or not all(isinstance(item, str) for item in actual):
660
+ return f"risk-probes[{index}].tag_evidence.shape_contract must be a list of strings"
661
+ required_shape = set(SHAPE_CONTRACT_REQUIRED_EVIDENCE)
662
+ if re.search(r'error object|error body', derived_from, re.IGNORECASE) or (
663
+ INLINE_JSON_OBJECT_RE.search(derived_from)
664
+ and re.search(r'error|invalid', derived_from, re.IGNORECASE)
665
+ ):
666
+ required_shape.add("asserts_exact_error_object")
667
+ missing_shape = sorted(required_shape - set(actual))
668
+ if missing_shape:
669
+ return (
670
+ f"risk-probes[{index}].tag_evidence.shape_contract missing required "
671
+ f"item(s): {', '.join(missing_shape)}"
672
+ )
248
673
  return None
249
674
 
250
675
 
676
+ def shape_contract_requires_evidence(text: str) -> bool:
677
+ return bool(re.search(
678
+ r'\b(?:keys?|fields?|rows?|shape|object|json[ -]?object|'
679
+ r'error body|stdout|stderr|response body)\b|'
680
+ r'\b(?:applied|rejected|accounts|scheduled|accepted|remaining)\b',
681
+ text,
682
+ re.IGNORECASE,
683
+ ) or INLINE_JSON_OBJECT_RE.search(text))
684
+
685
+
251
686
  def required_risk_probe_tags(verification_text: str) -> set[str]:
252
687
  text = verification_text.lower()
253
688
  required: set[str] = set()
254
689
  if re.search(r'priority|higher-priority|ordered by|ordering|appears first|input order', text):
255
690
  required.add("ordering_inversion")
256
- if re.search(r'blocked|overlap|forbidden|window', text):
691
+ if re.search(r'blocked|overlap|forbidden[ -]+window', text):
257
692
  required.add("boundary_overlap")
258
- if re.search(r'rolls? back|reduce[s]? stock|available to later|later orders|remaining|stock', text):
693
+ if re.search(r'rolls? back|rollback|all-or-nothing|tentative', text):
694
+ required.add("rollback_state")
695
+ if re.search(
696
+ r'rolls? back|rollback|all-or-nothing|tentative|reduce[s]? stock|'
697
+ r'available to later|later orders|remaining|'
698
+ r'(?:stock|inventory|balance|availability).{0,80}(?:later|remaining|after failures)',
699
+ text,
700
+ ):
259
701
  required.add("prior_consumption")
260
702
  if "remaining" in text:
261
703
  required.add("positive_remaining")
262
- if re.search(r'stderr|stdout|exit `?2`?|json error', text):
704
+ if re.search(
705
+ r'stderr|exit[ `]*2|(?:prints?|writes?).{0,40}json[ -]?error|'
706
+ r'json[ -]?error.{0,40}(?:stderr|exit)',
707
+ text,
708
+ ):
709
+ required.add("error_contract")
710
+ if re.search(
711
+ r'\b(?:400|401|403|404|409|422|5[0-9][0-9])\b|http status|status code',
712
+ text,
713
+ ) and re.search(r'json error|error object|error body|invalid_query|error.*field', text):
714
+ required.add("http_error_contract")
715
+ if re.search(
716
+ r'\b(?:keys?|fields?|rows?|shape|json[ -]?object|'
717
+ r'error object|error body|response body)\b|'
718
+ r'\b(?:applied|rejected|accounts|scheduled|accepted|remaining)\b',
719
+ text,
720
+ ) or INLINE_JSON_OBJECT_RE.search(verification_text):
721
+ required.add("shape_contract")
722
+ if re.search(r'stderr|stdout|exit `?2`?', text):
263
723
  required.add("stdout_stderr_contract")
724
+ if re.search(r'signature|signing|signed|x-signature|hmac|raw[ -]?body|timingsafeequal', text):
725
+ required.add("auth_signature_contract")
726
+ if re.search(
727
+ r'replay|same.{0,40}`?id`?|already-seen|idempotent|re-delivery|'
728
+ r'duplicate[ -]+(?:delivery|event|id)',
729
+ text,
730
+ ):
731
+ required.add("idempotency_replay")
732
+ if re.search(
733
+ r'concurrent|close together|simultaneous|parallel|race|lost update|'
734
+ r'many at once|several .{0,40}requests',
735
+ text,
736
+ ):
737
+ required.add("concurrent_state_consistency")
738
+ if re.search(
739
+ r'one valid \+ one invalid|valid \+ one invalid|all-or-nothing|'
740
+ r'same list as before|0 inserts|no partial updates',
741
+ text,
742
+ ):
743
+ required.add("atomic_batch_state")
264
744
  return required
265
745
 
266
746
 
@@ -287,8 +767,8 @@ def load_risk_probes(
287
767
  if not line.strip():
288
768
  continue
289
769
  try:
290
- probe = json.loads(line)
291
- except json.JSONDecodeError as e:
770
+ probe = loads_strict_json(line)
771
+ except ValueError as e:
292
772
  return ([], f"risk-probes[{index}] invalid JSON: {e}")
293
773
  err = validate_risk_probe(probe, index, verification_text)
294
774
  if err:
@@ -306,6 +786,12 @@ def load_risk_probes(
306
786
  missing_tags = sorted(required_risk_probe_tags(verification_text) - present_tags)
307
787
  if missing_tags:
308
788
  return ([], f"risk-probes.jsonl missing required probe tag(s): {', '.join(missing_tags)}")
789
+ solo_headroom_probe_err = validate_risk_probes_cover_solo_headroom_hypothesis(
790
+ probes,
791
+ verification_text,
792
+ )
793
+ if solo_headroom_probe_err:
794
+ return ([], solo_headroom_probe_err)
309
795
  return (probes, None)
310
796
 
311
797
 
@@ -318,8 +804,8 @@ def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
318
804
  if not state_path.is_file():
319
805
  return (None, None)
320
806
  try:
321
- state = json.loads(state_path.read_text())
322
- except (json.JSONDecodeError, OSError):
807
+ state = loads_strict_json(state_path.read_text())
808
+ except (ValueError, OSError):
323
809
  return (None, None)
324
810
  src = state.get("source") or {}
325
811
  src_type = src.get("type")
@@ -337,6 +823,75 @@ def read_source(work: Path, devlyn_dir: Path) -> tuple[str | None, Path | None]:
337
823
  return (src_type, md if md.is_file() else None)
338
824
 
339
825
 
826
+ def read_state(devlyn_dir: Path) -> dict:
827
+ state_path = devlyn_dir / "pipeline.state.json"
828
+ if not state_path.is_file():
829
+ return {}
830
+ try:
831
+ data = loads_strict_json(state_path.read_text())
832
+ except (ValueError, OSError):
833
+ return {}
834
+ return data if isinstance(data, dict) else {}
835
+
836
+
837
+ def state_requires_risk_probes(state: dict) -> bool:
838
+ risk_profile = state.get("risk_profile")
839
+ return isinstance(risk_profile, dict) and risk_profile.get("risk_probes_enabled") is True
840
+
841
+
842
+ def risk_probes_state_error(state: dict) -> str | None:
843
+ if "risk_profile" not in state:
844
+ return None
845
+ risk_profile = state.get("risk_profile")
846
+ if not isinstance(risk_profile, dict):
847
+ return "pipeline.state.json risk_profile must be an object"
848
+ if "risk_probes_enabled" not in risk_profile:
849
+ return None
850
+ if not isinstance(risk_profile.get("risk_probes_enabled"), bool):
851
+ return "pipeline.state.json risk_profile.risk_probes_enabled must be boolean"
852
+ return None
853
+
854
+
855
+ def source_integrity_error(src_type: str | None, state: dict, source_md: Path | None) -> str | None:
856
+ if source_md is None:
857
+ return None
858
+ src = state.get("source") if isinstance(state.get("source"), dict) else {}
859
+ if src_type == "generated":
860
+ field = "criteria_sha256"
861
+ required = True
862
+ elif src_type == "spec":
863
+ field = "spec_sha256"
864
+ required = False
865
+ else:
866
+ return None
867
+ expected = src.get(field)
868
+ qualified = f"source.{field}"
869
+ if not isinstance(expected, str) or not expected:
870
+ if required:
871
+ return f"{qualified} is required for generated criteria source integrity."
872
+ return None
873
+ try:
874
+ actual = hashlib.sha256(source_md.read_bytes()).hexdigest()
875
+ except OSError as exc:
876
+ return f"could not read {source_md} for source integrity check: {exc}"
877
+ if expected != actual:
878
+ return f"{qualified} mismatch for {source_md}: expected {expected}, actual {actual}."
879
+ return None
880
+
881
+
882
+ def load_expected_contract(expected_path: Path) -> tuple[dict | None, str | None]:
883
+ try:
884
+ data = loads_strict_json(expected_path.read_text())
885
+ except ValueError as e:
886
+ return (None, f"{expected_path} has invalid JSON: {e}")
887
+ except OSError as e:
888
+ return (None, f"{expected_path} is unreadable: {e}")
889
+ err = validate_expected_shape(data)
890
+ if err:
891
+ return (None, f"{expected_path}: {err}")
892
+ return (data, None)
893
+
894
+
340
895
  def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
341
896
  """Materialize .devlyn/spec-verify.json from the json block in `md`.
342
897
 
@@ -349,8 +904,8 @@ def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
349
904
  if block is None:
350
905
  return (False, None)
351
906
  try:
352
- data = json.loads(block)
353
- except json.JSONDecodeError as e:
907
+ data = loads_strict_json(block)
908
+ except ValueError as e:
354
909
  return (False, f"`## Verification` ```json``` block in {md} has invalid JSON: {e}")
355
910
  err = validate_shape(data)
356
911
  if err:
@@ -361,13 +916,44 @@ def stage_from_source(md: Path, devlyn_dir: Path) -> tuple[bool, str | None]:
361
916
  return (True, None)
362
917
 
363
918
 
919
+ def stage_from_expected(
920
+ md: Path,
921
+ devlyn_dir: Path,
922
+ ) -> tuple[bool, bool, str | None, Path, dict | None]:
923
+ """Materialize .devlyn/spec-verify.json from sibling spec.expected.json.
924
+
925
+ Returns (found, staged, error, expected_path, expected_data).
926
+ - found=False: no sibling file; caller may fall back to legacy inline carrier.
927
+ - found=True, error: sibling exists but is malformed; caller must fail closed.
928
+ - found=True, staged=False: valid pure-design contract with no commands.
929
+ - found=True, staged=True: wrote verification_commands into spec-verify.json.
930
+ """
931
+ expected_path = md.with_name("spec.expected.json")
932
+ if not expected_path.is_file():
933
+ return (False, False, None, expected_path, None)
934
+ data, err = load_expected_contract(expected_path)
935
+ if err:
936
+ return (True, False, err, expected_path, None)
937
+ assert data is not None
938
+ commands = data.get("verification_commands")
939
+ if not commands:
940
+ spec_path = devlyn_dir / "spec-verify.json"
941
+ if spec_path.exists():
942
+ spec_path.unlink()
943
+ return (True, False, None, expected_path, data)
944
+ normalized = {"verification_commands": commands}
945
+ devlyn_dir.mkdir(parents=True, exist_ok=True)
946
+ (devlyn_dir / "spec-verify.json").write_text(json.dumps(normalized, indent=2) + "\n")
947
+ return (True, True, None, expected_path, data)
948
+
949
+
364
950
  def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | None) -> None:
365
951
  """Emit a single CRITICAL finding for a malformed verification carrier."""
366
952
  devlyn_dir.mkdir(parents=True, exist_ok=True)
367
- findings_path = devlyn_dir / "spec-verify-findings.jsonl"
953
+ findings_path = devlyn_dir / output_findings_name()
368
954
  file_ref = str(source_path) if source_path else ".devlyn/pipeline.state.json"
369
955
  finding = {
370
- "id": "BGATE-0001",
956
+ "id": f"{output_finding_prefix()}-0001",
371
957
  "rule_id": "correctness.spec-verify-malformed",
372
958
  "level": "error",
373
959
  "severity": "CRITICAL",
@@ -375,11 +961,11 @@ def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | No
375
961
  "message": f"Verification contract carrier is malformed: {error}",
376
962
  "file": file_ref,
377
963
  "line": 1,
378
- "phase": "build_gate",
964
+ "phase": output_phase(),
379
965
  "criterion_ref": "spec-verify://carrier",
380
966
  "fix_hint": (
381
- "Fix the `## Verification` ```json``` block: a JSON object with "
382
- "a non-empty `verification_commands` array of "
967
+ "Fix the sibling `spec.expected.json` file or the `## Verification` "
968
+ "```json``` block: a JSON object with a non-empty `verification_commands` array of "
383
969
  "{cmd, exit_code?, stdout_contains?, stdout_not_contains?} "
384
970
  "entries. See references/build-gate.md § 'Spec literal check'."
385
971
  ),
@@ -390,6 +976,200 @@ def write_malformed_finding(devlyn_dir: Path, error: str, source_path: Path | No
390
976
  fh.write(json.dumps(finding) + "\n")
391
977
 
392
978
 
979
+ def slice_diff_to_files(diff_text: str, files: list[str]) -> str:
980
+ if not files:
981
+ return diff_text
982
+ out: list[str] = []
983
+ keep = False
984
+ for line in diff_text.splitlines(keepends=True):
985
+ if line.startswith("diff --git "):
986
+ keep = any(path in line for path in files)
987
+ if keep:
988
+ out.append(line)
989
+ return "".join(out)
990
+
991
+
992
+ def diff_text_for_expected(work: Path, devlyn_dir: Path, state: dict) -> tuple[str, str | None]:
993
+ external_diff = devlyn_dir / "external-diff.patch"
994
+ if external_diff.is_file():
995
+ try:
996
+ return (external_diff.read_text(), None)
997
+ except OSError as e:
998
+ return ("", f"cannot read {external_diff}: {e}")
999
+ base_sha = ((state.get("base_ref") or {}).get("sha") or "").strip()
1000
+ cmd = ["git", "diff"]
1001
+ if base_sha:
1002
+ cmd.append(base_sha)
1003
+ proc = subprocess.run(cmd, cwd=str(work), capture_output=True, text=True)
1004
+ if proc.returncode != 0:
1005
+ return ("", (proc.stderr or proc.stdout or "git diff failed").strip())
1006
+ return (proc.stdout or "", None)
1007
+
1008
+
1009
+ def count_deps_added(work: Path, state: dict) -> int:
1010
+ base_sha = ((state.get("base_ref") or {}).get("sha") or "").strip()
1011
+ cmd = ["git", "diff"]
1012
+ if base_sha:
1013
+ cmd.append(base_sha)
1014
+ cmd.extend(["--", "package.json"])
1015
+ proc = subprocess.run(cmd, cwd=str(work), capture_output=True, text=True)
1016
+ if proc.returncode != 0:
1017
+ return 0
1018
+ in_deps = False
1019
+ count = 0
1020
+ for line in (proc.stdout or "").splitlines():
1021
+ if line.startswith(("diff ", "index ", "---", "+++", "@@")):
1022
+ continue
1023
+ marker = line[:1]
1024
+ content = line[1:] if marker in {"+", "-", " "} else line
1025
+ if '"dependencies"' in content or '"devDependencies"' in content:
1026
+ in_deps = True
1027
+ elif content.strip().startswith("}"):
1028
+ in_deps = False
1029
+ elif in_deps and marker == "+":
1030
+ if re.search(r'"[^"]+"\s*:\s*"[^"]+"', content):
1031
+ count += 1
1032
+ return count
1033
+
1034
+
1035
+ def changed_files(work: Path, state: dict, devlyn_dir: Path) -> list[str]:
1036
+ external_diff = devlyn_dir / "external-diff.patch"
1037
+ if external_diff.is_file():
1038
+ names: list[str] = []
1039
+ try:
1040
+ external_text = external_diff.read_text()
1041
+ except OSError:
1042
+ return []
1043
+ for line in external_text.splitlines():
1044
+ if line.startswith("diff --git "):
1045
+ parts = line.split()
1046
+ if len(parts) >= 4:
1047
+ names.append(parts[3].removeprefix("b/"))
1048
+ return names
1049
+ base_sha = ((state.get("base_ref") or {}).get("sha") or "").strip()
1050
+ cmd = ["git", "diff", "--name-only"]
1051
+ if base_sha:
1052
+ cmd.append(base_sha)
1053
+ proc = subprocess.run(cmd, cwd=str(work), capture_output=True, text=True)
1054
+ if proc.returncode != 0:
1055
+ return []
1056
+ return [line.strip() for line in (proc.stdout or "").splitlines() if line.strip()]
1057
+
1058
+
1059
+ def expected_contract_findings(
1060
+ expected_data: dict | None,
1061
+ expected_path: Path | None,
1062
+ work: Path,
1063
+ devlyn_dir: Path,
1064
+ state: dict,
1065
+ finding_start: int,
1066
+ ) -> tuple[list[dict], int]:
1067
+ if not expected_data:
1068
+ return ([], finding_start)
1069
+ findings: list[dict] = []
1070
+ seq = finding_start
1071
+ diff_text, diff_error = diff_text_for_expected(work, devlyn_dir, state)
1072
+ if diff_error and (
1073
+ expected_data.get("forbidden_patterns") or expected_data.get("forbidden_files")
1074
+ ):
1075
+ findings.append({
1076
+ "id": f"{output_finding_prefix()}-{seq:04d}",
1077
+ "rule_id": "correctness.expected-contract-unverifiable",
1078
+ "level": "error",
1079
+ "severity": "CRITICAL",
1080
+ "confidence": 1.0,
1081
+ "message": f"Cannot compute diff for forbidden_patterns: {diff_error}",
1082
+ "file": str(expected_path or "spec.expected.json"),
1083
+ "line": 1,
1084
+ "phase": output_phase(),
1085
+ "criterion_ref": "spec.expected.json/forbidden_patterns",
1086
+ "fix_hint": "Ensure pipeline.state.json has base_ref.sha or provide .devlyn/external-diff.patch.",
1087
+ "blocking": True,
1088
+ "status": "open",
1089
+ })
1090
+ seq += 1
1091
+ for i, pattern in enumerate(expected_data.get("forbidden_patterns", []) or []):
1092
+ scope = slice_diff_to_files(diff_text, pattern.get("files") or [])
1093
+ if not re.search(pattern["pattern"], scope):
1094
+ continue
1095
+ is_disqualifier = pattern.get("severity") == "disqualifier"
1096
+ findings.append({
1097
+ "id": f"{output_finding_prefix()}-{seq:04d}",
1098
+ "rule_id": "correctness.forbidden-pattern",
1099
+ "level": "error" if is_disqualifier else "warning",
1100
+ "severity": "CRITICAL" if is_disqualifier else "MEDIUM",
1101
+ "confidence": 1.0,
1102
+ "message": pattern.get("description") or f"Forbidden pattern matched: {pattern['pattern']}",
1103
+ "file": str(expected_path or "spec.expected.json"),
1104
+ "line": 1,
1105
+ "phase": output_phase(),
1106
+ "criterion_ref": f"spec.expected.json/forbidden_patterns/{i}",
1107
+ "fix_hint": "Remove the forbidden diff pattern or change the spec.expected.json contract explicitly.",
1108
+ "blocking": is_disqualifier,
1109
+ "status": "open",
1110
+ })
1111
+ seq += 1
1112
+ changed = set(changed_files(work, state, devlyn_dir))
1113
+ for i, required in enumerate(expected_data.get("required_files", []) or []):
1114
+ if (work / required).exists():
1115
+ continue
1116
+ findings.append({
1117
+ "id": f"{output_finding_prefix()}-{seq:04d}",
1118
+ "rule_id": "correctness.required-file-missing",
1119
+ "level": "error",
1120
+ "severity": "CRITICAL",
1121
+ "confidence": 1.0,
1122
+ "message": f"Required file is missing: {required}",
1123
+ "file": str(expected_path or "spec.expected.json"),
1124
+ "line": 1,
1125
+ "phase": output_phase(),
1126
+ "criterion_ref": f"spec.expected.json/required_files/{i}",
1127
+ "fix_hint": "Create the required file or remove it from the expected contract.",
1128
+ "blocking": True,
1129
+ "status": "open",
1130
+ })
1131
+ seq += 1
1132
+ for i, forbidden in enumerate(expected_data.get("forbidden_files", []) or []):
1133
+ if forbidden not in changed:
1134
+ continue
1135
+ findings.append({
1136
+ "id": f"{output_finding_prefix()}-{seq:04d}",
1137
+ "rule_id": "scope.forbidden-file-touched",
1138
+ "level": "error",
1139
+ "severity": "CRITICAL",
1140
+ "confidence": 1.0,
1141
+ "message": f"Forbidden file appears in the diff: {forbidden}",
1142
+ "file": str(expected_path or "spec.expected.json"),
1143
+ "line": 1,
1144
+ "phase": output_phase(),
1145
+ "criterion_ref": f"spec.expected.json/forbidden_files/{i}",
1146
+ "fix_hint": "Remove that file from the diff or update the expected contract.",
1147
+ "blocking": True,
1148
+ "status": "open",
1149
+ })
1150
+ seq += 1
1151
+ max_deps = expected_data.get("max_deps_added", 0)
1152
+ deps_added = count_deps_added(work, state)
1153
+ if deps_added > max_deps:
1154
+ findings.append({
1155
+ "id": f"{output_finding_prefix()}-{seq:04d}",
1156
+ "rule_id": "scope.max-deps-added-exceeded",
1157
+ "level": "error",
1158
+ "severity": "CRITICAL",
1159
+ "confidence": 1.0,
1160
+ "message": f"Added {deps_added} package dependencies; max_deps_added is {max_deps}.",
1161
+ "file": str(expected_path or "spec.expected.json"),
1162
+ "line": 1,
1163
+ "phase": output_phase(),
1164
+ "criterion_ref": "spec.expected.json/max_deps_added",
1165
+ "fix_hint": "Remove the new dependency or explicitly license it in spec.expected.json.",
1166
+ "blocking": True,
1167
+ "status": "open",
1168
+ })
1169
+ seq += 1
1170
+ return (findings, seq)
1171
+
1172
+
393
1173
  def run_check_mode(md_path: Path) -> int:
394
1174
  """`--check <markdown>` — validate the verification carrier without
395
1175
  running any commands. Used by /devlyn:ideate after item-spec write.
@@ -400,15 +1180,28 @@ def run_check_mode(md_path: Path) -> int:
400
1180
  if not md_path.is_file():
401
1181
  print(f"[spec-verify --check] error: {md_path} not found", file=sys.stderr)
402
1182
  return 2
403
- block = extract_verification_block(md_path.read_text())
1183
+ text = md_path.read_text()
1184
+ frontmatter_err = validate_present_spec_complexity(text)
1185
+ if frontmatter_err:
1186
+ print(f"[spec-verify --check] {md_path}: {frontmatter_err}", file=sys.stderr)
1187
+ return 2
1188
+ solo_headroom_err = validate_present_solo_headroom_hypothesis(text)
1189
+ if solo_headroom_err:
1190
+ print(f"[spec-verify --check] {md_path}: {solo_headroom_err}", file=sys.stderr)
1191
+ return 2
1192
+ solo_ceiling_err = validate_present_solo_ceiling_avoidance(text)
1193
+ if solo_ceiling_err:
1194
+ print(f"[spec-verify --check] {md_path}: {solo_ceiling_err}", file=sys.stderr)
1195
+ return 2
1196
+ block = extract_verification_block(text)
404
1197
  if block is None:
405
1198
  # Section absent or no json block — opt-in nature preserved for
406
1199
  # ideate (a spec without machine verification is still valid; it
407
1200
  # just won't activate the BUILD_GATE gate).
408
1201
  return 0
409
1202
  try:
410
- data = json.loads(block)
411
- except json.JSONDecodeError as e:
1203
+ data = loads_strict_json(block)
1204
+ except ValueError as e:
412
1205
  print(
413
1206
  f"[spec-verify --check] {md_path}: invalid JSON in `## Verification` "
414
1207
  f"```json``` block: {e}",
@@ -419,6 +1212,33 @@ def run_check_mode(md_path: Path) -> int:
419
1212
  if err:
420
1213
  print(f"[spec-verify --check] {md_path}: shape error: {err}", file=sys.stderr)
421
1214
  return 2
1215
+ solo_headroom_command_err = validate_solo_headroom_commands_against_expected(
1216
+ text,
1217
+ data.get("verification_commands", []),
1218
+ "`## Verification` JSON carrier",
1219
+ )
1220
+ if solo_headroom_command_err:
1221
+ print(f"[spec-verify --check] {md_path}: {solo_headroom_command_err}", file=sys.stderr)
1222
+ return 2
1223
+ return 0
1224
+
1225
+
1226
+ def run_check_expected_mode(expected_path: Path) -> int:
1227
+ if not expected_path.is_file():
1228
+ print(f"[spec-verify --check-expected] error: {expected_path} not found", file=sys.stderr)
1229
+ return 2
1230
+ _data, err = load_expected_contract(expected_path)
1231
+ if err:
1232
+ print(f"[spec-verify --check-expected] {expected_path}: shape error: {err}", file=sys.stderr)
1233
+ return 2
1234
+ complexity_err = validate_sibling_spec_complexity(expected_path)
1235
+ if complexity_err:
1236
+ print(f"[spec-verify --check-expected] {expected_path}: shape error: {complexity_err}", file=sys.stderr)
1237
+ return 2
1238
+ sibling_err = validate_expected_against_sibling_spec(expected_path, _data)
1239
+ if sibling_err:
1240
+ print(f"[spec-verify --check-expected] {expected_path}: shape error: {sibling_err}", file=sys.stderr)
1241
+ return 2
422
1242
  return 0
423
1243
 
424
1244
 
@@ -461,70 +1281,1756 @@ def run_self_test() -> int:
461
1281
  print(good.stderr, file=sys.stderr)
462
1282
  return 1
463
1283
 
464
- (devlyn / "risk-probes.jsonl").write_text(json.dumps({
465
- "id": "P2",
466
- "derived_from": "probe must pass visible marker.",
467
- "cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
468
- "exit_code": 0,
469
- }) + "\n")
470
- bad = subprocess.run(
471
- [sys.executable, script_path, "--validate-risk-probes"],
1284
+ (devlyn / "risk-probes.jsonl").unlink()
1285
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
1286
+ "source": {"type": "spec", "spec_path": str(spec_md)},
1287
+ "risk_profile": {"risk_probes_enabled": True},
1288
+ }))
1289
+ missing_required_probe = subprocess.run(
1290
+ [sys.executable, script_path, "--include-risk-probes"],
472
1291
  cwd=work,
473
1292
  env=env,
474
1293
  capture_output=True,
475
1294
  text=True,
476
1295
  )
477
- if bad.returncode == 0:
478
- print("hidden verifier path was accepted", file=sys.stderr)
1296
+ if missing_required_probe.returncode == 0:
1297
+ print("--include-risk-probes accepted missing required risk-probes.jsonl", file=sys.stderr)
1298
+ return 1
1299
+ if "risk-probes.jsonl is required when --risk-probes is enabled" not in missing_required_probe.stderr:
1300
+ print("--include-risk-probes missing required probe had the wrong error", file=sys.stderr)
1301
+ print(missing_required_probe.stderr, file=sys.stderr)
479
1302
  return 1
480
1303
 
481
- (devlyn / "risk-probes.jsonl").write_text(json.dumps({
482
- "id": "P3",
483
- "derived_from": "probe must pass visible marker.",
484
- "cmd": "printf bad-error-derived-from",
485
- "exit_code": 0,
486
- "tags": ["error_contract"],
487
- "tag_evidence": {"error_contract": []},
488
- }) + "\n")
489
- bad_error_ref = subprocess.run(
490
- [sys.executable, script_path, "--validate-risk-probes"],
1304
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
1305
+ "source": {"type": "spec", "spec_path": str(spec_md)},
1306
+ "risk_profile": {"risk_probes_enabled": False},
1307
+ }))
1308
+ missing_optional_probe = subprocess.run(
1309
+ [sys.executable, script_path, "--include-risk-probes"],
491
1310
  cwd=work,
492
1311
  env=env,
493
1312
  capture_output=True,
494
1313
  text=True,
495
1314
  )
496
- if bad_error_ref.returncode == 0:
497
- print("error_contract with unrelated derived_from was accepted", file=sys.stderr)
1315
+ if missing_optional_probe.returncode != 0:
1316
+ print("--include-risk-probes rejected optional missing risk-probes.jsonl", file=sys.stderr)
1317
+ print(missing_optional_probe.stderr, file=sys.stderr)
1318
+ return 1
1319
+
1320
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
1321
+ "source": {"type": "spec", "spec_path": str(spec_md)},
1322
+ "risk_profile": {"risk_probes_enabled": "true"},
1323
+ }))
1324
+ malformed_risk_probe_state = subprocess.run(
1325
+ [sys.executable, script_path, "--include-risk-probes"],
1326
+ cwd=work,
1327
+ env=env,
1328
+ capture_output=True,
1329
+ text=True,
1330
+ )
1331
+ if malformed_risk_probe_state.returncode == 0:
1332
+ print("--include-risk-probes accepted non-boolean risk_probes_enabled", file=sys.stderr)
1333
+ return 1
1334
+ if "risk_profile.risk_probes_enabled must be boolean" not in malformed_risk_probe_state.stderr:
1335
+ print("--include-risk-probes malformed risk_probes_enabled had the wrong error", file=sys.stderr)
1336
+ print(malformed_risk_probe_state.stderr, file=sys.stderr)
1337
+ return 1
1338
+
1339
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
1340
+ "source": {"type": "spec", "spec_path": str(spec_md)},
1341
+ "risk_profile": "enabled",
1342
+ }))
1343
+ malformed_risk_profile = subprocess.run(
1344
+ [sys.executable, script_path, "--include-risk-probes"],
1345
+ cwd=work,
1346
+ env=env,
1347
+ capture_output=True,
1348
+ text=True,
1349
+ )
1350
+ if malformed_risk_profile.returncode == 0:
1351
+ print("--include-risk-probes accepted non-object risk_profile", file=sys.stderr)
1352
+ return 1
1353
+ if "risk_profile must be an object" not in malformed_risk_profile.stderr:
1354
+ print("--include-risk-probes malformed risk_profile had the wrong error", file=sys.stderr)
1355
+ print(malformed_risk_profile.stderr, file=sys.stderr)
498
1356
  return 1
499
1357
 
1358
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
1359
+ "source": {"type": "spec", "spec_path": str(spec_md)}
1360
+ }))
500
1361
  (devlyn / "risk-probes.jsonl").write_text(json.dumps({
501
- "id": "P4",
1362
+ "id": "P1",
502
1363
  "derived_from": "probe must pass visible marker.",
503
- "cmd": "printf weak-boundary",
1364
+ "cmd": "printf probe-ok",
504
1365
  "exit_code": 0,
505
- "tags": ["boundary_overlap"],
506
- "tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
1366
+ "stdout_contains": ["probe-ok"],
1367
+ "stdout_not_contains": [],
1368
+ "tags": ["shape_contract"],
1369
+ "tag_evidence": {},
507
1370
  }) + "\n")
508
- weak = subprocess.run(
509
- [sys.executable, script_path, "--validate-risk-probes"],
1371
+
1372
+ good_complexity = work / "good-complexity.md"
1373
+ good_complexity.write_text(
1374
+ "---\nid: good\ncomplexity: large\n---\n\n# Good\n\n## Verification\n\n- ok\n",
1375
+ encoding="utf-8",
1376
+ )
1377
+ good_complexity_check = subprocess.run(
1378
+ [sys.executable, script_path, "--check", str(good_complexity)],
510
1379
  cwd=work,
511
- env=env,
512
1380
  capture_output=True,
513
1381
  text=True,
514
1382
  )
515
- if weak.returncode == 0:
516
- print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
1383
+ if good_complexity_check.returncode != 0:
1384
+ print(good_complexity_check.stderr, file=sys.stderr)
517
1385
  return 1
518
- return 0
519
1386
 
1387
+ bad_complexity = work / "bad-complexity.md"
1388
+ bad_complexity.write_text(
1389
+ "---\nid: bad\ncomplexity: hihg\n---\n\n# Bad\n\n## Verification\n\n- ok\n",
1390
+ encoding="utf-8",
1391
+ )
1392
+ bad_complexity_check = subprocess.run(
1393
+ [sys.executable, script_path, "--check", str(bad_complexity)],
1394
+ cwd=work,
1395
+ capture_output=True,
1396
+ text=True,
1397
+ )
1398
+ if bad_complexity_check.returncode == 0:
1399
+ print("unsupported spec complexity was accepted", file=sys.stderr)
1400
+ return 1
1401
+ if "frontmatter complexity must be one of" not in bad_complexity_check.stderr:
1402
+ print("unsupported spec complexity did not report the allowed values", file=sys.stderr)
1403
+ print(bad_complexity_check.stderr, file=sys.stderr)
1404
+ return 1
520
1405
 
521
- def main() -> int:
522
- include_risk_probes = False
523
- validate_risk_probes_only = False
524
- if "--include-risk-probes" in sys.argv[1:]:
525
- include_risk_probes = True
526
- sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
527
- if "--validate-risk-probes" in sys.argv[1:]:
1406
+ weak_solo_headroom = work / "weak-solo-headroom.md"
1407
+ weak_solo_headroom.write_text(
1408
+ "# Weak\n\n## Verification\n\n"
1409
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling.\n"
1410
+ "- Observable command: `node check.js` exposes behavior.\n",
1411
+ encoding="utf-8",
1412
+ )
1413
+ weak_solo_check = subprocess.run(
1414
+ [sys.executable, script_path, "--check", str(weak_solo_headroom)],
1415
+ cwd=work,
1416
+ capture_output=True,
1417
+ text=True,
1418
+ )
1419
+ if weak_solo_check.returncode == 0:
1420
+ print("weak solo-headroom hypothesis was accepted by --check", file=sys.stderr)
1421
+ return 1
1422
+ if "backticked command/observable line that exposes the miss" not in weak_solo_check.stderr:
1423
+ print("--check did not report weak solo-headroom hypothesis", file=sys.stderr)
1424
+ print(weak_solo_check.stderr, file=sys.stderr)
1425
+ return 1
1426
+
1427
+ weak_descriptive_backtick = work / "weak-descriptive-backtick.md"
1428
+ weak_descriptive_backtick.write_text(
1429
+ "# Weak descriptive backtick\n\n## Verification\n\n"
1430
+ "- solo-headroom hypothesis: solo_claude should miss behavior where observable `priority rollback` exposes the miss.\n",
1431
+ encoding="utf-8",
1432
+ )
1433
+ weak_descriptive_check = subprocess.run(
1434
+ [sys.executable, script_path, "--check", str(weak_descriptive_backtick)],
1435
+ cwd=work,
1436
+ capture_output=True,
1437
+ text=True,
1438
+ )
1439
+ if weak_descriptive_check.returncode == 0:
1440
+ print("descriptive backtick solo-headroom hypothesis was accepted by --check", file=sys.stderr)
1441
+ return 1
1442
+
1443
+ strong_solo_headroom = work / "strong-solo-headroom.md"
1444
+ strong_solo_headroom.write_text(
1445
+ "# Strong\n\n## Verification\n\n"
1446
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling exposed by `node check.js`.\n",
1447
+ encoding="utf-8",
1448
+ )
1449
+ strong_solo_check = subprocess.run(
1450
+ [sys.executable, script_path, "--check", str(strong_solo_headroom)],
1451
+ cwd=work,
1452
+ capture_output=True,
1453
+ text=True,
1454
+ )
1455
+ if strong_solo_check.returncode != 0:
1456
+ print("actionable solo-headroom hypothesis was rejected by --check", file=sys.stderr)
1457
+ print(strong_solo_check.stderr, file=sys.stderr)
1458
+ return 1
1459
+
1460
+ docs_style_solo_headroom = work / "docs-style-solo-headroom.md"
1461
+ docs_style_solo_headroom.write_text(
1462
+ "# Docs style\n\n## Verification\n\n"
1463
+ "- Solo-headroom hypothesis: the spec must literally contain `solo_claude`, `miss`, and an observable command; "
1464
+ "`node check.js` exposes the miss.\n",
1465
+ encoding="utf-8",
1466
+ )
1467
+ docs_style_solo_check = subprocess.run(
1468
+ [sys.executable, script_path, "--check", str(docs_style_solo_headroom)],
1469
+ cwd=work,
1470
+ capture_output=True,
1471
+ text=True,
1472
+ )
1473
+ if docs_style_solo_check.returncode != 0:
1474
+ print("docs-style solo-headroom hypothesis was rejected by --check", file=sys.stderr)
1475
+ print(docs_style_solo_check.stderr, file=sys.stderr)
1476
+ return 1
1477
+
1478
+ weak_solo_ceiling = work / "weak-solo-ceiling.md"
1479
+ weak_solo_ceiling.write_text(
1480
+ "# Weak ceiling\n\n## Verification\n\n"
1481
+ "- solo ceiling avoidance: this is not like the previous ones.\n",
1482
+ encoding="utf-8",
1483
+ )
1484
+ weak_solo_ceiling_check = subprocess.run(
1485
+ [sys.executable, script_path, "--check", str(weak_solo_ceiling)],
1486
+ cwd=work,
1487
+ capture_output=True,
1488
+ text=True,
1489
+ )
1490
+ if weak_solo_ceiling_check.returncode == 0:
1491
+ print("weak solo ceiling avoidance was accepted by --check", file=sys.stderr)
1492
+ return 1
1493
+ if "concrete difference from rejected or solo-saturated controls" not in weak_solo_ceiling_check.stderr:
1494
+ print("--check did not report weak solo ceiling avoidance", file=sys.stderr)
1495
+ print(weak_solo_ceiling_check.stderr, file=sys.stderr)
1496
+ return 1
1497
+
1498
+ strong_solo_ceiling = work / "strong-solo-ceiling.md"
1499
+ strong_solo_ceiling.write_text(
1500
+ "# Strong ceiling\n\n## Verification\n\n"
1501
+ "- solo ceiling avoidance: unlike solo-saturated `S2`-`S6`, this uses a cross-run "
1502
+ "state leak because solo_claude headroom should be preserved.\n",
1503
+ encoding="utf-8",
1504
+ )
1505
+ strong_solo_ceiling_check = subprocess.run(
1506
+ [sys.executable, script_path, "--check", str(strong_solo_ceiling)],
1507
+ cwd=work,
1508
+ capture_output=True,
1509
+ text=True,
1510
+ )
1511
+ if strong_solo_ceiling_check.returncode != 0:
1512
+ print("actionable solo ceiling avoidance was rejected by --check", file=sys.stderr)
1513
+ print(strong_solo_ceiling_check.stderr, file=sys.stderr)
1514
+ return 1
1515
+
1516
+ inline_mismatched_solo = work / "inline-mismatched-solo.md"
1517
+ inline_mismatched_solo.write_text(
1518
+ "# Inline mismatch\n\n## Verification\n\n"
1519
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
1520
+ "`node check.js` exposes the miss.\n\n"
1521
+ "```json\n"
1522
+ + json.dumps({"verification_commands": [{"cmd": "printf ok"}]})
1523
+ + "\n```\n",
1524
+ encoding="utf-8",
1525
+ )
1526
+ inline_mismatched_check = subprocess.run(
1527
+ [sys.executable, script_path, "--check", str(inline_mismatched_solo)],
1528
+ cwd=work,
1529
+ capture_output=True,
1530
+ text=True,
1531
+ )
1532
+ if inline_mismatched_check.returncode == 0:
1533
+ print("mismatched inline solo-headroom command was accepted by --check", file=sys.stderr)
1534
+ return 1
1535
+ if "observable command must match `## Verification` JSON carrier" not in inline_mismatched_check.stderr:
1536
+ print("--check did not report mismatched inline solo-headroom command", file=sys.stderr)
1537
+ print(inline_mismatched_check.stderr, file=sys.stderr)
1538
+ return 1
1539
+
1540
+ inline_matched_solo = work / "inline-matched-solo.md"
1541
+ inline_matched_solo.write_text(
1542
+ "# Inline match\n\n## Verification\n\n"
1543
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
1544
+ "`printf ok` exposes the miss.\n\n"
1545
+ "```json\n"
1546
+ + json.dumps({"verification_commands": [{"cmd": "printf ok"}]})
1547
+ + "\n```\n",
1548
+ encoding="utf-8",
1549
+ )
1550
+ inline_matched_check = subprocess.run(
1551
+ [sys.executable, script_path, "--check", str(inline_matched_solo)],
1552
+ cwd=work,
1553
+ capture_output=True,
1554
+ text=True,
1555
+ )
1556
+ if inline_matched_check.returncode != 0:
1557
+ print("matched inline solo-headroom command was rejected by --check", file=sys.stderr)
1558
+ print(inline_matched_check.stderr, file=sys.stderr)
1559
+ return 1
1560
+
1561
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
1562
+ "id": "P2",
1563
+ "derived_from": "probe must pass visible marker.",
1564
+ "cmd": "node $BENCH_FIXTURE_DIR/verifiers/hidden.js",
1565
+ "exit_code": 0,
1566
+ }) + "\n")
1567
+ bad = subprocess.run(
1568
+ [sys.executable, script_path, "--validate-risk-probes"],
1569
+ cwd=work,
1570
+ env=env,
1571
+ capture_output=True,
1572
+ text=True,
1573
+ )
1574
+ if bad.returncode == 0:
1575
+ print("hidden verifier path was accepted", file=sys.stderr)
1576
+ return 1
1577
+
1578
+ (devlyn / "risk-probes.jsonl").write_text('{"id":NaN}\n')
1579
+ bad_probe_nan = subprocess.run(
1580
+ [sys.executable, script_path, "--validate-risk-probes"],
1581
+ cwd=work,
1582
+ env=env,
1583
+ capture_output=True,
1584
+ text=True,
1585
+ )
1586
+ if bad_probe_nan.returncode == 0:
1587
+ print("NaN risk-probes JSONL was accepted", file=sys.stderr)
1588
+ return 1
1589
+ if "invalid JSON numeric constant: NaN" not in bad_probe_nan.stderr:
1590
+ print("NaN risk-probes JSONL did not report invalid numeric constant", file=sys.stderr)
1591
+ print(bad_probe_nan.stderr, file=sys.stderr)
1592
+ return 1
1593
+
1594
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
1595
+ "id": "P3",
1596
+ "derived_from": "probe must pass visible marker.",
1597
+ "cmd": "printf bad-error-derived-from",
1598
+ "exit_code": 0,
1599
+ "tags": ["error_contract"],
1600
+ "tag_evidence": {"error_contract": []},
1601
+ }) + "\n")
1602
+ bad_error_ref = subprocess.run(
1603
+ [sys.executable, script_path, "--validate-risk-probes"],
1604
+ cwd=work,
1605
+ env=env,
1606
+ capture_output=True,
1607
+ text=True,
1608
+ )
1609
+ if bad_error_ref.returncode == 0:
1610
+ print("error_contract with unrelated derived_from was accepted", file=sys.stderr)
1611
+ return 1
1612
+
1613
+ spec_md.write_text(
1614
+ "# Spec\n\n## Verification\n\n"
1615
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
1616
+ "`printf ok` exposes the miss.\n"
1617
+ "- probe must pass visible marker.\n",
1618
+ encoding="utf-8",
1619
+ )
1620
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
1621
+ "id": "P4",
1622
+ "derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
1623
+ "cmd": "printf unrelated",
1624
+ "exit_code": 0,
1625
+ "stdout_contains": ["unrelated"],
1626
+ "stdout_not_contains": [],
1627
+ "tags": ["shape_contract"],
1628
+ "tag_evidence": {},
1629
+ }) + "\n")
1630
+ bad_solo_headroom_probe = subprocess.run(
1631
+ [sys.executable, script_path, "--validate-risk-probes"],
1632
+ cwd=work,
1633
+ env=env,
1634
+ capture_output=True,
1635
+ text=True,
1636
+ )
1637
+ if bad_solo_headroom_probe.returncode == 0:
1638
+ print("risk probe missing solo-headroom command coverage was accepted", file=sys.stderr)
1639
+ return 1
1640
+ if "risk-probes[0].cmd must contain a solo-headroom hypothesis observable command" not in bad_solo_headroom_probe.stderr:
1641
+ print("solo-headroom risk-probe coverage failure had the wrong error", file=sys.stderr)
1642
+ print(bad_solo_headroom_probe.stderr, file=sys.stderr)
1643
+ return 1
1644
+
1645
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
1646
+ "id": "P4a",
1647
+ "derived_from": "probe must pass visible marker.",
1648
+ "cmd": "bash -lc 'printf ok'",
1649
+ "exit_code": 0,
1650
+ "stdout_contains": ["ok"],
1651
+ "stdout_not_contains": [],
1652
+ "tags": ["shape_contract"],
1653
+ "tag_evidence": {},
1654
+ }) + "\n")
1655
+ bad_solo_headroom_derived_from = subprocess.run(
1656
+ [sys.executable, script_path, "--validate-risk-probes"],
1657
+ cwd=work,
1658
+ env=env,
1659
+ capture_output=True,
1660
+ text=True,
1661
+ )
1662
+ if bad_solo_headroom_derived_from.returncode == 0:
1663
+ print("risk probe with unrelated solo-headroom derived_from was accepted", file=sys.stderr)
1664
+ return 1
1665
+ if "risk-probes[0].derived_from must reference the solo-headroom hypothesis bullet" not in bad_solo_headroom_derived_from.stderr:
1666
+ print("solo-headroom risk-probe derived_from failure had the wrong error", file=sys.stderr)
1667
+ print(bad_solo_headroom_derived_from.stderr, file=sys.stderr)
1668
+ return 1
1669
+
1670
+ (devlyn / "risk-probes.jsonl").write_text(
1671
+ json.dumps({
1672
+ "id": "P5a",
1673
+ "derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
1674
+ "cmd": "printf first-unrelated",
1675
+ "exit_code": 0,
1676
+ "stdout_contains": ["first-unrelated"],
1677
+ "stdout_not_contains": [],
1678
+ "tags": ["shape_contract"],
1679
+ "tag_evidence": {},
1680
+ }) + "\n" + json.dumps({
1681
+ "id": "P5b",
1682
+ "derived_from": "probe must pass visible marker.",
1683
+ "cmd": "bash -lc 'printf ok'",
1684
+ "exit_code": 0,
1685
+ "stdout_contains": ["ok"],
1686
+ "stdout_not_contains": [],
1687
+ "tags": ["shape_contract"],
1688
+ "tag_evidence": {},
1689
+ }) + "\n"
1690
+ )
1691
+ late_solo_headroom_probe = subprocess.run(
1692
+ [sys.executable, script_path, "--validate-risk-probes"],
1693
+ cwd=work,
1694
+ env=env,
1695
+ capture_output=True,
1696
+ text=True,
1697
+ )
1698
+ if late_solo_headroom_probe.returncode == 0:
1699
+ print("solo-headroom command in a later risk probe was accepted", file=sys.stderr)
1700
+ return 1
1701
+
1702
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
1703
+ "id": "P5c",
1704
+ "derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
1705
+ "cmd": "printf ok2",
1706
+ "exit_code": 0,
1707
+ "stdout_contains": ["ok2"],
1708
+ "stdout_not_contains": [],
1709
+ "tags": ["shape_contract"],
1710
+ "tag_evidence": {},
1711
+ }) + "\n")
1712
+ prefix_solo_headroom_probe = subprocess.run(
1713
+ [sys.executable, script_path, "--validate-risk-probes"],
1714
+ cwd=work,
1715
+ env=env,
1716
+ capture_output=True,
1717
+ text=True,
1718
+ )
1719
+ if prefix_solo_headroom_probe.returncode == 0:
1720
+ print("solo-headroom command prefix match was accepted", file=sys.stderr)
1721
+ return 1
1722
+
1723
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
1724
+ "id": "P5",
1725
+ "derived_from": "solo-headroom hypothesis: solo_claude should miss duplicate handling; `printf ok` exposes the miss.",
1726
+ "cmd": "bash -lc 'printf ok'",
1727
+ "exit_code": 0,
1728
+ "stdout_contains": ["ok"],
1729
+ "stdout_not_contains": [],
1730
+ "tags": ["shape_contract"],
1731
+ "tag_evidence": {},
1732
+ }) + "\n")
1733
+ good_solo_headroom_probe = subprocess.run(
1734
+ [sys.executable, script_path, "--validate-risk-probes"],
1735
+ cwd=work,
1736
+ env=env,
1737
+ capture_output=True,
1738
+ text=True,
1739
+ )
1740
+ if good_solo_headroom_probe.returncode != 0:
1741
+ print("risk probe covering solo-headroom command was rejected", file=sys.stderr)
1742
+ print(good_solo_headroom_probe.stderr, file=sys.stderr)
1743
+ return 1
1744
+
1745
+ expected_json = work / "spec.expected.json"
1746
+ expected_json.write_text(json.dumps({
1747
+ "verification_commands": [
1748
+ {"cmd": "printf ok", "exit_code": 0, "stdout_contains": ["ok"]}
1749
+ ],
1750
+ "forbidden_patterns": [
1751
+ {
1752
+ "pattern": "catch\\s*\\{\\s*\\}",
1753
+ "description": "silent catch hides failures",
1754
+ "severity": "disqualifier",
1755
+ }
1756
+ ],
1757
+ "required_files": ["bin/cli.js"],
1758
+ "forbidden_files": [],
1759
+ "max_deps_added": 0,
1760
+ }) + "\n")
1761
+ spec_md.write_text(
1762
+ "# Spec\n\n## Verification\n\n"
1763
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling.\n"
1764
+ "- Observable command: `node check.js` exposes behavior.\n"
1765
+ )
1766
+ weak_sibling_solo = subprocess.run(
1767
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1768
+ cwd=work,
1769
+ capture_output=True,
1770
+ text=True,
1771
+ )
1772
+ if weak_sibling_solo.returncode == 0:
1773
+ print("weak sibling solo-headroom hypothesis was accepted by --check-expected", file=sys.stderr)
1774
+ return 1
1775
+ if "backticked command/observable line that exposes the miss" not in weak_sibling_solo.stderr:
1776
+ print("--check-expected did not report weak sibling solo-headroom hypothesis", file=sys.stderr)
1777
+ print(weak_sibling_solo.stderr, file=sys.stderr)
1778
+ return 1
1779
+
1780
+ spec_md.write_text(
1781
+ "# Spec\n\n## Verification\n\n"
1782
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
1783
+ "`node check.js` exposes the miss.\n",
1784
+ encoding="utf-8",
1785
+ )
1786
+ mismatched_sibling_solo = subprocess.run(
1787
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1788
+ cwd=work,
1789
+ capture_output=True,
1790
+ text=True,
1791
+ )
1792
+ if mismatched_sibling_solo.returncode == 0:
1793
+ print("mismatched sibling solo-headroom command was accepted by --check-expected", file=sys.stderr)
1794
+ return 1
1795
+ if "observable command must match spec.expected.json" not in mismatched_sibling_solo.stderr:
1796
+ print("--check-expected did not report mismatched sibling solo-headroom command", file=sys.stderr)
1797
+ print(mismatched_sibling_solo.stderr, file=sys.stderr)
1798
+ return 1
1799
+
1800
+ spec_md.write_text(
1801
+ "# Spec\n\n## Verification\n\n"
1802
+ "- solo-headroom hypothesis: solo_claude should miss duplicate handling; "
1803
+ "`printf ok` exposes the miss.\n",
1804
+ encoding="utf-8",
1805
+ )
1806
+ matched_sibling_solo = subprocess.run(
1807
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1808
+ cwd=work,
1809
+ capture_output=True,
1810
+ text=True,
1811
+ )
1812
+ if matched_sibling_solo.returncode != 0:
1813
+ print("matched sibling solo-headroom command was rejected by --check-expected", file=sys.stderr)
1814
+ print(matched_sibling_solo.stderr, file=sys.stderr)
1815
+ return 1
1816
+
1817
+ spec_md.write_text(
1818
+ "# Spec\n\n## Verification\n\n"
1819
+ "- Solo-headroom hypothesis: the spec must literally contain `solo_claude`, `miss`, and an observable command; "
1820
+ "`printf ok` exposes the miss.\n",
1821
+ encoding="utf-8",
1822
+ )
1823
+ docs_style_sibling_solo = subprocess.run(
1824
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1825
+ cwd=work,
1826
+ capture_output=True,
1827
+ text=True,
1828
+ )
1829
+ if docs_style_sibling_solo.returncode != 0:
1830
+ print("docs-style sibling solo-headroom command was rejected by --check-expected", file=sys.stderr)
1831
+ print(docs_style_sibling_solo.stderr, file=sys.stderr)
1832
+ return 1
1833
+
1834
+ spec_md.write_text(
1835
+ "# Spec\n\n## Verification\n\n"
1836
+ "- solo ceiling avoidance: this differs from controls but omits the required baseline.\n",
1837
+ encoding="utf-8",
1838
+ )
1839
+ weak_sibling_solo_ceiling = subprocess.run(
1840
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1841
+ cwd=work,
1842
+ capture_output=True,
1843
+ text=True,
1844
+ )
1845
+ if weak_sibling_solo_ceiling.returncode == 0:
1846
+ print("weak sibling solo ceiling avoidance was accepted by --check-expected", file=sys.stderr)
1847
+ return 1
1848
+ if "concrete difference from rejected or solo-saturated controls" not in weak_sibling_solo_ceiling.stderr:
1849
+ print("--check-expected did not report weak sibling solo ceiling avoidance", file=sys.stderr)
1850
+ print(weak_sibling_solo_ceiling.stderr, file=sys.stderr)
1851
+ return 1
1852
+
1853
+ spec_md.write_text(
1854
+ "# Spec\n\n## Verification\n\n"
1855
+ "- solo ceiling avoidance: unlike solo-saturated `S2`-`S6`, this includes a "
1856
+ "multi-run temporal dependency because solo_claude headroom should remain.\n",
1857
+ encoding="utf-8",
1858
+ )
1859
+ strong_sibling_solo_ceiling = subprocess.run(
1860
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1861
+ cwd=work,
1862
+ capture_output=True,
1863
+ text=True,
1864
+ )
1865
+ if strong_sibling_solo_ceiling.returncode != 0:
1866
+ print("actionable sibling solo ceiling avoidance was rejected by --check-expected", file=sys.stderr)
1867
+ print(strong_sibling_solo_ceiling.stderr, file=sys.stderr)
1868
+ return 1
1869
+
1870
+ spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
1871
+ expected_good = subprocess.run(
1872
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1873
+ cwd=work,
1874
+ capture_output=True,
1875
+ text=True,
1876
+ )
1877
+ if expected_good.returncode != 0:
1878
+ print(expected_good.stderr, file=sys.stderr)
1879
+ return 1
1880
+
1881
+ spec_md.write_text(
1882
+ "---\nid: bad-sibling\ncomplexity: hihg\n---\n\n# Bad sibling\n\n## Verification\n\n- ok\n",
1883
+ encoding="utf-8",
1884
+ )
1885
+ expected_bad_sibling_complexity = subprocess.run(
1886
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1887
+ cwd=work,
1888
+ capture_output=True,
1889
+ text=True,
1890
+ )
1891
+ if expected_bad_sibling_complexity.returncode == 0:
1892
+ print("unsupported sibling spec complexity was accepted by --check-expected", file=sys.stderr)
1893
+ return 1
1894
+ if "frontmatter complexity must be one of" not in expected_bad_sibling_complexity.stderr:
1895
+ print("--check-expected did not report unsupported sibling spec complexity", file=sys.stderr)
1896
+ print(expected_bad_sibling_complexity.stderr, file=sys.stderr)
1897
+ return 1
1898
+ spec_md.write_text("# Spec\n\n## Verification\n\n- probe must pass visible marker.\n")
1899
+
1900
+ expected_json.write_text(json.dumps({"verification_commands": []}) + "\n")
1901
+ expected_empty_runtime = subprocess.run(
1902
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1903
+ cwd=work,
1904
+ capture_output=True,
1905
+ text=True,
1906
+ )
1907
+ if expected_empty_runtime.returncode == 0:
1908
+ print("empty verification_commands should fail for runtime specs", file=sys.stderr)
1909
+ return 1
1910
+
1911
+ pure_root = work / "pure-design"
1912
+ pure_root.mkdir()
1913
+ pure_spec = pure_root / "spec.md"
1914
+ pure_spec.write_text(
1915
+ "# Pure design\n\n## Verification\n\n- (all Requirements are pure-design; no runtime verification commands)\n",
1916
+ encoding="utf-8",
1917
+ )
1918
+ pure_expected = pure_root / "spec.expected.json"
1919
+ pure_expected.write_text(json.dumps({"verification_commands": []}) + "\n", encoding="utf-8")
1920
+ expected_empty_design = subprocess.run(
1921
+ [sys.executable, script_path, "--check-expected", str(pure_expected)],
1922
+ cwd=work,
1923
+ capture_output=True,
1924
+ text=True,
1925
+ )
1926
+ if expected_empty_design.returncode != 0:
1927
+ print("empty verification_commands should be valid for pure-design specs", file=sys.stderr)
1928
+ print(expected_empty_design.stderr, file=sys.stderr)
1929
+ return 1
1930
+
1931
+ expected_json.write_text(json.dumps({"unknown": True}) + "\n")
1932
+ expected_bad = subprocess.run(
1933
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1934
+ cwd=work,
1935
+ capture_output=True,
1936
+ text=True,
1937
+ )
1938
+ if expected_bad.returncode == 0:
1939
+ print("spec.expected.json with unknown key was accepted", file=sys.stderr)
1940
+ return 1
1941
+
1942
+ expected_json.write_text(json.dumps({
1943
+ "verification_commands": [{"cmd": "printf ok", "stdout_contians": ["ok"]}]
1944
+ }) + "\n")
1945
+ expected_bad_command = subprocess.run(
1946
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1947
+ cwd=work,
1948
+ capture_output=True,
1949
+ text=True,
1950
+ )
1951
+ if expected_bad_command.returncode == 0:
1952
+ print("spec.expected.json command with unknown key was accepted", file=sys.stderr)
1953
+ return 1
1954
+
1955
+ expected_json.write_text("[1]\n")
1956
+ expected_non_object = subprocess.run(
1957
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1958
+ cwd=work,
1959
+ capture_output=True,
1960
+ text=True,
1961
+ )
1962
+ if expected_non_object.returncode == 0:
1963
+ print("spec.expected.json top-level array was accepted", file=sys.stderr)
1964
+ return 1
1965
+ if "top-level must be a JSON object" not in expected_non_object.stderr:
1966
+ print("spec.expected.json top-level array did not report object shape error", file=sys.stderr)
1967
+ print(expected_non_object.stderr, file=sys.stderr)
1968
+ return 1
1969
+ if "Traceback" in expected_non_object.stderr:
1970
+ print("spec.expected.json top-level array produced a traceback", file=sys.stderr)
1971
+ return 1
1972
+
1973
+ expected_json.write_text("{broken\n")
1974
+ expected_invalid_json = subprocess.run(
1975
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1976
+ cwd=work,
1977
+ capture_output=True,
1978
+ text=True,
1979
+ )
1980
+ if expected_invalid_json.returncode == 0:
1981
+ print("invalid spec.expected.json was accepted", file=sys.stderr)
1982
+ return 1
1983
+ if "has invalid JSON" not in expected_invalid_json.stderr:
1984
+ print("invalid spec.expected.json did not report JSON parse error", file=sys.stderr)
1985
+ print(expected_invalid_json.stderr, file=sys.stderr)
1986
+ return 1
1987
+ if "Traceback" in expected_invalid_json.stderr:
1988
+ print("invalid spec.expected.json produced a traceback", file=sys.stderr)
1989
+ return 1
1990
+
1991
+ expected_json.write_text('{"verification_commands": NaN}\n')
1992
+ expected_nan_json = subprocess.run(
1993
+ [sys.executable, script_path, "--check-expected", str(expected_json)],
1994
+ cwd=work,
1995
+ capture_output=True,
1996
+ text=True,
1997
+ )
1998
+ if expected_nan_json.returncode == 0:
1999
+ print("NaN spec.expected.json was accepted", file=sys.stderr)
2000
+ return 1
2001
+ if "invalid JSON numeric constant: NaN" not in expected_nan_json.stderr:
2002
+ print("NaN spec.expected.json did not report invalid numeric constant", file=sys.stderr)
2003
+ print(expected_nan_json.stderr, file=sys.stderr)
2004
+ return 1
2005
+
2006
+ spec_integrity = work / "spec-integrity"
2007
+ spec_integrity.mkdir()
2008
+ spec_integrity_devlyn = spec_integrity / ".devlyn"
2009
+ spec_integrity_devlyn.mkdir()
2010
+ integrity_spec = spec_integrity / "spec.md"
2011
+ integrity_spec.write_text(
2012
+ "# Spec\n\n## Verification\n\n```json\n"
2013
+ "{\"verification_commands\":[{\"cmd\":\"printf spec-hash-ok\",\"stdout_contains\":[\"spec-hash-ok\"]}]}\n"
2014
+ "```\n",
2015
+ encoding="utf-8",
2016
+ )
2017
+ (spec_integrity_devlyn / "pipeline.state.json").write_text(json.dumps({
2018
+ "source": {
2019
+ "type": "spec",
2020
+ "spec_path": str(integrity_spec),
2021
+ "spec_sha256": "0" * 64,
2022
+ }
2023
+ }))
2024
+ spec_bad_hash_run = subprocess.run(
2025
+ [sys.executable, script_path],
2026
+ cwd=spec_integrity,
2027
+ capture_output=True,
2028
+ text=True,
2029
+ )
2030
+ if spec_bad_hash_run.returncode == 0:
2031
+ print("spec source with mismatched source.spec_sha256 was accepted", file=sys.stderr)
2032
+ return 1
2033
+ if "source.spec_sha256 mismatch" not in spec_bad_hash_run.stderr:
2034
+ print("spec source hash mismatch did not report source integrity", file=sys.stderr)
2035
+ print(spec_bad_hash_run.stderr, file=sys.stderr)
2036
+ return 1
2037
+
2038
+ spec_hash = hashlib.sha256(integrity_spec.read_bytes()).hexdigest()
2039
+ (spec_integrity_devlyn / "pipeline.state.json").write_text(json.dumps({
2040
+ "source": {
2041
+ "type": "spec",
2042
+ "spec_path": str(integrity_spec),
2043
+ "spec_sha256": spec_hash,
2044
+ }
2045
+ }))
2046
+ spec_hash_run = subprocess.run(
2047
+ [sys.executable, script_path],
2048
+ cwd=spec_integrity,
2049
+ capture_output=True,
2050
+ text=True,
2051
+ )
2052
+ if spec_hash_run.returncode != 0:
2053
+ print(spec_hash_run.stderr, file=sys.stderr)
2054
+ return 1
2055
+ staged_spec_hash = loads_strict_json((spec_integrity_devlyn / "spec-verify.json").read_text())
2056
+ if staged_spec_hash.get("verification_commands", [{}])[0].get("cmd") != "printf spec-hash-ok":
2057
+ print("spec source with matching source.spec_sha256 was not staged", file=sys.stderr)
2058
+ return 1
2059
+
2060
+ generated_user = work / "generated-user"
2061
+ generated_user.mkdir()
2062
+ generated_devlyn = generated_user / ".devlyn"
2063
+ generated_devlyn.mkdir()
2064
+ generated_criteria = generated_user / ".devlyn" / "criteria.generated.md"
2065
+ generated_criteria.write_text(
2066
+ "# Criteria\n\n## Verification\n\n```json\n"
2067
+ "{\"verification_commands\":[{\"cmd\":\"printf generated-ok\",\"stdout_contains\":[\"generated-ok\"]}]}\n"
2068
+ "```\n",
2069
+ encoding="utf-8",
2070
+ )
2071
+ (generated_devlyn / "pipeline.state.json").write_text(json.dumps({
2072
+ "source": {"type": "generated", "criteria_path": str(generated_criteria)}
2073
+ }))
2074
+ generated_missing_hash_run = subprocess.run(
2075
+ [sys.executable, script_path],
2076
+ cwd=generated_user,
2077
+ capture_output=True,
2078
+ text=True,
2079
+ )
2080
+ if generated_missing_hash_run.returncode == 0:
2081
+ print("generated criteria without source.criteria_sha256 was accepted", file=sys.stderr)
2082
+ return 1
2083
+ if "source.criteria_sha256 is required" not in generated_missing_hash_run.stderr:
2084
+ print("generated criteria without source.criteria_sha256 did not report source integrity", file=sys.stderr)
2085
+ print(generated_missing_hash_run.stderr, file=sys.stderr)
2086
+ return 1
2087
+
2088
+ (generated_devlyn / "pipeline.state.json").write_text(json.dumps({
2089
+ "source": {
2090
+ "type": "generated",
2091
+ "criteria_path": str(generated_criteria),
2092
+ "criteria_sha256": "0" * 64,
2093
+ }
2094
+ }))
2095
+ generated_bad_hash_run = subprocess.run(
2096
+ [sys.executable, script_path],
2097
+ cwd=generated_user,
2098
+ capture_output=True,
2099
+ text=True,
2100
+ )
2101
+ if generated_bad_hash_run.returncode == 0:
2102
+ print("generated criteria with mismatched source.criteria_sha256 was accepted", file=sys.stderr)
2103
+ return 1
2104
+ if "source.criteria_sha256 mismatch" not in generated_bad_hash_run.stderr:
2105
+ print("generated criteria hash mismatch did not report source integrity", file=sys.stderr)
2106
+ print(generated_bad_hash_run.stderr, file=sys.stderr)
2107
+ return 1
2108
+
2109
+ generated_hash = hashlib.sha256(generated_criteria.read_bytes()).hexdigest()
2110
+ (generated_devlyn / "pipeline.state.json").write_text(json.dumps({
2111
+ "source": {
2112
+ "type": "generated",
2113
+ "criteria_path": str(generated_criteria),
2114
+ "criteria_sha256": generated_hash,
2115
+ }
2116
+ }))
2117
+ generated_run = subprocess.run(
2118
+ [sys.executable, script_path],
2119
+ cwd=generated_user,
2120
+ capture_output=True,
2121
+ text=True,
2122
+ )
2123
+ if generated_run.returncode != 0:
2124
+ print(generated_run.stderr, file=sys.stderr)
2125
+ return 1
2126
+ staged_generated = loads_strict_json((generated_devlyn / "spec-verify.json").read_text())
2127
+ if staged_generated.get("verification_commands", [{}])[0].get("cmd") != "printf generated-ok":
2128
+ print("generated criteria carrier was not staged into .devlyn/spec-verify.json", file=sys.stderr)
2129
+ return 1
2130
+
2131
+ generated_criteria.write_text(
2132
+ "# Criteria\n\n## Verification\n\n- generated criteria omitted its machine-readable carrier.\n",
2133
+ encoding="utf-8",
2134
+ )
2135
+ malformed_generated_hash = hashlib.sha256(generated_criteria.read_bytes()).hexdigest()
2136
+ (generated_devlyn / "pipeline.state.json").write_text(json.dumps({
2137
+ "source": {
2138
+ "type": "generated",
2139
+ "criteria_path": str(generated_criteria),
2140
+ "criteria_sha256": malformed_generated_hash,
2141
+ }
2142
+ }))
2143
+ malformed_generated_run = subprocess.run(
2144
+ [sys.executable, script_path],
2145
+ cwd=generated_user,
2146
+ capture_output=True,
2147
+ text=True,
2148
+ )
2149
+ if malformed_generated_run.returncode == 0:
2150
+ print("generated criteria without a JSON carrier was accepted", file=sys.stderr)
2151
+ return 1
2152
+ if "Generated criteria were written without one" not in malformed_generated_run.stderr:
2153
+ print("generated criteria without a JSON carrier did not report the generated-source contract", file=sys.stderr)
2154
+ print(malformed_generated_run.stderr, file=sys.stderr)
2155
+ return 1
2156
+
2157
+ real_user = work / "real-user"
2158
+ real_user.mkdir()
2159
+ real_devlyn = real_user / ".devlyn"
2160
+ real_devlyn.mkdir()
2161
+ real_spec = real_user / "spec.md"
2162
+ real_spec.write_text(
2163
+ "# Spec\n\n## Verification\n\n- sibling command must print sibling-ok.\n"
2164
+ )
2165
+ (real_user / "spec.expected.json").write_text(json.dumps({
2166
+ "verification_commands": [
2167
+ {"cmd": "printf sibling-ok", "stdout_contains": ["sibling-ok"]}
2168
+ ]
2169
+ }) + "\n")
2170
+ (real_devlyn / "pipeline.state.json").write_text(json.dumps({
2171
+ "source": {"type": "spec", "spec_path": str(real_spec)}
2172
+ }))
2173
+ sibling_run = subprocess.run(
2174
+ [sys.executable, script_path],
2175
+ cwd=real_user,
2176
+ capture_output=True,
2177
+ text=True,
2178
+ )
2179
+ if sibling_run.returncode != 0:
2180
+ print(sibling_run.stderr, file=sys.stderr)
2181
+ return 1
2182
+ staged = loads_strict_json((real_devlyn / "spec-verify.json").read_text())
2183
+ if staged.get("verification_commands", [{}])[0].get("cmd") != "printf sibling-ok":
2184
+ print("sibling spec.expected.json was not staged into .devlyn/spec-verify.json", file=sys.stderr)
2185
+ return 1
2186
+
2187
+ malformed = work / "malformed-sibling"
2188
+ malformed.mkdir()
2189
+ malformed_devlyn = malformed / ".devlyn"
2190
+ malformed_devlyn.mkdir()
2191
+ malformed_spec = malformed / "spec.md"
2192
+ malformed_spec.write_text(
2193
+ "# Spec\n\n## Verification\n\n```json\n"
2194
+ "{\"verification_commands\":[{\"cmd\":\"printf inline-ok\"}]}\n"
2195
+ "```\n"
2196
+ )
2197
+ (malformed / "spec.expected.json").write_text(json.dumps({"unknown": True}) + "\n")
2198
+ (malformed_devlyn / "pipeline.state.json").write_text(json.dumps({
2199
+ "source": {"type": "spec", "spec_path": str(malformed_spec)}
2200
+ }))
2201
+ malformed_run = subprocess.run(
2202
+ [sys.executable, script_path],
2203
+ cwd=malformed,
2204
+ capture_output=True,
2205
+ text=True,
2206
+ )
2207
+ if malformed_run.returncode == 0:
2208
+ print("malformed sibling spec.expected.json fell back to inline carrier", file=sys.stderr)
2209
+ return 1
2210
+
2211
+ bench_spec = work / "bench-spec.md"
2212
+ bench_spec.write_text("# Spec\n\n## Verification\n\n- benchmark pre-staged wins.\n")
2213
+ (work / "spec.expected.json").write_text(json.dumps({
2214
+ "verification_commands": [
2215
+ {"cmd": "printf wrong", "stdout_contains": ["wrong"]}
2216
+ ]
2217
+ }) + "\n")
2218
+ (devlyn / "pipeline.state.json").write_text(json.dumps({
2219
+ "source": {"type": "spec", "spec_path": str(bench_spec)}
2220
+ }))
2221
+ (devlyn / "spec-verify.json").write_text(json.dumps({
2222
+ "verification_commands": [
2223
+ {"cmd": "printf bench-staged", "stdout_contains": ["bench-staged"]}
2224
+ ]
2225
+ }) + "\n")
2226
+ bench_pre_staged = subprocess.run(
2227
+ [sys.executable, script_path],
2228
+ cwd=work,
2229
+ env=env,
2230
+ capture_output=True,
2231
+ text=True,
2232
+ )
2233
+ if bench_pre_staged.returncode != 0:
2234
+ print(bench_pre_staged.stderr, file=sys.stderr)
2235
+ return 1
2236
+ staged_bench = loads_strict_json((devlyn / "spec-verify.json").read_text())
2237
+ if staged_bench.get("verification_commands", [{}])[0].get("cmd") != "printf bench-staged":
2238
+ print("benchmark pre-staged contract was overwritten", file=sys.stderr)
2239
+ return 1
2240
+
2241
+ verify_output = work / "verify-output"
2242
+ verify_output.mkdir()
2243
+ verify_devlyn = verify_output / ".devlyn"
2244
+ verify_devlyn.mkdir()
2245
+ verify_spec = verify_output / "spec.md"
2246
+ verify_spec.write_text("# Spec\n\n## Verification\n\n- verify mechanical output.\n")
2247
+ (verify_output / "spec.expected.json").write_text(json.dumps({
2248
+ "verification_commands": [
2249
+ {"cmd": "printf wrong", "stdout_contains": ["expected"]}
2250
+ ]
2251
+ }) + "\n")
2252
+ (verify_devlyn / "pipeline.state.json").write_text(json.dumps({
2253
+ "source": {"type": "spec", "spec_path": str(verify_spec)}
2254
+ }))
2255
+ verify_env = os.environ.copy()
2256
+ verify_env.update({
2257
+ "SPEC_VERIFY_PHASE": "verify_mechanical",
2258
+ "SPEC_VERIFY_FINDINGS_FILE": "verify-mechanical.findings.jsonl",
2259
+ "SPEC_VERIFY_FINDING_PREFIX": "VERIFY-MECH",
2260
+ })
2261
+ verify_output_run = subprocess.run(
2262
+ [sys.executable, script_path],
2263
+ cwd=verify_output,
2264
+ env=verify_env,
2265
+ capture_output=True,
2266
+ text=True,
2267
+ )
2268
+ if verify_output_run.returncode == 0:
2269
+ print("VERIFY output-mode failing command was accepted", file=sys.stderr)
2270
+ return 1
2271
+ verify_findings = (verify_devlyn / "verify-mechanical.findings.jsonl").read_text()
2272
+ if '"phase": "verify_mechanical"' not in verify_findings or "VERIFY-MECH-" not in verify_findings:
2273
+ print("VERIFY output-mode did not route findings to verify-mechanical", file=sys.stderr)
2274
+ return 1
2275
+
2276
+ contract_root = work / "expected-contract"
2277
+ contract_root.mkdir()
2278
+ contract_devlyn = contract_root / ".devlyn"
2279
+ contract_devlyn.mkdir()
2280
+ (contract_root / "package.json").write_text(
2281
+ '{\n "dependencies": {},\n "devDependencies": {}\n}\n'
2282
+ )
2283
+ subprocess.run(["git", "init", "-q"], cwd=contract_root, check=True)
2284
+ subprocess.run(["git", "add", "-A"], cwd=contract_root, check=True)
2285
+ subprocess.run(
2286
+ ["git", "-c", "user.email=t@t", "-c", "user.name=t", "commit", "-q", "-m", "base"],
2287
+ cwd=contract_root,
2288
+ check=True,
2289
+ )
2290
+ base_sha = subprocess.check_output(
2291
+ ["git", "rev-parse", "HEAD"],
2292
+ cwd=contract_root,
2293
+ text=True,
2294
+ ).strip()
2295
+ contract_spec = contract_root / "spec.md"
2296
+ contract_spec.write_text("# Spec\n\n## Verification\n\n- expected contract checks.\n")
2297
+ (contract_root / "app.js").write_text("try { work(); } catch { return null; }\n")
2298
+ (contract_root / "forbidden.txt").write_text("forbidden\n")
2299
+ (contract_root / "package.json").write_text(
2300
+ '{\n "dependencies": {\n "left-pad": "1.3.0"\n },\n'
2301
+ ' "devDependencies": {}\n}\n'
2302
+ )
2303
+ (contract_root / "spec.expected.json").write_text(json.dumps({
2304
+ "verification_commands": [{"cmd": "printf ok", "stdout_contains": ["ok"]}],
2305
+ "forbidden_patterns": [{
2306
+ "pattern": "catch\\s*\\{\\s*return null",
2307
+ "description": "silent catch fallback",
2308
+ "severity": "disqualifier",
2309
+ }],
2310
+ "required_files": ["required.txt"],
2311
+ "forbidden_files": ["forbidden.txt"],
2312
+ "max_deps_added": 0,
2313
+ }) + "\n")
2314
+ subprocess.run(["git", "add", "-A"], cwd=contract_root, check=True)
2315
+ (contract_devlyn / "pipeline.state.json").write_text(json.dumps({
2316
+ "source": {"type": "spec", "spec_path": str(contract_spec)},
2317
+ "base_ref": {"sha": base_sha},
2318
+ }))
2319
+ contract_run = subprocess.run(
2320
+ [sys.executable, script_path],
2321
+ cwd=contract_root,
2322
+ capture_output=True,
2323
+ text=True,
2324
+ )
2325
+ if contract_run.returncode == 0:
2326
+ print("expected contract violations were accepted", file=sys.stderr)
2327
+ return 1
2328
+ findings_text = (contract_devlyn / "spec-verify-findings.jsonl").read_text()
2329
+ for rule_id in (
2330
+ "correctness.forbidden-pattern",
2331
+ "correctness.required-file-missing",
2332
+ "scope.forbidden-file-touched",
2333
+ "scope.max-deps-added-exceeded",
2334
+ ):
2335
+ if rule_id not in findings_text:
2336
+ print(f"expected contract finding missing: {rule_id}", file=sys.stderr)
2337
+ return 1
2338
+
2339
+ (devlyn / "risk-probes.jsonl").write_text(json.dumps({
2340
+ "id": "P4",
2341
+ "derived_from": "probe must pass visible marker.",
2342
+ "cmd": "printf weak-boundary",
2343
+ "exit_code": 0,
2344
+ "tags": ["boundary_overlap"],
2345
+ "tag_evidence": {"boundary_overlap": ["one_minute_overlap"]},
2346
+ }) + "\n")
2347
+ weak = subprocess.run(
2348
+ [sys.executable, script_path, "--validate-risk-probes"],
2349
+ cwd=work,
2350
+ env=env,
2351
+ capture_output=True,
2352
+ text=True,
2353
+ )
2354
+ if weak.returncode == 0:
2355
+ print("incomplete boundary_overlap evidence was accepted", file=sys.stderr)
2356
+ return 1
2357
+
2358
+ rollback_root = work / "rollback-risk-probe"
2359
+ rollback_root.mkdir()
2360
+ rollback_devlyn = rollback_root / ".devlyn"
2361
+ rollback_devlyn.mkdir()
2362
+ rollback_spec = rollback_root / "spec.md"
2363
+ rollback_spec.write_text(
2364
+ "# Spec\n\n## Verification\n\n"
2365
+ "- A failed all-or-nothing operation must roll back tentative state "
2366
+ "so later orders can use the released stock.\n"
2367
+ )
2368
+ (rollback_devlyn / "pipeline.state.json").write_text(json.dumps({
2369
+ "source": {"type": "spec", "spec_path": str(rollback_spec)}
2370
+ }))
2371
+ (rollback_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2372
+ "id": "P5",
2373
+ "derived_from": (
2374
+ "A failed all-or-nothing operation must roll back tentative "
2375
+ "state so later orders can use the released stock."
2376
+ ),
2377
+ "cmd": "printf weak-rollback",
2378
+ "exit_code": 0,
2379
+ "tags": ["prior_consumption"],
2380
+ "tag_evidence": {
2381
+ "prior_consumption": [
2382
+ "same_resource_consumed_first",
2383
+ "later_entity_fails_or_reroutes",
2384
+ ],
2385
+ },
2386
+ }) + "\n")
2387
+ weak_rollback = subprocess.run(
2388
+ [sys.executable, script_path, "--validate-risk-probes"],
2389
+ cwd=rollback_root,
2390
+ capture_output=True,
2391
+ text=True,
2392
+ )
2393
+ if weak_rollback.returncode == 0:
2394
+ print("rollback verification text did not require rollback_state probe tag", file=sys.stderr)
2395
+ return 1
2396
+
2397
+ error_root = work / "error-contract-risk-probe"
2398
+ error_root.mkdir()
2399
+ error_devlyn = error_root / ".devlyn"
2400
+ error_devlyn.mkdir()
2401
+ error_spec = error_root / "spec.md"
2402
+ error_spec.write_text(
2403
+ "# Spec\n\n## Verification\n\n"
2404
+ "- Invalid input must print a JSON error object to stderr and exit 2.\n"
2405
+ )
2406
+ (error_devlyn / "pipeline.state.json").write_text(json.dumps({
2407
+ "source": {"type": "spec", "spec_path": str(error_spec)}
2408
+ }))
2409
+ (error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2410
+ "id": "P6",
2411
+ "derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
2412
+ "cmd": "printf weak-error-contract",
2413
+ "exit_code": 0,
2414
+ "tags": ["stdout_stderr_contract", "error_contract"],
2415
+ "tag_evidence": {
2416
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2417
+ "error_contract": ["asserts_error_payload_or_stderr"],
2418
+ },
2419
+ }) + "\n")
2420
+ weak_error_contract = subprocess.run(
2421
+ [sys.executable, script_path, "--validate-risk-probes"],
2422
+ cwd=error_root,
2423
+ capture_output=True,
2424
+ text=True,
2425
+ )
2426
+ if weak_error_contract.returncode == 0:
2427
+ print("error_contract without exit-code evidence was accepted", file=sys.stderr)
2428
+ return 1
2429
+
2430
+ (error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2431
+ "id": "P7",
2432
+ "derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
2433
+ "cmd": "printf weak-stdio-contract",
2434
+ "exit_code": 2,
2435
+ "tags": ["stdout_stderr_contract", "error_contract"],
2436
+ "tag_evidence": {
2437
+ "stdout_stderr_contract": [],
2438
+ "error_contract": [
2439
+ "asserts_error_payload_or_stderr",
2440
+ "asserts_nonzero_or_exit_2",
2441
+ ],
2442
+ },
2443
+ }) + "\n")
2444
+ weak_stdio_contract = subprocess.run(
2445
+ [sys.executable, script_path, "--validate-risk-probes"],
2446
+ cwd=error_root,
2447
+ capture_output=True,
2448
+ text=True,
2449
+ )
2450
+ if weak_stdio_contract.returncode == 0:
2451
+ print("stdout_stderr_contract without stream evidence was accepted", file=sys.stderr)
2452
+ return 1
2453
+
2454
+ (error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2455
+ "id": "P7b",
2456
+ "derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
2457
+ "cmd": "printf weak-json-error-shape-contract",
2458
+ "exit_code": 2,
2459
+ "tags": ["stdout_stderr_contract", "error_contract"],
2460
+ "tag_evidence": {
2461
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2462
+ "error_contract": [
2463
+ "asserts_error_payload_or_stderr",
2464
+ "asserts_nonzero_or_exit_2",
2465
+ ],
2466
+ },
2467
+ }) + "\n")
2468
+ weak_json_error_shape_contract = subprocess.run(
2469
+ [sys.executable, script_path, "--validate-risk-probes"],
2470
+ cwd=error_root,
2471
+ capture_output=True,
2472
+ text=True,
2473
+ )
2474
+ if weak_json_error_shape_contract.returncode == 0:
2475
+ print("JSON error object text did not require shape_contract tag", file=sys.stderr)
2476
+ return 1
2477
+
2478
+ (error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2479
+ "id": "P7c",
2480
+ "derived_from": "Invalid input must print a JSON error object to stderr and exit 2.",
2481
+ "cmd": "printf json-error-shape-contract",
2482
+ "exit_code": 2,
2483
+ "tags": ["stdout_stderr_contract", "error_contract", "shape_contract"],
2484
+ "tag_evidence": {
2485
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2486
+ "error_contract": [
2487
+ "asserts_error_payload_or_stderr",
2488
+ "asserts_nonzero_or_exit_2",
2489
+ ],
2490
+ "shape_contract": [
2491
+ "uses_visible_input_key_names",
2492
+ "asserts_visible_output_key_names",
2493
+ "asserts_no_unexpected_output_keys",
2494
+ "asserts_exact_error_object",
2495
+ ],
2496
+ },
2497
+ }) + "\n")
2498
+ strong_json_error_shape_contract = subprocess.run(
2499
+ [sys.executable, script_path, "--validate-risk-probes"],
2500
+ cwd=error_root,
2501
+ capture_output=True,
2502
+ text=True,
2503
+ )
2504
+ if strong_json_error_shape_contract.returncode != 0:
2505
+ print("JSON error object shape_contract with exact object evidence was rejected", file=sys.stderr)
2506
+ print(strong_json_error_shape_contract.stderr, file=sys.stderr)
2507
+ return 1
2508
+
2509
+ inline_json_error = (
2510
+ 'Invalid input prints JSON error `{ "error": "invalid" }` to stderr and exits 2.'
2511
+ )
2512
+ error_spec.write_text("# Spec\n\n## Verification\n\n- " + inline_json_error + "\n")
2513
+ (error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2514
+ "id": "P7d",
2515
+ "derived_from": inline_json_error,
2516
+ "cmd": "printf weak-inline-json-error-shape-contract",
2517
+ "exit_code": 2,
2518
+ "tags": ["stdout_stderr_contract", "error_contract"],
2519
+ "tag_evidence": {
2520
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2521
+ "error_contract": [
2522
+ "asserts_error_payload_or_stderr",
2523
+ "asserts_nonzero_or_exit_2",
2524
+ ],
2525
+ },
2526
+ }) + "\n")
2527
+ weak_inline_json_error_shape_contract = subprocess.run(
2528
+ [sys.executable, script_path, "--validate-risk-probes"],
2529
+ cwd=error_root,
2530
+ capture_output=True,
2531
+ text=True,
2532
+ )
2533
+ if weak_inline_json_error_shape_contract.returncode == 0:
2534
+ print("inline JSON error text did not require shape_contract tag", file=sys.stderr)
2535
+ return 1
2536
+
2537
+ (error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2538
+ "id": "P7e",
2539
+ "derived_from": inline_json_error,
2540
+ "cmd": "printf inline-json-error-shape-contract",
2541
+ "exit_code": 2,
2542
+ "tags": ["stdout_stderr_contract", "error_contract", "shape_contract"],
2543
+ "tag_evidence": {
2544
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2545
+ "error_contract": [
2546
+ "asserts_error_payload_or_stderr",
2547
+ "asserts_nonzero_or_exit_2",
2548
+ ],
2549
+ "shape_contract": [
2550
+ "uses_visible_input_key_names",
2551
+ "asserts_visible_output_key_names",
2552
+ "asserts_no_unexpected_output_keys",
2553
+ "asserts_exact_error_object",
2554
+ ],
2555
+ },
2556
+ }) + "\n")
2557
+ strong_inline_json_error_shape_contract = subprocess.run(
2558
+ [sys.executable, script_path, "--validate-risk-probes"],
2559
+ cwd=error_root,
2560
+ capture_output=True,
2561
+ text=True,
2562
+ )
2563
+ if strong_inline_json_error_shape_contract.returncode != 0:
2564
+ print("inline JSON error shape_contract with exact object evidence was rejected", file=sys.stderr)
2565
+ print(strong_inline_json_error_shape_contract.stderr, file=sys.stderr)
2566
+ return 1
2567
+
2568
+ http_error_root = work / "http-error-contract-risk-probe"
2569
+ http_error_root.mkdir()
2570
+ http_error_devlyn = http_error_root / ".devlyn"
2571
+ http_error_devlyn.mkdir()
2572
+ http_error_spec = http_error_root / "spec.md"
2573
+ http_error_spec.write_text(
2574
+ "# Spec\n\n## Verification\n\n"
2575
+ "- An invalid query returns HTTP 400 with JSON error body `{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`.\n"
2576
+ )
2577
+ (http_error_devlyn / "pipeline.state.json").write_text(json.dumps({
2578
+ "source": {"type": "spec", "spec_path": str(http_error_spec)}
2579
+ }))
2580
+ (http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2581
+ "id": "P8",
2582
+ "derived_from": (
2583
+ "An invalid query returns HTTP 400 with JSON error body "
2584
+ "`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
2585
+ ),
2586
+ "cmd": "printf weak-http-error-contract",
2587
+ "exit_code": 0,
2588
+ "tags": ["shape_contract"],
2589
+ "tag_evidence": {},
2590
+ }) + "\n")
2591
+ weak_http_error_contract = subprocess.run(
2592
+ [sys.executable, script_path, "--validate-risk-probes"],
2593
+ cwd=http_error_root,
2594
+ capture_output=True,
2595
+ text=True,
2596
+ )
2597
+ if weak_http_error_contract.returncode == 0:
2598
+ print("http error text did not require http_error_contract tag", file=sys.stderr)
2599
+ return 1
2600
+
2601
+ (http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2602
+ "id": "P8b",
2603
+ "derived_from": (
2604
+ "An invalid query returns HTTP 400 with JSON error body "
2605
+ "`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
2606
+ ),
2607
+ "cmd": "printf http-error-contract",
2608
+ "exit_code": 0,
2609
+ "tags": ["http_error_contract"],
2610
+ "tag_evidence": {
2611
+ "http_error_contract": ["asserts_http_error_status"],
2612
+ },
2613
+ }) + "\n")
2614
+ incomplete_http_error_contract = subprocess.run(
2615
+ [sys.executable, script_path, "--validate-risk-probes"],
2616
+ cwd=http_error_root,
2617
+ capture_output=True,
2618
+ text=True,
2619
+ )
2620
+ if incomplete_http_error_contract.returncode == 0:
2621
+ print("http_error_contract without payload evidence was accepted", file=sys.stderr)
2622
+ return 1
2623
+
2624
+ (http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2625
+ "id": "P8c",
2626
+ "derived_from": (
2627
+ "An invalid query returns HTTP 400 with JSON error body "
2628
+ "`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
2629
+ ),
2630
+ "cmd": "printf weak-exact-error-shape-contract",
2631
+ "exit_code": 0,
2632
+ "tags": ["http_error_contract", "shape_contract"],
2633
+ "tag_evidence": {
2634
+ "http_error_contract": [
2635
+ "asserts_http_error_status",
2636
+ "asserts_error_payload_body",
2637
+ ],
2638
+ "shape_contract": [
2639
+ "uses_visible_input_key_names",
2640
+ "asserts_visible_output_key_names",
2641
+ "asserts_no_unexpected_output_keys",
2642
+ ],
2643
+ },
2644
+ }) + "\n")
2645
+ weak_exact_error_shape_contract = subprocess.run(
2646
+ [sys.executable, script_path, "--validate-risk-probes"],
2647
+ cwd=http_error_root,
2648
+ capture_output=True,
2649
+ text=True,
2650
+ )
2651
+ if weak_exact_error_shape_contract.returncode == 0:
2652
+ print("exact error body shape_contract without exact object evidence was accepted", file=sys.stderr)
2653
+ return 1
2654
+
2655
+ (http_error_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2656
+ "id": "P8d",
2657
+ "derived_from": (
2658
+ "An invalid query returns HTTP 400 with JSON error body "
2659
+ "`{ \"error\": \"invalid_query\", \"field\": \"per_page\" }`."
2660
+ ),
2661
+ "cmd": "printf exact-error-shape-contract",
2662
+ "exit_code": 0,
2663
+ "tags": ["http_error_contract", "shape_contract"],
2664
+ "tag_evidence": {
2665
+ "http_error_contract": [
2666
+ "asserts_http_error_status",
2667
+ "asserts_error_payload_body",
2668
+ ],
2669
+ "shape_contract": [
2670
+ "uses_visible_input_key_names",
2671
+ "asserts_visible_output_key_names",
2672
+ "asserts_no_unexpected_output_keys",
2673
+ "asserts_exact_error_object",
2674
+ ],
2675
+ },
2676
+ }) + "\n")
2677
+ strong_exact_error_shape_contract = subprocess.run(
2678
+ [sys.executable, script_path, "--validate-risk-probes"],
2679
+ cwd=http_error_root,
2680
+ capture_output=True,
2681
+ text=True,
2682
+ )
2683
+ if strong_exact_error_shape_contract.returncode != 0:
2684
+ print("exact error body shape_contract with exact object evidence was rejected", file=sys.stderr)
2685
+ print(strong_exact_error_shape_contract.stderr, file=sys.stderr)
2686
+ return 1
2687
+
2688
+ shape_root = work / "exact-shape-risk-probe"
2689
+ shape_root.mkdir()
2690
+ shape_devlyn = shape_root / ".devlyn"
2691
+ shape_devlyn.mkdir()
2692
+ shape_spec = shape_root / "spec.md"
2693
+ shape_spec.write_text(
2694
+ "# Spec\n\n## Verification\n\n"
2695
+ "- On success, output is one JSON object with keys `applied`, `rejected`, and `accounts`; "
2696
+ "`rejected` rows have keys `id` and `reason`.\n"
2697
+ )
2698
+ (shape_devlyn / "pipeline.state.json").write_text(json.dumps({
2699
+ "source": {"type": "spec", "spec_path": str(shape_spec)}
2700
+ }))
2701
+ (shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2702
+ "id": "P8c",
2703
+ "derived_from": (
2704
+ "On success, output is one JSON object with keys `applied`, `rejected`, and `accounts`; "
2705
+ "`rejected` rows have keys `id` and `reason`."
2706
+ ),
2707
+ "cmd": "printf weak-shape-contract",
2708
+ "exit_code": 0,
2709
+ "tags": ["shape_contract"],
2710
+ "tag_evidence": {},
2711
+ }) + "\n")
2712
+ weak_shape_contract = subprocess.run(
2713
+ [sys.executable, script_path, "--validate-risk-probes"],
2714
+ cwd=shape_root,
2715
+ capture_output=True,
2716
+ text=True,
2717
+ )
2718
+ if weak_shape_contract.returncode == 0:
2719
+ print("shape_contract without exact key evidence was accepted", file=sys.stderr)
2720
+ return 1
2721
+
2722
+ (shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2723
+ "id": "P8d",
2724
+ "derived_from": (
2725
+ "On success, output is one JSON object with keys `applied`, `rejected`, and `accounts`; "
2726
+ "`rejected` rows have keys `id` and `reason`."
2727
+ ),
2728
+ "cmd": "printf shape-contract",
2729
+ "exit_code": 0,
2730
+ "tags": ["shape_contract"],
2731
+ "tag_evidence": {
2732
+ "shape_contract": [
2733
+ "uses_visible_input_key_names",
2734
+ "asserts_visible_output_key_names",
2735
+ "asserts_no_unexpected_output_keys",
2736
+ ],
2737
+ },
2738
+ }) + "\n")
2739
+ strong_shape_contract = subprocess.run(
2740
+ [sys.executable, script_path, "--validate-risk-probes"],
2741
+ cwd=shape_root,
2742
+ capture_output=True,
2743
+ text=True,
2744
+ )
2745
+ if strong_shape_contract.returncode != 0:
2746
+ print("shape_contract with exact key evidence was rejected", file=sys.stderr)
2747
+ print(strong_shape_contract.stderr, file=sys.stderr)
2748
+ return 1
2749
+
2750
+ inline_json_success = 'On success, stdout is `{ "id": "acct_1", "status": "accepted" }`.'
2751
+ shape_spec.write_text("# Spec\n\n## Verification\n\n- " + inline_json_success + "\n")
2752
+ (shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2753
+ "id": "P8e",
2754
+ "derived_from": inline_json_success,
2755
+ "cmd": "printf weak-inline-json-shape-contract",
2756
+ "exit_code": 0,
2757
+ "tags": ["stdout_stderr_contract"],
2758
+ "tag_evidence": {
2759
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2760
+ },
2761
+ }) + "\n")
2762
+ weak_inline_json_shape_contract = subprocess.run(
2763
+ [sys.executable, script_path, "--validate-risk-probes"],
2764
+ cwd=shape_root,
2765
+ capture_output=True,
2766
+ text=True,
2767
+ )
2768
+ if weak_inline_json_shape_contract.returncode == 0:
2769
+ print("inline JSON object text did not require shape_contract tag", file=sys.stderr)
2770
+ return 1
2771
+
2772
+ (shape_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2773
+ "id": "P8f",
2774
+ "derived_from": inline_json_success,
2775
+ "cmd": "printf inline-json-shape-contract",
2776
+ "exit_code": 0,
2777
+ "tags": ["stdout_stderr_contract", "shape_contract"],
2778
+ "tag_evidence": {
2779
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2780
+ "shape_contract": [
2781
+ "uses_visible_input_key_names",
2782
+ "asserts_visible_output_key_names",
2783
+ "asserts_no_unexpected_output_keys",
2784
+ ],
2785
+ },
2786
+ }) + "\n")
2787
+ strong_inline_json_shape_contract = subprocess.run(
2788
+ [sys.executable, script_path, "--validate-risk-probes"],
2789
+ cwd=shape_root,
2790
+ capture_output=True,
2791
+ text=True,
2792
+ )
2793
+ if strong_inline_json_shape_contract.returncode != 0:
2794
+ print("inline JSON object shape_contract with key evidence was rejected", file=sys.stderr)
2795
+ print(strong_inline_json_shape_contract.stderr, file=sys.stderr)
2796
+ return 1
2797
+
2798
+ forbidden_text_root = work / "forbidden-pattern-risk-probe"
2799
+ forbidden_text_root.mkdir()
2800
+ forbidden_text_devlyn = forbidden_text_root / ".devlyn"
2801
+ forbidden_text_devlyn.mkdir()
2802
+ forbidden_text_spec = forbidden_text_root / "spec.md"
2803
+ forbidden_text_spec.write_text(
2804
+ "# Spec\n\n## Verification\n\n"
2805
+ "- The diff must not add forbidden fallback patterns.\n"
2806
+ )
2807
+ (forbidden_text_devlyn / "pipeline.state.json").write_text(json.dumps({
2808
+ "source": {"type": "spec", "spec_path": str(forbidden_text_spec)}
2809
+ }))
2810
+ (forbidden_text_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2811
+ "id": "P8",
2812
+ "derived_from": "The diff must not add forbidden fallback patterns.",
2813
+ "cmd": "printf forbidden-pattern-static-check",
2814
+ "exit_code": 0,
2815
+ "tags": ["shape_contract"],
2816
+ "tag_evidence": {},
2817
+ }) + "\n")
2818
+ forbidden_pattern_probe = subprocess.run(
2819
+ [sys.executable, script_path, "--validate-risk-probes"],
2820
+ cwd=forbidden_text_root,
2821
+ capture_output=True,
2822
+ text=True,
2823
+ )
2824
+ if forbidden_pattern_probe.returncode != 0:
2825
+ print("generic forbidden-pattern verification text incorrectly required boundary_overlap", file=sys.stderr)
2826
+ print(forbidden_pattern_probe.stderr, file=sys.stderr)
2827
+ return 1
2828
+
2829
+ stock_validation_root = work / "stock-validation-risk-probe"
2830
+ stock_validation_root.mkdir()
2831
+ stock_validation_devlyn = stock_validation_root / ".devlyn"
2832
+ stock_validation_devlyn.mkdir()
2833
+ stock_validation_spec = stock_validation_root / "spec.md"
2834
+ stock_validation_spec.write_text(
2835
+ "# Spec\n\n## Verification\n\n"
2836
+ "- A quote over combined stock exits `2`, prints one JSON error to stderr, and prints no stdout.\n"
2837
+ )
2838
+ (stock_validation_devlyn / "pipeline.state.json").write_text(json.dumps({
2839
+ "source": {"type": "spec", "spec_path": str(stock_validation_spec)}
2840
+ }))
2841
+ (stock_validation_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2842
+ "id": "P9",
2843
+ "derived_from": (
2844
+ "A quote over combined stock exits `2`, prints one JSON error "
2845
+ "to stderr, and prints no stdout."
2846
+ ),
2847
+ "cmd": "printf stock-validation-error",
2848
+ "exit_code": 2,
2849
+ "tags": ["stdout_stderr_contract", "error_contract"],
2850
+ "tag_evidence": {
2851
+ "stdout_stderr_contract": ["asserts_named_stream_output"],
2852
+ "error_contract": [
2853
+ "asserts_error_payload_or_stderr",
2854
+ "asserts_nonzero_or_exit_2",
2855
+ ],
2856
+ },
2857
+ }) + "\n")
2858
+ stock_validation_probe = subprocess.run(
2859
+ [sys.executable, script_path, "--validate-risk-probes"],
2860
+ cwd=stock_validation_root,
2861
+ capture_output=True,
2862
+ text=True,
2863
+ )
2864
+ if stock_validation_probe.returncode != 0:
2865
+ print("stock validation error text incorrectly required prior_consumption", file=sys.stderr)
2866
+ print(stock_validation_probe.stderr, file=sys.stderr)
2867
+ return 1
2868
+
2869
+ webhook_root = work / "webhook-risk-probe"
2870
+ webhook_root.mkdir()
2871
+ webhook_devlyn = webhook_root / ".devlyn"
2872
+ webhook_devlyn.mkdir()
2873
+ webhook_spec = webhook_root / "spec.md"
2874
+ webhook_spec.write_text(
2875
+ "# Spec\n\n## Verification\n\n"
2876
+ "- A POST whose body has been modified after signing returns 401.\n"
2877
+ "- A second POST with the same accepted `id` returns 409 even if the duplicate body would otherwise fail shape validation.\n"
2878
+ )
2879
+ (webhook_devlyn / "pipeline.state.json").write_text(json.dumps({
2880
+ "source": {"type": "spec", "spec_path": str(webhook_spec)}
2881
+ }))
2882
+ (webhook_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2883
+ "id": "P10",
2884
+ "derived_from": "A POST whose body has been modified after signing returns 401.",
2885
+ "cmd": "printf weak-webhook",
2886
+ "exit_code": 0,
2887
+ "tags": ["shape_contract"],
2888
+ "tag_evidence": {},
2889
+ }) + "\n")
2890
+ weak_webhook_probe = subprocess.run(
2891
+ [sys.executable, script_path, "--validate-risk-probes"],
2892
+ cwd=webhook_root,
2893
+ capture_output=True,
2894
+ text=True,
2895
+ )
2896
+ if weak_webhook_probe.returncode == 0:
2897
+ print("webhook signature/replay text did not require auth/idempotency probe tags", file=sys.stderr)
2898
+ return 1
2899
+
2900
+ duplicate_sku_root = work / "duplicate-sku-risk-probe"
2901
+ duplicate_sku_root.mkdir()
2902
+ duplicate_sku_devlyn = duplicate_sku_root / ".devlyn"
2903
+ duplicate_sku_devlyn.mkdir()
2904
+ duplicate_sku_spec = duplicate_sku_root / "spec.md"
2905
+ duplicate_sku_spec.write_text(
2906
+ "# Spec\n\n## Verification\n\n"
2907
+ "- A cart with duplicate SKUs combines quantities before stock validation.\n"
2908
+ )
2909
+ (duplicate_sku_devlyn / "pipeline.state.json").write_text(json.dumps({
2910
+ "source": {"type": "spec", "spec_path": str(duplicate_sku_spec)}
2911
+ }))
2912
+ (duplicate_sku_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2913
+ "id": "P11",
2914
+ "derived_from": "A cart with duplicate SKUs combines quantities before stock validation.",
2915
+ "cmd": "printf duplicate-sku-shape",
2916
+ "exit_code": 0,
2917
+ "tags": ["shape_contract"],
2918
+ "tag_evidence": {},
2919
+ }) + "\n")
2920
+ duplicate_sku_probe = subprocess.run(
2921
+ [sys.executable, script_path, "--validate-risk-probes"],
2922
+ cwd=duplicate_sku_root,
2923
+ capture_output=True,
2924
+ text=True,
2925
+ )
2926
+ if duplicate_sku_probe.returncode != 0:
2927
+ print("duplicate SKU verification text incorrectly required idempotency_replay", file=sys.stderr)
2928
+ print(duplicate_sku_probe.stderr, file=sys.stderr)
2929
+ return 1
2930
+
2931
+ concurrent_root = work / "concurrent-state-risk-probe"
2932
+ concurrent_root.mkdir()
2933
+ concurrent_devlyn = concurrent_root / ".devlyn"
2934
+ concurrent_devlyn.mkdir()
2935
+ concurrent_spec = concurrent_root / "spec.md"
2936
+ concurrent_spec.write_text(
2937
+ "# Spec\n\n## Verification\n\n"
2938
+ "- Several POST requests close together must all appear exactly once "
2939
+ "in GET output with distinct ids.\n"
2940
+ )
2941
+ (concurrent_devlyn / "pipeline.state.json").write_text(json.dumps({
2942
+ "source": {"type": "spec", "spec_path": str(concurrent_spec)}
2943
+ }))
2944
+ (concurrent_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2945
+ "id": "P12",
2946
+ "derived_from": (
2947
+ "Several POST requests close together must all appear exactly "
2948
+ "once in GET output with distinct ids."
2949
+ ),
2950
+ "cmd": "printf weak-concurrent-state",
2951
+ "exit_code": 0,
2952
+ "tags": ["shape_contract"],
2953
+ "tag_evidence": {},
2954
+ }) + "\n")
2955
+ weak_concurrent_probe = subprocess.run(
2956
+ [sys.executable, script_path, "--validate-risk-probes"],
2957
+ cwd=concurrent_root,
2958
+ capture_output=True,
2959
+ text=True,
2960
+ )
2961
+ if weak_concurrent_probe.returncode == 0:
2962
+ print("concurrent state text did not require concurrent_state_consistency tag", file=sys.stderr)
2963
+ return 1
2964
+
2965
+ atomic_batch_root = work / "atomic-batch-risk-probe"
2966
+ atomic_batch_root.mkdir()
2967
+ atomic_batch_devlyn = atomic_batch_root / ".devlyn"
2968
+ atomic_batch_devlyn.mkdir()
2969
+ atomic_batch_spec = atomic_batch_root / "spec.md"
2970
+ atomic_batch_spec.write_text(
2971
+ "# Spec\n\n## Verification\n\n"
2972
+ "- A POST with one valid + one invalid item returns `400`, AND a subsequent GET returns the same list as before the import.\n"
2973
+ "- A POST with all-valid items returns `201`, and the items appear in GET output in order with distinct ids.\n"
2974
+ )
2975
+ (atomic_batch_devlyn / "pipeline.state.json").write_text(json.dumps({
2976
+ "source": {"type": "spec", "spec_path": str(atomic_batch_spec)}
2977
+ }))
2978
+ (atomic_batch_devlyn / "risk-probes.jsonl").write_text(json.dumps({
2979
+ "id": "P13",
2980
+ "derived_from": (
2981
+ "A POST with one valid + one invalid item returns `400`, AND "
2982
+ "a subsequent GET returns the same list as before the import."
2983
+ ),
2984
+ "cmd": "printf weak-atomic-batch",
2985
+ "exit_code": 0,
2986
+ "tags": ["shape_contract"],
2987
+ "tag_evidence": {},
2988
+ }) + "\n")
2989
+ weak_atomic_batch_probe = subprocess.run(
2990
+ [sys.executable, script_path, "--validate-risk-probes"],
2991
+ cwd=atomic_batch_root,
2992
+ capture_output=True,
2993
+ text=True,
2994
+ )
2995
+ if weak_atomic_batch_probe.returncode == 0:
2996
+ print("atomic batch text did not require atomic_batch_state tag", file=sys.stderr)
2997
+ return 1
2998
+
2999
+ (atomic_batch_devlyn / "risk-probes.jsonl").write_text(json.dumps({
3000
+ "id": "P13b",
3001
+ "derived_from": (
3002
+ "A POST with one valid + one invalid item returns `400`, AND "
3003
+ "a subsequent GET returns the same list as before the import."
3004
+ ),
3005
+ "cmd": "printf incomplete-atomic-batch",
3006
+ "exit_code": 0,
3007
+ "tags": ["atomic_batch_state"],
3008
+ "tag_evidence": {
3009
+ "atomic_batch_state": [
3010
+ "mixed_valid_invalid_batch",
3011
+ "asserts_store_unchanged_after_failure",
3012
+ ],
3013
+ },
3014
+ }) + "\n")
3015
+ incomplete_atomic_batch_probe = subprocess.run(
3016
+ [sys.executable, script_path, "--validate-risk-probes"],
3017
+ cwd=atomic_batch_root,
3018
+ capture_output=True,
3019
+ text=True,
3020
+ )
3021
+ if incomplete_atomic_batch_probe.returncode == 0:
3022
+ print("atomic_batch_state without success-order evidence was accepted", file=sys.stderr)
3023
+ return 1
3024
+ return 0
3025
+
3026
+
3027
+ def main() -> int:
3028
+ include_risk_probes = False
3029
+ validate_risk_probes_only = False
3030
+ if "--include-risk-probes" in sys.argv[1:]:
3031
+ include_risk_probes = True
3032
+ sys.argv = [arg for arg in sys.argv if arg != "--include-risk-probes"]
3033
+ if "--validate-risk-probes" in sys.argv[1:]:
528
3034
  validate_risk_probes_only = True
529
3035
  sys.argv = [arg for arg in sys.argv if arg != "--validate-risk-probes"]
530
3036
 
@@ -537,6 +3043,12 @@ def main() -> int:
537
3043
  return 2
538
3044
  return run_check_mode(Path(sys.argv[2]))
539
3045
 
3046
+ if len(sys.argv) >= 2 and sys.argv[1] == "--check-expected":
3047
+ if len(sys.argv) != 3:
3048
+ print("usage: spec-verify-check.py --check-expected <json-path>", file=sys.stderr)
3049
+ return 2
3050
+ return run_check_expected_mode(Path(sys.argv[2]))
3051
+
540
3052
  bench_mode = "BENCH_WORKDIR" in os.environ
541
3053
  work = Path(os.environ.get("BENCH_WORKDIR") or os.getcwd())
542
3054
  devlyn_dir = work / ".devlyn"
@@ -550,15 +3062,17 @@ def main() -> int:
550
3062
  # source-extract entirely. iter-0019.9 closes the F9 regression where
551
3063
  # source-extract from an ideate-generated spec overwrote the
552
3064
  # benchmark contract — for benchmarks, expected.json is canonical.
553
- # 2. Otherwise, attempt source-extract from
3065
+ # 2. Otherwise, real-user source.type=="spec" first attempts the sibling
3066
+ # spec.expected.json next to spec.md. If present, validate it and stage
3067
+ # its verification_commands. If malformed, fail closed. If absent,
3068
+ # continue to legacy source-extract.
3069
+ # 3. Source-extract reads
554
3070
  # `pipeline.state.json:source.{spec_path | criteria_path}`. If it has
555
- # a json block, overwrite .devlyn/spec-verify.json with it. This is
556
- # the real-user carrier path; in real-user mode a pre-existing file
557
- # is stale (from a killed prior run) and must NOT be trusted.
558
- # 3. If source has no json block AND source.type=="generated":
3071
+ # a json block, overwrite .devlyn/spec-verify.json with it.
3072
+ # 4. If source has no json block AND source.type=="generated":
559
3073
  # CRITICAL spec-verify-malformed — generated criteria must ship a
560
- # verifiable contract per phase-1-build.md <output_contract>.
561
- # 4. If source has no json block AND source.type=="spec":
3074
+ # verifiable contract per the generated-criteria output contract.
3075
+ # 5. If source has no sibling/json block AND source.type=="spec":
562
3076
  # - Real-user mode: silent no-op (preserves iter-0019.6 backward
563
3077
  # compat for handwritten specs without the carrier). Drop any
564
3078
  # stale pre-staged file.
@@ -568,6 +3082,14 @@ def main() -> int:
568
3082
  pre_staged = spec_path.is_file() # captured BEFORE any potential write
569
3083
  trust_bench_staged = bench_mode and pre_staged
570
3084
  src_type, source_md = read_source(work, devlyn_dir)
3085
+ state = read_state(devlyn_dir)
3086
+ integrity_error = source_integrity_error(src_type, state, source_md)
3087
+ if integrity_error:
3088
+ print(f"[spec-verify] carrier malformed: {integrity_error}", file=sys.stderr)
3089
+ write_malformed_finding(devlyn_dir, integrity_error, source_md)
3090
+ return 1
3091
+ expected_data: dict | None = None
3092
+ expected_path: Path | None = None
571
3093
  if validate_risk_probes_only:
572
3094
  _risk_probes, risk_error = load_risk_probes(
573
3095
  devlyn_dir, source_md, require_present=True
@@ -579,7 +3101,20 @@ def main() -> int:
579
3101
  print("[spec-verify] risk probes valid", file=sys.stderr)
580
3102
  return 0
581
3103
  if source_md is not None and not trust_bench_staged:
582
- staged, error = stage_from_source(source_md, devlyn_dir)
3104
+ if src_type == "spec":
3105
+ expected_found, expected_staged, expected_error, expected_path, expected_data = stage_from_expected(
3106
+ source_md, devlyn_dir
3107
+ )
3108
+ if expected_error is not None:
3109
+ print(f"[spec-verify] carrier malformed: {expected_error}", file=sys.stderr)
3110
+ write_malformed_finding(devlyn_dir, expected_error, expected_path)
3111
+ return 1
3112
+ if expected_staged:
3113
+ staged, error = (True, None)
3114
+ else:
3115
+ staged, error = stage_from_source(source_md, devlyn_dir)
3116
+ else:
3117
+ staged, error = stage_from_source(source_md, devlyn_dir)
583
3118
  if error is not None:
584
3119
  print(f"[spec-verify] carrier malformed: {error}", file=sys.stderr)
585
3120
  write_malformed_finding(devlyn_dir, error, source_md)
@@ -589,13 +3124,13 @@ def main() -> int:
589
3124
  msg = (
590
3125
  f"generated {source_md.name} must include a "
591
3126
  "`## Verification` ```json``` block (verification_commands "
592
- "array). PHASE 1 BUILD generated criteria without one."
3127
+ "array). Generated criteria were written without one."
593
3128
  )
594
3129
  print(f"[spec-verify] {msg}", file=sys.stderr)
595
3130
  write_malformed_finding(devlyn_dir, msg, source_md)
596
3131
  return 1
597
3132
  # source.type=="spec", no block in spec markdown.
598
- if not bench_mode:
3133
+ if not bench_mode and expected_data is None:
599
3134
  # Real-user handwritten spec: silent no-op. Drop any stale
600
3135
  # pre-staged file so a killed prior run cannot poison this
601
3136
  # run's gate.
@@ -615,31 +3150,42 @@ def main() -> int:
615
3150
  spec_path.unlink()
616
3151
  return 0
617
3152
 
3153
+ commands: list[dict] = []
618
3154
  if not spec_path.exists():
619
3155
  # No source markdown carrier AND no pre-staged file. Silent no-op
620
3156
  # for benchmark misconfigurations (no fixture to gate against) and
621
3157
  # for real-user runs without spec/criteria. Generated source case
622
3158
  # is handled above.
623
- return 0
624
-
625
- try:
626
- spec = json.loads(spec_path.read_text())
627
- except (json.JSONDecodeError, OSError) as e:
628
- print(f"[spec-verify] error: cannot parse {spec_path}: {e}", file=sys.stderr)
629
- return 2
3159
+ if expected_data is None:
3160
+ return 0
3161
+ else:
3162
+ try:
3163
+ spec = loads_strict_json(spec_path.read_text())
3164
+ except (ValueError, OSError) as e:
3165
+ print(f"[spec-verify] error: cannot parse {spec_path}: {e}", file=sys.stderr)
3166
+ return 2
630
3167
 
631
- # iter-0019.8 (Codex R2 #2): apply full shape validation to pre-staged
632
- # carriers too — bool exit_code, empty list, whitespace-only cmd were
633
- # silently accepted on the benchmark path. Empty list is rejected
634
- # because "all 0 commands passed" is vacuously true.
635
- shape_err = validate_shape(spec)
636
- if shape_err:
637
- print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
638
- write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
639
- return 1
640
- commands = list(spec["verification_commands"])
3168
+ # iter-0019.8 (Codex R2 #2): apply full shape validation to pre-staged
3169
+ # carriers too — bool exit_code, empty list, whitespace-only cmd were
3170
+ # silently accepted on the benchmark path. Empty list is rejected
3171
+ # because "all 0 commands passed" is vacuously true.
3172
+ shape_err = validate_shape(spec)
3173
+ if shape_err:
3174
+ print(f"[spec-verify] error: {spec_path}: {shape_err}", file=sys.stderr)
3175
+ write_malformed_finding(devlyn_dir, f"{spec_path}: {shape_err}", None)
3176
+ return 1
3177
+ commands = list(spec["verification_commands"])
641
3178
  if include_risk_probes:
642
- risk_probes, risk_error = load_risk_probes(devlyn_dir, source_md)
3179
+ risk_state_error = risk_probes_state_error(state)
3180
+ if risk_state_error:
3181
+ print(f"[spec-verify] risk probes malformed: {risk_state_error}", file=sys.stderr)
3182
+ write_malformed_finding(devlyn_dir, risk_state_error, Path("pipeline.state.json"))
3183
+ return 1
3184
+ risk_probes, risk_error = load_risk_probes(
3185
+ devlyn_dir,
3186
+ source_md,
3187
+ require_present=state_requires_risk_probes(state),
3188
+ )
643
3189
  if risk_error:
644
3190
  print(f"[spec-verify] risk probes malformed: {risk_error}", file=sys.stderr)
645
3191
  write_malformed_finding(devlyn_dir, risk_error, devlyn_dir / "risk-probes.jsonl")
@@ -648,7 +3194,7 @@ def main() -> int:
648
3194
 
649
3195
  devlyn_dir.mkdir(parents=True, exist_ok=True)
650
3196
  results_path = devlyn_dir / "spec-verify.results.json"
651
- findings_path = devlyn_dir / "spec-verify-findings.jsonl"
3197
+ findings_path = devlyn_dir / output_findings_name()
652
3198
 
653
3199
  verify_env = os.environ.copy()
654
3200
  verify_env["BENCH_WORKDIR"] = str(work)
@@ -759,7 +3305,7 @@ def main() -> int:
759
3305
  )
760
3306
 
761
3307
  findings.append({
762
- "id": f"BGATE-{finding_seq:04d}",
3308
+ "id": f"{output_finding_prefix()}-{finding_seq:04d}",
763
3309
  "rule_id": rule_id,
764
3310
  "level": "error",
765
3311
  "severity": "CRITICAL",
@@ -767,7 +3313,7 @@ def main() -> int:
767
3313
  "message": msg,
768
3314
  "file": file_ref,
769
3315
  "line": 1,
770
- "phase": "build_gate",
3316
+ "phase": output_phase(),
771
3317
  "criterion_ref": criterion_ref,
772
3318
  "fix_hint": fix_hint,
773
3319
  "blocking": True,
@@ -784,7 +3330,7 @@ def main() -> int:
784
3330
  else "correctness.spec-literal-mismatch"
785
3331
  )
786
3332
  findings.append({
787
- "id": f"BGATE-{finding_seq:04d}",
3333
+ "id": f"{output_finding_prefix()}-{finding_seq:04d}",
788
3334
  "rule_id": rule_id,
789
3335
  "level": "error",
790
3336
  "severity": "CRITICAL",
@@ -794,7 +3340,7 @@ def main() -> int:
794
3340
  ),
795
3341
  "file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
796
3342
  "line": 1,
797
- "phase": "build_gate",
3343
+ "phase": output_phase(),
798
3344
  "criterion_ref": (
799
3345
  f"risk-probe:{vc.get('id')}"
800
3346
  if vc.get("_risk_probe")
@@ -817,7 +3363,7 @@ def main() -> int:
817
3363
  else "correctness.spec-literal-mismatch"
818
3364
  )
819
3365
  findings.append({
820
- "id": f"BGATE-{finding_seq:04d}",
3366
+ "id": f"{output_finding_prefix()}-{finding_seq:04d}",
821
3367
  "rule_id": rule_id,
822
3368
  "level": "error",
823
3369
  "severity": "CRITICAL",
@@ -828,7 +3374,7 @@ def main() -> int:
828
3374
  ),
829
3375
  "file": ".devlyn/risk-probes.jsonl" if vc.get("_risk_probe") else ".devlyn/spec-verify.json",
830
3376
  "line": 1,
831
- "phase": "build_gate",
3377
+ "phase": output_phase(),
832
3378
  "criterion_ref": (
833
3379
  f"risk-probe:{vc.get('id')}"
834
3380
  if vc.get("_risk_probe")
@@ -843,6 +3389,16 @@ def main() -> int:
843
3389
  })
844
3390
  finding_seq += 1
845
3391
 
3392
+ expected_findings, finding_seq = expected_contract_findings(
3393
+ expected_data,
3394
+ expected_path,
3395
+ work,
3396
+ devlyn_dir,
3397
+ state,
3398
+ finding_seq,
3399
+ )
3400
+ findings.extend(expected_findings)
3401
+
846
3402
  results_path.write_text(json.dumps({"commands": results}, indent=2) + "\n")
847
3403
 
848
3404
  # Append findings (jsonl). BUILD_GATE merge step concatenates this onto
@@ -853,10 +3409,11 @@ def main() -> int:
853
3409
  fh.write(json.dumps(f) + "\n")
854
3410
 
855
3411
  failed = [r for r in results if r.get("pass") is False]
856
- if failed:
3412
+ blocking_findings = [f for f in findings if f.get("severity") in {"CRITICAL", "HIGH"}]
3413
+ if failed or blocking_findings:
857
3414
  print(
858
3415
  f"[spec-verify] {len(failed)}/{len(results)} command(s) failed; "
859
- f"{len(findings)} CRITICAL finding(s) written to {findings_path}",
3416
+ f"{len(findings)} finding(s) written to {findings_path}",
860
3417
  file=sys.stderr,
861
3418
  )
862
3419
  return 1