devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -18,6 +18,113 @@ section() { printf '\n%s=== %s ===%s\n' "$dim" "$1" "$reset"; }
18
18
  ok() { printf ' %s✓%s %s\n' "$green" "$reset" "$1"; }
19
19
  bad() { printf ' %s✗%s %s\n' "$red" "$reset" "$1"; fail=1; }
20
20
 
21
+ make_temp_file() {
22
+ local __var="$1"
23
+ shift || true
24
+ local path
25
+ if ! path=$(command mktemp "$@"); then
26
+ bad "mktemp failed: ${*:-<default>}"
27
+ return 1
28
+ fi
29
+ printf -v "$__var" '%s' "$path"
30
+ }
31
+
32
+ make_temp_dir() {
33
+ local __var="$1"
34
+ shift || true
35
+ local path
36
+ if ! path=$(command mktemp -d "$@"); then
37
+ bad "mktemp -d failed: ${*:-<default>}"
38
+ return 1
39
+ fi
40
+ printf -v "$__var" '%s' "$path"
41
+ }
42
+
43
+ section "Check 0a: Temp allocation fails closed"
44
+ direct_mktemp=$(grep -nE '(^|[ =])mktemp( |$)|\$\([[:space:]]*mktemp' scripts/lint-skills.sh \
45
+ | grep -v 'command mktemp' \
46
+ | grep -v 'make_temp_' \
47
+ | grep -v 'direct_mktemp=' || true)
48
+ if [ -z "$direct_mktemp" ]; then
49
+ ok "lint-skills.sh uses temp allocation helpers instead of direct mktemp"
50
+ else
51
+ while IFS= read -r f; do bad "$f"; done <<< "$direct_mktemp"
52
+ fi
53
+
54
+ # iter-0034 Phase 4 cutover (2026-05-03): legacy skill paths dropped.
55
+ # Surface is the 2-skill product (`/devlyn:resolve` + `/devlyn:ideate`)
56
+ # plus the `_shared/` kernel. Keep this list single-source so all installed
57
+ # mirror parity checks cover the same files.
58
+ critical_path_files=$(cat <<'EOF'
59
+ _shared/spec-verify-check.py
60
+ _shared/collect-codex-findings.py
61
+ _shared/verify-merge-findings.py
62
+ devlyn:ideate/SKILL.md
63
+ devlyn:ideate/references/spec-template.md
64
+ devlyn:ideate/references/elicitation.md
65
+ devlyn:ideate/references/project-mode.md
66
+ devlyn:ideate/references/from-spec-mode.md
67
+ devlyn:resolve/SKILL.md
68
+ devlyn:resolve/references/state-schema.md
69
+ devlyn:resolve/references/free-form-mode.md
70
+ devlyn:resolve/references/phases/plan.md
71
+ devlyn:resolve/references/phases/probe-derive.md
72
+ devlyn:resolve/references/phases/implement.md
73
+ devlyn:resolve/references/phases/build-gate.md
74
+ devlyn:resolve/references/phases/cleanup.md
75
+ devlyn:resolve/references/phases/verify.md
76
+ _shared/expected.schema.json
77
+ _shared/adapters/README.md
78
+ _shared/adapters/opus-4-7.md
79
+ _shared/adapters/gpt-5-5.md
80
+ _shared/codex-config.md
81
+ _shared/codex-monitored.sh
82
+ _shared/engine-preflight.md
83
+ _shared/pair-plan-schema.md
84
+ _shared/runtime-principles.md
85
+ EOF
86
+ )
87
+
88
+ check_skill_mirror_parity() {
89
+ local target_dir="$1"
90
+ local skip_msg="$2"
91
+ local missing_prefix="$3"
92
+ local differ_suffix="$4"
93
+ local ok_msg="$5"
94
+ local drift=0
95
+ local rel src dst
96
+
97
+ if [ ! -d "$target_dir" ]; then
98
+ ok "$skip_msg"
99
+ return
100
+ fi
101
+
102
+ while IFS= read -r rel; do
103
+ [ -n "$rel" ] || continue
104
+ src="config/skills/$rel"
105
+ dst="$target_dir/$rel"
106
+ if [ ! -f "$src" ] || [ ! -f "$dst" ]; then
107
+ bad "$missing_prefix: $rel"; drift=1; continue
108
+ fi
109
+ if ! diff -q "$src" "$dst" >/dev/null 2>&1; then
110
+ bad "$rel — $differ_suffix"
111
+ drift=1
112
+ fi
113
+ done <<< "$critical_path_files"
114
+
115
+ # iter-0009: codex-monitored.sh must be executable in installed mirrors
116
+ # (skills trees get copied into work dirs for variant arms; bash refuses to
117
+ # run a non-executable wrapper).
118
+ if [ -f "$target_dir/_shared/codex-monitored.sh" ] \
119
+ && [ ! -x "$target_dir/_shared/codex-monitored.sh" ]; then
120
+ bad "_shared/codex-monitored.sh — not executable in ${target_dir} mirror"
121
+ drift=1
122
+ fi
123
+ if [ $drift -eq 0 ]; then
124
+ ok "$ok_msg"
125
+ fi
126
+ }
127
+
21
128
  # ---------------------------------------------------------------------------
22
129
  # 1. No MCP references in managed source or user-facing docs.
23
130
  # ---------------------------------------------------------------------------
@@ -69,6 +176,18 @@ else
69
176
  while IFS= read -r f; do bad "$f"; done <<< "$offenders"
70
177
  fi
71
178
 
179
+ # ---------------------------------------------------------------------------
180
+ # 2a. Packaged root instruction files must not contain pyx-memory secrets.
181
+ # ---------------------------------------------------------------------------
182
+ section "Check 2a: No pyx-memory secrets in packaged root instructions"
183
+ offenders=$(grep -RInE 'memory\.api\.pyxmate\.com|Authorization: Bearer pyx_[A-Za-z0-9]{12,}|X-API-Key: pyx_[A-Za-z0-9]{12,}|pyx_[A-Za-z0-9]{16,}' \
184
+ AGENTS.md CLAUDE.md 2>/dev/null || true)
185
+ if [ -z "$offenders" ]; then
186
+ ok "AGENTS.md and CLAUDE.md contain no pyx-memory secret material"
187
+ else
188
+ while IFS= read -r f; do bad "$f"; done <<< "$offenders"
189
+ fi
190
+
72
191
  # ---------------------------------------------------------------------------
73
192
  # 3. No stale model strings (gpt-5.0..5.4 hardcoded outside config).
74
193
  # ---------------------------------------------------------------------------
@@ -121,66 +240,54 @@ if [ $missing -eq 0 ]; then
121
240
  ok "all devlyn:* skills have name: field"
122
241
  fi
123
242
 
243
+ # ---------------------------------------------------------------------------
244
+ # 5a. devlyn:design-ui is a required skill, not an optional addon.
245
+ # ---------------------------------------------------------------------------
246
+ section "Check 5a: devlyn:design-ui is required"
247
+ if [ -f "config/skills/devlyn:design-ui/SKILL.md" ]; then
248
+ ok "devlyn:design-ui source lives in config/skills"
249
+ else
250
+ bad "devlyn:design-ui must be a required skill under config/skills"
251
+ fi
252
+ if [ ! -e "optional-skills/devlyn:design-ui" ]; then
253
+ ok "devlyn:design-ui is not in optional-skills"
254
+ else
255
+ bad "devlyn:design-ui must not be installed as an optional addon"
256
+ fi
257
+ if grep -Fq "skillsToInstall: ['devlyn:resolve', 'devlyn:ideate', 'devlyn:design-ui', '_shared']" bin/devlyn.js; then
258
+ ok "Codex install includes devlyn:design-ui"
259
+ else
260
+ bad "Codex skillsToInstall must include devlyn:design-ui"
261
+ fi
262
+ if ! grep -F "name: 'devlyn:design-ui'" bin/devlyn.js >/dev/null 2>&1; then
263
+ ok "devlyn:design-ui is absent from OPTIONAL_ADDONS"
264
+ else
265
+ bad "devlyn:design-ui must not be listed in OPTIONAL_ADDONS"
266
+ fi
267
+
124
268
  # ---------------------------------------------------------------------------
125
269
  # 6. Source ↔ installed mirror parity on critical path.
126
270
  # Only runs if .claude/skills exists (i.e. installer has been run).
127
271
  # ---------------------------------------------------------------------------
128
272
  section "Check 6: Source ↔ installed mirror parity (critical path)"
129
- if [ ! -d .claude/skills ]; then
130
- ok "no .claude/skills (fresh checkout) — skipping parity check"
131
- else
132
- drift=0
133
- # iter-0034 Phase 4 cutover (2026-05-03): legacy skill paths dropped.
134
- # Surface is the 2-skill product (`/devlyn:resolve` + `/devlyn:ideate`)
135
- # plus the `_shared/` kernel.
136
- for rel in \
137
- _shared/spec-verify-check.py \
138
- _shared/collect-codex-findings.py \
139
- _shared/verify-merge-findings.py \
140
- devlyn:ideate/SKILL.md \
141
- devlyn:ideate/references/spec-template.md \
142
- devlyn:ideate/references/elicitation.md \
143
- devlyn:ideate/references/project-mode.md \
144
- devlyn:ideate/references/from-spec-mode.md \
145
- devlyn:resolve/SKILL.md \
146
- devlyn:resolve/references/state-schema.md \
147
- devlyn:resolve/references/free-form-mode.md \
148
- devlyn:resolve/references/phases/plan.md \
149
- devlyn:resolve/references/phases/probe-derive.md \
150
- devlyn:resolve/references/phases/implement.md \
151
- devlyn:resolve/references/phases/build-gate.md \
152
- devlyn:resolve/references/phases/cleanup.md \
153
- devlyn:resolve/references/phases/verify.md \
154
- _shared/expected.schema.json \
155
- _shared/adapters/README.md \
156
- _shared/adapters/opus-4-7.md \
157
- _shared/adapters/gpt-5-5.md \
158
- _shared/codex-config.md \
159
- _shared/codex-monitored.sh \
160
- _shared/pair-plan-schema.md \
161
- _shared/runtime-principles.md; do
162
- src="config/skills/$rel"
163
- dst=".claude/skills/$rel"
164
- if [ ! -f "$src" ] || [ ! -f "$dst" ]; then
165
- bad "missing file on critical path: $rel"; drift=1; continue
166
- fi
167
- if ! diff -q "$src" "$dst" >/dev/null 2>&1; then
168
- bad "$rel — source and installed differ"
169
- drift=1
170
- fi
171
- done
172
- # iter-0009: codex-monitored.sh must be executable in the installed mirror
173
- # (skills tree gets cp -R'd into $WORK_DIR for the variant arm; bash will
174
- # refuse to run a non-executable wrapper).
175
- if [ -f ".claude/skills/_shared/codex-monitored.sh" ] \
176
- && [ ! -x ".claude/skills/_shared/codex-monitored.sh" ]; then
177
- bad "_shared/codex-monitored.sh — not executable in installed mirror"
178
- drift=1
179
- fi
180
- if [ $drift -eq 0 ]; then
181
- ok "critical path parity clean"
182
- fi
183
- fi
273
+ check_skill_mirror_parity \
274
+ ".claude/skills" \
275
+ "no .claude/skills (fresh checkout) — skipping parity check" \
276
+ "missing file on critical path" \
277
+ "source and installed differ" \
278
+ "critical path parity clean"
279
+
280
+ # Codex / agent runtimes in this repo can also expose a project-local
281
+ # `.agents/skills` mirror. If it exists, keep the same critical path in parity;
282
+ # otherwise a session can read stale pair/risk-probe contracts even while the
283
+ # source and `.claude/skills` mirrors are clean.
284
+ section "Check 6a: Source ↔ .agents mirror parity (critical path)"
285
+ check_skill_mirror_parity \
286
+ ".agents/skills" \
287
+ "no .agents/skills — skipping parity check" \
288
+ "missing .agents critical-path file" \
289
+ "source and .agents mirror differ" \
290
+ ".agents critical path parity clean"
184
291
 
185
292
  # ---------------------------------------------------------------------------
186
293
  # 6b. VERIFY merge verdict binding self-test.
@@ -194,6 +301,83 @@ if python3 config/skills/_shared/verify-merge-findings.py --self-test >/dev/null
194
301
  else
195
302
  bad "verify-merge-findings.py self-test failed"
196
303
  fi
304
+ if ! grep -Fq 'def pair_trigger_skip_contract_violation' config/skills/_shared/verify-merge-findings.py \
305
+ || ! grep -Fq 'def pair_trigger_missing_contract_violation' config/skills/_shared/verify-merge-findings.py \
306
+ || ! grep -Fq 'def pair_trigger_reason_completeness_violation' config/skills/_shared/verify-merge-findings.py \
307
+ || ! grep -Fq 'def pair_trigger_present' config/skills/_shared/verify-merge-findings.py \
308
+ || ! grep -Fq 'KNOWN_PAIR_TRIGGER_REASONS = {' config/skills/_shared/verify-merge-findings.py \
309
+ || ! grep -Fq 'mode.pair-verify' config/skills/_shared/verify-merge-findings.py \
310
+ || ! grep -Fq 'state.get("pair_verify") is True' config/skills/_shared/verify-merge-findings.py \
311
+ || ! grep -Fq 'def pair_flag_contract_violation' config/skills/_shared/verify-merge-findings.py \
312
+ || ! grep -Fq 'verify-pair-trigger-conflicting-pair-flags' config/skills/_shared/verify-merge-findings.py \
313
+ || ! grep -Fq -- '--pair-verify and --no-pair are mutually exclusive' config/skills/_shared/verify-merge-findings.py \
314
+ || ! grep -Fq 'def has_known_pair_trigger_reason' config/skills/_shared/verify-merge-findings.py \
315
+ || ! grep -Fq 'def all_known_pair_trigger_reasons' config/skills/_shared/verify-merge-findings.py \
316
+ || ! grep -Fq 'return reason in KNOWN_PAIR_TRIGGER_REASONS' config/skills/_shared/verify-merge-findings.py \
317
+ || ! grep -Fq '"reasons": ["risk.high", 3]' config/skills/_shared/verify-merge-findings.py \
318
+ || ! grep -Fq '"reasons": ["risk.high", "looks-hard"]' config/skills/_shared/verify-merge-findings.py \
319
+ || ! grep -Fq '"reasons": ["risk high"]' config/skills/_shared/verify-merge-findings.py \
320
+ || ! grep -Fq '"reasons": ["risk_profile.high_risk", "risk_probes_enabled"]' config/skills/_shared/verify-merge-findings.py \
321
+ || ! grep -Fq 'verify-pair-trigger-reasons-unknown' config/skills/_shared/verify-merge-findings.py \
322
+ || ! grep -Fq 'verify-pair-trigger-reasons-incomplete' config/skills/_shared/verify-merge-findings.py \
323
+ || ! grep -Fq 'pair_trigger.reasons is missing applicable canonical reason(s)' config/skills/_shared/verify-merge-findings.py \
324
+ || ! grep -Fq 'pair_trigger.reasons must include a known pair-trigger reason' config/skills/_shared/verify-merge-findings.py \
325
+ || ! grep -Fq 'pair_trigger.reasons must only include known pair-trigger reasons' config/skills/_shared/verify-merge-findings.py \
326
+ || ! grep -Fq 'verify-pair-trigger-ineligible-unjustified' config/skills/_shared/verify-merge-findings.py \
327
+ || ! grep -Fq 'VERIFY state requires a pair decision' config/skills/_shared/verify-merge-findings.py \
328
+ || ! grep -Fq 'verify-pair-trigger-user-no-pair-unsupported' config/skills/_shared/verify-merge-findings.py \
329
+ || ! grep -Fq 'risk_profile.pair_default_enabled false from an explicit --no-pair opt-out' config/skills/_shared/verify-merge-findings.py \
330
+ || ! grep -Fq 'def risk_profile_contract_violation' config/skills/_shared/verify-merge-findings.py \
331
+ || ! grep -Fq 'verify-risk-profile-malformed' config/skills/_shared/verify-merge-findings.py \
332
+ || ! grep -Fq 'risk_profile.risk_probes_enabled must be a boolean' config/skills/_shared/verify-merge-findings.py \
333
+ || ! grep -Fq 'risk_profile.reasons must be a list of strings' config/skills/_shared/verify-merge-findings.py \
334
+ || ! grep -Fq 'def spec_frontmatter_complexity' config/skills/_shared/verify-merge-findings.py \
335
+ || ! grep -Fq '"complexity": "large"' config/skills/_shared/verify-merge-findings.py \
336
+ || ! grep -Fq 'complexity.large' config/skills/_shared/verify-merge-findings.py \
337
+ || ! grep -Fq 'spec.complexity.high' config/skills/_shared/verify-merge-findings.py \
338
+ || ! grep -Fq 'spec.complexity.large' config/skills/_shared/verify-merge-findings.py \
339
+ || ! grep -Fq 'verify-pair-trigger-required-missing' config/skills/_shared/verify-merge-findings.py \
340
+ || ! grep -Fq 'verify-pair-trigger-skipped-reason-unsupported' config/skills/_shared/verify-merge-findings.py \
341
+ || ! grep -Fq 'verify-pair-trigger-mechanical-blocker-unsupported' config/skills/_shared/verify-merge-findings.py \
342
+ || ! grep -Fq 'verify-pair-trigger-primary-judge-blocker-unsupported' config/skills/_shared/verify-merge-findings.py \
343
+ || ! grep -Fq 'eligible:false` with no supported skip reason' config/skills/devlyn:resolve/references/state-schema.md \
344
+ || ! grep -Fq 'Canonical eligible reasons are `mode.verify-only`' config/skills/devlyn:resolve/references/state-schema.md \
345
+ || ! grep -Fq '`mode.pair-verify`' config/skills/devlyn:resolve/references/state-schema.md \
346
+ || ! grep -Fq 'Eligible triggers must contain only canonical' config/skills/devlyn:resolve/references/phases/verify.md \
347
+ || ! grep -Fq 'include every applicable canonical reason' config/skills/devlyn:resolve/references/phases/verify.md \
348
+ || ! grep -Fq '`mode.pair-verify`' config/skills/devlyn:resolve/references/phases/verify.md \
349
+ || ! grep -Fq '"pair_verify": false' config/skills/devlyn:resolve/references/state-schema.md \
350
+ || ! grep -Fq 'requires a non-empty reasons list containing every applicable canonical eligible reason' config/skills/devlyn:resolve/references/state-schema.md \
351
+ || ! grep -Fq 'containing every applicable canonical eligible reason' config/skills/devlyn:resolve/references/state-schema.md \
352
+ || ! grep -Fq 'state.pair_verify == true' config/skills/devlyn:resolve/references/state-schema.md \
353
+ || ! grep -Fq 'pair_verify: true` only when `--pair-verify` was passed' config/skills/devlyn:resolve/SKILL.md \
354
+ || ! grep -Fq 'include every applicable canonical reason' config/skills/devlyn:resolve/SKILL.md \
355
+ || ! grep -Fq '`--pair-verify` and `--no-pair` are mutually exclusive' config/skills/devlyn:resolve/SKILL.md \
356
+ || ! grep -Fq 'mutually exclusive with `risk_profile.pair_default_enabled == false`' config/skills/devlyn:resolve/references/state-schema.md \
357
+ || ! grep -Fq 'if both are present, stop with `BLOCKED:invalid-flags`' config/skills/devlyn:resolve/references/phases/verify.md \
358
+ || ! grep -Fq 'Contradictory, incomplete, or unknown trigger state is a VERIFY contract violation' config/skills/devlyn:resolve/SKILL.md \
359
+ || ! grep -Fq 'user_no_pair` is valid only when `risk_profile.pair_default_enabled == false`' config/skills/devlyn:resolve/references/state-schema.md \
360
+ || ! grep -Fq 'def reject_json_constant' config/skills/_shared/verify-merge-findings.py \
361
+ || ! grep -Fq 'loads_strict_json(raw)' config/skills/_shared/verify-merge-findings.py \
362
+ || ! grep -Fq 'invalid JSON numeric constant: NaN' config/skills/_shared/verify-merge-findings.py; then
363
+ bad "verify-merge-findings.py must block missing pair_trigger and unsupported skip reasons"
364
+ fi
365
+ verify_merge_risk_profile_guard_missing=0
366
+ for file in \
367
+ config/skills/_shared/verify-merge-findings.py \
368
+ .claude/skills/_shared/verify-merge-findings.py \
369
+ .agents/skills/_shared/verify-merge-findings.py; do
370
+ if ! grep -Fq 'def risk_profile_contract_violation' "$file" \
371
+ || ! grep -Fq 'verify-risk-profile-malformed' "$file" \
372
+ || ! grep -Fq 'risk_profile.risk_probes_enabled must be a boolean' "$file" \
373
+ || ! grep -Fq 'risk_profile.reasons must be a list of strings' "$file"; then
374
+ bad "$file — verify-merge-findings.py must fail closed on malformed risk_profile"
375
+ verify_merge_risk_profile_guard_missing=1
376
+ fi
377
+ done
378
+ if [ $verify_merge_risk_profile_guard_missing -eq 0 ]; then
379
+ ok "verify-merge-findings.py risk_profile shape guard is mirrored"
380
+ fi
197
381
 
198
382
  section "Check 6c: Codex stdout collection writes canonical pair findings"
199
383
  if python3 config/skills/_shared/collect-codex-findings.py --self-test >/dev/null 2>&1; then
@@ -201,12 +385,1001 @@ if python3 config/skills/_shared/collect-codex-findings.py --self-test >/dev/nul
201
385
  else
202
386
  bad "collect-codex-findings.py self-test failed"
203
387
  fi
388
+ if ! grep -Fq 'def reject_json_constant' config/skills/_shared/collect-codex-findings.py \
389
+ || ! grep -Fq 'loads_strict_json(raw)' config/skills/_shared/collect-codex-findings.py \
390
+ || ! grep -Fq 'NaN Codex stdout finding must not normalize' config/skills/_shared/collect-codex-findings.py; then
391
+ bad "collect-codex-findings.py must reject non-standard JSON constants in pair-JUDGE stdout"
392
+ fi
393
+
394
+ section "Check 6c1: Archive preserves pair/risk-probe artifacts safely"
395
+ if python3 config/skills/_shared/archive_run.py --self-test >/dev/null 2>&1; then
396
+ ok "archive_run.py self-test passed"
397
+ else
398
+ bad "archive_run.py self-test failed"
399
+ fi
400
+ if ! grep -Fq 'SAFE_RUN_ID_RE' config/skills/_shared/archive_run.py \
401
+ || ! grep -Fq 'run_id must match [A-Za-z0-9_.-]+' config/skills/_shared/archive_run.py \
402
+ || ! grep -Fq 'Archive devlyn:resolve run artifacts' config/skills/_shared/archive_run.py \
403
+ || grep -Fq 'Archive auto-resolve run artifacts' config/skills/_shared/archive_run.py \
404
+ || ! grep -Fq 'invalid JSON numeric constant: NaN' config/skills/_shared/archive_run.py \
405
+ || ! grep -Fq '"verify.pair.findings.jsonl"' config/skills/_shared/archive_run.py \
406
+ || ! grep -Fq '"verify-merge.summary.json"' config/skills/_shared/archive_run.py \
407
+ || ! grep -Fq '"codex-judge.*"' config/skills/_shared/archive_run.py; then
408
+ bad "archive_run.py must safely archive pair/risk-probe evidence and reject unsafe run ids"
409
+ fi
204
410
 
205
411
  section "Check 6d: Spec verification executes hidden-blind risk probes"
206
412
  if python3 config/skills/_shared/spec-verify-check.py --self-test >/dev/null 2>&1; then
207
- ok "spec-verify-check.py risk-probe self-test passed"
413
+ ok "spec-verify-check.py risk-probe and expected-contract self-test passed"
414
+ else
415
+ bad "spec-verify-check.py risk-probe / expected-contract self-test failed"
416
+ fi
417
+ if ! grep -Fq 'def validate_present_spec_complexity' config/skills/_shared/spec-verify-check.py \
418
+ || ! grep -Fq 'SPEC_COMPLEXITY_VALUES = {"trivial", "medium", "high", "large"}' config/skills/_shared/spec-verify-check.py \
419
+ || ! grep -Fq 'frontmatter complexity must be one of' config/skills/_shared/spec-verify-check.py \
420
+ || ! grep -Fq 'def validate_sibling_spec_complexity' config/skills/_shared/spec-verify-check.py \
421
+ || ! grep -Fq 'unsupported spec complexity was accepted' config/skills/_shared/spec-verify-check.py \
422
+ || ! grep -Fq 'unsupported sibling spec complexity was accepted by --check-expected' config/skills/_shared/spec-verify-check.py; then
423
+ bad "spec-verify-check.py checks must reject unsupported spec complexity values"
424
+ else
425
+ ok "spec-verify-check.py checks reject unsupported spec complexity values"
426
+ fi
427
+ if ! grep -Fq 'generated criteria carrier was not staged into .devlyn/spec-verify.json' config/skills/_shared/spec-verify-check.py \
428
+ || ! grep -Fq 'spec source with mismatched source.spec_sha256 was accepted' config/skills/_shared/spec-verify-check.py \
429
+ || ! grep -Fq 'spec source with matching source.spec_sha256 was not staged' config/skills/_shared/spec-verify-check.py \
430
+ || ! grep -Fq 'source.spec_sha256 mismatch' config/skills/_shared/spec-verify-check.py \
431
+ || ! grep -Fq 'generated criteria without a JSON carrier was accepted' config/skills/_shared/spec-verify-check.py \
432
+ || ! grep -Fq 'generated criteria without source.criteria_sha256 was accepted' config/skills/_shared/spec-verify-check.py \
433
+ || ! grep -Fq 'generated criteria with mismatched source.criteria_sha256 was accepted' config/skills/_shared/spec-verify-check.py \
434
+ || ! grep -Fq 'def source_integrity_error' config/skills/_shared/spec-verify-check.py \
435
+ || ! grep -Fq 'source.criteria_sha256 mismatch' config/skills/_shared/spec-verify-check.py \
436
+ || ! grep -Fq 'Generated criteria were written without one' config/skills/_shared/spec-verify-check.py \
437
+ || ! grep -Fq 'generated criteria carrier was not staged into .devlyn/spec-verify.json' .claude/skills/_shared/spec-verify-check.py \
438
+ || ! grep -Fq 'spec source with mismatched source.spec_sha256 was accepted' .claude/skills/_shared/spec-verify-check.py \
439
+ || ! grep -Fq 'spec source with matching source.spec_sha256 was not staged' .claude/skills/_shared/spec-verify-check.py \
440
+ || ! grep -Fq 'source.spec_sha256 mismatch' .claude/skills/_shared/spec-verify-check.py \
441
+ || ! grep -Fq 'generated criteria without a JSON carrier was accepted' .claude/skills/_shared/spec-verify-check.py \
442
+ || ! grep -Fq 'generated criteria without source.criteria_sha256 was accepted' .claude/skills/_shared/spec-verify-check.py \
443
+ || ! grep -Fq 'generated criteria with mismatched source.criteria_sha256 was accepted' .claude/skills/_shared/spec-verify-check.py \
444
+ || ! grep -Fq 'def source_integrity_error' .claude/skills/_shared/spec-verify-check.py \
445
+ || ! grep -Fq 'source.criteria_sha256 mismatch' .claude/skills/_shared/spec-verify-check.py \
446
+ || ! grep -Fq 'Generated criteria were written without one' .claude/skills/_shared/spec-verify-check.py \
447
+ || ! grep -Fq 'generated criteria carrier was not staged into .devlyn/spec-verify.json' .agents/skills/_shared/spec-verify-check.py \
448
+ || ! grep -Fq 'spec source with mismatched source.spec_sha256 was accepted' .agents/skills/_shared/spec-verify-check.py \
449
+ || ! grep -Fq 'spec source with matching source.spec_sha256 was not staged' .agents/skills/_shared/spec-verify-check.py \
450
+ || ! grep -Fq 'source.spec_sha256 mismatch' .agents/skills/_shared/spec-verify-check.py \
451
+ || ! grep -Fq 'generated criteria without a JSON carrier was accepted' .agents/skills/_shared/spec-verify-check.py \
452
+ || ! grep -Fq 'generated criteria without source.criteria_sha256 was accepted' .agents/skills/_shared/spec-verify-check.py \
453
+ || ! grep -Fq 'generated criteria with mismatched source.criteria_sha256 was accepted' .agents/skills/_shared/spec-verify-check.py \
454
+ || ! grep -Fq 'def source_integrity_error' .agents/skills/_shared/spec-verify-check.py \
455
+ || ! grep -Fq 'source.criteria_sha256 mismatch' .agents/skills/_shared/spec-verify-check.py \
456
+ || ! grep -Fq 'Generated criteria were written without one' .agents/skills/_shared/spec-verify-check.py \
457
+ || ! grep -Fq '"criteria_sha256": generated_hash' .agents/skills/_shared/spec-verify-check.py; then
458
+ bad "spec-verify-check.py self-test must cover generated criteria source extraction"
459
+ else
460
+ ok "spec-verify-check.py covers generated criteria source extraction"
461
+ fi
462
+ if ! grep -Fq 'def validate_present_solo_headroom_hypothesis' config/skills/_shared/spec-verify-check.py \
463
+ || ! grep -Fq 'def state_requires_risk_probes' config/skills/_shared/spec-verify-check.py \
464
+ || ! grep -Fq 'def risk_probes_state_error' config/skills/_shared/spec-verify-check.py \
465
+ || ! grep -Fq -- '--include-risk-probes accepted missing required risk-probes.jsonl' config/skills/_shared/spec-verify-check.py \
466
+ || ! grep -Fq -- '--include-risk-probes accepted non-boolean risk_probes_enabled' config/skills/_shared/spec-verify-check.py \
467
+ || ! grep -Fq -- '--include-risk-probes accepted non-object risk_profile' config/skills/_shared/spec-verify-check.py \
468
+ || ! grep -Fq 'def validate_risk_probes_cover_solo_headroom_hypothesis' config/skills/_shared/spec-verify-check.py \
469
+ || ! grep -Fq 'def has_backticked_observable_miss_command' config/skills/_shared/spec-verify-check.py \
470
+ || ! grep -Fq 'backticked command/observable line that exposes the miss' config/skills/_shared/spec-verify-check.py \
471
+ || ! grep -Fq 'weak solo-headroom hypothesis was accepted by --check' config/skills/_shared/spec-verify-check.py \
472
+ || ! grep -Fq 'descriptive backtick solo-headroom hypothesis was accepted by --check' config/skills/_shared/spec-verify-check.py \
473
+ || ! grep -Fq 'risk probe missing solo-headroom command coverage was accepted' config/skills/_shared/spec-verify-check.py \
474
+ || ! grep -Fq 'risk probe with unrelated solo-headroom derived_from was accepted' config/skills/_shared/spec-verify-check.py \
475
+ || ! grep -Fq 'risk-probes[0].derived_from must reference the solo-headroom hypothesis bullet' config/skills/_shared/spec-verify-check.py \
476
+ || ! grep -Fq 'solo-headroom command in a later risk probe was accepted' config/skills/_shared/spec-verify-check.py \
477
+ || ! grep -Fq 'solo-headroom command prefix match was accepted' config/skills/_shared/spec-verify-check.py \
478
+ || ! grep -Fq '(?<![A-Za-z0-9_.:/=-])' config/skills/_shared/spec-verify-check.py \
479
+ || ! grep -Fq 'risk-probes[0].cmd must contain a solo-headroom hypothesis observable command' config/skills/_shared/spec-verify-check.py \
480
+ || ! grep -Fq 'weak sibling solo-headroom hypothesis was accepted by --check-expected' config/skills/_shared/spec-verify-check.py \
481
+ || ! grep -Fq 'docs-style solo-headroom hypothesis was rejected by --check' config/skills/_shared/spec-verify-check.py \
482
+ || ! grep -Fq 'docs-style sibling solo-headroom command was rejected by --check-expected' config/skills/_shared/spec-verify-check.py \
483
+ || ! grep -Fq 'def validate_present_solo_headroom_hypothesis' .claude/skills/_shared/spec-verify-check.py \
484
+ || ! grep -Fq 'def state_requires_risk_probes' .claude/skills/_shared/spec-verify-check.py \
485
+ || ! grep -Fq 'def risk_probes_state_error' .claude/skills/_shared/spec-verify-check.py \
486
+ || ! grep -Fq -- '--include-risk-probes accepted missing required risk-probes.jsonl' .claude/skills/_shared/spec-verify-check.py \
487
+ || ! grep -Fq -- '--include-risk-probes accepted non-boolean risk_probes_enabled' .claude/skills/_shared/spec-verify-check.py \
488
+ || ! grep -Fq -- '--include-risk-probes accepted non-object risk_profile' .claude/skills/_shared/spec-verify-check.py \
489
+ || ! grep -Fq 'def validate_risk_probes_cover_solo_headroom_hypothesis' .claude/skills/_shared/spec-verify-check.py \
490
+ || ! grep -Fq 'def has_backticked_observable_miss_command' .claude/skills/_shared/spec-verify-check.py \
491
+ || ! grep -Fq 'backticked command/observable line that exposes the miss' .claude/skills/_shared/spec-verify-check.py \
492
+ || ! grep -Fq 'weak solo-headroom hypothesis was accepted by --check' .claude/skills/_shared/spec-verify-check.py \
493
+ || ! grep -Fq 'descriptive backtick solo-headroom hypothesis was accepted by --check' .claude/skills/_shared/spec-verify-check.py \
494
+ || ! grep -Fq 'risk probe missing solo-headroom command coverage was accepted' .claude/skills/_shared/spec-verify-check.py \
495
+ || ! grep -Fq 'risk probe with unrelated solo-headroom derived_from was accepted' .claude/skills/_shared/spec-verify-check.py \
496
+ || ! grep -Fq 'risk-probes[0].derived_from must reference the solo-headroom hypothesis bullet' .claude/skills/_shared/spec-verify-check.py \
497
+ || ! grep -Fq 'solo-headroom command in a later risk probe was accepted' .claude/skills/_shared/spec-verify-check.py \
498
+ || ! grep -Fq 'solo-headroom command prefix match was accepted' .claude/skills/_shared/spec-verify-check.py \
499
+ || ! grep -Fq '(?<![A-Za-z0-9_.:/=-])' .claude/skills/_shared/spec-verify-check.py \
500
+ || ! grep -Fq 'risk-probes[0].cmd must contain a solo-headroom hypothesis observable command' .claude/skills/_shared/spec-verify-check.py \
501
+ || ! grep -Fq 'weak sibling solo-headroom hypothesis was accepted by --check-expected' .claude/skills/_shared/spec-verify-check.py \
502
+ || ! grep -Fq 'docs-style solo-headroom hypothesis was rejected by --check' .claude/skills/_shared/spec-verify-check.py \
503
+ || ! grep -Fq 'docs-style sibling solo-headroom command was rejected by --check-expected' .claude/skills/_shared/spec-verify-check.py \
504
+ || ! grep -Fq 'def validate_present_solo_headroom_hypothesis' .agents/skills/_shared/spec-verify-check.py \
505
+ || ! grep -Fq 'def state_requires_risk_probes' .agents/skills/_shared/spec-verify-check.py \
506
+ || ! grep -Fq 'def risk_probes_state_error' .agents/skills/_shared/spec-verify-check.py \
507
+ || ! grep -Fq -- '--include-risk-probes accepted missing required risk-probes.jsonl' .agents/skills/_shared/spec-verify-check.py \
508
+ || ! grep -Fq -- '--include-risk-probes accepted non-boolean risk_probes_enabled' .agents/skills/_shared/spec-verify-check.py \
509
+ || ! grep -Fq -- '--include-risk-probes accepted non-object risk_profile' .agents/skills/_shared/spec-verify-check.py \
510
+ || ! grep -Fq 'def validate_risk_probes_cover_solo_headroom_hypothesis' .agents/skills/_shared/spec-verify-check.py \
511
+ || ! grep -Fq 'def has_backticked_observable_miss_command' .agents/skills/_shared/spec-verify-check.py \
512
+ || ! grep -Fq 'backticked command/observable line that exposes the miss' .agents/skills/_shared/spec-verify-check.py \
513
+ || ! grep -Fq 'weak solo-headroom hypothesis was accepted by --check' .agents/skills/_shared/spec-verify-check.py \
514
+ || ! grep -Fq 'descriptive backtick solo-headroom hypothesis was accepted by --check' .agents/skills/_shared/spec-verify-check.py \
515
+ || ! grep -Fq 'risk probe missing solo-headroom command coverage was accepted' .agents/skills/_shared/spec-verify-check.py \
516
+ || ! grep -Fq 'risk probe with unrelated solo-headroom derived_from was accepted' .agents/skills/_shared/spec-verify-check.py \
517
+ || ! grep -Fq 'risk-probes[0].derived_from must reference the solo-headroom hypothesis bullet' .agents/skills/_shared/spec-verify-check.py \
518
+ || ! grep -Fq 'solo-headroom command in a later risk probe was accepted' .agents/skills/_shared/spec-verify-check.py \
519
+ || ! grep -Fq 'solo-headroom command prefix match was accepted' .agents/skills/_shared/spec-verify-check.py \
520
+ || ! grep -Fq '(?<![A-Za-z0-9_.:/=-])' .agents/skills/_shared/spec-verify-check.py \
521
+ || ! grep -Fq 'risk-probes[0].cmd must contain a solo-headroom hypothesis observable command' .agents/skills/_shared/spec-verify-check.py \
522
+ || ! grep -Fq 'weak sibling solo-headroom hypothesis was accepted by --check-expected' .agents/skills/_shared/spec-verify-check.py \
523
+ || ! grep -Fq 'docs-style solo-headroom hypothesis was rejected by --check' .agents/skills/_shared/spec-verify-check.py \
524
+ || ! grep -Fq 'docs-style sibling solo-headroom command was rejected by --check-expected' .agents/skills/_shared/spec-verify-check.py; then
525
+ bad "spec-verify-check.py --check and --check-expected must reject weak solo-headroom hypotheses"
526
+ else
527
+ ok "spec-verify-check.py rejects weak solo-headroom hypotheses"
528
+ fi
529
+ if ! grep -Fq 'requires `.devlyn/risk-probes.jsonl`' config/skills/devlyn:resolve/SKILL.md \
530
+ || ! grep -Fq 'requires `.devlyn/risk-probes.jsonl`' .claude/skills/devlyn:resolve/SKILL.md \
531
+ || ! grep -Fq 'requires `.devlyn/risk-probes.jsonl`' .agents/skills/devlyn:resolve/SKILL.md \
532
+ || ! grep -Fq 'missing `.devlyn/risk-probes.jsonl` is a CRITICAL mechanical blocker' config/skills/devlyn:resolve/SKILL.md \
533
+ || ! grep -Fq 'missing `.devlyn/risk-probes.jsonl` is a CRITICAL mechanical blocker' .claude/skills/devlyn:resolve/SKILL.md \
534
+ || ! grep -Fq 'missing `.devlyn/risk-probes.jsonl` is a CRITICAL mechanical blocker' .agents/skills/devlyn:resolve/SKILL.md \
535
+ || ! grep -Fq 'requires that file when `state.risk_profile.risk_probes_enabled == true`' config/skills/devlyn:resolve/references/phases/build-gate.md \
536
+ || ! grep -Fq 'requires that file when `state.risk_profile.risk_probes_enabled == true`' .claude/skills/devlyn:resolve/references/phases/build-gate.md \
537
+ || ! grep -Fq 'requires that file when `state.risk_profile.risk_probes_enabled == true`' .agents/skills/devlyn:resolve/references/phases/build-gate.md \
538
+ || ! grep -Fq 'Malformed `state.risk_profile` is also CRITICAL because it can hide enabled risk probes' config/skills/devlyn:resolve/references/phases/build-gate.md \
539
+ || ! grep -Fq 'Malformed `state.risk_profile` is also CRITICAL because it can hide enabled risk probes' .claude/skills/devlyn:resolve/references/phases/build-gate.md \
540
+ || ! grep -Fq 'Malformed `state.risk_profile` is also CRITICAL because it can hide enabled risk probes' .agents/skills/devlyn:resolve/references/phases/build-gate.md \
541
+ || ! grep -Fq 'When `state.risk_profile.risk_probes_enabled == true`, missing `.devlyn/risk-probes.jsonl` is also CRITICAL' config/skills/devlyn:resolve/references/phases/verify.md \
542
+ || ! grep -Fq 'When `state.risk_profile.risk_probes_enabled == true`, missing `.devlyn/risk-probes.jsonl` is also CRITICAL' .claude/skills/devlyn:resolve/references/phases/verify.md \
543
+ || ! grep -Fq 'When `state.risk_profile.risk_probes_enabled == true`, missing `.devlyn/risk-probes.jsonl` is also CRITICAL' .agents/skills/devlyn:resolve/references/phases/verify.md; then
544
+ bad "BUILD_GATE and VERIFY must fail closed when enabled risk probes are missing"
545
+ else
546
+ ok "BUILD_GATE and VERIFY require enabled risk probes"
547
+ fi
548
+ if grep -Fq 'or any(char.isspace() for char in stripped)' config/skills/_shared/spec-verify-check.py \
549
+ || grep -Fq 'or any(char.isspace() for char in stripped)' .claude/skills/_shared/spec-verify-check.py \
550
+ || grep -Fq 'or any(char.isspace() for char in stripped)' .agents/skills/_shared/spec-verify-check.py \
551
+ || grep -Fq 'or any(char.isspace() for char in stripped)' config/skills/_shared/verify-merge-findings.py \
552
+ || grep -Fq 'or any(char.isspace() for char in stripped)' .claude/skills/_shared/verify-merge-findings.py \
553
+ || grep -Fq 'or any(char.isspace() for char in stripped)' .agents/skills/_shared/verify-merge-findings.py \
554
+ || grep -Fq 'or any(char.isspace() for char in stripped)' benchmark/auto-resolve/scripts/pair_evidence_contract.py; then
555
+ bad "solo-headroom command detection must not treat descriptive whitespace as a command"
556
+ else
557
+ ok "solo-headroom command detection rejects descriptive whitespace"
558
+ fi
559
+ if ! grep -Fq '"printf",' config/skills/_shared/spec-verify-check.py \
560
+ || ! grep -Fq '"printf",' .claude/skills/_shared/spec-verify-check.py \
561
+ || ! grep -Fq '"printf",' .agents/skills/_shared/spec-verify-check.py \
562
+ || ! grep -Fq '"printf",' config/skills/_shared/verify-merge-findings.py \
563
+ || ! grep -Fq '"printf",' .claude/skills/_shared/verify-merge-findings.py \
564
+ || ! grep -Fq '"printf",' .agents/skills/_shared/verify-merge-findings.py \
565
+ || ! grep -Fq '"printf",' benchmark/auto-resolve/scripts/pair_evidence_contract.py; then
566
+ bad "solo-headroom command detection must keep explicit printf command support"
567
+ else
568
+ ok "solo-headroom command detection keeps explicit printf command support"
569
+ fi
570
+ if python3 - <<'PY'
571
+ import ast
572
+ import pathlib
573
+ import sys
574
+
575
+ files = [
576
+ pathlib.Path("benchmark/auto-resolve/scripts/pair_evidence_contract.py"),
577
+ pathlib.Path("config/skills/_shared/spec-verify-check.py"),
578
+ pathlib.Path(".claude/skills/_shared/spec-verify-check.py"),
579
+ pathlib.Path(".agents/skills/_shared/spec-verify-check.py"),
580
+ pathlib.Path("config/skills/_shared/verify-merge-findings.py"),
581
+ pathlib.Path(".claude/skills/_shared/verify-merge-findings.py"),
582
+ pathlib.Path(".agents/skills/_shared/verify-merge-findings.py"),
583
+ ]
584
+ names = [
585
+ "COMMAND_PREFIXES",
586
+ "RESERVED_BACKTICK_TERMS",
587
+ "OBSERVABLE_COMMAND_MARKERS",
588
+ ]
589
+
590
+ def extract(path: pathlib.Path, name: str) -> tuple[str, ...]:
591
+ tree = ast.parse(path.read_text(encoding="utf-8"))
592
+ for node in tree.body:
593
+ if isinstance(node, ast.Assign):
594
+ if any(isinstance(target, ast.Name) and target.id == name for target in node.targets):
595
+ value = ast.literal_eval(node.value)
596
+ return tuple(sorted(value))
597
+ raise AssertionError(f"{path}: missing {name}")
598
+
599
+ baseline = {name: extract(files[0], name) for name in names}
600
+ for path in files[1:]:
601
+ for name in names:
602
+ value = extract(path, name)
603
+ if value != baseline[name]:
604
+ print(f"{path}: {name} drifted from {files[0]}", file=sys.stderr)
605
+ print(f"expected={baseline[name]!r}", file=sys.stderr)
606
+ print(f"actual={value!r}", file=sys.stderr)
607
+ sys.exit(1)
608
+ PY
609
+ then
610
+ ok "solo-headroom command detection constants stay in parity"
611
+ else
612
+ bad "solo-headroom command detection constants must stay in parity"
613
+ fi
614
+ if ! grep -Fq 'required.add("rollback_state")' config/skills/_shared/spec-verify-check.py \
615
+ || ! grep -Fq 'rollback verification text did not require rollback_state probe tag' config/skills/_shared/spec-verify-check.py; then
616
+ bad "spec-verify-check.py must require rollback_state risk probes for rollback/all-or-nothing verification text"
617
+ fi
618
+ if ! grep -Fq 'spec.expected.json top-level array produced a traceback' config/skills/_shared/spec-verify-check.py \
619
+ || ! grep -Fq 'invalid spec.expected.json produced a traceback' config/skills/_shared/spec-verify-check.py \
620
+ || ! grep -Fq 'NaN spec.expected.json did not report invalid numeric constant' config/skills/_shared/spec-verify-check.py \
621
+ || ! grep -Fq 'NaN risk-probes JSONL did not report invalid numeric constant' config/skills/_shared/spec-verify-check.py \
622
+ || ! grep -Fq 'def reject_json_constant' config/skills/_shared/spec-verify-check.py \
623
+ || ! grep -Fq 'loads_strict_json(line)' config/skills/_shared/spec-verify-check.py \
624
+ || ! grep -Fq 'loads_strict_json(expected_path.read_text())' config/skills/_shared/spec-verify-check.py \
625
+ || ! grep -Fq 'top-level must be a JSON object' config/skills/_shared/spec-verify-check.py \
626
+ || ! grep -Fq 'has invalid JSON' config/skills/_shared/spec-verify-check.py; then
627
+ bad "spec-verify-check.py self-test must fail malformed spec.expected.json cleanly without traceback"
628
+ fi
629
+ if ! grep -Fq 'required.add("error_contract")' config/skills/_shared/spec-verify-check.py \
630
+ || ! grep -Fq 'error_contract without exit-code evidence was accepted' config/skills/_shared/spec-verify-check.py; then
631
+ bad "spec-verify-check.py must require error_contract risk probes for invalid/stderr/JSON-error/exit-2 verification text"
632
+ fi
633
+ if ! grep -Fq '"asserts_named_stream_output"' config/skills/_shared/spec-verify-check.py \
634
+ || ! grep -Fq '"asserts_error_payload_or_stderr"' config/skills/_shared/spec-verify-check.py \
635
+ || ! grep -Fq '"asserts_nonzero_or_exit_2"' config/skills/_shared/spec-verify-check.py \
636
+ || ! grep -Fq 'error_contract without exit-code evidence was accepted' config/skills/_shared/spec-verify-check.py \
637
+ || ! grep -Fq 'stdout_stderr_contract without stream evidence was accepted' config/skills/_shared/spec-verify-check.py \
638
+ || ! grep -Fq '`stdout_stderr_contract`: `asserts_named_stream_output`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
639
+ || ! grep -Fq '`error_contract`: `asserts_error_payload_or_stderr`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
640
+ || ! grep -Fq '`asserts_nonzero_or_exit_2`' config/skills/devlyn:resolve/SKILL.md; then
641
+ bad "risk-probe error/stdout-stderr tags must require concrete tag_evidence markers in validator and prompt contract"
642
+ fi
643
+ if ! grep -Fq '"http_error_contract"' config/skills/_shared/spec-verify-check.py \
644
+ || ! grep -Fq 'asserts_http_error_status' config/skills/_shared/spec-verify-check.py \
645
+ || ! grep -Fq 'http error text did not require http_error_contract tag' config/skills/_shared/spec-verify-check.py \
646
+ || ! grep -Fq 'http_error_contract without payload evidence was accepted' config/skills/_shared/spec-verify-check.py \
647
+ || ! grep -Fq 'exact error body shape_contract without exact object evidence was accepted' config/skills/_shared/spec-verify-check.py \
648
+ || ! grep -Fq 'exact error body shape_contract with exact object evidence was rejected' config/skills/_shared/spec-verify-check.py \
649
+ || ! grep -Fq '`http_error_contract`: `asserts_http_error_status`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
650
+ || ! grep -Fq '`http_error_contract` must include `asserts_http_error_status`' config/skills/devlyn:resolve/SKILL.md; then
651
+ bad "risk-probe HTTP error contracts must require concrete status and payload markers"
652
+ fi
653
+ if ! grep -Fq '"uses_visible_input_key_names"' config/skills/_shared/spec-verify-check.py \
654
+ || ! grep -Fq '"asserts_visible_output_key_names"' config/skills/_shared/spec-verify-check.py \
655
+ || ! grep -Fq '"asserts_no_unexpected_output_keys"' config/skills/_shared/spec-verify-check.py \
656
+ || ! grep -Fq 'JSON error object text did not require shape_contract tag' config/skills/_shared/spec-verify-check.py \
657
+ || ! grep -Fq 'JSON error object shape_contract with exact object evidence was rejected' config/skills/_shared/spec-verify-check.py \
658
+ || ! grep -Fq 'INLINE_JSON_OBJECT_RE' config/skills/_shared/spec-verify-check.py \
659
+ || ! grep -Fq 'inline JSON object text did not require shape_contract tag' config/skills/_shared/spec-verify-check.py \
660
+ || ! grep -Fq 'inline JSON object shape_contract with key evidence was rejected' config/skills/_shared/spec-verify-check.py \
661
+ || ! grep -Fq 'inline JSON error text did not require shape_contract tag' config/skills/_shared/spec-verify-check.py \
662
+ || ! grep -Fq 'inline JSON error shape_contract with exact object evidence was rejected' config/skills/_shared/spec-verify-check.py \
663
+ || ! grep -Fq 'shape_contract without exact key evidence was accepted' config/skills/_shared/spec-verify-check.py \
664
+ || ! grep -Fq '`shape_contract` when the visible text names exact keys' config/skills/devlyn:resolve/references/phases/probe-derive.md \
665
+ || ! grep -Fq '`shape_contract` must' config/skills/devlyn:resolve/SKILL.md; then
666
+ bad "risk-probe shape contracts must require exact visible input/output key evidence when visible text names shape"
667
+ fi
668
+ if ! grep -Fq "forbidden[ -]+window" config/skills/_shared/spec-verify-check.py \
669
+ || grep -Fq "r'blocked|overlap|forbidden|window'" config/skills/_shared/spec-verify-check.py \
670
+ || ! grep -Fq 'generic forbidden-pattern verification text incorrectly required boundary_overlap' config/skills/_shared/spec-verify-check.py; then
671
+ bad "risk-probe boundary_overlap must trigger for forbidden windows / blocked overlap, not generic forbidden pattern text"
672
+ fi
673
+ if ! grep -Fq '(?:stock|inventory|balance|availability).{0,80}(?:later|remaining|after failures)' config/skills/_shared/spec-verify-check.py \
674
+ || ! grep -Fq 'stock validation error text incorrectly required prior_consumption' config/skills/_shared/spec-verify-check.py; then
675
+ bad "risk-probe prior_consumption must trigger on later/remaining state consumption, not plain stock validation errors"
676
+ fi
677
+ if ! grep -Fq '"auth_signature_contract"' config/skills/_shared/spec-verify-check.py \
678
+ || ! grep -Fq '"idempotency_replay"' config/skills/_shared/spec-verify-check.py \
679
+ || ! grep -Fq 'asserts_signature_over_exact_bytes' config/skills/_shared/spec-verify-check.py \
680
+ || ! grep -Fq 'duplicate_id_rejected_regardless_of_body' config/skills/_shared/spec-verify-check.py \
681
+ || ! grep -Fq 'webhook signature/replay text did not require auth/idempotency probe tags' config/skills/_shared/spec-verify-check.py \
682
+ || ! grep -Fq '`auth_signature_contract`: `asserts_signature_over_exact_bytes`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
683
+ || ! grep -Fq '`idempotency_replay`: `first_delivery_then_duplicate`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
684
+ || ! grep -Fq '`auth_signature_contract` must include `asserts_signature_over_exact_bytes`' config/skills/devlyn:resolve/SKILL.md; then
685
+ bad "risk-probe webhook/signature/replay contracts must require concrete auth_signature_contract and idempotency_replay tags"
686
+ fi
687
+ if ! grep -Fq 'signing|signed' config/skills/_shared/spec-verify-check.py \
688
+ || ! grep -Fq 'same.{0,40}`?id`?' config/skills/_shared/spec-verify-check.py; then
689
+ bad "risk-probe webhook signature/replay trigger must catch signing/signed and same accepted id wording"
690
+ fi
691
+ if ! grep -Fq 'duplicate[ -]+(?:delivery|event|id)' config/skills/_shared/spec-verify-check.py \
692
+ || ! grep -Fq 'duplicate SKU verification text incorrectly required idempotency_replay' config/skills/_shared/spec-verify-check.py; then
693
+ bad "risk-probe idempotency_replay must trigger on duplicate delivery/event/id, not duplicate SKU aggregation"
694
+ fi
695
+ if ! grep -Fq '"concurrent_state_consistency"' config/skills/_shared/spec-verify-check.py \
696
+ || ! grep -Fq 'overlapping_mutations_exercised' config/skills/_shared/spec-verify-check.py \
697
+ || ! grep -Fq 'concurrent state text did not require concurrent_state_consistency tag' config/skills/_shared/spec-verify-check.py \
698
+ || ! grep -Fq '`concurrent_state_consistency`: `overlapping_mutations_exercised`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
699
+ || ! grep -Fq '`concurrent_state_consistency` must' config/skills/devlyn:resolve/SKILL.md; then
700
+ bad "risk-probe concurrent state contracts must require concrete concurrent_state_consistency markers"
701
+ fi
702
+ if ! grep -Fq '"atomic_batch_state"' config/skills/_shared/spec-verify-check.py \
703
+ || ! grep -Fq 'mixed_valid_invalid_batch' config/skills/_shared/spec-verify-check.py \
704
+ || ! grep -Fq 'atomic batch text did not require atomic_batch_state tag' config/skills/_shared/spec-verify-check.py \
705
+ || ! grep -Fq 'atomic_batch_state without success-order evidence was accepted' config/skills/_shared/spec-verify-check.py \
706
+ || ! grep -Fq '`atomic_batch_state`: `mixed_valid_invalid_batch`' config/skills/devlyn:resolve/references/phases/probe-derive.md \
707
+ || ! grep -Fq '`atomic_batch_state` must include `mixed_valid_invalid_batch`' config/skills/devlyn:resolve/SKILL.md; then
708
+ bad "risk-probe atomic batch contracts must require concrete mixed-failure and success-order markers"
709
+ fi
710
+
711
+ section "Check 6f: ideate validates sibling spec.expected.json"
712
+ expected_check_missing=0
713
+ for file in \
714
+ config/skills/devlyn:ideate/SKILL.md \
715
+ config/skills/devlyn:ideate/references/elicitation.md \
716
+ config/skills/devlyn:ideate/references/from-spec-mode.md \
717
+ config/skills/devlyn:ideate/references/project-mode.md \
718
+ config/skills/devlyn:ideate/references/spec-template.md
719
+ do
720
+ if ! grep -Fq -- '--check-expected <expected-path>' "$file"; then
721
+ bad "$file — missing spec.expected.json mechanical validation command"
722
+ expected_check_missing=1
723
+ fi
724
+ done
725
+ if [ $expected_check_missing -eq 0 ]; then
726
+ ok "ideate docs require --check-expected for sibling expected contracts"
727
+ fi
728
+ if ! grep -Fq 'any present actionable solo-headroom hypothesis' config/skills/devlyn:ideate/SKILL.md \
729
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .claude/skills/devlyn:ideate/SKILL.md \
730
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .agents/skills/devlyn:ideate/SKILL.md \
731
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' config/skills/devlyn:ideate/references/elicitation.md \
732
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .claude/skills/devlyn:ideate/references/elicitation.md \
733
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .agents/skills/devlyn:ideate/references/elicitation.md \
734
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' config/skills/devlyn:ideate/references/project-mode.md \
735
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .claude/skills/devlyn:ideate/references/project-mode.md \
736
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .agents/skills/devlyn:ideate/references/project-mode.md \
737
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' config/skills/devlyn:resolve/SKILL.md \
738
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .claude/skills/devlyn:resolve/SKILL.md \
739
+ || ! grep -Fq 'any present actionable solo-headroom hypothesis' .agents/skills/devlyn:resolve/SKILL.md \
740
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" config/skills/devlyn:ideate/SKILL.md \
741
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" .claude/skills/devlyn:ideate/SKILL.md \
742
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" .agents/skills/devlyn:ideate/SKILL.md \
743
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" config/skills/devlyn:ideate/references/elicitation.md \
744
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" .claude/skills/devlyn:ideate/references/elicitation.md \
745
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" .agents/skills/devlyn:ideate/references/elicitation.md \
746
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" config/skills/devlyn:ideate/references/from-spec-mode.md \
747
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" .claude/skills/devlyn:ideate/references/from-spec-mode.md \
748
+ || ! grep -Fq "legacy inline \`## Verification\` JSON carrier" .agents/skills/devlyn:ideate/references/from-spec-mode.md \
749
+ || ! grep -Fq "inline \`## Verification\` JSON carrier" config/skills/devlyn:resolve/SKILL.md \
750
+ || ! grep -Fq "inline \`## Verification\` JSON carrier" .claude/skills/devlyn:resolve/SKILL.md \
751
+ || ! grep -Fq "inline \`## Verification\` JSON carrier" .agents/skills/devlyn:resolve/SKILL.md \
752
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' config/skills/devlyn:ideate/SKILL.md \
753
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .claude/skills/devlyn:ideate/SKILL.md \
754
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .agents/skills/devlyn:ideate/SKILL.md \
755
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' config/skills/devlyn:ideate/references/elicitation.md \
756
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .claude/skills/devlyn:ideate/references/elicitation.md \
757
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .agents/skills/devlyn:ideate/references/elicitation.md \
758
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' config/skills/devlyn:ideate/references/project-mode.md \
759
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .claude/skills/devlyn:ideate/references/project-mode.md \
760
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .agents/skills/devlyn:ideate/references/project-mode.md \
761
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' config/skills/devlyn:resolve/SKILL.md \
762
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .claude/skills/devlyn:resolve/SKILL.md \
763
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .agents/skills/devlyn:resolve/SKILL.md; then
764
+ bad "ideate/resolve docs must describe mechanical solo-headroom validation"
765
+ else
766
+ ok "ideate/resolve docs describe mechanical solo-headroom validation"
767
+ fi
768
+ if ! grep -Fq 'def validate_expected_against_sibling_spec' config/skills/_shared/spec-verify-check.py \
769
+ || ! grep -Fq 'empty verification_commands should fail for runtime specs' config/skills/_shared/spec-verify-check.py \
770
+ || ! grep -Fq 'empty verification_commands should be valid for pure-design specs' config/skills/_shared/spec-verify-check.py \
771
+ || ! grep -Fq 'backticked_observable_miss_commands(spec_text)' config/skills/_shared/spec-verify-check.py \
772
+ || ! grep -Fq 'observable command must match spec.expected.json' config/skills/_shared/spec-verify-check.py \
773
+ || ! grep -Fq 'observable command must match `## Verification` JSON carrier' config/skills/_shared/spec-verify-check.py \
774
+ || ! grep -Fq 'mismatched inline solo-headroom command was accepted by --check' config/skills/_shared/spec-verify-check.py \
775
+ || ! grep -Fq 'matched inline solo-headroom command was rejected by --check' config/skills/_shared/spec-verify-check.py \
776
+ || ! grep -Fq 'mismatched sibling solo-headroom command was accepted by --check-expected' config/skills/_shared/spec-verify-check.py \
777
+ || ! grep -Fq 'matched sibling solo-headroom command was rejected by --check-expected' config/skills/_shared/spec-verify-check.py \
778
+ || ! grep -Fq 'def validate_present_solo_ceiling_avoidance' config/skills/_shared/spec-verify-check.py \
779
+ || ! grep -Fq 'weak solo ceiling avoidance was accepted by --check' config/skills/_shared/spec-verify-check.py \
780
+ || ! grep -Fq 'actionable solo ceiling avoidance was rejected by --check' config/skills/_shared/spec-verify-check.py \
781
+ || ! grep -Fq 'weak sibling solo ceiling avoidance was accepted by --check-expected' config/skills/_shared/spec-verify-check.py \
782
+ || ! grep -Fq 'actionable sibling solo ceiling avoidance was rejected by --check-expected' config/skills/_shared/spec-verify-check.py \
783
+ || ! grep -Fq 'backticked_observable_miss_commands(spec_text)' .claude/skills/_shared/spec-verify-check.py \
784
+ || ! grep -Fq 'observable command must match spec.expected.json' .claude/skills/_shared/spec-verify-check.py \
785
+ || ! grep -Fq 'observable command must match `## Verification` JSON carrier' .claude/skills/_shared/spec-verify-check.py \
786
+ || ! grep -Fq 'mismatched inline solo-headroom command was accepted by --check' .claude/skills/_shared/spec-verify-check.py \
787
+ || ! grep -Fq 'matched inline solo-headroom command was rejected by --check' .claude/skills/_shared/spec-verify-check.py \
788
+ || ! grep -Fq 'mismatched sibling solo-headroom command was accepted by --check-expected' .claude/skills/_shared/spec-verify-check.py \
789
+ || ! grep -Fq 'matched sibling solo-headroom command was rejected by --check-expected' .claude/skills/_shared/spec-verify-check.py \
790
+ || ! grep -Fq 'def validate_present_solo_ceiling_avoidance' .claude/skills/_shared/spec-verify-check.py \
791
+ || ! grep -Fq 'weak solo ceiling avoidance was accepted by --check' .claude/skills/_shared/spec-verify-check.py \
792
+ || ! grep -Fq 'actionable solo ceiling avoidance was rejected by --check' .claude/skills/_shared/spec-verify-check.py \
793
+ || ! grep -Fq 'weak sibling solo ceiling avoidance was accepted by --check-expected' .claude/skills/_shared/spec-verify-check.py \
794
+ || ! grep -Fq 'actionable sibling solo ceiling avoidance was rejected by --check-expected' .claude/skills/_shared/spec-verify-check.py \
795
+ || ! grep -Fq 'backticked_observable_miss_commands(spec_text)' .agents/skills/_shared/spec-verify-check.py \
796
+ || ! grep -Fq 'observable command must match spec.expected.json' .agents/skills/_shared/spec-verify-check.py \
797
+ || ! grep -Fq 'observable command must match `## Verification` JSON carrier' .agents/skills/_shared/spec-verify-check.py \
798
+ || ! grep -Fq 'mismatched inline solo-headroom command was accepted by --check' .agents/skills/_shared/spec-verify-check.py \
799
+ || ! grep -Fq 'matched inline solo-headroom command was rejected by --check' .agents/skills/_shared/spec-verify-check.py \
800
+ || ! grep -Fq 'mismatched sibling solo-headroom command was accepted by --check-expected' .agents/skills/_shared/spec-verify-check.py \
801
+ || ! grep -Fq 'matched sibling solo-headroom command was rejected by --check-expected' .agents/skills/_shared/spec-verify-check.py \
802
+ || ! grep -Fq 'def validate_present_solo_ceiling_avoidance' .agents/skills/_shared/spec-verify-check.py \
803
+ || ! grep -Fq 'weak solo ceiling avoidance was accepted by --check' .agents/skills/_shared/spec-verify-check.py \
804
+ || ! grep -Fq 'actionable solo ceiling avoidance was rejected by --check' .agents/skills/_shared/spec-verify-check.py \
805
+ || ! grep -Fq 'weak sibling solo ceiling avoidance was accepted by --check-expected' .agents/skills/_shared/spec-verify-check.py \
806
+ || ! grep -Fq 'actionable sibling solo ceiling avoidance was rejected by --check-expected' .agents/skills/_shared/spec-verify-check.py; then
807
+ bad "spec-verify-check.py must reject empty expected runtime contracts, weak solo ceiling avoidance, and preserve pure-design escape"
808
+ fi
809
+ if ! grep -Fq 'Verification includes at least one compound scenario that exercises the interaction end-to-end' \
810
+ config/skills/devlyn:ideate/references/spec-template.md \
811
+ || ! grep -Fq 'Verification includes at least one compound scenario that exercises the interaction end-to-end' \
812
+ .claude/skills/devlyn:ideate/references/spec-template.md \
813
+ || ! grep -Fq 'Verification includes at least one compound scenario that exercises the interaction end-to-end' \
814
+ .agents/skills/devlyn:ideate/references/spec-template.md; then
815
+ bad "ideate spec template must require compound interaction verification for pair-relevant high-risk specs"
816
+ else
817
+ ok "ideate spec template requires compound interaction verification for pair-relevant specs"
818
+ fi
819
+ if ! grep -Fq 'ask for one concrete compound' config/skills/devlyn:ideate/references/elicitation.md \
820
+ || ! grep -Fq 'ask for one concrete compound' .claude/skills/devlyn:ideate/references/elicitation.md \
821
+ || ! grep -Fq 'ask for one concrete compound' .agents/skills/devlyn:ideate/references/elicitation.md; then
822
+ bad "ideate elicitation must ask for compound interaction scenarios when pair-relevant risks appear"
823
+ else
824
+ ok "ideate elicitation asks for compound interaction scenarios when pair-relevant risks appear"
825
+ fi
826
+ if ! grep -Fq 'solo-headroom hypothesis inside `## Verification`' config/skills/devlyn:ideate/references/spec-template.md \
827
+ || ! grep -Fq 'solo-headroom hypothesis inside `## Verification`' .claude/skills/devlyn:ideate/references/spec-template.md \
828
+ || ! grep -Fq 'solo-headroom hypothesis inside `## Verification`' .agents/skills/devlyn:ideate/references/spec-template.md \
829
+ || ! grep -Fq 'ask for one solo-headroom hypothesis' config/skills/devlyn:ideate/references/elicitation.md \
830
+ || ! grep -Fq 'ask for one solo-headroom hypothesis' .claude/skills/devlyn:ideate/references/elicitation.md \
831
+ || ! grep -Fq 'ask for one solo-headroom hypothesis' .agents/skills/devlyn:ideate/references/elicitation.md; then
832
+ bad "ideate must require a visible solo-headroom hypothesis for benchmark and pair-evidence specs"
833
+ else
834
+ ok "ideate requires solo-headroom hypothesis for benchmark and pair-evidence specs"
835
+ fi
836
+ if ! grep -Fq 'must literally contain `solo-headroom hypothesis`' config/skills/devlyn:ideate/references/spec-template.md \
837
+ || ! grep -Fq 'must literally contain `solo-headroom hypothesis`' .claude/skills/devlyn:ideate/references/spec-template.md \
838
+ || ! grep -Fq 'must literally contain `solo-headroom hypothesis`' .agents/skills/devlyn:ideate/references/spec-template.md \
839
+ || ! grep -Fq 'backticked line itself must contain `miss`' config/skills/devlyn:ideate/references/spec-template.md \
840
+ || ! grep -Fq 'backticked line itself must contain `miss`' .claude/skills/devlyn:ideate/references/spec-template.md \
841
+ || ! grep -Fq 'backticked line itself must contain `miss`' .agents/skills/devlyn:ideate/references/spec-template.md \
842
+ || ! grep -Fq '`solo_claude`, `miss`, and a backticked observable command' config/skills/devlyn:ideate/references/spec-template.md \
843
+ || ! grep -Fq '`solo_claude`, `miss`, and a backticked observable command' .claude/skills/devlyn:ideate/references/spec-template.md \
844
+ || ! grep -Fq '`solo_claude`, `miss`, and a backticked observable command' .agents/skills/devlyn:ideate/references/spec-template.md \
845
+ || ! grep -Fq 'command/observable' config/skills/devlyn:ideate/references/spec-template.md \
846
+ || ! grep -Fq 'command/observable' .claude/skills/devlyn:ideate/references/spec-template.md \
847
+ || ! grep -Fq 'command/observable' .agents/skills/devlyn:ideate/references/spec-template.md \
848
+ || ! grep -Fq 'must literally contain `solo-headroom hypothesis`' config/skills/devlyn:ideate/references/elicitation.md \
849
+ || ! grep -Fq 'must literally contain `solo-headroom hypothesis`' .claude/skills/devlyn:ideate/references/elicitation.md \
850
+ || ! grep -Fq 'must literally contain `solo-headroom hypothesis`' .agents/skills/devlyn:ideate/references/elicitation.md \
851
+ || ! grep -Fq 'line itself must contain `miss`' config/skills/devlyn:ideate/references/elicitation.md \
852
+ || ! grep -Fq 'line itself must contain `miss`' .claude/skills/devlyn:ideate/references/elicitation.md \
853
+ || ! grep -Fq 'line itself must contain `miss`' .agents/skills/devlyn:ideate/references/elicitation.md \
854
+ || ! grep -Fq 'Do not write a benchmark/risk-probe/pair-evidence spec until this' config/skills/devlyn:ideate/references/elicitation.md \
855
+ || ! grep -Fq 'Do not write a benchmark/risk-probe/pair-evidence spec until this' .claude/skills/devlyn:ideate/references/elicitation.md \
856
+ || ! grep -Fq 'Do not write a benchmark/risk-probe/pair-evidence spec until this' .agents/skills/devlyn:ideate/references/elicitation.md \
857
+ || ! grep -Fq 'spec not ready — solo-headroom hypothesis required' config/skills/devlyn:ideate/references/elicitation.md \
858
+ || ! grep -Fq 'spec not ready — solo-headroom hypothesis required' .claude/skills/devlyn:ideate/references/elicitation.md \
859
+ || ! grep -Fq 'spec not ready — solo-headroom hypothesis required' .agents/skills/devlyn:ideate/references/elicitation.md \
860
+ || ! grep -Fq '`solo_claude`, `miss`, and a backticked observable command' config/skills/devlyn:ideate/references/elicitation.md \
861
+ || ! grep -Fq '`solo_claude`, `miss`, and a backticked observable command' .claude/skills/devlyn:ideate/references/elicitation.md \
862
+ || ! grep -Fq '`solo_claude`, `miss`, and a backticked observable command' .agents/skills/devlyn:ideate/references/elicitation.md \
863
+ || ! grep -Fq 'command/observable' config/skills/devlyn:ideate/references/elicitation.md \
864
+ || ! grep -Fq 'command/observable' .claude/skills/devlyn:ideate/references/elicitation.md \
865
+ || ! grep -Fq 'command/observable' .agents/skills/devlyn:ideate/references/elicitation.md \
866
+ || ! grep -Fq 'Verification literally contains `solo-headroom hypothesis`, `solo_claude`' config/skills/devlyn:ideate/references/from-spec-mode.md \
867
+ || ! grep -Fq 'Verification literally contains `solo-headroom hypothesis`, `solo_claude`' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
868
+ || ! grep -Fq 'Verification literally contains `solo-headroom hypothesis`, `solo_claude`' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
869
+ || ! grep -Fq 'backticked line itself must contain `miss`' config/skills/devlyn:ideate/references/from-spec-mode.md \
870
+ || ! grep -Fq 'backticked line itself must contain `miss`' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
871
+ || ! grep -Fq 'backticked line itself must contain `miss`' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
872
+ || ! grep -Fq 'command/observable' config/skills/devlyn:ideate/references/from-spec-mode.md \
873
+ || ! grep -Fq 'command/observable' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
874
+ || ! grep -Fq 'command/observable' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
875
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' config/skills/devlyn:ideate/references/from-spec-mode.md \
876
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
877
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
878
+ || ! grep -Fq "source for VERIFY's canonical \`spec.solo_headroom_hypothesis\` trigger reason" config/skills/devlyn:ideate/references/spec-template.md \
879
+ || ! grep -Fq "source for VERIFY's canonical \`spec.solo_headroom_hypothesis\` trigger reason" .claude/skills/devlyn:ideate/references/spec-template.md \
880
+ || ! grep -Fq "source for VERIFY's canonical \`spec.solo_headroom_hypothesis\` trigger reason" .agents/skills/devlyn:ideate/references/spec-template.md \
881
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' config/skills/devlyn:ideate/references/spec-template.md \
882
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .claude/skills/devlyn:ideate/references/spec-template.md \
883
+ || ! grep -Fq 'spec.expected.json.verification_commands[].cmd' .agents/skills/devlyn:ideate/references/spec-template.md; then
884
+ bad "ideate solo-headroom hypothesis prompt must match the actionable checker contract"
885
+ else
886
+ ok "ideate solo-headroom hypothesis prompt matches checker contract"
887
+ fi
888
+ if ! grep -Fq 'quick mode must not infer a solo-headroom hypothesis' config/skills/devlyn:ideate/references/elicitation.md \
889
+ || ! grep -Fq 'quick mode must not infer a solo-headroom hypothesis' .claude/skills/devlyn:ideate/references/elicitation.md \
890
+ || ! grep -Fq 'quick mode must not infer a solo-headroom hypothesis' .agents/skills/devlyn:ideate/references/elicitation.md \
891
+ || ! grep -Fq 'do not infer a solo-headroom hypothesis' config/skills/devlyn:ideate/SKILL.md \
892
+ || ! grep -Fq 'do not infer a solo-headroom hypothesis' .claude/skills/devlyn:ideate/SKILL.md \
893
+ || ! grep -Fq 'do not infer a solo-headroom hypothesis' .agents/skills/devlyn:ideate/SKILL.md; then
894
+ bad "ideate quick mode must not invent solo-headroom hypotheses"
895
+ else
896
+ ok "ideate quick mode does not invent solo-headroom hypotheses"
897
+ fi
898
+ if ! grep -Fq 'solo ceiling avoidance' config/skills/devlyn:ideate/references/spec-template.md \
899
+ || ! grep -Fq 'solo ceiling avoidance' .claude/skills/devlyn:ideate/references/spec-template.md \
900
+ || ! grep -Fq 'solo ceiling avoidance' .agents/skills/devlyn:ideate/references/spec-template.md \
901
+ || ! grep -Fq 'rejected or solo-saturated controls such as `S2`-`S6`' config/skills/devlyn:ideate/references/spec-template.md \
902
+ || ! grep -Fq 'rejected or solo-saturated controls such as `S2`-`S6`' .claude/skills/devlyn:ideate/references/spec-template.md \
903
+ || ! grep -Fq 'rejected or solo-saturated controls such as `S2`-`S6`' .agents/skills/devlyn:ideate/references/spec-template.md \
904
+ || ! grep -Fq 'Solo ceiling avoidance' config/skills/devlyn:ideate/references/elicitation.md \
905
+ || ! grep -Fq 'Solo ceiling avoidance' .claude/skills/devlyn:ideate/references/elicitation.md \
906
+ || ! grep -Fq 'Solo ceiling avoidance' .agents/skills/devlyn:ideate/references/elicitation.md \
907
+ || ! grep -Fq 'spec not ready — solo ceiling avoidance required' config/skills/devlyn:ideate/references/elicitation.md \
908
+ || ! grep -Fq 'spec not ready — solo ceiling avoidance required' .claude/skills/devlyn:ideate/references/elicitation.md \
909
+ || ! grep -Fq 'spec not ready — solo ceiling avoidance required' .agents/skills/devlyn:ideate/references/elicitation.md \
910
+ || ! grep -Fq 'pair-evidence not ready — Pair-candidate headroom is unproven until the spec states solo ceiling avoidance' config/skills/devlyn:ideate/references/from-spec-mode.md \
911
+ || ! grep -Fq 'pair-evidence not ready — Pair-candidate headroom is unproven until the spec states solo ceiling avoidance' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
912
+ || ! grep -Fq 'pair-evidence not ready — Pair-candidate headroom is unproven until the spec states solo ceiling avoidance' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
913
+ || ! grep -Fq 'also do not infer solo ceiling avoidance' config/skills/devlyn:ideate/SKILL.md \
914
+ || ! grep -Fq 'also do not infer solo ceiling avoidance' .claude/skills/devlyn:ideate/SKILL.md \
915
+ || ! grep -Fq 'also do not infer solo ceiling avoidance' .agents/skills/devlyn:ideate/SKILL.md \
916
+ || ! grep -Fq 'per-feature Verification must also include a solo ceiling avoidance note' config/skills/devlyn:ideate/references/project-mode.md \
917
+ || ! grep -Fq 'per-feature Verification must also include a solo ceiling avoidance note' .claude/skills/devlyn:ideate/references/project-mode.md \
918
+ || ! grep -Fq 'per-feature Verification must also include a solo ceiling avoidance note' .agents/skills/devlyn:ideate/references/project-mode.md; then
919
+ bad "ideate must require solo ceiling avoidance for new unmeasured pair candidates"
920
+ else
921
+ ok "ideate requires solo ceiling avoidance for new unmeasured pair candidates"
922
+ fi
923
+ if ! grep -Fq 'complexity: medium' config/skills/devlyn:ideate/references/spec-template.md \
924
+ || ! grep -Fq 'complexity: medium' .claude/skills/devlyn:ideate/references/spec-template.md \
925
+ || ! grep -Fq 'complexity: medium' .agents/skills/devlyn:ideate/references/spec-template.md \
926
+ || ! grep -Fq 'Complexity signal' config/skills/devlyn:ideate/references/elicitation.md \
927
+ || ! grep -Fq 'Complexity signal' .claude/skills/devlyn:ideate/references/elicitation.md \
928
+ || ! grep -Fq 'Complexity signal' .agents/skills/devlyn:ideate/references/elicitation.md \
929
+ || ! grep -Fq 'downstream VERIFY pair-trigger signal' config/skills/devlyn:ideate/references/elicitation.md \
930
+ || ! grep -Fq 'downstream VERIFY pair-trigger signal' .claude/skills/devlyn:ideate/references/elicitation.md \
931
+ || ! grep -Fq 'downstream VERIFY pair-trigger signal' .agents/skills/devlyn:ideate/references/elicitation.md \
932
+ || ! grep -Fq 'complexity=medium default' config/skills/devlyn:ideate/references/from-spec-mode.md \
933
+ || ! grep -Fq 'complexity=medium default' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
934
+ || ! grep -Fq 'complexity=medium default' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
935
+ || ! grep -Fq 'supported `complexity` frontmatter' config/skills/devlyn:ideate/SKILL.md \
936
+ || ! grep -Fq 'supported `complexity` frontmatter' .claude/skills/devlyn:ideate/SKILL.md \
937
+ || ! grep -Fq 'supported `complexity` frontmatter' .agents/skills/devlyn:ideate/SKILL.md \
938
+ || ! grep -Fq 'supported `complexity` frontmatter' config/skills/devlyn:ideate/references/elicitation.md \
939
+ || ! grep -Fq 'supported `complexity` frontmatter' .claude/skills/devlyn:ideate/references/elicitation.md \
940
+ || ! grep -Fq 'supported `complexity` frontmatter' .agents/skills/devlyn:ideate/references/elicitation.md \
941
+ || ! grep -Fq 'supported `complexity` frontmatter' config/skills/devlyn:resolve/SKILL.md \
942
+ || ! grep -Fq 'supported `complexity` frontmatter' .claude/skills/devlyn:resolve/SKILL.md \
943
+ || ! grep -Fq 'supported `complexity` frontmatter' .agents/skills/devlyn:resolve/SKILL.md \
944
+ || ! grep -Fq 'sibling spec `complexity` frontmatter' config/skills/devlyn:ideate/SKILL.md \
945
+ || ! grep -Fq 'sibling spec `complexity` frontmatter' .claude/skills/devlyn:ideate/SKILL.md \
946
+ || ! grep -Fq 'sibling spec `complexity` frontmatter' .agents/skills/devlyn:ideate/SKILL.md \
947
+ || ! grep -Fq 'sibling spec `complexity` frontmatter' config/skills/devlyn:resolve/SKILL.md \
948
+ || ! grep -Fq 'sibling spec `complexity` frontmatter' .claude/skills/devlyn:resolve/SKILL.md \
949
+ || ! grep -Fq 'sibling spec `complexity` frontmatter' .agents/skills/devlyn:resolve/SKILL.md \
950
+ || ! grep -Fq 'Frontmatter has `id`, `title`, `kind`, `status: planned`, `complexity`' config/skills/devlyn:ideate/SKILL.md \
951
+ || ! grep -Fq 'Frontmatter has `id`, `title`, `kind`, `status: planned`, `complexity`' .claude/skills/devlyn:ideate/SKILL.md \
952
+ || ! grep -Fq 'Frontmatter has `id`, `title`, `kind`, `status: planned`, `complexity`' .agents/skills/devlyn:ideate/SKILL.md; then
953
+ bad "ideate specs must emit complexity frontmatter for resolve pair triggers"
954
+ else
955
+ ok "ideate specs emit complexity frontmatter for resolve pair triggers"
956
+ fi
957
+ if ! grep -Fq 'warning: Verification may need one compound end-to-end scenario before pair-relevant risks are measurable' \
958
+ config/skills/devlyn:ideate/references/from-spec-mode.md \
959
+ || ! grep -Fq 'warning: Verification may need one compound end-to-end scenario before pair-relevant risks are measurable' \
960
+ .claude/skills/devlyn:ideate/references/from-spec-mode.md \
961
+ || ! grep -Fq 'warning: Verification may need one compound end-to-end scenario before pair-relevant risks are measurable' \
962
+ .agents/skills/devlyn:ideate/references/from-spec-mode.md; then
963
+ bad "ideate from-spec mode must warn when preserved high-risk specs lack compound verification"
964
+ else
965
+ ok "ideate from-spec mode warns on pair-relevant specs with weak verification"
966
+ fi
967
+ if ! grep -Fq 'pair-evidence not ready — Pair-candidate headroom is unproven until the spec states a solo-headroom hypothesis' \
968
+ config/skills/devlyn:ideate/references/from-spec-mode.md \
969
+ || ! grep -Fq 'pair-evidence not ready — Pair-candidate headroom is unproven until the spec states a solo-headroom hypothesis' \
970
+ .claude/skills/devlyn:ideate/references/from-spec-mode.md \
971
+ || ! grep -Fq 'pair-evidence not ready — Pair-candidate headroom is unproven until the spec states a solo-headroom hypothesis' \
972
+ .agents/skills/devlyn:ideate/references/from-spec-mode.md \
973
+ || ! grep -Fq 'Do not call' config/skills/devlyn:ideate/references/from-spec-mode.md \
974
+ || ! grep -Fq 'Do not call' .claude/skills/devlyn:ideate/references/from-spec-mode.md \
975
+ || ! grep -Fq 'Do not call' .agents/skills/devlyn:ideate/references/from-spec-mode.md \
976
+ || ! grep -Fq 'announcement must say `pair-evidence not ready`' config/skills/devlyn:ideate/SKILL.md \
977
+ || ! grep -Fq 'announcement must say `pair-evidence not ready`' .claude/skills/devlyn:ideate/SKILL.md \
978
+ || ! grep -Fq 'announcement must say `pair-evidence not ready`' .agents/skills/devlyn:ideate/SKILL.md; then
979
+ bad "ideate from-spec mode must warn when pair-candidate specs lack solo-headroom hypothesis"
980
+ else
981
+ ok "ideate from-spec mode warns on missing solo-headroom hypothesis"
982
+ fi
983
+ if ! grep -Fq 'per-feature Verification must' config/skills/devlyn:ideate/references/project-mode.md \
984
+ || ! grep -Fq 'per-feature Verification must' .claude/skills/devlyn:ideate/references/project-mode.md \
985
+ || ! grep -Fq 'per-feature Verification must' .agents/skills/devlyn:ideate/references/project-mode.md; then
986
+ bad "ideate project mode must require compound verification inside each pair-relevant feature spec"
987
+ else
988
+ ok "ideate project mode keeps compound verification inside pair-relevant feature specs"
989
+ fi
990
+ if ! grep -Fq 'per-feature Verification must include a solo-headroom' config/skills/devlyn:ideate/references/project-mode.md \
991
+ || ! grep -Fq 'per-feature Verification must include a solo-headroom' .claude/skills/devlyn:ideate/references/project-mode.md \
992
+ || ! grep -Fq 'per-feature Verification must include a solo-headroom' .agents/skills/devlyn:ideate/references/project-mode.md \
993
+ || rg -q 'Context or Verification' config/skills/devlyn:ideate/references .claude/skills/devlyn:ideate/references .agents/skills/devlyn:ideate/references; then
994
+ bad "ideate project mode must keep solo-headroom hypothesis inside each pair-candidate feature spec"
995
+ else
996
+ ok "ideate project mode keeps solo-headroom hypothesis inside each pair-candidate feature spec"
997
+ fi
998
+ if ! grep -Fq 'feature spec must literally contain' \
999
+ config/skills/devlyn:ideate/references/project-mode.md \
1000
+ || ! grep -Fq 'feature spec must literally contain' \
1001
+ .claude/skills/devlyn:ideate/references/project-mode.md \
1002
+ || ! grep -Fq 'feature spec must literally contain' \
1003
+ .agents/skills/devlyn:ideate/references/project-mode.md \
1004
+ || ! grep -Fq '`solo-headroom hypothesis`, `solo_claude`, `miss`, and a backticked' \
1005
+ config/skills/devlyn:ideate/references/project-mode.md \
1006
+ || ! grep -Fq '`solo-headroom hypothesis`, `solo_claude`, `miss`, and a backticked' \
1007
+ .claude/skills/devlyn:ideate/references/project-mode.md \
1008
+ || ! grep -Fq '`solo-headroom hypothesis`, `solo_claude`, `miss`, and a backticked' \
1009
+ .agents/skills/devlyn:ideate/references/project-mode.md \
1010
+ || ! grep -Fq 'behavior a capable' config/skills/devlyn:ideate/references/project-mode.md \
1011
+ || ! grep -Fq 'behavior a capable' .claude/skills/devlyn:ideate/references/project-mode.md \
1012
+ || ! grep -Fq 'behavior a capable' .agents/skills/devlyn:ideate/references/project-mode.md \
1013
+ || ! grep -Fq 'backticked line itself must' config/skills/devlyn:ideate/references/project-mode.md \
1014
+ || ! grep -Fq 'backticked line itself must' .claude/skills/devlyn:ideate/references/project-mode.md \
1015
+ || ! grep -Fq 'backticked line itself must' .agents/skills/devlyn:ideate/references/project-mode.md \
1016
+ || ! grep -Fq 'command/observable' \
1017
+ config/skills/devlyn:ideate/references/project-mode.md \
1018
+ || ! grep -Fq 'command/observable' \
1019
+ .claude/skills/devlyn:ideate/references/project-mode.md \
1020
+ || ! grep -Fq 'command/observable' \
1021
+ .agents/skills/devlyn:ideate/references/project-mode.md; then
1022
+ bad "ideate project mode solo-headroom prompt must keep the actionable checker contract"
1023
+ else
1024
+ ok "ideate project mode solo-headroom prompt keeps checker contract"
1025
+ fi
1026
+
1027
+ if ! grep -Fq 'If the visible spec includes a solo-headroom hypothesis, the first probe must' \
1028
+ config/skills/devlyn:resolve/references/phases/probe-derive.md \
1029
+ || ! grep -Fq 'If the visible spec includes a solo-headroom hypothesis, the first probe must' \
1030
+ .claude/skills/devlyn:resolve/references/phases/probe-derive.md \
1031
+ || ! grep -Fq 'If the visible spec includes a solo-headroom hypothesis, the first probe must' \
1032
+ .agents/skills/devlyn:resolve/references/phases/probe-derive.md \
1033
+ || ! grep -Fq 'When the visible spec includes a solo-headroom hypothesis, the first probe must' \
1034
+ config/skills/devlyn:resolve/SKILL.md \
1035
+ || ! grep -Fq 'When the visible spec includes a solo-headroom hypothesis, the first probe must' \
1036
+ .claude/skills/devlyn:resolve/SKILL.md \
1037
+ || ! grep -Fq 'When the visible spec includes a solo-headroom hypothesis, the first probe must' \
1038
+ .agents/skills/devlyn:resolve/SKILL.md \
1039
+ || ! grep -Fq 'its `cmd` must contain the hypothesis'\''s backticked' \
1040
+ config/skills/devlyn:resolve/SKILL.md \
1041
+ || ! grep -Fq 'its `cmd` must contain the hypothesis'\''s backticked' \
1042
+ .claude/skills/devlyn:resolve/SKILL.md \
1043
+ || ! grep -Fq 'its `cmd` must contain the hypothesis'\''s backticked' \
1044
+ .agents/skills/devlyn:resolve/SKILL.md \
1045
+ || ! grep -Fq 'its `derived_from` must reference the hypothesis bullet' \
1046
+ config/skills/devlyn:resolve/SKILL.md \
1047
+ || ! grep -Fq 'its `derived_from` must reference the hypothesis bullet' \
1048
+ .claude/skills/devlyn:resolve/SKILL.md \
1049
+ || ! grep -Fq 'its `derived_from` must reference the hypothesis bullet' \
1050
+ .agents/skills/devlyn:resolve/SKILL.md; then
1051
+ bad "resolve risk-probe prompts must consume solo-headroom hypothesis before pair-evidence work"
1052
+ else
1053
+ ok "resolve risk-probe prompts consume solo-headroom hypothesis"
1054
+ fi
1055
+ if ! grep -Fq 'the behavior the spec says `solo_claude` is expected to miss' \
1056
+ config/skills/devlyn:resolve/references/phases/probe-derive.md \
1057
+ || ! grep -Fq 'the behavior the spec says `solo_claude` is expected to miss' \
1058
+ .claude/skills/devlyn:resolve/references/phases/probe-derive.md \
1059
+ || ! grep -Fq 'the behavior the spec says `solo_claude` is expected to miss' \
1060
+ .agents/skills/devlyn:resolve/references/phases/probe-derive.md \
1061
+ || ! grep -Fq 'exercise the stated `solo_claude` miss' \
1062
+ config/skills/devlyn:resolve/references/phases/probe-derive.md \
1063
+ || ! grep -Fq 'exercise the stated `solo_claude` miss' \
1064
+ .claude/skills/devlyn:resolve/references/phases/probe-derive.md \
1065
+ || ! grep -Fq 'exercise the stated `solo_claude` miss' \
1066
+ .agents/skills/devlyn:resolve/references/phases/probe-derive.md \
1067
+ || ! grep -Fq 'with a `cmd` containing the hypothesis'\''s' \
1068
+ config/skills/devlyn:resolve/references/phases/probe-derive.md \
1069
+ || ! grep -Fq 'with a `cmd` containing the hypothesis'\''s' \
1070
+ .claude/skills/devlyn:resolve/references/phases/probe-derive.md \
1071
+ || ! grep -Fq 'with a `cmd` containing the hypothesis'\''s' \
1072
+ .agents/skills/devlyn:resolve/references/phases/probe-derive.md \
1073
+ || ! grep -Fq '`derived_from` pointing at the hypothesis' \
1074
+ config/skills/devlyn:resolve/references/phases/probe-derive.md \
1075
+ || ! grep -Fq '`derived_from` pointing at the hypothesis' \
1076
+ .claude/skills/devlyn:resolve/references/phases/probe-derive.md \
1077
+ || ! grep -Fq '`derived_from` pointing at the hypothesis' \
1078
+ .agents/skills/devlyn:resolve/references/phases/probe-derive.md; then
1079
+ bad "resolve risk-probe solo-headroom prompt must target the stated solo_claude miss"
1080
+ else
1081
+ ok "resolve risk-probe solo-headroom prompt targets the stated solo_claude miss"
1082
+ fi
1083
+ if ! grep -Fq 'If the spec includes a solo-headroom hypothesis, one of the two targeted' \
1084
+ config/skills/devlyn:resolve/references/phases/verify.md \
1085
+ || ! grep -Fq 'If the spec includes a solo-headroom hypothesis, one of the two targeted' \
1086
+ .claude/skills/devlyn:resolve/references/phases/verify.md \
1087
+ || ! grep -Fq 'If the spec includes a solo-headroom hypothesis, one of the two targeted' \
1088
+ .agents/skills/devlyn:resolve/references/phases/verify.md \
1089
+ || ! grep -Fq 'If the spec includes a solo-headroom hypothesis, one of those targeted probes must' \
1090
+ config/skills/devlyn:resolve/SKILL.md \
1091
+ || ! grep -Fq 'If the spec includes a solo-headroom hypothesis, one of those targeted probes must' \
1092
+ .claude/skills/devlyn:resolve/SKILL.md \
1093
+ || ! grep -Fq 'If the spec includes a solo-headroom hypothesis, one of those targeted probes must' \
1094
+ .agents/skills/devlyn:resolve/SKILL.md \
1095
+ || ! grep -Fq 'using the hypothesis'\''s backticked observable command as its command anchor' \
1096
+ config/skills/devlyn:resolve/SKILL.md \
1097
+ || ! grep -Fq 'using the hypothesis'\''s backticked observable command as its command anchor' \
1098
+ .claude/skills/devlyn:resolve/SKILL.md \
1099
+ || ! grep -Fq 'using the hypothesis'\''s backticked observable command as its command anchor' \
1100
+ .agents/skills/devlyn:resolve/SKILL.md \
1101
+ || ! grep -Fq 'hypothesis'\''s backticked observable command as its command anchor' \
1102
+ config/skills/devlyn:resolve/references/phases/verify.md \
1103
+ || ! grep -Fq 'hypothesis'\''s backticked observable command as its command anchor' \
1104
+ .claude/skills/devlyn:resolve/references/phases/verify.md \
1105
+ || ! grep -Fq 'hypothesis'\''s backticked observable command as its command anchor' \
1106
+ .agents/skills/devlyn:resolve/references/phases/verify.md; then
1107
+ bad "resolve pair-JUDGE prompts must prioritize solo-headroom hypothesis"
208
1108
  else
209
- bad "spec-verify-check.py risk-probe self-test failed"
1109
+ ok "resolve pair-JUDGE prompts prioritize solo-headroom hypothesis"
1110
+ fi
1111
+ if ! grep -Fq 'current free-form `state.complexity` is `"large"`' config/skills/devlyn:resolve/SKILL.md \
1112
+ || ! grep -Fq 'current free-form `state.complexity` is `"large"`' .claude/skills/devlyn:resolve/SKILL.md \
1113
+ || ! grep -Fq 'current free-form `state.complexity` is `"large"`' .agents/skills/devlyn:resolve/SKILL.md \
1114
+ || ! grep -Fq 'legacy/external spec `complexity: large` is accepted for compatibility' config/skills/devlyn:resolve/SKILL.md \
1115
+ || ! grep -Fq 'legacy/external spec `complexity: large` is accepted for compatibility' .claude/skills/devlyn:resolve/SKILL.md \
1116
+ || ! grep -Fq 'legacy/external spec `complexity: large` is accepted for compatibility' .agents/skills/devlyn:resolve/SKILL.md \
1117
+ || ! grep -Fq 'current free-form `state.complexity` of `"large"`' config/skills/devlyn:resolve/references/state-schema.md \
1118
+ || ! grep -Fq 'current free-form `state.complexity` of `"large"`' .claude/skills/devlyn:resolve/references/state-schema.md \
1119
+ || ! grep -Fq 'current free-form `state.complexity` of `"large"`' .agents/skills/devlyn:resolve/references/state-schema.md \
1120
+ || ! grep -Fq 'Legacy/external spec `complexity: large` remains accepted for compatibility' config/skills/devlyn:resolve/references/state-schema.md \
1121
+ || ! grep -Fq 'Legacy/external spec `complexity: large` remains accepted for compatibility' .claude/skills/devlyn:resolve/references/state-schema.md \
1122
+ || ! grep -Fq 'Legacy/external spec `complexity: large` remains accepted for compatibility' .agents/skills/devlyn:resolve/references/state-schema.md \
1123
+ || ! grep -Fq 'Current free-form `state.complexity` is `"large"`' config/skills/devlyn:resolve/references/phases/verify.md \
1124
+ || ! grep -Fq 'Current free-form `state.complexity` is `"large"`' .claude/skills/devlyn:resolve/references/phases/verify.md \
1125
+ || ! grep -Fq 'Current free-form `state.complexity` is `"large"`' .agents/skills/devlyn:resolve/references/phases/verify.md \
1126
+ || ! grep -Fq 'legacy/external spec' config/skills/devlyn:resolve/references/phases/verify.md \
1127
+ || ! grep -Fq 'legacy/external spec' .claude/skills/devlyn:resolve/references/phases/verify.md \
1128
+ || ! grep -Fq 'legacy/external spec' .agents/skills/devlyn:resolve/references/phases/verify.md; then
1129
+ bad "resolve VERIFY docs must distinguish current large complexity, legacy high state, and legacy large spec compatibility"
1130
+ else
1131
+ ok "resolve VERIFY docs distinguish current large complexity, legacy high state, and legacy large spec compatibility"
1132
+ fi
1133
+ if ! grep -Fq 'def spec_has_solo_headroom_hypothesis' config/skills/_shared/verify-merge-findings.py \
1134
+ || ! grep -Fq 'spec.solo_headroom_hypothesis' config/skills/_shared/verify-merge-findings.py \
1135
+ || ! grep -Fq 'and "solo_claude" in lower' config/skills/_shared/verify-merge-findings.py \
1136
+ || ! grep -Fq 'and "miss" in lower' config/skills/_shared/verify-merge-findings.py \
1137
+ || ! grep -Fq 'OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")' config/skills/_shared/verify-merge-findings.py \
1138
+ || ! grep -Fq 'def is_command_like_backtick' config/skills/_shared/verify-merge-findings.py \
1139
+ || ! grep -Fq 'for key in ("spec_path", "criteria_path")' config/skills/_shared/verify-merge-findings.py \
1140
+ || ! grep -Fq 'has_backticked_observable_command(text)' config/skills/_shared/verify-merge-findings.py \
1141
+ || ! grep -Fq 'Observable command: `node check.js` exposes behavior' config/skills/_shared/verify-merge-findings.py \
1142
+ || ! grep -Fq 'observable `SOLO_CLAUDE` exposes the miss' config/skills/_shared/verify-merge-findings.py \
1143
+ || ! grep -Fq 'observable `priority rollback` exposes the miss' config/skills/_shared/verify-merge-findings.py \
1144
+ || ! grep -Fq 'implementation token `rollback`' config/skills/_shared/verify-merge-findings.py \
1145
+ || ! grep -Fq '`SOLO_CLAUDE` should miss' config/skills/_shared/verify-merge-findings.py \
1146
+ || ! grep -Fq '{"source": {"criteria_path": str(criteria_path)}}' config/skills/_shared/verify-merge-findings.py \
1147
+ || ! grep -Fq 'spec_has_solo_headroom_hypothesis(' config/skills/_shared/verify-merge-findings.py \
1148
+ || ! grep -Fq ') is False' config/skills/_shared/verify-merge-findings.py \
1149
+ || ! grep -Fq ') is True' config/skills/_shared/verify-merge-findings.py \
1150
+ || ! grep -Fq 'def spec_has_solo_headroom_hypothesis' .claude/skills/_shared/verify-merge-findings.py \
1151
+ || ! grep -Fq 'spec.solo_headroom_hypothesis' .claude/skills/_shared/verify-merge-findings.py \
1152
+ || ! grep -Fq 'and "solo_claude" in lower' .claude/skills/_shared/verify-merge-findings.py \
1153
+ || ! grep -Fq 'and "miss" in lower' .claude/skills/_shared/verify-merge-findings.py \
1154
+ || ! grep -Fq 'OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")' .claude/skills/_shared/verify-merge-findings.py \
1155
+ || ! grep -Fq 'def is_command_like_backtick' .claude/skills/_shared/verify-merge-findings.py \
1156
+ || ! grep -Fq 'for key in ("spec_path", "criteria_path")' .claude/skills/_shared/verify-merge-findings.py \
1157
+ || ! grep -Fq 'has_backticked_observable_command(text)' .claude/skills/_shared/verify-merge-findings.py \
1158
+ || ! grep -Fq 'Observable command: `node check.js` exposes behavior' .claude/skills/_shared/verify-merge-findings.py \
1159
+ || ! grep -Fq 'observable `SOLO_CLAUDE` exposes the miss' .claude/skills/_shared/verify-merge-findings.py \
1160
+ || ! grep -Fq 'observable `priority rollback` exposes the miss' .claude/skills/_shared/verify-merge-findings.py \
1161
+ || ! grep -Fq 'implementation token `rollback`' .claude/skills/_shared/verify-merge-findings.py \
1162
+ || ! grep -Fq '`SOLO_CLAUDE` should miss' .claude/skills/_shared/verify-merge-findings.py \
1163
+ || ! grep -Fq '{"source": {"criteria_path": str(criteria_path)}}' .claude/skills/_shared/verify-merge-findings.py \
1164
+ || ! grep -Fq 'spec_has_solo_headroom_hypothesis(' .claude/skills/_shared/verify-merge-findings.py \
1165
+ || ! grep -Fq ') is False' .claude/skills/_shared/verify-merge-findings.py \
1166
+ || ! grep -Fq ') is True' .claude/skills/_shared/verify-merge-findings.py \
1167
+ || ! grep -Fq 'def spec_has_solo_headroom_hypothesis' .agents/skills/_shared/verify-merge-findings.py \
1168
+ || ! grep -Fq 'spec.solo_headroom_hypothesis' .agents/skills/_shared/verify-merge-findings.py \
1169
+ || ! grep -Fq 'and "solo_claude" in lower' .agents/skills/_shared/verify-merge-findings.py \
1170
+ || ! grep -Fq 'and "miss" in lower' .agents/skills/_shared/verify-merge-findings.py \
1171
+ || ! grep -Fq 'OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")' .agents/skills/_shared/verify-merge-findings.py \
1172
+ || ! grep -Fq 'def is_command_like_backtick' .agents/skills/_shared/verify-merge-findings.py \
1173
+ || ! grep -Fq 'for key in ("spec_path", "criteria_path")' .agents/skills/_shared/verify-merge-findings.py \
1174
+ || ! grep -Fq 'has_backticked_observable_command(text)' .agents/skills/_shared/verify-merge-findings.py \
1175
+ || ! grep -Fq 'Observable command: `node check.js` exposes behavior' .agents/skills/_shared/verify-merge-findings.py \
1176
+ || ! grep -Fq 'observable `SOLO_CLAUDE` exposes the miss' .agents/skills/_shared/verify-merge-findings.py \
1177
+ || ! grep -Fq 'observable `priority rollback` exposes the miss' .agents/skills/_shared/verify-merge-findings.py \
1178
+ || ! grep -Fq 'implementation token `rollback`' .agents/skills/_shared/verify-merge-findings.py \
1179
+ || ! grep -Fq '`SOLO_CLAUDE` should miss' .agents/skills/_shared/verify-merge-findings.py \
1180
+ || ! grep -Fq '{"source": {"criteria_path": str(criteria_path)}}' .agents/skills/_shared/verify-merge-findings.py \
1181
+ || ! grep -Fq 'spec_has_solo_headroom_hypothesis(' .agents/skills/_shared/verify-merge-findings.py \
1182
+ || ! grep -Fq ') is False' .agents/skills/_shared/verify-merge-findings.py \
1183
+ || ! grep -Fq ') is True' .agents/skills/_shared/verify-merge-findings.py \
1184
+ || ! grep -Fq 'The spec includes an actionable solo-headroom hypothesis.' \
1185
+ config/skills/devlyn:resolve/references/phases/verify.md \
1186
+ || ! grep -Fq 'The spec includes an actionable solo-headroom hypothesis.' \
1187
+ .claude/skills/devlyn:resolve/references/phases/verify.md \
1188
+ || ! grep -Fq 'The spec includes an actionable solo-headroom hypothesis.' \
1189
+ .agents/skills/devlyn:resolve/references/phases/verify.md \
1190
+ || ! grep -Fq 'actionable solo-headroom hypotheses' config/skills/devlyn:resolve/references/state-schema.md \
1191
+ || ! grep -Fq 'actionable solo-headroom hypotheses' .claude/skills/devlyn:resolve/references/state-schema.md \
1192
+ || ! grep -Fq 'actionable solo-headroom hypotheses' .agents/skills/devlyn:resolve/references/state-schema.md \
1193
+ || ! grep -Fq 'same actionable solo-headroom hypothesis is a VERIFY pair-trigger reason' config/skills/devlyn:resolve/SKILL.md \
1194
+ || ! grep -Fq 'same actionable solo-headroom hypothesis is a VERIFY pair-trigger reason' .claude/skills/devlyn:resolve/SKILL.md \
1195
+ || ! grep -Fq 'same actionable solo-headroom hypothesis is a VERIFY pair-trigger reason' .agents/skills/devlyn:resolve/SKILL.md; then
1196
+ bad "resolve VERIFY pair trigger must include actionable solo-headroom hypothesis specs"
1197
+ else
1198
+ ok "resolve VERIFY pair trigger includes actionable solo-headroom hypothesis specs"
1199
+ fi
1200
+ if ! grep -Fq 'pair_evidence_intent' config/skills/devlyn:resolve/references/free-form-mode.md \
1201
+ || ! grep -Fq 'pair_evidence_intent' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1202
+ || ! grep -Fq 'pair_evidence_intent' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1203
+ || ! grep -Fq 'has_actionable_solo_headroom' config/skills/devlyn:resolve/references/free-form-mode.md \
1204
+ || ! grep -Fq 'has_actionable_solo_headroom' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1205
+ || ! grep -Fq 'has_actionable_solo_headroom' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1206
+ || ! grep -Fq 'state.source.type = "generated"' config/skills/devlyn:resolve/SKILL.md \
1207
+ || ! grep -Fq 'state.source.type = "generated"' .claude/skills/devlyn:resolve/SKILL.md \
1208
+ || ! grep -Fq 'state.source.type = "generated"' .agents/skills/devlyn:resolve/SKILL.md \
1209
+ || ! grep -Fq 'state.source.criteria_path = ".devlyn/criteria.generated.md"' config/skills/devlyn:resolve/references/free-form-mode.md \
1210
+ || ! grep -Fq 'state.source.criteria_path = ".devlyn/criteria.generated.md"' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1211
+ || ! grep -Fq 'state.source.criteria_path = ".devlyn/criteria.generated.md"' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1212
+ || ! grep -Fq 'state.source.criteria_sha256' config/skills/devlyn:resolve/references/free-form-mode.md \
1213
+ || ! grep -Fq 'state.source.criteria_sha256' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1214
+ || ! grep -Fq 'state.source.criteria_sha256' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1215
+ || ! grep -Fq 'state.source.criteria_sha256` for generated free-form mode' config/skills/devlyn:resolve/references/phases/verify.md \
1216
+ || ! grep -Fq 'state.source.criteria_sha256` for generated free-form mode' .claude/skills/devlyn:resolve/references/phases/verify.md \
1217
+ || ! grep -Fq 'state.source.criteria_sha256` for generated free-form mode' .agents/skills/devlyn:resolve/references/phases/verify.md \
1218
+ || ! grep -Fq 'Free-form mode sets `type: "generated"`' config/skills/devlyn:resolve/references/state-schema.md \
1219
+ || ! grep -Fq 'Free-form mode sets `type: "generated"`' .claude/skills/devlyn:resolve/references/state-schema.md \
1220
+ || ! grep -Fq 'Free-form mode sets `type: "generated"`' .agents/skills/devlyn:resolve/references/state-schema.md \
1221
+ || ! grep -Fq 'backticked observable command line that itself contains `miss`' config/skills/devlyn:resolve/references/free-form-mode.md \
1222
+ || ! grep -Fq 'backticked observable command line that itself contains `miss`' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1223
+ || ! grep -Fq 'backticked observable command line that itself contains `miss`' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1224
+ || ! grep -Fq 'BLOCKED:solo-headroom-hypothesis-required' config/skills/devlyn:resolve/references/free-form-mode.md \
1225
+ || ! grep -Fq 'BLOCKED:solo-headroom-hypothesis-required' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1226
+ || ! grep -Fq 'BLOCKED:solo-headroom-hypothesis-required' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1227
+ || ! grep -Fq 'pair-evidence intent without an actionable solo-headroom hypothesis must halt' config/skills/devlyn:resolve/SKILL.md \
1228
+ || ! grep -Fq 'pair-evidence intent without an actionable solo-headroom hypothesis must halt' .claude/skills/devlyn:resolve/SKILL.md \
1229
+ || ! grep -Fq 'pair-evidence intent without an actionable solo-headroom hypothesis must halt' .agents/skills/devlyn:resolve/SKILL.md \
1230
+ || ! grep -Fq 'preserve that literal hypothesis in `.devlyn/criteria.generated.md`' config/skills/devlyn:resolve/references/free-form-mode.md \
1231
+ || ! grep -Fq 'preserve that literal hypothesis in `.devlyn/criteria.generated.md`' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1232
+ || ! grep -Fq 'preserve that literal hypothesis in `.devlyn/criteria.generated.md`' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1233
+ || ! grep -Fq 'emit the canonical `spec.solo_headroom_hypothesis` pair trigger reason' config/skills/devlyn:resolve/references/free-form-mode.md \
1234
+ || ! grep -Fq 'emit the canonical `spec.solo_headroom_hypothesis` pair trigger reason' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1235
+ || ! grep -Fq 'emit the canonical `spec.solo_headroom_hypothesis` pair trigger reason' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1236
+ || ! grep -Fq '/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required`' config/skills/devlyn:resolve/references/state-schema.md \
1237
+ || ! grep -Fq '/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required`' .claude/skills/devlyn:resolve/references/state-schema.md \
1238
+ || ! grep -Fq '/devlyn:ideate` guidance for `BLOCKED:solo-headroom-hypothesis-required`' .agents/skills/devlyn:resolve/references/state-schema.md \
1239
+ || ! grep -Fq '/devlyn:ideate` guidance after `BLOCKED:solo-headroom-hypothesis-required`' config/skills/devlyn:resolve/SKILL.md \
1240
+ || ! grep -Fq '/devlyn:ideate` guidance after `BLOCKED:solo-headroom-hypothesis-required`' .claude/skills/devlyn:resolve/SKILL.md \
1241
+ || ! grep -Fq '/devlyn:ideate` guidance after `BLOCKED:solo-headroom-hypothesis-required`' .agents/skills/devlyn:resolve/SKILL.md \
1242
+ || ! grep -Fq 'Free-form goals that ask for benchmark evidence, pair-evidence, risk-probe' README.md \
1243
+ || ! grep -Fq '`/devlyn:resolve` stops with `BLOCKED:solo-headroom-hypothesis-required`' README.md \
1244
+ || ! grep -Fq 'points you to `/devlyn:ideate` instead of inventing a weak hypothesis' README.md; then
1245
+ bad "resolve free-form mode must block pair-evidence goals without actionable solo-headroom hypothesis"
1246
+ else
1247
+ ok "resolve free-form mode blocks pair-evidence goals without actionable solo-headroom hypothesis"
1248
+ fi
1249
+ if ! grep -Fq 'unmeasured_pair_candidate_intent' config/skills/devlyn:resolve/references/free-form-mode.md \
1250
+ || ! grep -Fq 'unmeasured_pair_candidate_intent' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1251
+ || ! grep -Fq 'unmeasured_pair_candidate_intent' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1252
+ || ! grep -Fq 'has_solo_ceiling_avoidance' config/skills/devlyn:resolve/references/free-form-mode.md \
1253
+ || ! grep -Fq 'has_solo_ceiling_avoidance' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1254
+ || ! grep -Fq 'has_solo_ceiling_avoidance' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1255
+ || ! grep -Fq 'BLOCKED:solo-ceiling-avoidance-required' config/skills/devlyn:resolve/references/free-form-mode.md \
1256
+ || ! grep -Fq 'BLOCKED:solo-ceiling-avoidance-required' .claude/skills/devlyn:resolve/references/free-form-mode.md \
1257
+ || ! grep -Fq 'BLOCKED:solo-ceiling-avoidance-required' .agents/skills/devlyn:resolve/references/free-form-mode.md \
1258
+ || ! grep -Fq 'unmeasured pair-candidate intent without solo ceiling avoidance must halt' config/skills/devlyn:resolve/SKILL.md \
1259
+ || ! grep -Fq 'unmeasured pair-candidate intent without solo ceiling avoidance must halt' .claude/skills/devlyn:resolve/SKILL.md \
1260
+ || ! grep -Fq 'unmeasured pair-candidate intent without solo ceiling avoidance must halt' .agents/skills/devlyn:resolve/SKILL.md \
1261
+ || ! grep -Fq '/devlyn:ideate` guidance after `BLOCKED:solo-ceiling-avoidance-required`' config/skills/devlyn:resolve/SKILL.md \
1262
+ || ! grep -Fq '/devlyn:ideate` guidance after `BLOCKED:solo-ceiling-avoidance-required`' .claude/skills/devlyn:resolve/SKILL.md \
1263
+ || ! grep -Fq '/devlyn:ideate` guidance after `BLOCKED:solo-ceiling-avoidance-required`' .agents/skills/devlyn:resolve/SKILL.md \
1264
+ || ! grep -Fq '/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required`' config/skills/devlyn:resolve/references/state-schema.md \
1265
+ || ! grep -Fq '/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required`' .claude/skills/devlyn:resolve/references/state-schema.md \
1266
+ || ! grep -Fq '/devlyn:ideate` guidance for `BLOCKED:solo-ceiling-avoidance-required`' .agents/skills/devlyn:resolve/references/state-schema.md \
1267
+ || ! grep -Fq '`/devlyn:resolve` stops with `BLOCKED:solo-ceiling-avoidance-required`' README.md; then
1268
+ bad "resolve free-form mode must block new unmeasured pair candidates without solo ceiling avoidance"
1269
+ else
1270
+ ok "resolve free-form mode blocks new unmeasured pair candidates without solo ceiling avoidance"
1271
+ fi
1272
+
1273
+ section "Check 6g: resolve consumes sibling spec.expected.json"
1274
+ sibling_consume_missing=0
1275
+ for file in \
1276
+ config/skills/devlyn:resolve/SKILL.md \
1277
+ config/skills/devlyn:resolve/references/phases/build-gate.md \
1278
+ config/skills/devlyn:resolve/references/phases/verify.md
1279
+ do
1280
+ if ! grep -Fq 'sibling `spec.expected.json`' "$file"; then
1281
+ bad "$file — missing sibling spec.expected.json consumption contract"
1282
+ sibling_consume_missing=1
1283
+ fi
1284
+ done
1285
+ for pattern in \
1286
+ 'def stage_from_expected' \
1287
+ 'stage_from_expected(' \
1288
+ 'expected_found, expected_staged, expected_error, expected_path' \
1289
+ 'def expected_contract_findings' \
1290
+ 'correctness.forbidden-pattern' \
1291
+ 'scope.max-deps-added-exceeded' \
1292
+ 'SPEC_VERIFY_FINDINGS_FILE'
1293
+ do
1294
+ if ! grep -Fq "$pattern" config/skills/_shared/spec-verify-check.py; then
1295
+ bad "spec-verify-check.py missing sibling expected staging implementation: $pattern"
1296
+ sibling_consume_missing=1
1297
+ fi
1298
+ done
1299
+ if [ $sibling_consume_missing -eq 0 ]; then
1300
+ ok "resolve self-stages and mechanically checks sibling spec.expected.json"
1301
+ fi
1302
+
1303
+ section "Check 6i: VERIFY mechanical findings are merge-visible"
1304
+ verify_mech_missing=0
1305
+ for pattern in \
1306
+ 'SPEC_VERIFY_PHASE=verify_mechanical' \
1307
+ 'SPEC_VERIFY_FINDINGS_FILE=verify-mechanical.findings.jsonl' \
1308
+ 'SPEC_VERIFY_FINDING_PREFIX=VERIFY-MECH'
1309
+ do
1310
+ if ! grep -Fq "$pattern" config/skills/devlyn:resolve/SKILL.md \
1311
+ || ! grep -Fq "$pattern" config/skills/devlyn:resolve/references/phases/verify.md \
1312
+ || ! grep -Fq "$pattern" config/skills/_shared/spec-verify-check.py; then
1313
+ bad "VERIFY mechanical output contract missing: $pattern"
1314
+ verify_mech_missing=1
1315
+ fi
1316
+ done
1317
+ if ! grep -Fq '("mechanical", "verify-mechanical.findings.jsonl")' \
1318
+ config/skills/_shared/verify-merge-findings.py; then
1319
+ bad "verify-merge-findings.py does not consume verify-mechanical.findings.jsonl"
1320
+ verify_mech_missing=1
1321
+ fi
1322
+ if grep -Fq 'All paths emit a CRITICAL finding to' config/skills/_shared/spec-verify-check.py; then
1323
+ bad "spec-verify-check.py has stale single-output CRITICAL finding wording"
1324
+ verify_mech_missing=1
1325
+ fi
1326
+ if [ $verify_mech_missing -eq 0 ]; then
1327
+ ok "spec-verify VERIFY output routes into verify-merge-findings.py"
1328
+ fi
1329
+
1330
+ section "Check 6j: VERIFY pair trigger runs after primary JUDGE"
1331
+ pair_trigger_order_missing=0
1332
+ for file in \
1333
+ config/skills/devlyn:resolve/SKILL.md \
1334
+ .claude/skills/devlyn:resolve/SKILL.md \
1335
+ .agents/skills/devlyn:resolve/SKILL.md
1336
+ do
1337
+ if ! grep -Fq 'Pair-mode (cross-model JUDGE) is eligible only after MECHANICAL and the primary JUDGE have no verdict-binding findings' "$file" \
1338
+ || ! grep -Fq 'After MECHANICAL and the primary JUDGE finish, compute `pair_trigger' "$file" \
1339
+ || ! grep -Fq '`risk_profile` is strict typed state' "$file" \
1340
+ || ! grep -Fq 'malformed `risk_profile` is also a VERIFY contract violation' "$file" \
1341
+ || ! grep -Fq 'If MECHANICAL or the primary JUDGE has a verdict-binding finding' "$file"; then
1342
+ bad "$file — VERIFY pair trigger order must be after primary JUDGE"
1343
+ pair_trigger_order_missing=1
1344
+ fi
1345
+ done
1346
+ for file in \
1347
+ config/skills/devlyn:resolve/references/phases/verify.md \
1348
+ .claude/skills/devlyn:resolve/references/phases/verify.md \
1349
+ .agents/skills/devlyn:resolve/references/phases/verify.md
1350
+ do
1351
+ if ! grep -Fq 'Pair-mode is eligible only after MECHANICAL and the primary JUDGE have no' "$file" \
1352
+ || ! grep -Fq 'After MECHANICAL and the primary JUDGE finish, compute and persist this before' "$file" \
1353
+ || ! grep -Fq 'Malformed `state.risk_profile` is a VERIFY contract violation' "$file" \
1354
+ || ! grep -Fq 'primary_judge_blocker' "$file"; then
1355
+ bad "$file — VERIFY phase body must compute pair_trigger after primary JUDGE"
1356
+ pair_trigger_order_missing=1
1357
+ fi
1358
+ done
1359
+ for file in \
1360
+ config/skills/devlyn:resolve/references/state-schema.md \
1361
+ .claude/skills/devlyn:resolve/references/state-schema.md \
1362
+ .agents/skills/devlyn:resolve/references/state-schema.md
1363
+ do
1364
+ if ! grep -Fq 'MECHANICAL and the primary JUDGE have no verdict-binding blockers' "$file" \
1365
+ || ! grep -Fq 'may set only `user_no_pair`, `mechanical_blocker`, `primary_judge_blocker`, or null' "$file" \
1366
+ || ! grep -Fq '`risk_profile` must remain an object with boolean' "$file" \
1367
+ || ! grep -Fq 'state implies a pair decision is required but `pair_trigger` is missing' "$file"; then
1368
+ bad "$file — state schema must document pair_trigger blocker and missing-trigger enforcement"
1369
+ pair_trigger_order_missing=1
1370
+ fi
1371
+ done
1372
+ if [ $pair_trigger_order_missing -eq 0 ]; then
1373
+ ok "VERIFY pair trigger order waits for primary JUDGE evidence"
1374
+ fi
1375
+
1376
+ section "Check 6h: No undocumented spec.expected.json.browser_flows field"
1377
+ browser_flow_refs=$(grep -RInF 'spec.expected.json.browser_flows' \
1378
+ config/skills README.md bin/ package.json 2>/dev/null || true)
1379
+ if [ -z "$browser_flow_refs" ]; then
1380
+ ok "active docs do not advertise unsupported browser_flows schema field"
1381
+ else
1382
+ while IFS= read -r f; do bad "$f"; done <<< "$browser_flow_refs"
210
1383
  fi
211
1384
 
212
1385
  section "Check 6e: All-or-nothing probes prove mutable rollback"
@@ -283,22 +1456,2313 @@ else
283
1456
  fi
284
1457
 
285
1458
  # ---------------------------------------------------------------------------
286
- # 9. Engine-downgrade string is canonical (codex-unavailable, not codex-ping failed).
1459
+ # 10a. Bounded Codex probe/judge calls must run isolated.
1460
+ # Pair/risk-probe paths are measured read-only judges, not implementation
1461
+ # phases. They must not inherit user config, AGENTS.md, hooks, pyx-memory,
1462
+ # or other local rules that can add hidden context or transcript side
1463
+ # effects. The wrapper owns the flag expansion; skill docs own requiring
1464
+ # CODEX_MONITORED_ISOLATED=1 for probe-derive and pair-JUDGE.
1465
+ # ---------------------------------------------------------------------------
1466
+ section "Check 10a: Bounded Codex calls use isolated wrapper mode"
1467
+ isolation_missing=0
1468
+ for needle in \
1469
+ 'CODEX_MONITORED_ISOLATED=1 bash "$CODEX_MONITORED_PATH"' \
1470
+ 'CODEX_MONITORED_ISOLATED=1` and `-c model_reasoning_effort=medium' \
1471
+ 'CODEX_MONITORED_ISOLATED=1 bash .claude/skills/_shared/codex-monitored.sh'
1472
+ do
1473
+ if ! grep -RInF "$needle" config/skills >/dev/null 2>&1; then
1474
+ bad "missing isolated Codex invocation contract: $needle"
1475
+ isolation_missing=1
1476
+ fi
1477
+ done
1478
+ for flag in \
1479
+ '--ignore-user-config' \
1480
+ '--ignore-rules' \
1481
+ '--ephemeral' \
1482
+ '--disable codex_hooks' \
1483
+ '--disable hooks'
1484
+ do
1485
+ if ! grep -F -- "$flag" config/skills/_shared/codex-monitored.sh >/dev/null 2>&1; then
1486
+ bad "codex-monitored.sh missing isolation flag expansion: $flag"
1487
+ isolation_missing=1
1488
+ fi
1489
+ done
1490
+ for helper in require_positive_int require_nonnegative_int; do
1491
+ if ! grep -F "$helper" config/skills/_shared/codex-monitored.sh >/dev/null 2>&1; then
1492
+ bad "codex-monitored.sh missing numeric env validator: $helper"
1493
+ isolation_missing=1
1494
+ fi
1495
+ done
1496
+
1497
+ if make_temp_dir tmp_env /tmp/codex-monitored-env.XXXXXX; then
1498
+ if CODEX_MONITORED_HEARTBEAT=0 CODEX_BIN=/bin/true \
1499
+ bash config/skills/_shared/codex-monitored.sh prompt \
1500
+ >"$tmp_env/heartbeat.stdout" 2>"$tmp_env/heartbeat.stderr"; then
1501
+ bad "codex-monitored.sh accepted CODEX_MONITORED_HEARTBEAT=0"
1502
+ isolation_missing=1
1503
+ elif ! grep -F 'CODEX_MONITORED_HEARTBEAT must be > 0' "$tmp_env/heartbeat.stderr" >/dev/null 2>&1; then
1504
+ bad "codex-monitored.sh heartbeat validation emitted wrong error"
1505
+ isolation_missing=1
1506
+ fi
1507
+ if CODEX_MONITORED_TIMEOUT_SEC=abc CODEX_BIN=/bin/true \
1508
+ bash config/skills/_shared/codex-monitored.sh prompt \
1509
+ >"$tmp_env/timeout.stdout" 2>"$tmp_env/timeout.stderr"; then
1510
+ bad "codex-monitored.sh accepted non-numeric CODEX_MONITORED_TIMEOUT_SEC"
1511
+ isolation_missing=1
1512
+ elif ! grep -F 'CODEX_MONITORED_TIMEOUT_SEC must be a non-negative integer' "$tmp_env/timeout.stderr" >/dev/null 2>&1; then
1513
+ bad "codex-monitored.sh timeout validation emitted wrong error"
1514
+ isolation_missing=1
1515
+ fi
1516
+ rm -rf "$tmp_env"
1517
+ else
1518
+ isolation_missing=1
1519
+ fi
1520
+
1521
+ if make_temp_dir tmp_iso /tmp/codex-monitored-isolated.XXXXXX; then
1522
+ cat > "$tmp_iso/codex" <<'EOF'
1523
+ #!/usr/bin/env bash
1524
+ printf '%s\n' "$@" > "$CODEX_FAKE_ARGS_OUT"
1525
+ EOF
1526
+ chmod +x "$tmp_iso/codex"
1527
+ CODEX_FAKE_ARGS_OUT="$tmp_iso/args.txt" \
1528
+ CODEX_MONITORED_ISOLATED=1 \
1529
+ CODEX_MONITORED_HEARTBEAT=999 \
1530
+ CODEX_BIN="$tmp_iso/codex" \
1531
+ bash config/skills/_shared/codex-monitored.sh -s read-only prompt \
1532
+ >"$tmp_iso/stdout.txt" 2>"$tmp_iso/stderr.txt"
1533
+ iso_exit=$?
1534
+ if [ $iso_exit -ne 0 ]; then
1535
+ bad "codex-monitored.sh isolated fake invocation exited $iso_exit"
1536
+ isolation_missing=1
1537
+ else
1538
+ for expected in exec --ignore-user-config --ignore-rules --ephemeral --disable codex_hooks --disable hooks -s read-only prompt; do
1539
+ if ! grep -Fx -- "$expected" "$tmp_iso/args.txt" >/dev/null 2>&1; then
1540
+ bad "codex-monitored.sh isolated fake invocation missing arg: $expected"
1541
+ isolation_missing=1
1542
+ fi
1543
+ done
1544
+ if ! grep -F '[codex-monitored] isolated=1' "$tmp_iso/stderr.txt" >/dev/null 2>&1; then
1545
+ bad "codex-monitored.sh isolated fake invocation missing lifecycle marker"
1546
+ isolation_missing=1
1547
+ fi
1548
+ fi
1549
+ rm -rf "$tmp_iso"
1550
+ else
1551
+ isolation_missing=1
1552
+ fi
1553
+ if [ $isolation_missing -eq 0 ]; then
1554
+ ok "bounded Codex probe/judge calls require isolated wrapper mode"
1555
+ fi
1556
+
1557
+ # ---------------------------------------------------------------------------
1558
+ # 10b. Shared routing docs must describe the current 2-skill surface.
1559
+ # A stale auto-resolve/preflight/ideate-CHALLENGE reference can misroute
1560
+ # bounded pair work back into unisolated or retired Codex paths.
1561
+ # ---------------------------------------------------------------------------
1562
+ section "Check 10b: Shared routing docs avoid retired skill surfaces"
1563
+ offenders=$(grep -RInE 'auto-resolve/SKILL\.md|preflight/SKILL\.md|challenge-rubric\.md|ideate CHALLENGE phase|does NOT consume this file|cross-model challenge phases when configured|phase-1-build\.md|phase-2-evaluate\.md|phase-3-critic\.md' \
1564
+ config/skills/_shared 2>/dev/null || true)
1565
+ if [ -z "$offenders" ]; then
1566
+ ok "shared routing docs reference current devlyn:ideate/devlyn:resolve surface"
1567
+ else
1568
+ while IFS= read -r f; do bad "$f"; done <<< "$offenders"
1569
+ fi
1570
+
1571
+ # ---------------------------------------------------------------------------
1572
+ # 10c. User-facing current docs must not advertise retired skills.
1573
+ # Historical archive mentions are allowed, but install/package copy and
1574
+ # active helper scripts should describe ideate -> resolve, not
1575
+ # ideate -> auto-resolve -> preflight or ideate CHALLENGE.
1576
+ # ---------------------------------------------------------------------------
1577
+ section "Check 10c: User-facing current docs avoid retired skill surfaces"
1578
+ offenders=$(
1579
+ {
1580
+ grep -nE '/devlyn:auto-resolve|ideate CHALLENGE|--with-codex|Quick Start pointing to ideate → auto-resolve → preflight|auto-resolve'\''s build agent|Core pipeline skills \(`ideate`, `auto-resolve`, `preflight`\)' README.md 2>/dev/null || true
1581
+ grep -nE '"description": .*auto-resolve|"description": .*preflight' package.json 2>/dev/null || true
1582
+ grep -nE 'so auto-resolve doesn'\''t prompt' bin/devlyn.js 2>/dev/null || true
1583
+ grep -nE 'devlyn:auto-resolve|phase-1-build\.md|phase-2-evaluate\.md|phase-3-critic\.md' scripts/static-ab.sh 2>/dev/null || true
1584
+ } | sed -E 's#^#user-facing retired-surface reference: #'
1585
+ )
1586
+ if [ -z "$offenders" ]; then
1587
+ ok "README/package/helper copy describes current ideate -> resolve surface"
1588
+ else
1589
+ while IFS= read -r f; do bad "$f"; done <<< "$offenders"
1590
+ fi
1591
+
1592
+ # ---------------------------------------------------------------------------
1593
+ # 10d. Prompt adapters cite the current official model guidance.
1594
+ # Prompt edits must be model-specific where tactics differ: OpenAI guidance
1595
+ # for GPT/Codex, Anthropic guidance for Claude. This check keeps that
1596
+ # contract from becoming oral tradition.
1597
+ # ---------------------------------------------------------------------------
1598
+ section "Check 10d: Prompt adapters cite official model guidance"
1599
+ adapter_missing=0
1600
+ if ! grep -Fq 'https://developers.openai.com/api/docs/guides/prompt-guidance?model=gpt-5.5' \
1601
+ config/skills/_shared/adapters/gpt-5-5.md; then
1602
+ bad "gpt-5-5 adapter missing official OpenAI prompt guidance URL"
1603
+ adapter_missing=1
1604
+ fi
1605
+ if ! grep -Fq 'https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices' \
1606
+ config/skills/_shared/adapters/opus-4-7.md; then
1607
+ bad "opus-4-7 adapter missing official Claude prompting best-practices URL"
1608
+ adapter_missing=1
1609
+ fi
1610
+ for pattern in 'Use Markdown only where it carries structure' 'metaprompter for itself'; do
1611
+ if ! grep -Fq "$pattern" config/skills/_shared/adapters/gpt-5-5.md; then
1612
+ bad "gpt-5-5 adapter missing official-guidance tactic: $pattern"
1613
+ adapter_missing=1
1614
+ fi
1615
+ done
1616
+ for pattern in 'high` or `xhigh` effort' 'report every issue you find' 'do not filter for importance or confidence' 'prefer concise positive examples' '<example>'; do
1617
+ if ! grep -Fq "$pattern" config/skills/_shared/adapters/opus-4-7.md; then
1618
+ bad "opus-4-7 adapter missing official-guidance tactic: $pattern"
1619
+ adapter_missing=1
1620
+ fi
1621
+ done
1622
+ for file in config/skills/devlyn:resolve/SKILL.md config/skills/devlyn:ideate/SKILL.md; do
1623
+ if ! grep -Fq '_shared/adapters/<model>.md' "$file"; then
1624
+ bad "$file — missing per-engine adapter injection contract"
1625
+ adapter_missing=1
1626
+ fi
1627
+ done
1628
+ if [ $adapter_missing -eq 0 ]; then
1629
+ ok "adapters cite official GPT/Claude guidance, carry model-specific tactics, and both skills inject them"
1630
+ fi
1631
+
1632
+ # ---------------------------------------------------------------------------
1633
+ # 10d1. Opus sidecar must fail closed on score-source mapping before provider calls.
1634
+ # ---------------------------------------------------------------------------
1635
+ section "Check 10d1: Opus judge sidecar validates blind mapping before provider calls"
1636
+ if ! grep -Fq 'judge blind mapping missing arm(s)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1637
+ || ! grep -Fq 'scores_by_arm without blind mapping' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1638
+ || ! grep -Fq 'scores_by_arm malformed score(s)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1639
+ || ! grep -Fq 'gpt judge.json _blind_mapping must be an object' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1640
+ || ! grep -Fq 'def is_score(value):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1641
+ || ! grep -Fq 'not isinstance(value, bool) and 0 <= value <= 100' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1642
+ || ! grep -Fq 'invalid opus score value(s)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1643
+ || ! grep -Fq 'invalid opus disqualifier value(s)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1644
+ || ! grep -Fq 'opus-invalid-generated-dq' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1645
+ || ! grep -Fq 'def blind_mapping(j):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1646
+ || ! grep -Fq 'def mapped_arm_set(j):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1647
+ || ! grep -Fq '{"solo_claude", "bare"}.issubset(mapped_arms)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1648
+ || ! grep -Fq 'def mapped_scores(j):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1649
+ || ! grep -Fq 'if arm in mapped_arms and is_score(score)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1650
+ || ! grep -Fq 'def margin_from_scores(scores, left, right):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1651
+ || ! grep -Fq 'def mapped_winner(j, scores):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1652
+ || ! grep -Fq 'def fmt_metric(value):' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1653
+ || ! grep -Fq 'g_l1_l0 = margin_from_scores(g_scores, "solo_claude", "bare")' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1654
+ || ! grep -Fq 'g_v_l0 = margin_from_scores(g_scores, "variant", "bare")' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1655
+ || ! grep -Fq 'g_winner = mapped_winner(g, g_scores)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1656
+ || ! grep -Fq '"winner_agree": g_winner is not None and o_winner is not None and g_winner == o_winner' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1657
+ || ! grep -Fq '"gpt_scores": g_scores' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1658
+ || ! grep -Fq '"opus_scores": o_scores' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1659
+ || ! grep -Fq 'opus-bad-mapping' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1660
+ || ! grep -Fq 'opus-malformed-mapping' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1661
+ || ! grep -Fq 'opus-malformed-score' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1662
+ || ! grep -Fq 'opus-invalid-generated-score' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1663
+ || ! grep -Fq 'arg-parse-opus-summary-mapping' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1664
+ || ! grep -Fq 'arg-parse-opus-summary-null-margin' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1665
+ || ! grep -Fq 'assert row["gpt_margin_l1_l0"] == 10' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1666
+ || ! grep -Fq 'assert row["gpt_winner"] is None' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1667
+ || ! grep -Fq "gpt_l1_l0_avg=na" benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1668
+ || ! grep -Fq 'solo_claude-bare={chosen' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1669
+ || ! grep -Fq 'variant-solo_claude={chosen' benchmark/auto-resolve/scripts/judge-opus-pass.sh; then
1670
+ offenders="${offenders}"$'\n'"judge-opus-pass.sh must validate _blind_mapping locally before invoking Claude"
1671
+ fi
1672
+
1673
+ # ---------------------------------------------------------------------------
1674
+ # 10e. Benchmark docs must describe the current solo-vs-pair topology.
1675
+ # Pair evidence work depends on the bare/solo_claude/pair contract being current:
1676
+ # bare, solo_claude, and the selected pair arm are measured today.
1677
+ # ---------------------------------------------------------------------------
1678
+ section "Check 10e: Benchmark docs describe current 3-arm pair topology"
1679
+ offenders=$(grep -RInE 'L1 .*queued|cannot directly verify the L1|auto-resolve → preflight|ideate → auto-resolve|all fixtures × 2 arms|9 fixtures × 2 arms|9 fixtures × 3 arms|≥ 7 of 9|7/9 fixtures|variant` / `bare|variant/\{input|preflight; bare|Audited by preflight|Variant'\''s CRITIC|future enhancement|release-blocker|Today the suite runs `variant`|/devlyn:auto-resolve|auto-resolve-ready|REAL auto-resolve|real auto-resolve|auto-resolve run|auto-resolve runs|Claude/auto-resolve|variant − bare|Margin \(variant|Ship thresholds use margin \(variant|both arms improve together|both arms|benchmark --n 3|run-suite\.sh --n 3|higher confidence for ship decisions|3 runs per fixture for ship decisions|One-command A/B benchmark|A/B randomized|A/B benchmark suite vs bare|not vs bare — bare is the opponent' \
1680
+ benchmark/auto-resolve/BENCHMARK-DESIGN.md \
1681
+ benchmark/auto-resolve/README.md \
1682
+ benchmark/auto-resolve/RUBRIC.md \
1683
+ benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1684
+ benchmark/auto-resolve/run-real-benchmark.md \
1685
+ benchmark/auto-resolve/fixtures/SCHEMA.md \
1686
+ benchmark/auto-resolve/shadow-fixtures/README.md \
1687
+ 2>/dev/null || true)
1688
+ fixture_note_stale=$(git grep -InE -- 'both arms|both solo and pair arms|bare or solo consistently reaches ceiling' -- \
1689
+ 'benchmark/auto-resolve/fixtures/**/NOTES.md' \
1690
+ ':!benchmark/auto-resolve/fixtures/retired/**' \
1691
+ 2>/dev/null || true)
1692
+ if [ -n "$fixture_note_stale" ]; then
1693
+ offenders="${offenders}"$'\n'"fixture NOTES must name bare and solo_claude explicitly instead of ambiguous arm wording:"$'\n'"$fixture_note_stale"
1694
+ fi
1695
+ shadow_note_stale=$(git grep -In -- 'solo headroom' -- \
1696
+ 'benchmark/auto-resolve/shadow-fixtures/**/NOTES.md' \
1697
+ 2>/dev/null || true)
1698
+ if [ -n "$shadow_note_stale" ]; then
1699
+ offenders="${offenders}"$'\n'"shadow fixture NOTES must name solo_claude headroom explicitly:"$'\n'"$shadow_note_stale"
1700
+ fi
1701
+ active_doc_stale_solo_scores=$(git grep -InE -- '(^|[^[:alnum:]_])solo [0-9]+|/ solo$|, solo [0-9]+|vs solo [0-9]+' -- \
1702
+ benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1703
+ benchmark/auto-resolve/README.md \
1704
+ benchmark/auto-resolve/run-real-benchmark.md \
1705
+ 2>/dev/null || true)
1706
+ if [ -n "$active_doc_stale_solo_scores" ]; then
1707
+ offenders="${offenders}"$'\n'"active benchmark score evidence must name solo_claude instead of shorthand solo:"$'\n'"$active_doc_stale_solo_scores"
1708
+ fi
1709
+ active_stale_margin_labels=$(git grep -InE -- 'l1-l0=|v-l1=|L1-L0 disagreement|Per-axis L1-L0|Suite-level per-axis L1-L0|Suite avg L1-L0|L2-L1 margin' -- \
1710
+ benchmark/auto-resolve/BENCHMARK-DESIGN.md \
1711
+ benchmark/auto-resolve/README.md \
1712
+ benchmark/auto-resolve/RUBRIC.md \
1713
+ benchmark/auto-resolve/scripts/judge-opus-pass.sh \
1714
+ 2>/dev/null || true)
1715
+ if [ -n "$active_stale_margin_labels" ]; then
1716
+ offenders="${offenders}"$'\n'"active benchmark docs/stdout must use key-aligned margin labels:"$'\n'"$active_stale_margin_labels"
1717
+ fi
1718
+ if grep -RInE 'gate: ≥ 7 of 9|Hard floor 3: ≥ 7 of 9|7-of-9 L1 floor|all 9 fixtures produced' \
1719
+ benchmark/auto-resolve/scripts/compile-report.py \
1720
+ benchmark/auto-resolve/scripts/ship-gate.py \
1721
+ benchmark/auto-resolve/scripts/judge-opus-pass.sh >/dev/null 2>&1; then
1722
+ offenders="${offenders}"$'\n'"benchmark scripts must describe the current extended fixture count as an explicit selected/gated set, not stale 9-fixture wording"
1723
+ fi
1724
+ if grep -Fq 'PAIR_ARM="l2_gated"' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh; then
1725
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh: default pair arm must stay on current measured l2_risk_probes path"
1726
+ fi
1727
+ if ! grep -Fq 'for required in result.json verify.json diff.patch; do' \
1728
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
1729
+ || ! grep -Fq 'reuse source missing $required' \
1730
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh; then
1731
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh: calibrated-arm reuse must fail closed on missing result.json, verify.json, and diff.patch"
1732
+ fi
1733
+ if ! grep -Fq 'reuse destination incomplete $required' \
1734
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
1735
+ || ! grep -Fq 'reuse destination is not a directory' \
1736
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh; then
1737
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh: calibrated-arm reuse must fail closed on incomplete existing destination"
1738
+ fi
1739
+ if ! grep -Fq 'diff.patch missing' benchmark/auto-resolve/scripts/headroom-gate.py \
1740
+ || ! grep -Fq 'diff.patch missing' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py; then
1741
+ offenders="${offenders}"$'\n'"benchmark pair gates must require diff.patch artifacts for measured arms"
1742
+ fi
1743
+ if ! grep -Fq 'pair-arm must be l2_risk_probes or l2_gated' \
1744
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
1745
+ || ! grep -Fq 'pair-arm l2_forced is retired' \
1746
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh; then
1747
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh: pair arm selection must fail closed before fixture execution"
1748
+ fi
1749
+ if ! grep -Fq 'l2_risk_probes|l2_gated) ;;' \
1750
+ benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh; then
1751
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh: runner allowlist must remain l2_risk_probes|l2_gated only"
1752
+ fi
1753
+ if grep -Fq 'default="l2_gated"' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py; then
1754
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py: --pair-arm default must stay on current measured l2_risk_probes path"
1755
+ fi
1756
+ if ! grep -Fq 'from pair_evidence_contract import (' \
1757
+ benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
1758
+ || ! grep -Fq 'ALLOWED_PAIR_ARMS,' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
1759
+ || ! grep -Fq 'ALLOWED_PAIR_ARMS = {"l2_risk_probes", "l2_gated"}' \
1760
+ benchmark/auto-resolve/scripts/pair_evidence_contract.py \
1761
+ || ! grep -Fq 'pair-arm l2_forced is retired' \
1762
+ benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py; then
1763
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py: pair arm selection must fail closed inside the gate"
1764
+ fi
1765
+ if ! grep -Fq '"l2_risk_probes"' benchmark/auto-resolve/scripts/check-f9-artifacts.py; then
1766
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/check-f9-artifacts.py: F9 skill-driven artifact check must accept current l2_risk_probes arm"
1767
+ fi
1768
+ if ! grep -Fq '_load_json_object' benchmark/auto-resolve/scripts/check-f9-artifacts.py \
1769
+ || ! grep -Fq 'expected JSON object' benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh; then
1770
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/check-f9-artifacts.py: F9 timing/state JSON must fail closed on non-object payloads"
1771
+ fi
1772
+ if grep -RInE 'asserts variant/solo|Variant-only artifact checks|Variant artifact check' \
1773
+ benchmark/auto-resolve/scripts/check-f9-artifacts.py \
1774
+ benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md \
1775
+ benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md >/dev/null 2>&1; then
1776
+ offenders="${offenders}"$'\n'"F9 artifact docs/checker wording must describe skill-driven arms, not variant-only checks"
1777
+ fi
1778
+ if grep -Fq '<variant|bare>' benchmark/auto-resolve/scripts/run-fixture.sh; then
1779
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-fixture.sh: usage must list current benchmark arms"
1780
+ fi
1781
+ if grep -Fq 'l2_gated/l2_forced' benchmark/auto-resolve/scripts/run-fixture.sh; then
1782
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-fixture.sh: comments must not omit l2_risk_probes from l2_* arm handling"
1783
+ fi
1784
+ if grep -Fq 'ENGINE_CLAUSE="--engine auto"' benchmark/auto-resolve/scripts/run-fixture.sh \
1785
+ || grep -Fq 'Run with `--engine auto`' benchmark/auto-resolve/scripts/run-fixture.sh; then
1786
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-fixture.sh: smoke variant arm must use current --engine claude --risk-probes path, not retired --engine auto"
1787
+ fi
1788
+ if ! grep -Fq 'ENGINE_CLAUSE="--engine claude --risk-probes"' benchmark/auto-resolve/scripts/run-fixture.sh; then
1789
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/run-fixture.sh: variant/l2 risk-probes path must remain available"
1790
+ fi
1791
+ if ! grep -Fq '{bare, solo_claude, selected pair arm}' benchmark/auto-resolve/scripts/judge.sh; then
1792
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/judge.sh: blind judge topology comment must describe current pair-candidate proof runs"
1793
+ fi
1794
+ if grep -Fq 'blind judge scores l2_gated' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
1795
+ || grep -Fq 'l2_gated is clean' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py; then
1796
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py: gate docstring must describe selected pair arm, not l2_gated-only proof"
1797
+ fi
1798
+ if grep -Fq 'only then spends a `l2_gated` arm' benchmark/auto-resolve/README.md \
1799
+ || grep -Fq 'fresh `l2_gated` measurement' benchmark/auto-resolve/README.md; then
1800
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/README.md: full-pipeline runner docs must not present l2_gated as the default measured proof path"
1801
+ fi
1802
+ if ! grep -Fq 'Pair arms are limited to current' benchmark/auto-resolve/README.md \
1803
+ || ! grep -Fq '`l2_forced` is' benchmark/auto-resolve/README.md; then
1804
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/README.md: full-pipeline docs must state allowed pair arms and retired l2_forced"
1805
+ fi
1806
+ if ! grep -Fq 'Current solo<pair' benchmark/auto-resolve/RUBRIC.md \
1807
+ || ! grep -Fq 'evidence uses the full-pipeline pair gate' benchmark/auto-resolve/RUBRIC.md \
1808
+ || ! grep -Fq 'explicit selected pair arm' benchmark/auto-resolve/RUBRIC.md \
1809
+ || ! grep -Fq 'selected pair arm over `solo_claude`' benchmark/auto-resolve/RUBRIC.md; then
1810
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/RUBRIC.md: rubric must distinguish legacy run-suite ship gate from current selected-pair-arm evidence"
1811
+ fi
1812
+ if grep -Fq 'PHASE 8' benchmark/auto-resolve/run-real-benchmark.md \
1813
+ || grep -Fq 'security_review on `--engine auto`' benchmark/auto-resolve/run-real-benchmark.md \
1814
+ || grep -Fq 'BENCHMARK-RESULTS-v3.md' benchmark/auto-resolve/run-real-benchmark.md; then
1815
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/run-real-benchmark.md: real-run docs must describe the current 5-phase resolve and pair score harness"
1816
+ fi
1817
+ if ! grep -Fq 'Archive note (2026-05-14): historical pre-cutover benchmark plan' benchmark/auto-resolve/v3.6-ab-plan.md \
1818
+ || ! grep -Fq 'Do not use this file for current' benchmark/auto-resolve/v3.6-ab-plan.md \
1819
+ || ! grep -Fq 'Archive note (2026-05-14): historical pre-cutover results' benchmark/auto-resolve/BENCHMARK-RESULTS-v3.md \
1820
+ || ! grep -Fq 'Do not treat these projected' benchmark/auto-resolve/BENCHMARK-RESULTS-v3.md \
1821
+ || ! grep -Fq 'Archive note (2026-05-14): historical v3.2 pilot' benchmark/auto-resolve/PILOT-RESULTS-v3.2.md \
1822
+ || ! grep -Fq 'This n=1 pilot is not current solo<pair evidence' benchmark/auto-resolve/PILOT-RESULTS-v3.2.md \
1823
+ || ! grep -Fq 'Archive note (2026-05-14): historical v3.2 strict-route pilot' benchmark/auto-resolve/PILOT-RESULTS-STRICT-v3.2.md \
1824
+ || ! grep -Fq 'This inline n=1 pilot is not current solo<pair evidence' benchmark/auto-resolve/PILOT-RESULTS-STRICT-v3.2.md \
1825
+ || ! grep -Fq 'archived static comparison helper for pre-cutover auto-resolve' benchmark/auto-resolve/measure-static.py \
1826
+ || ! grep -Fq 'is not current solo<pair evidence' benchmark/auto-resolve/measure-static.py; then
1827
+ offenders="${offenders}"$'\n'"archived v3 benchmark artifacts must be clearly marked as non-current solo<pair evidence"
1828
+ fi
1829
+ if ! grep -Fq 'l2_risk_probes` | current measured pair path' benchmark/auto-resolve/run-real-benchmark.md \
1830
+ || ! grep -Fq 'Dry-runs, lint, and shell tests prove wiring only' benchmark/auto-resolve/run-real-benchmark.md; then
1831
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/run-real-benchmark.md: real-run docs must name the measured pair arm and distinguish wiring checks from scores"
1832
+ fi
1833
+ if grep -RInE 'F27.*unmeasured|unmeasured.*F27' \
1834
+ benchmark/auto-resolve/README.md \
1835
+ benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1836
+ benchmark/auto-resolve/run-real-benchmark.md \
1837
+ benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md >/dev/null 2>&1; then
1838
+ offenders="${offenders}"$'\n'"F27 docs must record the measured headroom failure, not stale unmeasured-candidate wording"
1839
+ fi
1840
+ if [ -d benchmark/auto-resolve/fixtures/F27-cli-subscription-proration ]; then
1841
+ offenders="${offenders}"$'\n'"F27 must stay out of active golden fixtures after failing headroom; keep it under fixtures/retired/"
1842
+ fi
1843
+ if [ ! -f benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md ]; then
1844
+ offenders="${offenders}"$'\n'"retired F27 must keep RETIRED.md with the measured rejection reason"
1845
+ fi
1846
+ if ! grep -Fq '"benchmark/auto-resolve/fixtures/retired/F*/**"' package.json; then
1847
+ offenders="${offenders}"$'\n'"package.json must include retired fixtures so replay artifacts ship in npm packages"
1848
+ fi
1849
+ if ! grep -Fq '20260511-f27-headroom-smoke-061401' benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md \
1850
+ || ! grep -Fq '20260511-f27-headroom-smoke-061401' benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md; then
1851
+ offenders="${offenders}"$'\n'"F27 notes must cite the measured headroom smoke run"
1852
+ fi
1853
+ if grep -Fq 'better candidate for pair risk probes' benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md; then
1854
+ offenders="${offenders}"$'\n'"F27 notes must not describe rejected F27 as a better pair-risk-probe candidate"
1855
+ fi
1856
+ if grep -Fq 'F16-cli-quote-tax-rules F27-cli-subscription-proration' \
1857
+ benchmark/auto-resolve/README.md benchmark/auto-resolve/run-real-benchmark.md; then
1858
+ offenders="${offenders}"$'\n'"benchmark docs must not recommend rejected F27 in headroom/full-pipeline command examples"
1859
+ fi
1860
+ if [ -d benchmark/auto-resolve/fixtures/F28-cli-return-authorization ]; then
1861
+ offenders="${offenders}"$'\n'"F28 must stay out of active golden fixtures after corrected-oracle reverify failed headroom; keep it under fixtures/retired/"
1862
+ fi
1863
+ if [ ! -f benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md ]; then
1864
+ offenders="${offenders}"$'\n'"retired F28 must keep RETIRED.md with the corrected-oracle rejection reason"
1865
+ fi
1866
+ if ! grep -Fq '20260511-f28-headroom-smoke-085307' benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md \
1867
+ || ! grep -Fq '20260511-f28-pair-smoke-091021' benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md \
1868
+ || ! grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md \
1869
+ || ! grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md \
1870
+ || ! grep -Fq '20260511-f28-headroom-smoke-085307' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1871
+ || ! grep -Fq '20260511-f28-pair-smoke-091021' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1872
+ || ! grep -Fq '20260511-f28-headroom-smoke-085307' benchmark/auto-resolve/README.md \
1873
+ || ! grep -Fq '20260511-f28-pair-smoke-091021' benchmark/auto-resolve/README.md \
1874
+ || ! grep -Fq '20260511-f28-headroom-smoke-085307' benchmark/auto-resolve/run-real-benchmark.md \
1875
+ || ! grep -Fq '20260511-f28-pair-smoke-091021' benchmark/auto-resolve/run-real-benchmark.md; then
1876
+ offenders="${offenders}"$'\n'"F28 docs must cite measured smoke and corrected-oracle rejection run ids before anyone counts it"
1877
+ fi
1878
+ if grep -Fq 'F16-cli-quote-tax-rules F28-cli-return-authorization' \
1879
+ benchmark/auto-resolve/README.md benchmark/auto-resolve/run-real-benchmark.md README.md; then
1880
+ offenders="${offenders}"$'\n'"benchmark docs must not recommend unstable F28 in pair-evidence command examples"
1881
+ fi
1882
+ if ! bash -n benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh >/dev/null 2>&1; then
1883
+ offenders="${offenders}"$'\n'"pair-rejected-fixtures.sh must be valid bash"
1884
+ fi
1885
+ if ! grep -Fq 'trivial calibration fixture' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1886
+ || ! grep -Fq '20260512-f2-medium-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1887
+ || ! grep -Fq '20260511-f3-http-error-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1888
+ || ! grep -Fq '20260512-f4-web-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1889
+ || ! grep -Fq '20260512-f5-fixloop-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1890
+ || ! grep -Fq '20260512-f6-checksum-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1891
+ || ! grep -Fq '20260512-f7-scope-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1892
+ || ! grep -Fq 'known-limit ambiguity fixture' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1893
+ || ! grep -Fq '20260512-f9-e2e-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1894
+ || ! grep -Fq '20260507-f10-f11-tier1-full-pipeline' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1895
+ || ! grep -Fq '20260511-f12-webhook-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1896
+ || ! grep -Fq '20260511-f15-concurrency-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1897
+ || ! grep -Fq 'bare 94 / solo_claude 98 in 20260508-f22-exact-error-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1898
+ || ! grep -Fq '20260508-f26-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1899
+ || ! grep -Fq '20260511-f27-headroom-smoke-061401' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1900
+ || ! grep -Fq '20260511-f28-policy-oraclefix-reverified-pair' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1901
+ || ! grep -Fq '20260510-f29-headroom-v2' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1902
+ || ! grep -Fq '20260511-f30-headroom-v1' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1903
+ || ! grep -Fq '20260512-f31-seat-rebalance-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1904
+ || ! grep -Fq '20260512-f32-subscription-renewal-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1905
+ || ! grep -Fq '20260513-s2-inventory-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1906
+ || ! grep -Fq '20260513-s3-ticket-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1907
+ || ! grep -Fq '20260513-s4-return-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1908
+ || ! grep -Fq '20260513-s5-credit-headroom' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1909
+ || ! grep -Fq '20260514-s6-refund-headroom-v1' benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh \
1910
+ || ! grep -Fq 'source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
1911
+ || ! grep -Fq 'source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
1912
+ || ! grep -Fq 'fixture_smoke_only' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
1913
+ || ! grep -Fq 'fixture_smoke_only' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
1914
+ || ! grep -Fq 'smoke-only-s1-provider-run' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
1915
+ || ! grep -Fq 'smoke-only-s1-provider-run' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
1916
+ || ! grep -Fq 'smoke-only-s1-cli-headroom' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1917
+ || ! grep -Fq 'smoke-only-s1-cli-pair' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
1918
+ || ! grep -Fq 'declare -F rejected_pair_fixture_reason' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
1919
+ || ! grep -Fq 'declare -F rejected_pair_fixture_reason' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
1920
+ || ! grep -Fq 'rejected-f31-fixture' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
1921
+ || ! grep -Fq 'rejected-f32-fixture' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
1922
+ || ! grep -Fq 'rejected-s6-shadow-fixture' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
1923
+ || ! grep -Fq 'rejected-f31-fixture' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
1924
+ || ! grep -Fq 'rejected-f32-fixture' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
1925
+ || ! grep -Fq 'rejected-s6-shadow-fixture' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
1926
+ || ! grep -Fq 'load_rejected_short_ids' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
1927
+ || ! grep -Fq 'load_rejected_fixture_reasons' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
1928
+ || ! grep -Fq 'rejected_excluded' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
1929
+ || ! grep -Fq 'rejected_excluded_reasons' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
1930
+ || ! grep -Fq 'rejected_excluded_reasons' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
1931
+ || ! grep -Fq 'selection_rule.rejected_excluded_reasons' benchmark/auto-resolve/README.md \
1932
+ || ! grep -Fq 'rejected_excluded_reasons' benchmark/auto-resolve/run-real-benchmark.md \
1933
+ || ! grep -Fq 'fixtures_pair_eligible must not be empty' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
1934
+ || ! grep -Fq 'gate3_threshold_count must be a positive integer' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
1935
+ || ! grep -Fq 'selection_rule.rejected_excluded_reasons keys must match rejected_excluded' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
1936
+ || ! grep -Fq 'nan-threshold-manifest' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
1937
+ || ! grep -Fq 'bad-rejected-reasons-manifest' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
1938
+ || ! grep -Fq 'fixture rejected for pair-candidate runs' benchmark/auto-resolve/scripts/headroom-gate.py \
1939
+ || ! grep -Fq 'fixture rejected for pair-candidate runs' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
1940
+ || ! grep -Fq 'rejected fixture registry missing' benchmark/auto-resolve/scripts/headroom-gate.py \
1941
+ || ! grep -Fq 'rejected fixture registry missing' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
1942
+ || ! grep -Fq '([FS]\d+)-\*\|([FS]\d+)' benchmark/auto-resolve/scripts/headroom-gate.py \
1943
+ || ! grep -Fq '([FS]\d+)-\*\|([FS]\d+)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
1944
+ || ! grep -Fq '([FS]\d+)-\*\|([FS]\d+)' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
1945
+ || ! grep -Fq '([FS]\d+)-\*\|([FS]\d+)' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
1946
+ || ! grep -Fq '([FS]\d+)-\*\|([FS]\d+)' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
1947
+ || ! grep -Fq 'missing-rejected-registry' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
1948
+ || ! grep -Fq 'missing-rejected-registry' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
1949
+ || ! grep -Fq 'rejected-shadow-direct' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
1950
+ || ! grep -Fq 'rejected-shadow-direct' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
1951
+ || ! grep -Fq 's-only-registry' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
1952
+ || ! grep -Fq 's-only-registry' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
1953
+ || ! grep -Fq 'shadow solo ceiling' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
1954
+ || grep -Fq 'case "$fid" in' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
1955
+ || grep -Fq 'case "$fid" in' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh; then
1956
+ offenders="${offenders}"$'\n'"pair candidate runners, audits, and manifest builder must honor the shared rejected fixture registry, including F* fixtures and S* shadow controls, without duplicating the case table"
1957
+ fi
1958
+ if ! grep -Fq 'Pair-candidate status: rejected by design' benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md \
1959
+ || ! grep -Fq 'Pair-candidate status: rejected by design' benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md \
1960
+ || ! grep -Fq 'rejected by design' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1961
+ || ! grep -Fq 'calibration/known-limit controls' benchmark/auto-resolve/README.md \
1962
+ || ! grep -Fq 'calibration/known-limit controls' benchmark/auto-resolve/run-real-benchmark.md; then
1963
+ offenders="${offenders}"$'\n'"F1/F8 docs must mark calibration and known-limit controls as rejected by design for pair evidence"
1964
+ fi
1965
+ if ! grep -Fq '20260512-f2-medium-headroom' benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md \
1966
+ || ! grep -Fq '20260512-f2-medium-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1967
+ || ! grep -Fq '20260512-f2-medium-headroom' benchmark/auto-resolve/README.md \
1968
+ || ! grep -Fq '20260512-f2-medium-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
1969
+ offenders="${offenders}"$'\n'"F2 docs must cite the measured headroom rejection before anyone counts it"
1970
+ fi
1971
+ if ! grep -Fq '20260511-f12-webhook-headroom' benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md \
1972
+ || ! grep -Fq '20260511-f12-webhook-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1973
+ || ! grep -Fq '20260511-f12-webhook-headroom' benchmark/auto-resolve/README.md \
1974
+ || ! grep -Fq '20260511-f12-webhook-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
1975
+ offenders="${offenders}"$'\n'"F12 docs must cite the measured headroom rejection before anyone counts it"
1976
+ fi
1977
+ if ! grep -Fq '20260507-f10-f11-tier1-full-pipeline' benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md \
1978
+ || ! grep -Fq '20260507-f10-f11-tier1-full-pipeline' benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md \
1979
+ || ! grep -Fq '20260507-f10-f11-tier1-full-pipeline' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1980
+ || ! grep -Fq '20260507-f10-f11-tier1-full-pipeline' benchmark/auto-resolve/README.md \
1981
+ || ! grep -Fq '20260507-f10-f11-tier1-full-pipeline' benchmark/auto-resolve/run-real-benchmark.md; then
1982
+ offenders="${offenders}"$'\n'"F10/F11 docs must cite the measured headroom rejection before anyone counts them"
1983
+ fi
1984
+ if ! grep -Fq '20260511-f3-http-error-headroom' benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md \
1985
+ || ! grep -Fq '20260511-f3-http-error-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1986
+ || ! grep -Fq '20260511-f3-http-error-headroom' benchmark/auto-resolve/README.md \
1987
+ || ! grep -Fq '20260511-f3-http-error-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
1988
+ offenders="${offenders}"$'\n'"F3 docs must cite the measured headroom rejection before anyone counts it"
1989
+ fi
1990
+ if ! grep -Fq '20260512-f4-web-headroom' benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md \
1991
+ || ! grep -Fq '20260512-f4-web-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1992
+ || ! grep -Fq '20260512-f4-web-headroom' benchmark/auto-resolve/README.md \
1993
+ || ! grep -Fq '20260512-f4-web-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
1994
+ offenders="${offenders}"$'\n'"F4 docs must cite the measured headroom rejection before anyone counts it"
1995
+ fi
1996
+ if ! grep -Fq '20260512-f5-fixloop-headroom' benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md \
1997
+ || ! grep -Fq '20260512-f5-fixloop-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
1998
+ || ! grep -Fq '20260512-f5-fixloop-headroom' benchmark/auto-resolve/README.md \
1999
+ || ! grep -Fq '20260512-f5-fixloop-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2000
+ offenders="${offenders}"$'\n'"F5 docs must cite the measured headroom rejection before anyone counts it"
2001
+ fi
2002
+ if ! grep -Fq '20260512-f6-checksum-headroom' benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md \
2003
+ || ! grep -Fq '20260512-f6-checksum-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2004
+ || ! grep -Fq '20260512-f6-checksum-headroom' benchmark/auto-resolve/README.md \
2005
+ || ! grep -Fq '20260512-f6-checksum-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2006
+ offenders="${offenders}"$'\n'"F6 docs must cite the measured headroom rejection before anyone counts it"
2007
+ fi
2008
+ if ! grep -Fq '20260512-f7-scope-headroom' benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md \
2009
+ || ! grep -Fq '20260512-f7-scope-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2010
+ || ! grep -Fq '20260512-f7-scope-headroom' benchmark/auto-resolve/README.md \
2011
+ || ! grep -Fq '20260512-f7-scope-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2012
+ offenders="${offenders}"$'\n'"F7 docs must cite the measured headroom rejection before anyone counts it"
2013
+ fi
2014
+ if ! grep -Fq '20260512-f9-e2e-headroom' benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md \
2015
+ || ! grep -Fq '20260512-f9-e2e-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2016
+ || ! grep -Fq '20260512-f9-e2e-headroom' benchmark/auto-resolve/README.md \
2017
+ || ! grep -Fq '20260512-f9-e2e-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2018
+ offenders="${offenders}"$'\n'"F9 docs must cite the measured headroom rejection before anyone counts it"
2019
+ fi
2020
+ if ! grep -Fq '20260511-f15-concurrency-headroom' benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md \
2021
+ || ! grep -Fq '20260511-f15-concurrency-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2022
+ || ! grep -Fq '20260511-f15-concurrency-headroom' benchmark/auto-resolve/README.md \
2023
+ || ! grep -Fq '20260511-f15-concurrency-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2024
+ offenders="${offenders}"$'\n'"F15 docs must cite the measured headroom rejection before anyone counts it"
2025
+ fi
2026
+ if ! grep -Fq '20260512-f31-seat-rebalance-headroom' benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md \
2027
+ || ! grep -Fq '20260512-f31-seat-rebalance-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2028
+ || ! grep -Fq '20260512-f31-seat-rebalance-headroom' benchmark/auto-resolve/README.md \
2029
+ || ! grep -Fq '20260512-f31-seat-rebalance-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2030
+ offenders="${offenders}"$'\n'"F31 docs must cite the measured headroom rejection before anyone counts it"
2031
+ fi
2032
+ if grep -Fq 'execFileSync' benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js \
2033
+ || ! grep -Fq "assert.strictEqual(result.stderr, '')" benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js \
2034
+ || ! grep -Fq 'assert.deepStrictEqual(parsed, {' benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js \
2035
+ || grep -Fq 'parsed.applied' benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js \
2036
+ || ! grep -Fq 'On success, write exactly one JSON object to stdout and no stderr. Keys: `applied`, `rejected`, `accounts`.' benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json \
2037
+ || [ "$(grep -Fc 'On success, write exactly one JSON object to stdout and no stderr. Keys: `applied`, `rejected`, `accounts`.' benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json)" -ne 1 ]; then
2038
+ offenders="${offenders}"$'\n'"F31 priority verifier must bind success stderr/no-extra-output contract"
2039
+ fi
2040
+ if ! grep -Fq '20260512-f32-subscription-renewal-headroom' benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md \
2041
+ || ! grep -Fq '20260512-f32-subscription-renewal-headroom' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2042
+ || ! grep -Fq '20260512-f32-subscription-renewal-headroom' benchmark/auto-resolve/README.md \
2043
+ || ! grep -Fq '20260512-f32-subscription-renewal-headroom' benchmark/auto-resolve/run-real-benchmark.md; then
2044
+ offenders="${offenders}"$'\n'"F32 docs must cite the measured headroom rejection before anyone counts it"
2045
+ fi
2046
+ if ! grep -Fq 'assert.deepEqual(output, {' benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js \
2047
+ || ! grep -Fq 'Output row key names and nested `credits` key names match the visible spec exactly, with no aliased or extra keys.' benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json \
2048
+ || [ "$(grep -Fc 'Output row key names and nested `credits` key names match the visible spec exactly, with no aliased or extra keys.' benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json)" -ne 1 ]; then
2049
+ offenders="${offenders}"$'\n'"F32 priority verifier must own exact output-shape contract without duplicate-error overclaim"
2050
+ fi
2051
+ if ! grep -Fq 'retired_fixture_exists' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2052
+ || ! grep -Fq 'retired_fixture_exists' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2053
+ || ! grep -Fq 'fixture is retired and is not rerun by pair-candidate runners' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2054
+ || ! grep -Fq 'fixture is retired and is not rerun by pair-candidate runners' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2055
+ || ! grep -Fq 'historical artifact replay' benchmark/auto-resolve/README.md \
2056
+ || ! grep -Fq 'historical artifact replay' benchmark/auto-resolve/run-real-benchmark.md; then
2057
+ offenders="${offenders}"$'\n'"pair candidate runners must reject retired fixtures explicitly and docs must reserve retired fixtures for historical artifact replay"
2058
+ fi
2059
+ if ! grep -Fq 'F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' benchmark/auto-resolve/run-real-benchmark.md; then
2060
+ offenders="${offenders}"$'\n'"run-real-benchmark.md examples must use the current measured F16/F23/F25 pair-evidence set"
2061
+ fi
2062
+ if ! grep -Fq '20260510-f16-f23-f25-combined-proof' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2063
+ || ! grep -Fq '20260510-f16-f23-f25-combined-proof' benchmark/auto-resolve/README.md \
2064
+ || ! grep -Fq '20260510-f16-f23-f25-combined-proof' benchmark/auto-resolve/run-real-benchmark.md; then
2065
+ offenders="${offenders}"$'\n'"benchmark docs must cite the current F16/F23/F25 pair proof run"
2066
+ fi
2067
+ if ! grep -Fq 'average pair margin +25.3' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2068
+ || ! grep -Fq 'average pair margin +25.3' benchmark/auto-resolve/README.md \
2069
+ || ! grep -Fq 'average pair margin was `+25.3`' benchmark/auto-resolve/run-real-benchmark.md; then
2070
+ offenders="${offenders}"$'\n'"benchmark docs must cite the F16/F23/F25 average pair margin"
2071
+ fi
2072
+ if ! grep -Fq 'average solo_claude headroom **8.0**' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2073
+ || ! grep -Fq 'minimum solo_claude' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2074
+ || ! grep -Fq '| Fixture | Bare | Solo_claude | Pair (`l2_risk_probes`) | Margin | Pair mode | Wall ratio |' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2075
+ || ! grep -Fq 'include `bare` headroom and `solo_claude` headroom' benchmark/auto-resolve/run-real-benchmark.md \
2076
+ || ! grep -Fq 'default 5-point `bare`/`solo_claude` headroom margins' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2077
+ || ! grep -Fq 'average and minimum `bare`/`solo_claude`' benchmark/auto-resolve/README.md \
2078
+ || ! grep -Fq 'average and minimum headroom' benchmark/auto-resolve/run-real-benchmark.md \
2079
+ || ! grep -Fq 'fixture pass count' benchmark/auto-resolve/README.md \
2080
+ || ! grep -Fq 'fixture pass count' benchmark/auto-resolve/run-real-benchmark.md; then
2081
+ offenders="${offenders}"$'\n'"benchmark docs must cite and explain headroom set summaries"
2082
+ fi
2083
+ if ! grep -Fq 'both baseline arms evidence-complete' \
2084
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md \
2085
+ || ! grep -Fq 'both baseline arms evidence-complete' \
2086
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json \
2087
+ || ! grep -Fq 'Fixtures passed: 3/3 (minimum required: 3)' \
2088
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md \
2089
+ || ! grep -Fq 'Average solo_claude headroom: 8.0' \
2090
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md \
2091
+ || ! grep -Fq '"min_solo_headroom": 5' \
2092
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json \
2093
+ || ! grep -Fq '| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Status | Reason |' \
2094
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md \
2095
+ || ! grep -Fq '| F16-cli-quote-tax-rules | 50 | 10 | 75 | 5 | PASS | |' \
2096
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md \
2097
+ || ! grep -Fq '"solo_headroom": 14' \
2098
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json; then
2099
+ offenders="${offenders}"$'\n'"tracked F16/F23/F25 headroom report must use current evidence-complete wording and headroom columns"
2100
+ fi
2101
+ if ! grep -Fq 'l2_risk_probes evidence-clean' \
2102
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
2103
+ || ! grep -Fq 'l2_risk_probes must be evidence-clean' \
2104
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json \
2105
+ || ! grep -Fq 'Fixtures passed: 3/3 (minimum required: 3)' \
2106
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
2107
+ || ! grep -Fq 'Average pair margin: +25.3' \
2108
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
2109
+ || ! grep -Fq '"avg_pair_margin": 25.333333333333332' \
2110
+ benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json; then
2111
+ offenders="${offenders}"$'\n'"tracked F16/F23/F25 pair report must use current l2_risk_probes evidence-clean wording and average pair margin"
2112
+ fi
2113
+ if ! grep -Fq 'evidence-complete `bare <= 60`' benchmark/auto-resolve/README.md \
2114
+ || ! grep -Fq 'evidence-complete `bare <= 60`' benchmark/auto-resolve/run-real-benchmark.md; then
2115
+ offenders="${offenders}"$'\n'"benchmark docs must describe baseline headroom arms as evidence-complete, not correctness-clean"
2116
+ fi
2117
+ if ! grep -Fq 'min-bare-headroom' benchmark/auto-resolve/scripts/headroom-gate.py \
2118
+ || ! grep -Fq 'min-solo-headroom' benchmark/auto-resolve/scripts/headroom-gate.py \
2119
+ || ! grep -Fq 'min-bare-headroom' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2120
+ || ! grep -Fq 'min-solo-headroom' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2121
+ || ! grep -Fq 'headroom >= 5' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md \
2122
+ || ! grep -Fq 'headroom >= 5' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
2123
+ || ! grep -Fq 'default minimum 5-point `bare`/`solo_claude` headroom margin' benchmark/auto-resolve/README.md \
2124
+ || ! grep -Fq 'default minimum 5-point `bare`/`solo_claude` headroom margin' benchmark/auto-resolve/run-real-benchmark.md \
2125
+ || ! grep -Fq 'default 5-point `bare`/`solo_claude` headroom margins' benchmark/auto-resolve/README.md \
2126
+ || ! grep -Fq 'default 5-point `bare`/`solo_claude` headroom margins' benchmark/auto-resolve/BENCHMARK-DESIGN.md \
2127
+ || ! grep -Fq 'bare headroom 1 < 5' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2128
+ || ! grep -Fq 'bare headroom 1 < 5' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh; then
2129
+ offenders="${offenders}"$'\n'"benchmark headroom gates must reject threshold-fragile candidates with default 5-point baseline headroom margins"
2130
+ fi
2131
+ if [ -d benchmark/auto-resolve/results/20260511-f28-headroom-smoke-085307/F28-cli-return-authorization ]; then
2132
+ if python3 benchmark/auto-resolve/scripts/headroom-gate.py \
2133
+ --run-id 20260511-f28-headroom-smoke-085307 \
2134
+ --min-fixtures 1 \
2135
+ --out-json /tmp/devlyn-lint-f28-headroom-gate.json \
2136
+ --out-md /tmp/devlyn-lint-f28-headroom-gate.md >/dev/null 2>&1; then
2137
+ offenders="${offenders}"$'\n'"current local F28 headroom artifact must fail the default 5-point headroom margin gate"
2138
+ elif ! grep -Fq 'bare headroom 1 < 5' /tmp/devlyn-lint-f28-headroom-gate.md; then
2139
+ offenders="${offenders}"$'\n'"current local F28 headroom failure must cite bare headroom 1 < 5"
2140
+ fi
2141
+ fi
2142
+ if [ -d benchmark/auto-resolve/results/20260511-f28-pair-smoke-091021/F28-cli-return-authorization ]; then
2143
+ if python3 benchmark/auto-resolve/scripts/headroom-gate.py \
2144
+ --run-id 20260511-f28-pair-smoke-091021 \
2145
+ --min-fixtures 1 \
2146
+ --out-json /tmp/devlyn-lint-f28-pair-smoke-headroom-gate.json \
2147
+ --out-md /tmp/devlyn-lint-f28-pair-smoke-headroom-gate.md >/dev/null 2>&1; then
2148
+ offenders="${offenders}"$'\n'"current local F28 pair-smoke artifact must fail before pair evidence can be counted"
2149
+ elif ! grep -Fq 'bare score 65 > 60' /tmp/devlyn-lint-f28-pair-smoke-headroom-gate.md; then
2150
+ offenders="${offenders}"$'\n'"current local F28 pair-smoke failure must cite bare score 65 > 60"
2151
+ fi
2152
+ fi
2153
+ if ! grep -Fq 'startup `Gate:` line' benchmark/auto-resolve/README.md \
2154
+ || ! grep -Fq 'startup `Gate:` line' benchmark/auto-resolve/run-real-benchmark.md \
2155
+ || ! grep -Fq 'headroom gate passed — candidate set accepted' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2156
+ || ! grep -Fq 'headroom gate failed — candidate set rejected' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2157
+ || ! grep -Fq 'candidate set was accepted or rejected' benchmark/auto-resolve/README.md \
2158
+ || ! grep -Fq 'candidate set was accepted or rejected' benchmark/auto-resolve/run-real-benchmark.md \
2159
+ || ! grep -Fq 'fixture score table with bare score, bare' benchmark/auto-resolve/README.md \
2160
+ || ! grep -Fq 'headroom, solo_claude score, solo_claude headroom' benchmark/auto-resolve/README.md \
2161
+ || ! grep -Fq 'remaining headroom against' benchmark/auto-resolve/run-real-benchmark.md \
2162
+ || ! grep -Fq 'pair minus `solo_claude` margin' benchmark/auto-resolve/run-real-benchmark.md \
2163
+ || ! grep -Fq 'Fixture Bare Solo_claude Pair Pair-Solo_claude' benchmark/auto-resolve/run-real-benchmark.md \
2164
+ || ! grep -Fq 'fixture score table with bare,' benchmark/auto-resolve/README.md \
2165
+ || ! grep -Fq 'solo_claude, pair, margin, pair-mode' benchmark/auto-resolve/README.md \
2166
+ || ! grep -Fq 'pair above `solo_claude`' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
2167
+ || ! grep -Fq 'Suite average variant-solo_claude margin' benchmark/auto-resolve/BENCHMARK-DESIGN.md \
2168
+ || ! grep -Fq 'legacy `variant`-`bare` (L2-L0)' benchmark/auto-resolve/RUBRIC.md \
2169
+ || ! grep -Fq '`solo_claude`-`bare` measures solo harness value; pair-`solo_claude` measures pair value' benchmark/auto-resolve/README.md \
2170
+ || ! grep -Fq '| fixture | bare | solo_claude | solo_claude-bare |' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2171
+ || ! grep -Fq '| fixture | bare | solo_claude | solo_claude-bare |' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2172
+ || ! grep -Fq '| fixture | bare | solo_claude | pair | pair-solo_claude |' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2173
+ || ! grep -Fq 'startup `Headroom:` and `Pair:` lines' benchmark/auto-resolve/run-real-benchmark.md \
2174
+ || ! grep -Fq 'startup `Headroom:` / `Pair:` lines' benchmark/auto-resolve/README.md; then
2175
+ offenders="${offenders}"$'\n'"benchmark docs must describe real-run startup gate lines and headroom score columns"
2176
+ fi
2177
+ if ! grep -Fq 'DEVLYN_BENCHMARK_CLI_SUBCOMMAND: benchmarkMode' bin/devlyn.js \
2178
+ || ! grep -Fq 'npx devlyn-cli benchmark headroom --run-id "$RUN_ID"' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2179
+ || ! grep -Fq 'npx devlyn-cli benchmark pair --run-id "$RUN_ID"' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2180
+ || ! grep -Fq 'canonical trigger, margin >= +' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2181
+ || ! grep -Fq 'canonical trigger, margin >= +5' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2182
+ || ! grep -Fq 'headroom gate failed — pair arm not executed' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2183
+ || ! grep -Fq 'pair gate failed — pair evidence rejected' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2184
+ || ! grep -Fq 'headroom gate passed — executing $PAIR_ARM' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2185
+ || ! grep -Fq 'pair gate passed — pair evidence accepted' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2186
+ || ! grep -Fq 'headroom fails, the runner explicitly says the pair arm was not executed' benchmark/auto-resolve/README.md \
2187
+ || ! grep -Fq 'pair gate fails, it explicitly says pair evidence was rejected' benchmark/auto-resolve/README.md \
2188
+ || ! grep -Fq 'then that pair evidence was accepted' benchmark/auto-resolve/README.md \
2189
+ || ! grep -Fq 'If headroom fails, it reports that the pair arm was not executed' benchmark/auto-resolve/run-real-benchmark.md \
2190
+ || ! grep -Fq 'pair gate fails, it reports that pair evidence was rejected' benchmark/auto-resolve/run-real-benchmark.md \
2191
+ || ! grep -Fq 'then that pair evidence was' benchmark/auto-resolve/run-real-benchmark.md \
2192
+ || ! grep -Fq 'accepted. When launched through' benchmark/auto-resolve/run-real-benchmark.md \
2193
+ || ! grep -Fq 'the replay `Command:` uses the' benchmark/auto-resolve/README.md \
2194
+ || ! grep -Fq 'same package CLI path' benchmark/auto-resolve/README.md \
2195
+ || ! grep -Fq 'the replay command uses' benchmark/auto-resolve/run-real-benchmark.md \
2196
+ || ! grep -Fq 'uses that same' benchmark/auto-resolve/run-real-benchmark.md \
2197
+ || ! grep -Fq 'package CLI path' benchmark/auto-resolve/run-real-benchmark.md; then
2198
+ offenders="${offenders}"$'\n'"benchmark CLI headroom/pair runs must replay as npx devlyn-cli commands and docs must state that"
2199
+ fi
2200
+ if ! grep -Fq 'npx devlyn-cli benchmark headroom --min-fixtures 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' README.md \
2201
+ || ! grep -Fq 'npx devlyn-cli benchmark frontier --out-md /tmp/devlyn-pair-frontier.md' README.md \
2202
+ || ! grep -Fq 'npx devlyn-cli benchmark pair --min-fixtures 3 --max-pair-solo-wall-ratio 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' README.md \
2203
+ || ! grep -Fq 'average pair margin' README.md \
2204
+ || ! grep -Fq 'default 5-point `bare`/`solo_claude` headroom margins' README.md \
2205
+ || ! grep -Fq 'Add `--dry-run` to either score runner' README.md \
2206
+ || ! grep -Fq 'fixture count, and the replay command' README.md \
2207
+ || ! grep -Fq 'Dry-runs' README.md \
2208
+ || ! grep -Fq 'and lint prove wiring only; real score claims must cite the run id and fixture' README.md; then
2209
+ offenders="${offenders}"$'\n'"README.md must expose score-focused benchmark headroom/pair CLI paths and distinguish wiring checks from real scores"
2210
+ fi
2211
+ if ! grep -Fq 'npx devlyn-cli benchmark pair --min-fixtures 3 --max-pair-solo-wall-ratio 3 F16-cli-quote-tax-rules F23-cli-fulfillment-wave F25-cli-cart-promotion-rules' bin/devlyn.js \
2212
+ || ! grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' bin/devlyn.js \
2213
+ || ! grep -Fq 'MAX_PAIR_SOLO_WALL_RATIO=3' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2214
+ || ! grep -Fq 'parser.add_argument("--max-pair-solo-wall-ratio", type=positive_float, default=3.0)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2215
+ || ! grep -Fq '"max_observed_pair_solo_wall_ratio": max(ratios) if ratios else None' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2216
+ || ! grep -Fq 'Allowed pair/solo wall ratio' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2217
+ || ! grep -Fq 'Maximum observed pair/solo wall ratio' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2218
+ || ! grep -Fq 'Maximum observed pair/solo wall ratio: 2.00x' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2219
+ || ! grep -Fq 'separates the allowed pair/solo wall ratio from the maximum observed pair/solo wall ratio' benchmark/auto-resolve/run-real-benchmark.md \
2220
+ || ! grep -Fq 'python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2221
+ || ! grep -Fq -- '--min-fixtures 3' benchmark/auto-resolve/run-real-benchmark.md \
2222
+ || ! grep -Fq -- '--min-fixtures 3' benchmark/auto-resolve/README.md \
2223
+ || ! grep -Fq 'average pair margin' benchmark/auto-resolve/run-real-benchmark.md \
2224
+ || ! grep -Fq 'average pair margin' benchmark/auto-resolve/README.md \
2225
+ || ! grep -Fq -- '--dry-run validate args/fixtures and print replay command only' bin/devlyn.js \
2226
+ || ! grep -Fq 'does not produce' benchmark/auto-resolve/run-real-benchmark.md \
2227
+ || ! grep -Fq 'scores. When showing' benchmark/auto-resolve/run-real-benchmark.md; then
2228
+ offenders="${offenders}"$'\n'"benchmark pair examples must explicitly gate the current F16/F23/F25 proof set with --min-fixtures 3"
2229
+ fi
2230
+ if ! grep -Fq 'use 3 for F16/F23/F25 proof reruns; audit requires 4 passing evidence rows' bin/devlyn.js \
2231
+ || ! grep -Fq 'use 3 for F16/F23/F25 proof reruns; audit requires 4 passing evidence rows' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh; then
2232
+ offenders="${offenders}"$'\n'"benchmark CLI help must distinguish proof reruns from the four-row release audit"
2233
+ fi
2234
+ if ! grep -Fq 'judge blind mapping missing' benchmark/auto-resolve/scripts/headroom-gate.py \
2235
+ || ! grep -Fq 'judge blind mapping missing' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2236
+ || ! grep -Fq 'blind_mapping_arm_missing' benchmark/auto-resolve/scripts/compile-report.py \
2237
+ || ! grep -Fq 'def trusted_winner()' benchmark/auto-resolve/scripts/compile-report.py \
2238
+ || ! grep -Fq 'raw_findings_by_arm = judge.get("findings_by_arm")' benchmark/auto-resolve/scripts/compile-report.py \
2239
+ || ! grep -Fq 'def critical_findings_for(arm: str)' benchmark/auto-resolve/scripts/compile-report.py \
2240
+ || ! grep -Fq 'def exact_bool(value)' benchmark/auto-resolve/scripts/compile-report.py \
2241
+ || ! grep -Fq '"malformed_boolean_fields": malformed_boolean_fields' benchmark/auto-resolve/scripts/compile-report.py \
2242
+ || ! grep -Fq '"dq_judge_malformed": judge_dq_malformed' benchmark/auto-resolve/scripts/compile-report.py \
2243
+ || ! grep -Fq 'def bool_or_none(value)' benchmark/auto-resolve/scripts/ship-gate.py \
2244
+ || ! grep -Fq 'l1_dq_by_fixture: dict[str, bool]' benchmark/auto-resolve/scripts/ship-gate.py \
2245
+ || ! grep -Fq 'summary arms_present malformed' benchmark/auto-resolve/scripts/ship-gate.py \
2246
+ || ! grep -Fq 'summary arms_present.solo_claude malformed' benchmark/auto-resolve/scripts/ship-gate.py \
2247
+ || ! grep -Fq 'summary margins_avg malformed' benchmark/auto-resolve/scripts/ship-gate.py \
2248
+ || ! grep -Fq 'raw_findings_letters = chosen.get("critical_findings")' benchmark/auto-resolve/scripts/judge.sh \
2249
+ || ! grep -Fq 'raw_findings_letters = chosen.get("critical_findings")' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2250
+ || ! grep -Fq 'raw_validation = judge.get("_axis_validation")' benchmark/auto-resolve/scripts/compile-report.py \
2251
+ || ! grep -Fq 'raw_validation = judge.get("_axis_validation")' benchmark/auto-resolve/scripts/headroom-gate.py \
2252
+ || ! grep -Fq 'raw_validation = judge.get("_axis_validation")' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2253
+ || ! grep -Fq 'raw_scores_by_arm = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/compile-report.py \
2254
+ || ! grep -Fq 'from pair_evidence_contract import is_score, is_strict_number' benchmark/auto-resolve/scripts/compile-report.py \
2255
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/compile-report.py \
2256
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/ship-gate.py \
2257
+ || ! grep -Fq 'if arm in mapped_arms and is_score(score)' benchmark/auto-resolve/scripts/compile-report.py \
2258
+ || ! grep -Fq 'def strict_number(value)' benchmark/auto-resolve/scripts/compile-report.py \
2259
+ || ! grep -Fq 'raw_scores = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/headroom-gate.py \
2260
+ || ! grep -Fq 'raw_scores = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2261
+ || ! grep -Fq 'from pair_evidence_contract import is_score' benchmark/auto-resolve/scripts/headroom-gate.py \
2262
+ || ! grep -Fq 'return value if is_score(value) else None' benchmark/auto-resolve/scripts/headroom-gate.py \
2263
+ || ! grep -Fq 'raw_scores = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2264
+ || ! grep -Fq 'from pair_evidence_contract import is_score' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2265
+ || ! grep -Fq 'def exact_bool(value: object)' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2266
+ || ! grep -Fq 'def disqualifier_flag(value: object' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2267
+ || ! grep -Fq 'if not is_score(solo) or not is_score(bare):' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2268
+ || ! grep -Fq 'if is_score(score):' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2269
+ || ! grep -Fq 'return legacy if is_score(legacy) else None' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2270
+ || ! grep -Fq 'raw_scores = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
2271
+ || ! grep -Fq 'raw_scores = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2272
+ || ! grep -Fq 'from pair_evidence_contract import (' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2273
+ || ! grep -Fq 'is_score,' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2274
+ || ! grep -Fq 'is_strict_number,' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2275
+ || ! grep -Fq 'loads_strict_json_object(p.read_text())' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2276
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2277
+ || ! grep -Fq 'PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2278
+ || ! grep -Fq 'def is_pair_judge_verdict(value: object) -> bool:' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2279
+ || ! grep -Fq 'def exact_bool(value: object)' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2280
+ || ! grep -Fq 'def bool_flag(value: object' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2281
+ || ! grep -Fq 'if is_score(sba.get(arm)):' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2282
+ || ! grep -Fq 'return legacy if is_score(legacy) else None' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2283
+ || ! grep -Fq 'def strict_elapsed_seconds' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2284
+ || ! grep -Fq 'def timeout_flag(result: dict | None) -> bool:' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2285
+ || ! grep -Fq 'raw_scores = judge.get("scores_by_arm")' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2286
+ || ! grep -Fq 'raw_dq_by_arm = judge.get("disqualifiers_by_arm")' benchmark/auto-resolve/scripts/headroom-gate.py \
2287
+ || ! grep -Fq 'raw_dq_by_arm = judge.get("disqualifiers_by_arm")' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2288
+ || ! grep -Fq 'raw_dq_by_arm = judge.get("disqualifiers_by_arm")' benchmark/auto-resolve/scripts/compile-report.py \
2289
+ || ! grep -Fq 'raw_by_arm = judge.get("disqualifiers_by_arm")' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2290
+ || ! grep -Fq 'raw_dba = judge.get("disqualifiers_by_arm")' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2291
+ || ! grep -Fq 'raw_legacy = judge.get("disqualifiers")' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2292
+ || ! grep -Fq 'raw_dqs = judge.get("disqualifiers")' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2293
+ || ! grep -Fq 'def is_score(value):' benchmark/auto-resolve/scripts/judge.sh \
2294
+ || ! grep -Fq 'invalid judge score value(s)' benchmark/auto-resolve/scripts/judge.sh \
2295
+ || ! grep -Fq 'invalid judge disqualifier value(s)' benchmark/auto-resolve/scripts/judge.sh \
2296
+ || ! grep -Fq 'invalid opus disqualifier value(s)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2297
+ || ! grep -Fq 'if arm is not None and key in chosen and is_score(chosen[key]):' benchmark/auto-resolve/scripts/judge.sh \
2298
+ || ! grep -Fq 'raw_dq_letters = chosen.get("disqualifiers")' benchmark/auto-resolve/scripts/judge.sh \
2299
+ || ! grep -Fq 'raw_dq_letters = chosen.get("disqualifiers")' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2300
+ || ! grep -Fq 'scores_by_arm` alone is not evidence' benchmark/auto-resolve/README.md \
2301
+ || ! grep -Fq 'without the blind slot mapping is not score evidence' benchmark/auto-resolve/run-real-benchmark.md \
2302
+ || ! grep -Fq 'absent from the blind mapping is not score evidence' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2303
+ || ! grep -Fq 'wrong-pair-mapping' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2304
+ || ! grep -Fq 'from pair_evidence_contract import (' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2305
+ || ! grep -Fq 'ALLOWED_PAIR_ARMS,' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2306
+ || ! grep -Fq 'loads_strict_json_object,' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2307
+ || ! grep -Fq 'def bool_flag_failure(value: Any, true_reason: str, malformed_reason: str) -> str | None:' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2308
+ || ! grep -Fq 'def bool_flag_failure(value: object, true_reason: str, malformed_reason: str) -> str | None:' benchmark/auto-resolve/scripts/headroom-gate.py \
2309
+ || ! grep -Fq 'return value if is_score(value) else None' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2310
+ || ! grep -Fq 'if not is_strict_number(pair_elapsed) or not is_strict_number(solo_elapsed):' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2311
+ || ! grep -Fq 'return is_strict_number(value) and value >= 1.0' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2312
+ || ! grep -Fq 'value must be finite and > 0' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2313
+ || ! grep -Fq 'overrange-score' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2314
+ || ! grep -Fq 'boolean-score' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2315
+ || ! grep -Fq 'boolean-wall-time' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2316
+ || ! grep -Fq 'invalid-max-wall-ratio' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2317
+ || ! grep -Fq 'boolean-pair-verify-score' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2318
+ || ! grep -Fq 'malformed-pair-bool' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2319
+ || ! grep -Fq 'malformed-judge-bool' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2320
+ || ! grep -Fq 'wrong-mapping' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2321
+ || ! grep -Fq 'overrange-score' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2322
+ || ! grep -Fq 'boolean-score' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2323
+ || ! grep -Fq 'malformed-bare-bool' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2324
+ || ! grep -Fq 'bare judge disqualifier malformed' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2325
+ || ! grep -Fq 'variant-mapping-disqualifies' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2326
+ || ! grep -Fq 'solo-mapping-disqualifies' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2327
+ || ! grep -Fq 'stale-margin' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2328
+ || ! grep -Fq 'stale judge margins must be recomputed from trusted scores' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2329
+ || ! grep -Fq 'winner without blind-mapped trusted score must be null' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2330
+ || ! grep -Fq 'malformed scores_by_arm must not expose' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2331
+ || ! grep -Fq 'out-of-range scores_by_arm must not expose variant score' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2332
+ || ! grep -Fq 'boolean scores_by_arm must not expose solo score' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2333
+ || ! grep -Fq 'boolean result numeric fields must not appear in compile summary' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2334
+ || ! grep -Fq 'malformed-result-bool' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2335
+ || ! grep -Fq 'malformed-judge-bool' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2336
+ || ! grep -Fq 'malformed-l1-dq-summary' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2337
+ || ! grep -Fq 'malformed-arms-present-wrapper' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2338
+ || ! grep -Fq 'malformed-arms-present' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2339
+ || ! grep -Fq 'malformed-margins-avg-wrapper' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2340
+ || ! grep -Fq 'F1 L1 disqualifier malformed' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2341
+ || ! grep -Fq 'overrange C1 row fields must not promote F17' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2342
+ || ! grep -Fq 'string C1 disqualifier must not promote F18' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2343
+ || ! grep -Fq 'f9-overrange-scores' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2344
+ || ! grep -Fq 'f9-boolean-scores' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2345
+ || ! grep -Fq 'f9-string-dq-entry' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2346
+ || ! grep -Fq 'overrange-scores-results' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2347
+ || ! grep -Fq 'boolean-scores-results' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2348
+ || ! grep -Fq 'boolean-wall-results' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2349
+ || ! grep -Fq 'nan-wall-results' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2350
+ || ! grep -Fq 'string-timeout-results' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2351
+ || ! grep -Fq 'malformed-pair-state-results' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2352
+ || ! grep -Fq 'non-list finding entry must become a one-item list' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2353
+ || ! grep -Fq 'non-dict findings_by_arm must be ignored' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2354
+ || ! grep -Fq 'non-dict _axis_validation wrapper must not crash' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2355
+ || ! grep -Fq 'load_dict_json' benchmark/auto-resolve/scripts/compile-report.py \
2356
+ || ! grep -Fq 'measurement invalid: malformed summary.json' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2357
+ || ! grep -Fq 'nan-summary' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2358
+ || ! grep -Fq 'summary rows contain non-object entries' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2359
+ || ! grep -Fq 'malformed-summary-field-types' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2360
+ || ! grep -Fq 'variant axis count malformed' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2361
+ || ! grep -Fq 'result.json malformed' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2362
+ || ! grep -Fq 'result.json malformed' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2363
+ || ! grep -Fq 'non-dict variant result.json must fail closed' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2364
+ || ! grep -Fq 'NaN variant result.json must fail closed as a disqualifier' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2365
+ || ! grep -Fq '("solo_claude (L1)", "solo_claude")' benchmark/auto-resolve/scripts/compile-report.py \
2366
+ || ! grep -Fq '| Fixture | Category | variant (L2) | solo_claude (L1) | bare (L0) | variant-bare | solo_claude-bare | variant-solo_claude | Winner | Wall variant/solo_claude/bare | Wall variant/solo_claude | Wall variant/bare |' benchmark/auto-resolve/scripts/compile-report.py \
2367
+ || ! grep -Fq '| Fixture | Category | variant (L2) | solo_claude (L1) | bare (L0) | variant-bare | solo_claude-bare | variant-solo_claude | Winner | Wall variant/solo_claude/bare | Wall variant/solo_claude | Wall variant/bare |' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2368
+ || ! grep -Fq '**solo_claude (L1):**' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2369
+ || ! grep -Fq 'variant (L2) vs solo_claude (L1) margin avg' benchmark/auto-resolve/scripts/compile-report.py \
2370
+ || ! grep -Fq '**variant (L2) vs solo_claude (L1) margin avg:** +10.0' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2371
+ || ! grep -Fq 'Wall ratio variant (L2) / solo_claude (L1)' benchmark/auto-resolve/scripts/compile-report.py \
2372
+ || ! grep -Fq '**Wall ratio variant (L2) / solo_claude (L1):** 1.0x' benchmark/auto-resolve/scripts/test-ship-gate.sh \
2373
+ || ! grep -Fq 'malformed-scores' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2374
+ || ! grep -Fq 'malformed-scores' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2375
+ || ! grep -Fq 'malformed-dq-entry' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2376
+ || ! grep -Fq 'malformed-dq-entry' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2377
+ || ! grep -Fq 'malformed-axis-wrapper' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2378
+ || ! grep -Fq 'malformed-axis-wrapper' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2379
+ || ! grep -Fq 'fixture-nan-metadata' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2380
+ || ! grep -Fq 'fixture-nan-expected' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2381
+ || ! grep -Fq 'c1-summary malformed: expected object' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2382
+ || ! grep -Fq 'f9-judge malformed: expected object' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2383
+ || ! grep -Fq 'malformed C1 row fields must not promote F1' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2384
+ || ! grep -Fq 'F9 must not be included when scores_by_arm is malformed' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2385
+ || ! grep -Fq 'truthy malformed disqualifier entry must exclude F9' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2386
+ || ! grep -Fq 'disqualifiers": ["not", "a", "dict"]' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2387
+ || ! grep -Fq 'bad-mapping' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2388
+ || ! grep -Fq 'write_fixture_with_malformed_dq_entry' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2389
+ || ! grep -Fq 'write_fixture_with_malformed_legacy_dq' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2390
+ || ! grep -Fq 'write_fixture_with_string_dq_entry' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2391
+ || ! grep -Fq 'write_fixture_with_malformed_result' benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh \
2392
+ || ! grep -Fq 'disqualifiers": ["not", "a", "dict"]' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2393
+ || ! grep -Fq 'malformed compare.json for malformed-compare' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2394
+ || ! grep -Fq 'malformed compare.json for nan-compare: invalid JSON' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2395
+ || ! grep -Fq 'malformed-compare-sections' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2396
+ || ! grep -Fq 'malformed-verdict-fields' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2397
+ || ! grep -Fq 'malformed-elapsed-fields' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2398
+ || ! grep -Fq 'string-pair-mode' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2399
+ || ! grep -Fq 'def strict_positive_number(value):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2400
+ || ! grep -Fq 'math.isfinite(value)' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2401
+ || ! grep -Fq 'def strict_nonnegative_int(value):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2402
+ || ! grep -Fq 'def summary_findings_count(data):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2403
+ || ! grep -Fq 'def severity_count_sum(data):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2404
+ || ! grep -Fq 'def strict_greater(left, right):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2405
+ || ! grep -Fq 'metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2406
+ || ! grep -Fq 'metadata timeout_seconds must be a positive integer' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2407
+ || ! grep -Fq 'state = as_dict(loads_strict_json_object(state_path.read_text())) if state_path.is_file() else {}' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2408
+ || ! grep -Fq 'json.loads(line, parse_constant=reject_json_constant)' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2409
+ || ! grep -Fq 'summary = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2410
+ || ! grep -Fq 'expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2411
+ || ! grep -Fq 'out[arm] = loads_strict_json_object(path.read_text()) if path.is_file() else {"missing": True}' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2412
+ || ! grep -Fq 'def as_dict(value):' benchmark/auto-resolve/scripts/run-fixture.sh \
2413
+ || ! grep -Fq 'metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-fixture.sh \
2414
+ || ! grep -Fq 'metadata timeout_seconds must be a positive integer' benchmark/auto-resolve/scripts/run-fixture.sh \
2415
+ || ! grep -Fq 'expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-fixture.sh \
2416
+ || ! grep -Fq 'verify = as_dict(loads_strict_json_object(pathlib.Path(result_dir, "verify.json").read_text()))' benchmark/auto-resolve/scripts/run-fixture.sh \
2417
+ || ! grep -Fq 'loads_strict_json_object(pathlib.Path(result_dir, "timing.json").read_text())' benchmark/auto-resolve/scripts/run-fixture.sh \
2418
+ || ! grep -Fq 'loads_strict_json_object(pathlib.Path(state_path).read_text())' benchmark/auto-resolve/scripts/run-fixture.sh \
2419
+ || ! grep -Fq 'phases = as_dict(state.get("phases"))' benchmark/auto-resolve/scripts/run-fixture.sh \
2420
+ || ! grep -Fq 'legacy_verify = as_dict(state.get("verify"))' benchmark/auto-resolve/scripts/run-fixture.sh \
2421
+ || ! grep -Fq 'data = raw_oracle' benchmark/auto-resolve/scripts/run-fixture.sh \
2422
+ || ! grep -Fq 'oracle artifact malformed or unreadable' benchmark/auto-resolve/scripts/run-fixture.sh \
2423
+ || ! grep -Fq '"type": "oracle-error"' benchmark/auto-resolve/scripts/run-fixture.sh \
2424
+ || ! grep -Fq 'verify["oracle_disqualifier"] = True' benchmark/auto-resolve/scripts/run-fixture.sh \
2425
+ || ! grep -Fq 'findings = raw_findings if isinstance(raw_findings, list) else []' benchmark/auto-resolve/scripts/run-fixture.sh \
2426
+ || ! grep -Fq 'if not isinstance(finding, dict):' benchmark/auto-resolve/scripts/run-fixture.sh \
2427
+ || ! grep -Fq 'def as_dict(value):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2428
+ || ! grep -Fq 'phases = as_dict(state.get("phases"))' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2429
+ || ! grep -Fq 'legacy_verify = as_dict(state.get("verify"))' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2430
+ || ! grep -Fq 'PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}' benchmark/auto-resolve/scripts/run-fixture.sh \
2431
+ || ! grep -Fq 'def has_pair_judge_verdict(sub_verdicts):' benchmark/auto-resolve/scripts/run-fixture.sh \
2432
+ || ! grep -Fq 'PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2433
+ || ! grep -Fq 'def has_pair_judge_verdict(sub_verdicts):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2434
+ || ! grep -Fq 'or verify_phase.get("pair_mode") is True' benchmark/auto-resolve/scripts/run-fixture.sh \
2435
+ || ! grep -Fq 'has_pair_judge_verdict(sub_verdicts) or verify_phase.get("pair_mode") is True' benchmark/auto-resolve/scripts/run-fixture.sh \
2436
+ || ! grep -Fq 'has_pair_judge_verdict(sub_verdicts)' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2437
+ || ! grep -Fq 'raw_pair_sub = pair.get("sub_verdicts")' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2438
+ || ! grep -Fq 'raw_pair_trigger = pair.get("pair_trigger")' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2439
+ || ! grep -Fq 'pair_trigger = raw_pair_trigger if isinstance(raw_pair_trigger, dict) else {}' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2440
+ || ! grep -Fq 'pair_mode_true = pair.get("pair_mode") is True' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2441
+ || ! grep -Fq 'def fmt_trigger_reasons(value):' benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2442
+ || ! grep -Fq "| Arm | Verdict | Pair mode | Triggers | Findings | Elapsed | Invoke exit | Failure |" benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh \
2443
+ || ! grep -Fq '| Arm | Verdict | Pair mode | Triggers | Findings | Elapsed | Invoke exit | Failure |' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2444
+ || ! grep -Fq 'pair_mode = pair.get("pair_mode") is True' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2445
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2446
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/check-f9-artifacts.py \
2447
+ || ! grep -Fq 'invalid JSON numeric constant: NaN' benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh \
2448
+ || ! grep -Fq 'loads_strict_json_object(path.read_text(encoding="utf8"))' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2449
+ || ! grep -Fq 'def pair_trigger_failures(' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2450
+ || ! grep -Fq 'def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2451
+ || ! grep -Fq 'pair_trigger_eligible(pair)' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2452
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons)' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2453
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": true' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2454
+ || ! grep -Fq 'failed attempt: pair trigger contract: ' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2455
+ || ! grep -Fq 'has_known_pair_trigger_reason(reasons)' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2456
+ || ! grep -Fq 'all_known_pair_trigger_reasons(trigger["reasons"])' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2457
+ || ! grep -Fq 'has_canonical_pair_trigger_reason(trigger["reasons"])' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2458
+ || ! grep -Fq 'path_has_actionable_solo_headroom_hypothesis(fixture_spec)' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2459
+ || ! grep -Fq 'pair_trigger reasons missing known trigger reason' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2460
+ || ! grep -Fq 'pair_trigger reasons contain unknown trigger reason' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2461
+ || ! grep -Fq 'pair_trigger reasons missing canonical trigger reason' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2462
+ || ! grep -Fq 'pair_trigger missing spec.solo_headroom_hypothesis' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2463
+ || ! grep -Fq -- '--require-hypothesis-trigger requires --fixtures-root' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2464
+ || ! grep -Fq 'swebench-missing-hypothesis-trigger-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2465
+ || ! grep -Fq -- '--require-hypothesis-trigger requires --fixtures-root' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2466
+ || ! grep -Fq 'has_known_pair_trigger_reason(reasons)' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2467
+ || ! grep -Fq 'all_known_pair_trigger_reasons(reasons)' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2468
+ || ! grep -Fq 'has_canonical_pair_trigger_reason(reasons)' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2469
+ || ! grep -Fq 'def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2470
+ || ! grep -Fq 'path_has_actionable_solo_headroom_hypothesis(fixtures_root / fixture_id / "spec.md")' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2471
+ || ! grep -Fq 'pair_trigger missing spec.solo_headroom_hypothesis' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2472
+ || ! grep -Fq -- '--require-hypothesis-trigger' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2473
+ || ! grep -Fq -- '--require-hypothesis-trigger' benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh \
2474
+ || ! grep -Fq 'missing-hypothesis-trigger' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2475
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons)' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2476
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": true' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2477
+ || ! grep -Fq '| Run | Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Triggers | Wall ratio | External lift | Internal lift | Status | Reason |' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2478
+ || ! grep -Fq '| Run | Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Triggers | Wall ratio | External lift | Internal lift | Status | Reason |' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2479
+ || ! grep -Fq 'pair_trigger reasons missing known trigger reason' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2480
+ || ! grep -Fq 'pair_trigger reasons contain unknown trigger reason' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2481
+ || ! grep -Fq 'pair_trigger reasons missing canonical trigger reason' benchmark/auto-resolve/scripts/frozen-verify-gate.py \
2482
+ || ! grep -Fq 'normalized-canonical-pair-trigger' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2483
+ || ! grep -Fq '| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2484
+ || ! grep -Fq '| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2485
+ || ! grep -Fq 'def is_true(value: Any) -> bool:' benchmark/auto-resolve/scripts/swebench-frozen-matrix.py \
2486
+ || ! grep -Fq 'swebench-bool-elapsed-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2487
+ || ! grep -Fq 'runner-nan-metadata' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2488
+ || ! grep -Fq 'runner-nan-expected' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2489
+ || ! grep -Fq 'json.loads(line, parse_constant=reject_json_constant)' benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh \
2490
+ || ! grep -Fq 'nan-instance-row' benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh \
2491
+ || ! grep -Fq 'manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-iter-0033c.sh \
2492
+ || ! grep -Fq 'manifest fixtures_pair_eligible must be a string array' benchmark/auto-resolve/scripts/run-iter-0033c.sh \
2493
+ || ! grep -Fq 'parsed = json.loads(ln, parse_constant=reject_json_constant)' benchmark/auto-resolve/scripts/iter-0033c-compare.py \
2494
+ || ! grep -Fq 'data = loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/judge.sh \
2495
+ || ! grep -Fq 'decoder = json.JSONDecoder(parse_constant=reject_json_constant)' benchmark/auto-resolve/scripts/judge.sh \
2496
+ || ! grep -Fq 'judge = loads_strict_json_object(judge_path.read_text())' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2497
+ || ! grep -Fq 'gpt = loads_strict_json_object(pathlib.Path(sys.argv[2]).read_text())' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2498
+ || ! grep -Fq 'decoder = json.JSONDecoder(parse_constant=reject_json_constant)' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2499
+ || ! grep -Fq 'g = loads_strict_json_object(g_f.read_text())' benchmark/auto-resolve/scripts/judge-opus-pass.sh \
2500
+ || ! grep -Fq 'swebench-malformed-pair-judge-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2501
+ || ! grep -Fq 'swebench-malformed-pair-trigger-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2502
+ || ! grep -Fq 'swebench-unknown-pair-trigger-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2503
+ || ! grep -Fq 'swebench-normalized-pair-trigger-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2504
+ || ! grep -Fq 'swebench-mixed-unknown-pair-trigger-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2505
+ || ! grep -Fq '"classification": "failed attempt: pair trigger contract: pair_trigger missing or malformed"' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2506
+ || ! grep -Fq '"classification": "failed attempt: pair trigger contract: pair_trigger reasons missing known trigger reason"' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2507
+ || ! grep -Fq '"classification": "failed attempt: pair trigger contract: pair_trigger reasons contain unknown trigger reason"' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2508
+ || ! grep -Fq '"classification": "failed attempt: pair trigger contract: pair_trigger reasons missing canonical trigger reason"' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2509
+ || ! grep -Fq 'historical-only-pair-trigger' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2510
+ || ! grep -Fq 'HISTORICAL_ONLY_TRIGGER_RUN_ID' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2511
+ || ! grep -Fq '| Pair trigger |' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2512
+ || ! grep -Fq '"verify_findings_count": "2"' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2513
+ || ! grep -Fq '"pair_found_more_low_or_worse": false' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2514
+ || ! grep -Fq 'pair-trigger eligibility/contract failures' benchmark/auto-resolve/README.md \
2515
+ || ! grep -Fq 'swebench-string-bool-matrix-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2516
+ || ! grep -Fq 'findings = raw_findings if isinstance(raw_findings, list) else []' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2517
+ || ! grep -Fq 'oracle artifact malformed or unreadable' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2518
+ || ! grep -Fq 'scope-tier-a-nan' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2519
+ || ! grep -Fq 'scope-tier-b-nan' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2520
+ || ! grep -Fq 'expected.json malformed: spec_output_files must be a string array' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2521
+ || ! grep -Fq 'loads_strict_json_object(exp_path.read_text())' benchmark/auto-resolve/scripts/oracle-scope-tier-a.py \
2522
+ || ! grep -Fq 'loads_strict_json_object(pathlib.Path(args.expected).read_text())' benchmark/auto-resolve/scripts/oracle-scope-tier-b.py \
2523
+ || ! grep -Fq 'failed attempt: malformed compare' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2524
+ || ! grep -Fq 'swebench-nan-matrix-test' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2525
+ || ! grep -Fq 'parse_prepared_case' benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py \
2526
+ || ! grep -Fq 'loads_strict_json_object(path.read_text(encoding="utf8"))' benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py \
2527
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/collect-swebench-predictions.py \
2528
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py \
2529
+ || ! grep -Fq 'loads_strict_json_object(stdout)' benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py \
2530
+ || ! grep -Fq 'batch-bad-timeout' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2531
+ || ! grep -Fq 'prepare-bad-timeout' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2532
+ || ! grep -Fq 'prepare-nan-case' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2533
+ || ! grep -Fq 'missing patch.diff for instance ids' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2534
+ || ! grep -Fq 'no non-empty patches collected' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2535
+ || ! grep -Fq 'collect-nan-instances' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2536
+ || ! grep -Fq 'batch-nan-predictions' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2537
+ || ! grep -Fq 'invalid JSON numeric constant: NaN' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2538
+ || ! grep -Fq 'fetch-empty-limit' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2539
+ || ! grep -Fq 'batch-empty-limit' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2540
+ || ! grep -Fq 'no prediction instances selected' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2541
+ || ! grep -Fq 'manifest malformed: expected JSON object' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2542
+ || ! grep -Fq 'corpus-manifest-nan' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2543
+ || ! grep -Fq 'loads_strict_json_object(manifest_path.read_text())' benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh \
2544
+ || ! grep -Fq 'loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())' benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh \
2545
+ || ! grep -Fq 'manifest malformed: prepared must be a non-empty array' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2546
+ || ! grep -Fq 'prepared[1] expected JSON object' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2547
+ || ! grep -Fq 'run ids malformed: no run ids' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2548
+ || ! grep -Fq 'run ids malformed: line 2 is empty' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2549
+ || ! grep -Fq 'run ids malformed: line 1 has unsafe run id' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2550
+ || ! grep -Fq -- '--fixture must match [A-Za-z0-9_.-]+' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2551
+ || ! grep -Fq -- '--run-id must match [A-Za-z0-9_.-]+' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2552
+ || ! grep -Fq 'unsafe SWE-bench repo' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2553
+ || ! grep -Fq 'unsafe SWE-bench base_commit' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2554
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py \
2555
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/fetch-swebench-instances.py \
2556
+ || ! grep -Fq 'malformed fetched row 1: row must be object' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2557
+ || ! grep -Fq 'solver-nan' benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh \
2558
+ || ! grep -Fq 'malformed scores_by_arm must not provide arm scores' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh \
2559
+ || ! grep -Fq 'non-dict result.json must not expose result fields' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh \
2560
+ || ! grep -Fq 'NaN result numeric fields must not appear in L1 summary' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh \
2561
+ || ! grep -Fq 'score_for(judge' benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
2562
+ || ! grep -Fq 'from pair_evidence_contract import is_score, is_strict_number' benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
2563
+ || ! grep -Fq 'loads_strict_json_object(path.read_text(encoding="utf8"))' benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
2564
+ || ! grep -Fq 'return legacy if is_score(legacy) else None' benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
2565
+ || ! grep -Fq 'def strict_number' benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
2566
+ || ! grep -Fq 'out-of-range scores must not appear in L1 summary' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh \
2567
+ || ! grep -Fq 'boolean scores must not appear in L1 summary' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh \
2568
+ || ! grep -Fq 'boolean result numeric fields must not appear in L1 summary' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh \
2569
+ || ! grep -Fq 'lowercase c_score' benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh; then
2570
+ offenders="${offenders}"$'\n'"benchmark gates must require judge _blind_mapping before accepting score evidence"
2571
+ fi
2572
+ if ! grep -Fq 'solo_claude beats bare' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2573
+ || ! grep -Fq 'mapped_score' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2574
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
2575
+ || ! grep -Fq 'c1-nan-score' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2576
+ || ! grep -Fq 'dirty F5 L1<=L0 row must not be promoted' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2577
+ || ! grep -Fq 'l1-rerun-summary must not override pre-registered C1 selection grounds' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh \
2578
+ || ! grep -Fq 'wrong mapping' benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh; then
2579
+ offenders="${offenders}"$'\n'"build-pair-eligible-manifest.py must use arm-mapped clean scores for F9 and L1<=L0 promotion"
2580
+ fi
2581
+ if [ -d benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/F16-cli-quote-tax-rules ] \
2582
+ && ! python3 benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2583
+ --run-id 20260510-f16-f23-f25-combined-proof \
2584
+ --pair-arm l2_risk_probes \
2585
+ --min-fixtures 3 \
2586
+ --min-pair-margin 5 \
2587
+ --max-pair-solo-wall-ratio 3 \
2588
+ --out-json /tmp/devlyn-lint-f16-f23-f25-pair-gate.json \
2589
+ --out-md /tmp/devlyn-lint-f16-f23-f25-pair-gate.md >/dev/null 2>&1; then
2590
+ offenders="${offenders}"$'\n'"current local artifacts for 20260510-f16-f23-f25-combined-proof must re-gate as PASS"
2591
+ fi
2592
+ if [ -d benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/F21-cli-scheduler-priority ] \
2593
+ && ! python3 benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2594
+ --run-id 20260511-f21-current-riskprobes-v1 \
2595
+ --pair-arm l2_risk_probes \
2596
+ --min-fixtures 1 \
2597
+ --min-pair-margin 5 \
2598
+ --max-pair-solo-wall-ratio 3 \
2599
+ --out-json /tmp/devlyn-lint-f21-pair-gate.json \
2600
+ --out-md /tmp/devlyn-lint-f21-pair-gate.md >/dev/null 2>&1; then
2601
+ offenders="${offenders}"$'\n'"current local artifacts for 20260511-f21-current-riskprobes-v1 must re-gate as PASS"
2602
+ fi
2603
+ if ! grep -Fq 'bash scripts/lint-fixtures.sh' benchmark/auto-resolve/README.md; then
2604
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/README.md: gate-change instructions must include fixture schema lint"
2605
+ fi
2606
+ if ! python3 - <<'PY'
2607
+ import runpy
2608
+
2609
+ runtime = runpy.run_path("config/skills/_shared/verify-merge-findings.py")
2610
+ benchmark = runpy.run_path("benchmark/auto-resolve/scripts/pair_evidence_contract.py")
2611
+
2612
+ runtime_known = set(runtime["KNOWN_PAIR_TRIGGER_REASONS"])
2613
+ canonical = set(benchmark["CANONICAL_PAIR_TRIGGER_REASONS"])
2614
+ aliases = set(benchmark["HISTORICAL_PAIR_TRIGGER_REASON_ALIASES"])
2615
+ normalized_aliases = set(benchmark["HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES"])
2616
+ known = set(benchmark["KNOWN_PAIR_TRIGGER_REASONS"])
2617
+ expected_aliases = {"risk_profile.high_risk", "risk_probes_enabled"}
2618
+ expected_normalized_aliases = {
2619
+ "complexity.high.spec.frontmatter",
2620
+ "frontmatter.complexity.high",
2621
+ "high.complexity.spec",
2622
+ "high.risk.profile",
2623
+ "spec.frontmatter.complexity.high",
2624
+ "state.complexity.high",
2625
+ }
2626
+
2627
+ errors = []
2628
+ if canonical != runtime_known:
2629
+ errors.append("benchmark canonical pair-trigger reasons must match runtime reasons")
2630
+ if aliases != expected_aliases:
2631
+ errors.append("benchmark historical pair-trigger aliases changed unexpectedly")
2632
+ if normalized_aliases != expected_normalized_aliases:
2633
+ errors.append("benchmark normalized historical pair-trigger aliases changed unexpectedly")
2634
+ if known != canonical | aliases:
2635
+ errors.append("benchmark known pair-trigger reasons must be canonical plus aliases")
2636
+ if runtime_known & aliases:
2637
+ errors.append("runtime pair-trigger reasons must not accept benchmark-only aliases")
2638
+ if not benchmark["has_historical_pair_trigger_reason"](["risk_profile.high_risk"]):
2639
+ errors.append("benchmark historical alias helper must detect exact aliases")
2640
+ if benchmark["has_historical_pair_trigger_reason"](["complexity.high"]):
2641
+ errors.append("benchmark historical alias helper must not count canonical reasons")
2642
+ if benchmark["is_known_pair_trigger_reason"]("risk high"):
2643
+ errors.append("benchmark pair-trigger reader must reject normalized canonical reason strings")
2644
+ if benchmark["is_canonical_pair_trigger_reason"]("risk high"):
2645
+ errors.append("benchmark pair-trigger canonical helper must be exact-match only")
2646
+ if not benchmark["is_known_pair_trigger_reason"]("high.risk.profile"):
2647
+ errors.append("benchmark pair-trigger reader must preserve documented normalized historical aliases")
2648
+ if benchmark["is_canonical_pair_trigger_reason"]("high.risk.profile"):
2649
+ errors.append("benchmark normalized historical aliases must not count as canonical reasons")
2650
+ if errors:
2651
+ raise SystemExit("\n".join(errors))
2652
+ PY
2653
+ then
2654
+ offenders="${offenders}"$'\n'"benchmark pair-trigger aliases must stay benchmark-only and runtime canonical reasons must stay in sync"
2655
+ fi
2656
+ if ! grep -Fq 'bash scripts/lint-shadow-fixtures.sh' benchmark/auto-resolve/README.md \
2657
+ || ! grep -Fq 'The headroom and pair candidate runners' benchmark/auto-resolve/README.md \
2658
+ || ! grep -Fq 'accept explicitly named `S*` ids for dry-run checks and candidate measurement' benchmark/auto-resolve/README.md \
2659
+ || ! grep -Fq 'Use `run-suite.sh --suite shadow` only with `--dry-run`' benchmark/auto-resolve/README.md \
2660
+ || ! grep -Fq 'rejected/smoke controls do not' benchmark/auto-resolve/README.md \
2661
+ || ! grep -Fq 'npx devlyn-cli benchmark headroom --dry-run --min-fixtures 1 S1-cli-lang-flag' benchmark/auto-resolve/shadow-fixtures/README.md \
2662
+ || ! grep -Fq 'Use non-dry-run headroom/pair only for' benchmark/auto-resolve/shadow-fixtures/README.md \
2663
+ || ! grep -Fq 'explicitly named `S*` candidates with a solo-headroom hypothesis' benchmark/auto-resolve/shadow-fixtures/README.md \
2664
+ || ! grep -Fq 'promote a validated `S*` task to an active `F*`' benchmark/auto-resolve/shadow-fixtures/README.md \
2665
+ || ! grep -Fq 'shadow suite run-suite is dry-run only' benchmark/auto-resolve/scripts/run-suite.sh \
2666
+ || ! grep -Fq 'run-suite.sh --suite shadow --dry-run' benchmark/auto-resolve/scripts/run-suite.sh \
2667
+ || ! grep -Fq 'shadow suite refuses provider/judge runs' benchmark/auto-resolve/scripts/run-suite.sh \
2668
+ || ! grep -Fq 'npx devlyn-cli benchmark suite --suite shadow --dry-run' bin/devlyn.js \
2669
+ || ! grep -Fq 'Use benchmark headroom/pair with explicit S* candidates for real provider measurement.' benchmark/auto-resolve/scripts/run-suite.sh \
2670
+ || ! grep -Fq 'shadow-suite-provider-run' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2671
+ || ! grep -Fq 'shadow-suite-judge-only-provider-run' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2672
+ || ! grep -Fq 'arg-parse-shadow-cli-suite-dry-run' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2673
+ || ! grep -Fq 'shadow suite dry-run must not invite a blocked non-dry-run suite invocation' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2674
+ || ! grep -Fq '`run-suite.sh --suite shadow` is dry-run only' benchmark/auto-resolve/shadow-fixtures/README.md \
2675
+ || ! grep -Fq 'parallel to the active golden `F*` fixtures' benchmark/auto-resolve/shadow-fixtures/README.md \
2676
+ || ! grep -Fq 'Active golden `F*` fixtures and pair-evidence audits control' benchmark/auto-resolve/shadow-fixtures/README.md \
2677
+ || ! grep -Fq 'Default `--suite golden` keeps active golden `F*` behavior' benchmark/auto-resolve/shadow-fixtures/README.md \
2678
+ || ! grep -Fq 'before any bare/solo/pair measurement' benchmark/auto-resolve/shadow-fixtures/README.md \
2679
+ || ! grep -Fq 'Do not spend' benchmark/auto-resolve/shadow-fixtures/README.md \
2680
+ || ! grep -Fq 'real provider calls on S1' benchmark/auto-resolve/shadow-fixtures/README.md \
2681
+ || ! grep -Fq 'not a solo<pair evidence candidate' benchmark/auto-resolve/shadow-fixtures/README.md \
2682
+ || grep -Fq 'F1-F9 still controls release' benchmark/auto-resolve/shadow-fixtures/README.md \
2683
+ || grep -Fq 'existing F1-F9 behavior' benchmark/auto-resolve/shadow-fixtures/README.md \
2684
+ || grep -Fq 'L0/L1/L2 measurement' benchmark/auto-resolve/shadow-fixtures/README.md; then
2685
+ offenders="${offenders}"$'\n'"shadow fixture docs must describe lint and S* headroom/pair candidate calibration before golden promotion"
2686
+ fi
2687
+ if ! grep -Fq 'write a solo-headroom hypothesis' benchmark/auto-resolve/README.md \
2688
+ || ! grep -Fq 'each new pair-candidate shadow fixture needs a' benchmark/auto-resolve/shadow-fixtures/README.md \
2689
+ || ! grep -Fq "candidate's \`spec.md\`: name the visible behavior a capable \`solo_claude\`" benchmark/auto-resolve/README.md \
2690
+ || ! grep -Fq '`solo_claude` baseline is expected to miss' benchmark/auto-resolve/shadow-fixtures/README.md \
2691
+ || ! grep -Fq "candidate's \`spec.md\`" benchmark/auto-resolve/README.md \
2692
+ || ! grep -Fq 'solo-headroom hypothesis in `spec.md`' benchmark/auto-resolve/shadow-fixtures/README.md \
2693
+ || ! grep -Fq 'observable command from `expected.json`' benchmark/auto-resolve/README.md \
2694
+ || ! grep -Fq 'observable command from' benchmark/auto-resolve/shadow-fixtures/README.md \
2695
+ || ! grep -Fq 'candidate runners enforce this as an actionable hypothesis' benchmark/auto-resolve/README.md \
2696
+ || ! grep -Fq 'runners enforce this as an actionable hypothesis' benchmark/auto-resolve/shadow-fixtures/README.md \
2697
+ || ! grep -Fq 'backticked observable command matching `expected.json`' benchmark/auto-resolve/README.md \
2698
+ || ! grep -Fq 'backticked observable command matching `expected.json`' benchmark/auto-resolve/shadow-fixtures/README.md \
2699
+ || ! grep -Fq 'command/observable' benchmark/auto-resolve/README.md \
2700
+ || ! grep -Fq 'command/observable' benchmark/auto-resolve/shadow-fixtures/README.md \
2701
+ || ! grep -Fq 'itself containing `miss`' benchmark/auto-resolve/README.md \
2702
+ || ! grep -Fq 'itself containing `miss`' benchmark/auto-resolve/shadow-fixtures/README.md \
2703
+ || ! grep -Fq '## Solo ceiling avoidance' benchmark/auto-resolve/README.md \
2704
+ || ! grep -Fq '## Solo ceiling avoidance' benchmark/auto-resolve/shadow-fixtures/README.md \
2705
+ || ! grep -Fq 'solo-saturated `S2`-`S6` controls' benchmark/auto-resolve/README.md \
2706
+ || ! grep -Fq 'calibrated solo-saturated controls (`S2`-`S6`)' benchmark/auto-resolve/shadow-fixtures/README.md; then
2707
+ offenders="${offenders}"$'\n'"benchmark shadow candidate docs must require solo-headroom hypothesis and solo-ceiling avoidance before provider spend"
2708
+ fi
2709
+ if [ ! -f benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py ] \
2710
+ || ! grep -Fq 'def has_actionable_solo_headroom_hypothesis_text' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2711
+ || ! grep -Fq '"solo-headroom hypothesis" in lower' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2712
+ || ! grep -Fq 'and "solo_claude" in lower' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2713
+ || ! grep -Fq 'and "miss" in lower' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2714
+ || ! grep -Fq 'OBSERVABLE_COMMAND_MARKERS = ("command", "observable", "expose")' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2715
+ || ! grep -Fq 'def is_command_like_backtick' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2716
+ || ! grep -Fq 'def path_has_actionable_solo_headroom_hypothesis' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2717
+ || ! grep -Fq 'has_actionable_solo_headroom_hypothesis_text(text)' benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py \
2718
+ || ! grep -Fq 'actionable_observable_commands(text)' benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py \
2719
+ || ! grep -Fq -- '--expected-json' benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py \
2720
+ || ! grep -Fq 'expected_commands(args.expected_json)' benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py \
2721
+ || ! grep -Fq 'expected UTF-8 text' benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py \
2722
+ || ! grep -Fq 'solo-headroom-hypothesis.py' scripts/lint-shadow-fixtures.sh \
2723
+ || ! grep -Fq 'has_solo_ceiling_avoidance_note' scripts/lint-shadow-fixtures.sh \
2724
+ || ! grep -Fq 'solo-ceiling-avoidance.py' scripts/lint-shadow-fixtures.sh \
2725
+ || ! grep -Fq 'SECTION_RE' benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py \
2726
+ || ! grep -Fq 'CONTROL_RE' benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py \
2727
+ || ! grep -Fq 'REASON_RE' benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py \
2728
+ || ! grep -Fq 'expected UTF-8 text' benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py \
2729
+ || ! grep -Fq 'fixture_has_solo_ceiling_avoidance_note' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2730
+ || ! grep -Fq 'fixture_has_solo_ceiling_avoidance_note' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2731
+ || ! grep -Fq 'solo-ceiling-avoidance.py' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2732
+ || ! grep -Fq 'solo-ceiling-avoidance.py' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2733
+ || ! grep -Fq 'shadow fixture NOTES.md needs ## Solo ceiling avoidance' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2734
+ || ! grep -Fq 'shadow fixture NOTES.md needs ## Solo ceiling avoidance' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2735
+ || ! grep -Fq 'solo-headroom-hypothesis.py' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2736
+ || ! grep -Fq 'solo-headroom-hypothesis.py' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2737
+ || ! grep -Fq -- '--expected-json "$dir/expected.json" "$dir/spec.md"' benchmark/auto-resolve/scripts/run-headroom-candidate.sh \
2738
+ || ! grep -Fq -- '--expected-json "$dir/expected.json" "$dir/spec.md"' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2739
+ || ! grep -Fq -- '--expected-json "$d/expected.json" "$spec"' scripts/lint-shadow-fixtures.sh \
2740
+ || ! grep -Fq 'from expected.json before provider spend' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2741
+ || ! grep -Fq 'from expected.json before provider spend' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2742
+ || ! grep -Fq 'unrelated-backtick-solo-headroom-hypothesis' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2743
+ || ! grep -Fq 'unrelated-backtick-solo-headroom-hypothesis' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2744
+ || ! grep -Fq 'observable-without-miss-solo-headroom-hypothesis' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2745
+ || ! grep -Fq 'observable-without-miss-solo-headroom-hypothesis' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2746
+ || ! grep -Fq 'missing-solo-ceiling-avoidance' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2747
+ || ! grep -Fq 'missing-solo-ceiling-avoidance' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2748
+ || ! grep -Fq 'weak-solo-ceiling-avoidance' benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh \
2749
+ || ! grep -Fq 'weak-solo-ceiling-avoidance' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2750
+ || ! grep -Fq 'hypothesis with unrelated backtick must fail' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2751
+ || ! grep -Fq 'weak hypothesis without observable command must fail' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2752
+ || ! grep -Fq 'actionable-hypothesis.md' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2753
+ || ! grep -Fq 'docs-style-actionable-hypothesis.md' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2754
+ || ! grep -Fq 'shadow-missing-solo-ceiling-avoidance' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2755
+ || ! grep -Fq 'shadow-weak-solo-ceiling-avoidance' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2756
+ || ! grep -Fq 'weak solo ceiling avoidance must fail' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2757
+ || ! grep -Fq 'non-utf8-solo-ceiling.md' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
2758
+ || ! grep -Fq 'non-utf8-hypothesis.md' benchmark/auto-resolve/scripts/test-lint-fixtures.sh; then
2759
+ offenders="${offenders}"$'\n'"solo-headroom hypothesis provider-spend guards must share one actionable checker and test weak-vs-actionable cases"
2760
+ fi
2761
+ if ! grep -Fq 'pair-candidate-frontier.py' benchmark/auto-resolve/README.md \
2762
+ || ! grep -Fq 'npx devlyn-cli benchmark frontier --out-md /tmp/devlyn-pair-frontier.md' benchmark/auto-resolve/README.md \
2763
+ || ! grep -Fq 'npx devlyn-cli benchmark frontier --out-md /tmp/devlyn-pair-frontier.md' benchmark/auto-resolve/run-real-benchmark.md \
2764
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --out-dir /tmp/devlyn-benchmark-audit' README.md \
2765
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --out-dir /tmp/devlyn-benchmark-audit' benchmark/auto-resolve/README.md \
2766
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --out-dir /tmp/devlyn-benchmark-audit' benchmark/auto-resolve/run-real-benchmark.md \
2767
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' README.md \
2768
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' benchmark/auto-resolve/README.md \
2769
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' benchmark/auto-resolve/run-real-benchmark.md \
2770
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' bin/devlyn.js \
2771
+ || ! grep -Fq 'npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2772
+ || ! grep -Fq 'npx devlyn-cli benchmark frontier Show pair candidate frontier scores/triggers without providers' bin/devlyn.js \
2773
+ || ! grep -Fq 'npx devlyn-cli benchmark frontier Show pair candidate frontier scores/triggers without providers' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2774
+ || ! grep -Fq 'npx devlyn-cli benchmark audit Audit pair evidence readiness' bin/devlyn.js \
2775
+ || ! grep -Fq 'npx devlyn-cli benchmark audit Audit pair evidence readiness' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2776
+ || ! grep -Fq 'Show active rejected/evidence/unmeasured pair candidates, scores, and triggers without providers' bin/devlyn.js \
2777
+ || ! grep -Fq 'Show active rejected/evidence/unmeasured pair candidates, scores, and triggers without providers' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2778
+ || ! grep -Fq 'Prints pair evidence score rows with trigger reasons; --out-md includes a Triggers column' bin/devlyn.js \
2779
+ || ! grep -Fq 'Prints pair evidence score rows with trigger reasons; --out-md includes a Triggers column' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2780
+ || ! grep -Fq 'Prints frontier score rows plus headroom and pair quality handoff rows' bin/devlyn.js \
2781
+ || ! grep -Fq 'Prints frontier score rows plus headroom and pair quality handoff rows' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2782
+ || ! grep -Fq 'Prints frontier score rows plus headroom_rejections=PASS/FAIL, pair_evidence_quality=PASS/FAIL, pair_trigger_reasons=PASS/FAIL, pair_evidence_hypotheses=PASS/FAIL, pair_evidence_hypothesis_triggers=PASS/WARN/FAIL, historical-alias, and hypothesis-trigger gap handoff rows' bin/devlyn.js \
2783
+ || ! grep -Fq 'Prints frontier score rows plus headroom_rejections=PASS/FAIL, pair_evidence_quality=PASS/FAIL, pair_trigger_reasons=PASS/FAIL, pair_evidence_hypotheses=PASS/FAIL, pair_evidence_hypothesis_triggers=PASS/WARN/FAIL, historical-alias, and hypothesis-trigger gap handoff rows' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2784
+ || ! grep -Fq "audit: 'audit-pair-evidence.py'" bin/devlyn.js \
2785
+ || ! grep -Fq 'npx devlyn-cli benchmark audit [options]' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2786
+ || ! grep -Fq -- '--min-pair-evidence N default: 4' bin/devlyn.js \
2787
+ || ! grep -Fq -- '--min-pair-evidence N default: 4' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2788
+ || ! grep -Fq -- '--min-pair-margin N default: 5' bin/devlyn.js \
2789
+ || ! grep -Fq -- '--min-pair-margin N default: 5' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2790
+ || ! grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' bin/devlyn.js \
2791
+ || ! grep -Fq -- '--max-pair-solo-wall-ratio N default: 3' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2792
+ || ! grep -Fq -- '--require-hypothesis-trigger' bin/devlyn.js \
2793
+ || ! grep -Fq -- '--require-hypothesis-trigger' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2794
+ || ! grep -Fq -- '--require-hypothesis-trigger' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2795
+ || ! grep -Fq 'npx devlyn-cli benchmark audit-headroom --out-json /tmp/devlyn-headroom-audit.json' README.md \
2796
+ || ! grep -Fq 'npx devlyn-cli benchmark audit-headroom --out-json /tmp/devlyn-headroom-audit.json' benchmark/auto-resolve/README.md \
2797
+ || ! grep -Fq 'npx devlyn-cli benchmark audit-headroom --out-json /tmp/devlyn-headroom-audit.json' benchmark/auto-resolve/run-real-benchmark.md \
2798
+ || ! grep -Fq 'npx devlyn-cli benchmark audit-headroom Audit failed headroom results' bin/devlyn.js \
2799
+ || ! grep -Fq 'npx devlyn-cli benchmark audit-headroom Audit failed headroom results' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2800
+ || ! grep -Fq "'audit-headroom': 'audit-headroom-rejections.py'" bin/devlyn.js \
2801
+ || ! grep -Fq 'npx devlyn-cli benchmark audit-headroom [options]' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2802
+ || ! grep -Fq 'benchmarkMode === '\''frontier'\''' bin/devlyn.js \
2803
+ || ! grep -Fq 'test-pair-candidate-frontier.sh' benchmark/auto-resolve/README.md \
2804
+ || ! grep -Fq 'audit-pair-evidence.py' benchmark/auto-resolve/README.md \
2805
+ || ! grep -Fq 'PASS audit-pair-evidence' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2806
+ || ! grep -Fq 'FAIL audit-pair-evidence' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2807
+ || ! grep -Fq 'FAIL audit-pair-evidence' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2808
+ || ! grep -Fq 'audit.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2809
+ || ! grep -Fq 'audit.json' README.md \
2810
+ || ! grep -Fq 'audit.json' benchmark/auto-resolve/README.md \
2811
+ || ! grep -Fq 'audit.json' benchmark/auto-resolve/run-real-benchmark.md \
2812
+ || ! grep -Fq '"frontier_summary": frontier_summary' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2813
+ || ! grep -Fq '"pair_evidence_rows": pair_evidence_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2814
+ || ! grep -Fq 'def load_pair_evidence_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2815
+ || ! grep -Fq 'def load_frontier_stdout_metrics' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2816
+ || ! grep -Fq '"summary_rows": summary_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2817
+ || ! grep -Fq '"aggregate_rows": aggregate_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2818
+ || ! grep -Fq '"trigger_rows": trigger_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2819
+ || ! grep -Fq '"trigger_rows_match_count": trigger_rows == expected_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2820
+ || ! grep -Fq '"hypothesis_trigger_rows_match_count": hypothesis_trigger_rows == expected_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2821
+ || ! grep -Fq 'from pair_evidence_contract import' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2822
+ || ! grep -Fq 'from pair_evidence_contract import' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
2823
+ || ! grep -Fq 'from pair_evidence_contract import' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2824
+ || ! grep -Fq 'def reject_json_constant' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2825
+ || ! grep -Fq 'def loads_strict_json_object' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2826
+ || ! grep -Fq 'parse_constant=reject_json_constant' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2827
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2828
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/headroom-gate.py \
2829
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2830
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
2831
+ || ! grep -Fq 'loads_strict_json_object(path.read_text())' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2832
+ || ! grep -Fq 'nan-json-constant' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
2833
+ || ! grep -Fq 'nan-result-json' benchmark/auto-resolve/scripts/test-headroom-gate.sh \
2834
+ || ! grep -Fq 'nan-result-json' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2835
+ || ! grep -Fq 'CANONICAL_PAIR_TRIGGER_REASONS = {' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2836
+ || ! grep -Fq 'HISTORICAL_PAIR_TRIGGER_REASON_ALIASES = {' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2837
+ || ! grep -Fq 'HISTORICAL_NORMALIZED_PAIR_TRIGGER_REASON_ALIASES = {' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2838
+ || ! grep -Fq 'Benchmark readers accept historical aliases only for archived artifacts' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2839
+ || ! grep -Fq 'KNOWN_PAIR_TRIGGER_REASONS = (' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2840
+ || ! grep -Fq 'mode.pair-verify' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2841
+ || ! grep -Fq 'def normalized_pair_trigger_reason' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2842
+ || ! grep -Fq 'def is_canonical_pair_trigger_reason' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2843
+ || ! grep -Fq 'def has_known_pair_trigger_reason' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2844
+ || ! grep -Fq 'def all_known_pair_trigger_reasons' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2845
+ || ! grep -Fq 'def has_canonical_pair_trigger_reason' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2846
+ || ! grep -Fq 'spec.solo_headroom_hypothesis' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2847
+ || ! grep -Fq 'risk_profile.high_risk' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2848
+ || ! grep -Fq 'has_known_pair_trigger_reason(reasons)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2849
+ || ! grep -Fq 'all_known_pair_trigger_reasons(reasons)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2850
+ || ! grep -Fq 'has_canonical_pair_trigger_reason(reasons)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2851
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2852
+ || ! grep -Fq '"pair_trigger_has_hypothesis_reason": (' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2853
+ || ! grep -Fq '"require_hypothesis_trigger": args.require_hypothesis_trigger' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2854
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": true' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2855
+ || ! grep -Fq '"pair_trigger_has_hypothesis_reason": true' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2856
+ || ! grep -Fq '"require_hypothesis_trigger": true' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2857
+ || ! grep -Fq 'pair_trigger eligible with a canonical reason' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2858
+ || ! grep -Fq 'pair_trigger eligible with canonical reason' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2859
+ || ! grep -Fq 'pair_trigger reasons missing known trigger reason' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2860
+ || ! grep -Fq 'pair_trigger reasons contain unknown trigger reason' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2861
+ || ! grep -Fq 'pair_trigger reasons missing canonical trigger reason' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2862
+ || ! grep -Fq 'fixture_spec_has_solo_headroom_hypothesis(fixture_dir.name)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2863
+ || ! grep -Fq 'pair_trigger missing spec.solo_headroom_hypothesis' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2864
+ || ! grep -Fq -- '--require-hypothesis-trigger' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2865
+ || ! grep -Fq -- '--require-hypothesis-trigger' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2866
+ || ! grep -Fq 'release audit: npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh \
2867
+ || ! grep -Fq 'release audit: npx devlyn-cli benchmark audit --require-hypothesis-trigger --out-dir /tmp/devlyn-benchmark-audit-strict' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2868
+ || ! grep -Fq 'missing-hypothesis-trigger' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2869
+ || ! grep -Fq 'grep -Fq -- '\''--require-hypothesis-trigger'\''' benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh \
2870
+ || ! grep -Fq 'def fmt_trigger_reasons' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2871
+ || ! grep -Fq 'Hypothesis trigger required: {str(report' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2872
+ || ! grep -Fq '| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Pair | Margin | Pair mode | Hypothesis trigger | Triggers | Wall ratio | Status | Reason |' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2873
+ || ! grep -Fq '| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Pair | Margin | Pair mode | Hypothesis trigger | Triggers | Wall ratio | Status | Reason |' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2874
+ || ! grep -Fq 'trigger-reason, and wall-ratio columns' benchmark/auto-resolve/README.md \
2875
+ || ! grep -Fq 'eligible with non-empty reasons and at least one canonical reason' benchmark/auto-resolve/README.md \
2876
+ || ! grep -Fq 'fixtures with an actionable solo-headroom hypothesis must include `spec.solo_headroom_hypothesis` in the trigger reasons' benchmark/auto-resolve/README.md \
2877
+ || ! grep -Fq 'pair trigger eligibility, trigger reasons, canonical-trigger coverage, and `spec.solo_headroom_hypothesis` coverage' benchmark/auto-resolve/run-real-benchmark.md \
2878
+ || ! grep -Fq 'unknown-pair-trigger-reason' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2879
+ || ! grep -Fq 'mixed-unknown-pair-trigger-reason' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2880
+ || ! grep -Fq 'normalized-canonical-pair-trigger-reason' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2881
+ || ! grep -Fq 'historical-only-pair-trigger-reason' benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh \
2882
+ || ! grep -Fq 'unknown-pair-trigger' benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh \
2883
+ || ! grep -Fq 'def normalize_pair_evidence_row' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2884
+ || ! grep -Fq 'def best_pair_evidence' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2885
+ || ! grep -Fq 'ALLOWED_PAIR_ARMS = {"l2_risk_probes", "l2_gated"}' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2886
+ || ! grep -Fq 'pair_arm not in ALLOWED_PAIR_ARMS' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2887
+ || ! grep -Fq 'pair_mode = row.get("pair_mode")' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2888
+ || ! grep -Fq 'if pair_mode is not True:' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2889
+ || ! grep -Fq 'pair_trigger_eligible = row.get("pair_trigger_eligible")' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2890
+ || ! grep -Fq 'if pair_trigger_eligible is not True:' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2891
+ || ! grep -Fq 'pair_trigger_reasons = row.get("pair_trigger_reasons")' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2892
+ || ! grep -Fq 'isinstance(pair_trigger_reasons, list)' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2893
+ || ! grep -Fq 'all_known_pair_trigger_reasons(pair_trigger_reasons)' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2894
+ || ! grep -Fq 'has_canonical_pair_trigger_reason(pair_trigger_reasons)' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2895
+ || ! grep -Fq '"pair_trigger_reasons": pair_trigger_reasons' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2896
+ || ! grep -Fq '"pair_trigger_has_canonical_reason": True' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2897
+ || ! grep -Fq '"pair_trigger_has_hypothesis_reason": (' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2898
+ || ! grep -Fq 'def pair_result_trigger_reasons' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2899
+ || ! grep -Fq 'candidate_row = dict(row)' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2900
+ || ! grep -Fq 'all_known_pair_trigger_reasons(reasons)' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2901
+ || ! grep -Fq 'has_canonical_pair_trigger_reason(reasons)' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2902
+ || ! grep -Fq 'candidate_row["pair_trigger_reasons"] = reasons' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2903
+ || ! grep -Fq '"reasons": ["complexity.high", "looks-hard"]' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
2904
+ || ! grep -Fq '"reasons":["risk high"]' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
2905
+ || ! grep -Fq 'triggers={triggers}' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2906
+ || ! grep -Fq 'hypothesis_trigger={hypothesis_trigger}' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2907
+ || ! grep -Fq '| Fixture | Status | Verdict | Evidence | Pair arm | Triggers | Hypothesis trigger |' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
2908
+ || ! grep -Fq 'pair-arm, and trigger-reason columns' benchmark/auto-resolve/README.md \
2909
+ || ! grep -Fq '| Fixture | Status | Verdict | Evidence | Pair arm | Triggers | Hypothesis trigger |' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
2910
+ || ! grep -Fq 'def check_pair_evidence_hypotheses' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2911
+ || ! grep -Fq 'def pair_evidence_hypothesis_trigger_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2912
+ || ! grep -Fq 'def pair_evidence_hypothesis_trigger_gap_details' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2913
+ || ! grep -Fq 'def check_pair_evidence_hypothesis_triggers' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2914
+ || ! grep -Fq 'def print_pair_evidence_hypothesis_triggers_summary' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2915
+ || ! grep -Fq 'pair_evidence_hypotheses={status}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2916
+ || ! grep -Fq 'pair_evidence_hypothesis_triggers={status}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2917
+ || ! grep -Fq 'pair evidence hypotheses missing for fixture(s)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2918
+ || ! grep -Fq 'pair-evidence-hypotheses' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2919
+ || ! grep -Fq 'pair_evidence_hypotheses=PASS documented=2 total=2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2920
+ || ! grep -Fq 'pair_evidence_hypothesis_triggers=WARN matched=0 documented=2 total=2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2921
+ || ! grep -Fq 'pair_evidence_hypothesis_trigger_gaps=F16-cli-quote-tax-rules=complexity.high;F21-cli-scheduler-priority=complexity.high,risk_profile.high_risk' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2922
+ || ! grep -Fq 'pair_evidence_hypothesis_triggers=FAIL matched=0 documented=2 total=2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2923
+ || ! grep -Fq 'strict benchmark audit must not report current hypothesis-trigger gaps' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2924
+ || ! grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2925
+ || ! grep -Fq 'verdict=pair_evidence_passed triggers={triggers}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2926
+ || ! grep -Fq 'def format_trigger_reasons' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2927
+ || ! grep -Fq 'missing-frontier-triggers' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
2928
+ || ! grep -Fq 'triggers=complexity.high' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
2929
+ || ! grep -Fq '"pair_trigger_reasons": pair_trigger_reasons(pair_result)' benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py \
2930
+ || ! grep -Fq '"pair_trigger_eligible": pair_trigger_eligible' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2931
+ || ! grep -Fq '"verdict": "pair_evidence_passed"' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2932
+ || ! grep -Fq '"pair_mode": pair_mode' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2933
+ || ! grep -Fq 'def pair_result_trigger_reasons' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
2934
+ || ! grep -Fq 'candidate_row = dict(row)' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
2935
+ || ! grep -Fq 'candidate_row["pair_trigger_reasons"] = reasons' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
2936
+ || ! grep -Fq 'F16-cli-quote-tax-rules/l2_risk_probes/result.json' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
2937
+ || ! grep -Fq '"reasons":["risk high"]' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
2938
+ || ! grep -Fq 'def is_strict_int' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2939
+ || ! grep -Fq 'def is_score' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2940
+ || ! grep -Fq '0 <= value <= 100' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2941
+ || ! grep -Fq 'def is_strict_number' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2942
+ || ! grep -Fq 'import math' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2943
+ || ! grep -Fq 'math.isfinite(value)' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2944
+ || ! grep -Fq 'and value > 0' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2945
+ || ! grep -Fq 'if pair_margin != pair_score - solo_score:' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
2946
+ || ! grep -Fq '"min_pair_evidence": min_pair_evidence' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2947
+ || ! grep -Fq '"actual_rows": len(pair_evidence_rows)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2948
+ || ! grep -Fq '"rows_match_count": (' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2949
+ || ! grep -Fq '"pair_evidence_quality": {' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2950
+ || ! grep -Fq 'def check_pair_evidence_quality' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2951
+ || ! grep -Fq 'def print_pair_evidence_quality' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2952
+ || ! grep -Fq 'def check_pair_trigger_reasons' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2953
+ || ! grep -Fq 'def print_pair_trigger_reasons_summary' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2954
+ || ! grep -Fq 'def pair_trigger_historical_alias_details' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2955
+ || ! grep -Fq 'pair_evidence_quality={status} min_pair_margin_actual={actual_margin}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2956
+ || ! grep -Fq 'pair_trigger_reasons={status} canonical={canonical} historical_alias={historical_alias}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2957
+ || ! grep -Fq '"historical_alias_details": historical_alias_details' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2958
+ || ! grep -Fq 'frontier pair_margin_min does not match pair evidence rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2959
+ || ! grep -Fq 'frontier pair_solo_wall_ratio_max does not match pair evidence rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2960
+ || ! grep -Fq 'is_strict_int(pair_evidence_count)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2961
+ || ! grep -Fq 'if not is_strict_int(count):' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2962
+ || ! grep -Fq 'pair evidence count missing or malformed from frontier report' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2963
+ || ! grep -Fq 'pair evidence rows {len(rows)} do not match summary count {count}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2964
+ || ! grep -Fq 'def check_frontier_report' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2965
+ || ! grep -Fq 'frontier verdict {verdict!r} is not PASS' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2966
+ || ! grep -Fq 'frontier has {unmeasured_count} unmeasured candidate fixture(s)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2967
+ || ! grep -Fq '"frontier_report": {' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2968
+ || ! grep -Fq '"min_pair_margin": min_pair_margin' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2969
+ || ! grep -Fq '"max_pair_solo_wall_ratio": max_pair_solo_wall_ratio' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2970
+ || ! grep -Fq 'not math.isfinite(args.max_pair_solo_wall_ratio)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2971
+ || ! grep -Fq 'max-pair-solo-wall-ratio must be finite and > 0' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2972
+ || ! grep -Fq 'pair evidence count {count} below required minimum {minimum}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
2973
+ || ! grep -Fq 'assert audit["checks"]["min_pair_evidence"]["actual_rows"] == len(audit["pair_evidence_rows"])' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2974
+ || ! grep -Fq 'assert audit["checks"]["frontier_report"]["verdict"] == frontier["verdict"]' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2975
+ || ! grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=2.25x max_wall_allowed=3.00x' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2976
+ || ! grep -Fq 'pair_trigger_reasons=PASS canonical=4 historical_alias=0 exposed=4 total=4 summary=4 rows_match=true' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2977
+ || ! grep -Fq 'current benchmark audit must not report historical aliases or hypothesis-trigger gaps' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2978
+ || ! grep -Fq 'benchmark audit must fail when min pair evidence exceeds current evidence rows' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2979
+ || ! grep -Fq 'actual_pair_evidence=$(python3 - "$TMP/audit/audit.json"' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2980
+ || ! grep -Fq 'required_pair_evidence=$((actual_pair_evidence + 1))' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2981
+ || ! grep -Fq 'pair evidence count ${actual_pair_evidence} below required minimum ${required_pair_evidence}' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2982
+ || ! grep -Fq 'pair_margin_avg=+27.25 pair_margin_min=+21 wall_avg=1.66x wall_max=2.25x' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2983
+ || ! grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2984
+ || ! grep -Fq 'FAIL audit-pair-evidence' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2985
+ || ! grep -Fq 'assert audit["checks"]["min_pair_evidence"]["required"] == required' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2986
+ || ! grep -Fq 'assert audit["checks"]["pair_evidence_quality"]["status"] == "PASS"' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2987
+ || ! grep -Fq 'assert audit["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 4' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2988
+ || ! grep -Fq 'assert audit["checks"]["pair_trigger_reasons"]["rows_match_count"] is True' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2989
+ || ! grep -Fq 'assert audit["checks"]["pair_trigger_reasons"]["canonical_rows"] == 4' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2990
+ || ! grep -Fq 'assert audit["checks"]["pair_trigger_reasons"]["historical_alias_details"] == []' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2991
+ || ! grep -Fq 'assert len(audit["pair_evidence_rows"]) == frontier["pair_evidence_count"]' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2992
+ || ! grep -Fq 'benchmark frontier must fail when active unmeasured candidates remain' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2993
+ || ! grep -Fq 'PASS pair-candidate-frontier' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2994
+ || ! grep -Fq 'FAIL pair-candidate-frontier' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2995
+ || ! grep -Fq 'assert report["rows"][0]["status"] == "candidate_unmeasured"' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2996
+ || ! grep -Fq 'assert row["verdict"] == "pair_evidence_passed"' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2997
+ || ! grep -Fq 'assert row["pair_mode"] is True' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2998
+ || ! grep -Fq 'assert row["pair_trigger_eligible"] is True' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
2999
+ || ! grep -Fq 'assert isinstance(row["pair_solo_wall_ratio"], (int, float))' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3000
+ || ! grep -Fq 'assert report["frontier_summary"]["pair_margin_avg"] == 27' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3001
+ || ! grep -Fq 'assert report["pair_evidence_rows"] == [' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3002
+ || ! grep -Fq 'assert report["checks"]["min_pair_evidence"]["actual_rows"] == 2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3003
+ || ! grep -Fq 'assert report["checks"]["min_pair_evidence"]["rows_match_count"] is True' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3004
+ || ! grep -Fq 'assert report["checks"]["min_pair_evidence"]["status"] == "FAIL"' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3005
+ || ! grep -Fq 'assert report["checks"]["pair_evidence_quality"]["status"] == "PASS"' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3006
+ || ! grep -Fq 'assert report["checks"]["pair_trigger_reasons"]["status"] == "PASS"' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3007
+ || ! grep -Fq 'assert report["checks"]["pair_trigger_reasons"]["summary_pair_evidence_count"] == 2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3008
+ || ! grep -Fq 'assert report["checks"]["pair_trigger_reasons"]["rows_match_count"] is True' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3009
+ || ! grep -Fq 'assert report["checks"]["pair_trigger_reasons"]["historical_alias_details"] == [' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3010
+ || ! grep -Fq 'normalized-canonical-trigger-reason-rows' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3011
+ || ! grep -Fq 'assert report["checks"]["pair_evidence_quality"]["min_pair_margin_actual"] == 21' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3012
+ || ! grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=1.47x max_wall_allowed=3.00x' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3013
+ || ! grep -Fq 'pair_evidence_quality=FAIL min_pair_margin_actual=+4 min_pair_margin_required=+5 max_wall_actual=1.20x max_wall_allowed=3.00x' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3014
+ || ! grep -Fq 'no-quality-rows-frontier.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3015
+ || ! grep -Fq 'pair evidence quality check has no complete rows' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3016
+ || ! grep -Fq 'low-quality-frontier.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3017
+ || ! grep -Fq 'pair evidence margin below minimum for fixture(s): F16-cli-quote-tax-rules' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3018
+ || ! grep -Fq 'high-wall-frontier.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3019
+ || ! grep -Fq 'pair evidence wall ratio above maximum for fixture(s): F16-cli-quote-tax-rules' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3020
+ || ! grep -Fq 'summary-mismatch-frontier.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3021
+ || ! grep -Fq 'frontier pair_margin_min does not match pair evidence rows' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3022
+ || ! grep -Fq 'summary-wall-mismatch-frontier.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3023
+ || ! grep -Fq 'frontier pair_solo_wall_ratio_max does not match pair evidence rows' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3024
+ || ! grep -Fq 'frontier-fail-verdict.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3025
+ || ! grep -Fq 'frontier-unmeasured.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3026
+ || ! grep -Fq 'frontier-malformed-unmeasured.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3027
+ || ! grep -Fq 'frontier-incomplete-best.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3028
+ || ! grep -Fq '"run_id": "lower-complete"' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3029
+ || ! grep -Fq 'bool-frontier-count.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3030
+ || ! grep -Fq 'malformed-pair-evidence-count' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3031
+ || ! grep -Fq 'mismatched-frontier-rows.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3032
+ || ! grep -Fq 'pair evidence rows 1 do not match summary count 2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3033
+ || ! grep -Fq 'bad-frontier-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3034
+ || ! grep -Fq 'nan-frontier-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3035
+ || ! grep -Fq 'nan-max-wall-ratio' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3036
+ || ! grep -Fq 'mismatched-margin-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3037
+ || ! grep -Fq 'mismatched-margin-row-fields' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3038
+ || ! grep -Fq 'overrange-score-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3039
+ || ! grep -Fq 'overrange-score-row-fields' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3040
+ || ! grep -Fq 'invalid-pair-arm-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3041
+ || ! grep -Fq 'invalid-pair-arm-row-fields' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3042
+ || ! grep -Fq 'false-pair-mode-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3043
+ || ! grep -Fq 'false-pair-mode-row-fields' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3044
+ || ! grep -Fq 'zero-wall-row-fields.json' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3045
+ || ! grep -Fq 'zero-wall-row-fields' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3046
+ || ! grep -Fq 'pair evidence count 2 below required minimum 4' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3047
+ || ! grep -Fq 'assert rows["F22-cli-low-margin"]["status"] == "candidate_unmeasured"' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3048
+ || ! grep -Fq 'assert rows["F22-cli-low-margin"]["status"] == "pair_evidence_passed"' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3049
+ || ! grep -Fq 'assert rows["F23-cli-high-wall"]["status"] == "candidate_unmeasured"' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3050
+ || ! grep -Fq 'assert rows["F23-cli-high-wall"]["status"] == "pair_evidence_passed"' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3051
+ || ! grep -Fq 'def print_final_verdict' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3052
+ || ! grep -Fq 'PASS pair-candidate-frontier' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3053
+ || ! grep -Fq 'FAIL pair-candidate-frontier' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3054
+ || ! grep -Fq 'PASS pair-candidate-frontier' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3055
+ || ! grep -Fq 'FAIL pair-candidate-frontier' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3056
+ || ! grep -Fq 'requires at least four fixtures with passing pair evidence' README.md \
3057
+ || ! grep -Fq 'revalidates frontier `verdict: PASS`, zero unmeasured candidates' README.md \
3058
+ || ! grep -Fq 'requires at least four active fixtures with passing pair evidence' benchmark/auto-resolve/README.md \
3059
+ || ! grep -Fq 'revalidates frontier `verdict: PASS`' benchmark/auto-resolve/README.md \
3060
+ || ! grep -Fq 'requires at least four active fixtures with passing pair evidence' benchmark/auto-resolve/run-real-benchmark.md \
3061
+ || ! grep -Fq 'revalidates frontier `verdict: PASS`, zero unmeasured candidates' benchmark/auto-resolve/run-real-benchmark.md \
3062
+ || ! grep -Fq 'counted by `benchmark audit` as the fourth passing pair-evidence row' benchmark/auto-resolve/README.md \
3063
+ || ! grep -Fq 'counted by `benchmark audit` as the fourth passing pair-evidence row' benchmark/auto-resolve/run-real-benchmark.md \
3064
+ || ! grep -Fq 'counted by `benchmark audit` as the fourth passing pair-evidence row' benchmark/auto-resolve/BENCHMARK-RESULTS.md \
3065
+ || ! grep -Fq 'the default 5-point pair margin' README.md \
3066
+ || ! grep -Fq -- '`--pair-verify` and `--no-pair` are mutually exclusive' README.md \
3067
+ || ! grep -Fq 'default 5-point pair margin' benchmark/auto-resolve/README.md \
3068
+ || ! grep -Fq 'the default 5-point pair margin' benchmark/auto-resolve/run-real-benchmark.md \
3069
+ || ! grep -Fq '3x pair/solo wall ratio' README.md \
3070
+ || ! grep -Fq '3x pair/solo wall ratio' benchmark/auto-resolve/README.md \
3071
+ || ! grep -Fq '3x pair/solo wall ratio' benchmark/auto-resolve/run-real-benchmark.md \
3072
+ || ! grep -Fq 'audit.json` with the frontier summary' README.md \
3073
+ || ! grep -Fq 'audit.json` with the frontier summary' benchmark/auto-resolve/README.md \
3074
+ || ! grep -Fq 'audit.json` with the frontier summary' benchmark/auto-resolve/run-real-benchmark.md \
3075
+ || ! grep -Fq 'artifact map' README.md \
3076
+ || ! grep -Fq 'artifact map' benchmark/auto-resolve/README.md \
3077
+ || ! grep -Fq 'artifact map' benchmark/auto-resolve/run-real-benchmark.md \
3078
+ || ! grep -Fq '`checks.frontier_stdout` records summary, aggregate, final-verdict, expected, printed score-row, trigger-visible row, and hypothesis-trigger-visible row counts' README.md \
3079
+ || ! grep -Fq '`headroom_rejections=...`, `pair_evidence_quality=...`,' README.md \
3080
+ || ! grep -Fq '`pair_trigger_reasons=...`, `pair_evidence_hypotheses=...`, and' README.md \
3081
+ || ! grep -Fq '`pair_evidence_hypothesis_triggers=...` handoff rows' README.md \
3082
+ || ! grep -Fq '`pair_trigger_historical_aliases=...` when archived evidence includes legacy' README.md \
3083
+ || ! grep -Fq '`pair_evidence_hypothesis_trigger_gaps=...` when documented' README.md \
3084
+ || ! grep -Fq 'canonical trigger reason coverage' README.md \
3085
+ || ! grep -Fq '`checks.pair_evidence_quality` records the same quality thresholds from the compact rows' README.md \
3086
+ || ! grep -Fq '`checks.pair_trigger_reasons` records canonical/historical-alias/exposed/total trigger-reason row counts, fixture-level historical alias details, summary count, and row-match status' README.md \
3087
+ || ! grep -Fq '`checks.pair_evidence_hypotheses` records documented/total pair-evidence hypothesis row counts' README.md \
3088
+ || ! grep -Fq '`checks.pair_evidence_hypothesis_triggers` records whether documented hypotheses also appear as `spec.solo_headroom_hypothesis` trigger reasons plus fixture-level gap details' README.md \
3089
+ || ! grep -Fq 'regenerated pair evidence' README.md \
3090
+ || ! grep -Fq 'Historical trigger aliases are only reported for archived artifact review' README.md \
3091
+ || ! grep -Fq 'current pair-evidence gates fail historical-only or unknown trigger reasons' README.md \
3092
+ || ! grep -Fq '`checks.frontier_stdout`' benchmark/auto-resolve/README.md \
3093
+ || ! grep -Fq 'aggregate, final-verdict, expected, printed score-row, trigger-visible row, and hypothesis-trigger-visible row counts' benchmark/auto-resolve/README.md \
3094
+ || ! grep -Fq '`headroom_rejections=...`,' benchmark/auto-resolve/README.md \
3095
+ || ! grep -Fq '`pair_trigger_reasons=...`, and' benchmark/auto-resolve/README.md \
3096
+ || ! grep -Fq '`pair_evidence_hypotheses=...` and' benchmark/auto-resolve/README.md \
3097
+ || ! grep -Fq '`pair_evidence_hypothesis_triggers=...` handoff rows' benchmark/auto-resolve/README.md \
3098
+ || ! grep -Fq '`pair_trigger_historical_aliases=...` when archived evidence includes legacy' benchmark/auto-resolve/README.md \
3099
+ || ! grep -Fq '`pair_evidence_hypothesis_trigger_gaps=...` when documented' benchmark/auto-resolve/README.md \
3100
+ || ! grep -Fq 'canonical trigger reason coverage' benchmark/auto-resolve/README.md \
3101
+ || ! grep -Fq 'records the same quality thresholds from the compact rows,' benchmark/auto-resolve/README.md \
3102
+ || ! grep -Fq '`checks.pair_trigger_reasons` records canonical/historical-alias/exposed/total trigger-reason row counts, fixture-level historical alias details, summary count, and row-match status' benchmark/auto-resolve/README.md \
3103
+ || ! grep -Fq '`checks.pair_evidence_hypotheses` records documented/total pair-evidence hypothesis row counts' benchmark/auto-resolve/README.md \
3104
+ || ! grep -Fq '`checks.pair_evidence_hypothesis_triggers` records whether documented hypotheses also appear as `spec.solo_headroom_hypothesis` trigger reasons plus fixture-level gap details' benchmark/auto-resolve/README.md \
3105
+ || ! grep -Fq 'regenerated pair evidence' benchmark/auto-resolve/README.md \
3106
+ || ! grep -Fq 'Historical trigger aliases are only reported for archived artifact review' benchmark/auto-resolve/README.md \
3107
+ || ! grep -Fq 'current pair-evidence gates fail historical-only or unknown trigger reasons' benchmark/auto-resolve/README.md \
3108
+ || ! grep -Fq '`checks.frontier_stdout` records summary, aggregate, final-verdict, expected, printed score-row, trigger-visible row, and hypothesis-trigger-visible row counts' benchmark/auto-resolve/run-real-benchmark.md \
3109
+ || ! grep -Fq 'also prints `headroom_rejections=...`, `pair_evidence_quality=...`,' benchmark/auto-resolve/run-real-benchmark.md \
3110
+ || ! grep -Fq '`pair_trigger_reasons=...`, `pair_evidence_hypotheses=...`, and `pair_evidence_hypothesis_triggers=...` handoff rows' benchmark/auto-resolve/run-real-benchmark.md \
3111
+ || ! grep -Fq '`pair_trigger_historical_aliases=...` when archived evidence includes legacy' benchmark/auto-resolve/run-real-benchmark.md \
3112
+ || ! grep -Fq '`pair_evidence_hypothesis_trigger_gaps=...` when documented' benchmark/auto-resolve/run-real-benchmark.md \
3113
+ || ! grep -Fq 'canonical trigger reason coverage' benchmark/auto-resolve/run-real-benchmark.md \
3114
+ || ! grep -Fq '`checks.pair_evidence_quality` records the same quality thresholds from the compact rows' benchmark/auto-resolve/run-real-benchmark.md \
3115
+ || ! grep -Fq '`checks.pair_trigger_reasons` records canonical/historical-alias/exposed/total trigger-reason row counts, fixture-level historical alias details, summary count, and row-match status' benchmark/auto-resolve/run-real-benchmark.md \
3116
+ || ! grep -Fq '`checks.pair_evidence_hypotheses` records documented/total pair-evidence hypothesis row counts' benchmark/auto-resolve/run-real-benchmark.md \
3117
+ || ! grep -Fq '`checks.pair_evidence_hypothesis_triggers` records whether documented hypotheses also appear as `spec.solo_headroom_hypothesis` trigger reasons plus fixture-level gap details' benchmark/auto-resolve/run-real-benchmark.md \
3118
+ || ! grep -Fq 'regenerated pair evidence' benchmark/auto-resolve/run-real-benchmark.md \
3119
+ || ! grep -Fq 'Historical trigger aliases are only reported for archived artifact review' benchmark/auto-resolve/run-real-benchmark.md \
3120
+ || ! grep -Fq 'current pair-evidence gates fail historical-only or unknown trigger reasons' benchmark/auto-resolve/run-real-benchmark.md \
3121
+ || ! grep -Fq 'compact trigger-backed verdict-bearing `pair_evidence_rows`' README.md \
3122
+ || ! grep -Fq 'pair_trigger_eligible: true' README.md \
3123
+ || ! grep -Fq 'non-empty `pair_trigger_reasons`, `pair_trigger_has_canonical_reason: true`, and `pair_trigger_has_hypothesis_reason`; the audit fails rows missing trigger reasons' README.md \
3124
+ || ! grep -Fq 'pair_trigger_has_canonical_reason: true' README.md \
3125
+ || ! grep -Fq '`pair_evidence_rows`' benchmark/auto-resolve/README.md \
3126
+ || ! grep -Fq 'compact trigger-backed verdict-bearing score rows' benchmark/auto-resolve/README.md \
3127
+ || ! grep -Fq 'pair_trigger_eligible: true' benchmark/auto-resolve/README.md \
3128
+ || ! grep -Fq 'non-empty `pair_trigger_reasons`, `pair_trigger_has_canonical_reason: true`, and `pair_trigger_has_hypothesis_reason`; the audit fails rows missing trigger reasons' benchmark/auto-resolve/README.md \
3129
+ || ! grep -Fq 'pair_trigger_has_canonical_reason: true' benchmark/auto-resolve/README.md \
3130
+ || ! grep -Fq 'compact trigger-backed verdict-bearing `pair_evidence_rows`' benchmark/auto-resolve/run-real-benchmark.md \
3131
+ || ! grep -Fq 'pair_trigger_eligible: true' benchmark/auto-resolve/run-real-benchmark.md \
3132
+ || ! grep -Fq 'non-empty `pair_trigger_reasons`, `pair_trigger_has_canonical_reason: true`, and `pair_trigger_has_hypothesis_reason`; the audit fails rows missing trigger reasons' benchmark/auto-resolve/run-real-benchmark.md \
3133
+ || ! grep -Fq 'pair_trigger_has_canonical_reason: true' benchmark/auto-resolve/run-real-benchmark.md \
3134
+ || ! grep -Fq 'revalidates `pair_mode: true`' README.md \
3135
+ || ! grep -Fq 'satisfy `pair_mode: true`' benchmark/auto-resolve/README.md \
3136
+ || ! grep -Fq 'revalidates `pair_mode: true`' benchmark/auto-resolve/run-real-benchmark.md \
3137
+ || ! grep -Fq 'JSON rows expose `pair_trigger_reasons` and' benchmark/auto-resolve/README.md \
3138
+ || ! grep -Fq 'Markdown output includes a `Triggers`' benchmark/auto-resolve/README.md \
3139
+ || ! grep -Fq 'trigger reasons, canonical-trigger coverage, classification counts' benchmark/auto-resolve/README.md \
3140
+ || ! grep -Fq 'Its Markdown table includes a `Triggers` column' benchmark/auto-resolve/README.md \
3141
+ || ! grep -Fq 'compact evidence row count must match the' README.md \
3142
+ || ! grep -Fq 'compact evidence row count must match the' benchmark/auto-resolve/README.md \
3143
+ || ! grep -Fq 'evidence row count must match the frontier evidence count' benchmark/auto-resolve/run-real-benchmark.md \
3144
+ || ! grep -Fq '"verdict": "PASS" if unmeasured_candidate_total == 0 else "FAIL"' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3145
+ || ! grep -Fq 'assert frontier["unmeasured_count"] == 0' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3146
+ || ! grep -Fq 'pair_margin_avg' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3147
+ || ! grep -Fq 'from pair_evidence_contract import' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3148
+ || ! grep -Fq 'from pair_evidence_contract import' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3149
+ || ! grep -Fq 'from pair_evidence_contract import' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3150
+ || ! grep -Fq 'def normalize_pair_evidence_row' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3151
+ || ! grep -Fq 'def best_pair_evidence' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3152
+ || ! grep -Fq 'ALLOWED_PAIR_ARMS = {"l2_risk_probes", "l2_gated"}' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3153
+ || ! grep -Fq 'pair_arm not in ALLOWED_PAIR_ARMS' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3154
+ || ! grep -Fq 'pair_mode = row.get("pair_mode")' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3155
+ || ! grep -Fq 'if pair_mode is not True:' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3156
+ || ! grep -Fq 'pair_trigger_eligible = row.get("pair_trigger_eligible")' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3157
+ || ! grep -Fq 'if pair_trigger_eligible is not True:' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3158
+ || ! grep -Fq '"pair_trigger_eligible": pair_trigger_eligible' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3159
+ || ! grep -Fq '"pair_mode": pair_mode' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3160
+ || ! grep -Fq 'def is_strict_int' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3161
+ || ! grep -Fq 'def is_score' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3162
+ || ! grep -Fq '0 <= value <= 100' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3163
+ || ! grep -Fq 'def is_strict_number' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3164
+ || ! grep -Fq 'import math' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3165
+ || ! grep -Fq 'math.isfinite(value)' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3166
+ || ! grep -Fq 'and value > 0' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3167
+ || ! grep -Fq 'if pair_margin != pair_score - solo_score:' benchmark/auto-resolve/scripts/pair_evidence_contract.py \
3168
+ || ! grep -Fq 'not math.isfinite(args.max_pair_solo_wall_ratio)' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3169
+ || ! grep -Fq 'nan-wall-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3170
+ || ! grep -Fq 'inflated-margin-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3171
+ || ! grep -Fq 'overrange-score-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3172
+ || ! grep -Fq 'invalid-arm-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3173
+ || ! grep -Fq 'false-pair-mode-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3174
+ || ! grep -Fq 'zero-wall-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3175
+ || ! grep -Fq 'nan-max-wall-ratio' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3176
+ || ! grep -Fq 'pair_margin_avg=+21.00 pair_margin_min=+21 wall_avg=1.28x wall_max=1.28x' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3177
+ || ! grep -Fq 'incomplete-high-run' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3178
+ || ! grep -Fq 'bad-pair-evidence-json' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3179
+ || ! grep -Fq 'pair evidence artifact malformed' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3180
+ || ! grep -Fq 'bad-pair-evidence-rows' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3181
+ || ! grep -Fq 'pair evidence artifact rows malformed' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3182
+ || ! grep -Fq 'assert len(rows["F16-cli-quote-tax-rules"]["passing_pair_evidence"]) == 1' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3183
+ || ! grep -Fq 'plus row-level verdicts' README.md \
3184
+ || ! grep -Fq 'including pair arm, trigger reasons, average/minimum pair margin' README.md \
3185
+ || ! grep -Fq 'Markdown frontier artifacts include a `Triggers` column' README.md \
3186
+ || ! grep -Fq 'pair arm, margin, wall ratio, run id, verdict, and trigger reasons' benchmark/auto-resolve/README.md \
3187
+ || ! grep -Fq 'with pair arm, verdict, and trigger reasons from the frontier step' benchmark/auto-resolve/run-real-benchmark.md \
3188
+ || ! grep -Fq 'pair-candidate-frontier.py --fail-on-unmeasured' benchmark/auto-resolve/run-real-benchmark.md \
3189
+ || ! grep -Fq 'audit-headroom-rejections.py' benchmark/auto-resolve/README.md \
3190
+ || ! grep -Fq 'audit-headroom-rejections.py' benchmark/auto-resolve/run-real-benchmark.md \
3191
+ || ! grep -Fq 'unrecorded headroom rejection(s)' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3192
+ || ! grep -Fq 'unsupported registry rejection(s)' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3193
+ || ! grep -Fq 'solo_claude={solo_score}' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3194
+ || ! grep -Fq 'expected_solo_claude={expected_solo_score}' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3195
+ || ! grep -Fq 'unsupported_registry_rejections' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
3196
+ || ! grep -Fq 'expected_solo_claude=98' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
3197
+ || ! grep -Fq 'active registry entries whose reason cites a run id or' benchmark/auto-resolve/README.md \
3198
+ || ! grep -Fq 'active rejected-registry reason is backed' benchmark/auto-resolve/run-real-benchmark.md \
3199
+ || ! grep -Fq 'invalid headroom rejections' bin/devlyn.js \
3200
+ || ! grep -Fq 'MALFORMED_JSON' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3201
+ || ! grep -Fq 'MALFORMED_ROWS' benchmark/auto-resolve/scripts/audit-headroom-rejections.py \
3202
+ || ! grep -Fq 'F33-cli-new-candidate' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
3203
+ || ! grep -Fq 'bad-json-headroom <unknown>' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
3204
+ || ! grep -Fq 'malformed-headroom <unknown>' benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh \
3205
+ || ! grep -Fq 'candidate_unmeasured' benchmark/auto-resolve/run-real-benchmark.md \
3206
+ || ! grep -Fq 'pair_evidence_passed' benchmark/auto-resolve/run-real-benchmark.md \
3207
+ || ! grep -Fq 'print_summary(report)' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3208
+ || ! grep -Fq 'fail_on_unmeasured' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3209
+ || ! grep -Fq 'unmeasured candidate fixture(s)' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3210
+ || ! grep -Fq 'pure JSON stdout must not include final text verdict' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3211
+ || ! grep -Fq 'benchmark frontier pure JSON stdout must not include final text verdict' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3212
+ || ! grep -Fq -- '--fail-on-unmeasured' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3213
+ || ! grep -Fq -- '--fail-on-unmeasured' bin/devlyn.js \
3214
+ || ! grep -Fq -- '--fail-on-unmeasured' benchmark/auto-resolve/README.md \
3215
+ || ! grep -Fq -- '--fail-on-unmeasured' benchmark/auto-resolve/run-real-benchmark.md \
3216
+ || ! grep -Fq 'bare={bare} solo_claude={solo} pair={pair} arm={arm} margin={margin}' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3217
+ || ! grep -Fq 'verdict=pair_evidence_passed' benchmark/auto-resolve/scripts/pair-candidate-frontier.py \
3218
+ || ! grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3219
+ || ! grep -Fq 'verdict=pair_evidence_passed' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3220
+ || ! grep -Fq '[audit] frontier' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3221
+ || ! grep -Fq 'fixtures=21 rejected=17 candidates=4 pair_evidence=4 unmeasured=0 verdict=PASS' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3222
+ || ! grep -Fq 'F16-cli-quote-tax-rules: bare=50 solo_claude=75 pair=96 arm=l2_risk_probes margin=+21' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3223
+ || ! grep -Fq 'frontier.stdout' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3224
+ || ! grep -Fq 'headroom-rejections.stdout' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3225
+ || ! grep -Fq '"frontier_stdout": "frontier.stdout"' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3226
+ || ! grep -Fq '"headroom_rejections_stdout": "headroom-rejections.stdout"' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3227
+ || ! grep -Fq 'def load_headroom_audit_summary' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3228
+ || ! grep -Fq 'def check_headroom_audit_report' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3229
+ || ! grep -Fq 'def print_headroom_rejections_summary' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3230
+ || ! grep -Fq 'headroom_rejections={status} verdict={verdict}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3231
+ || ! grep -Fq 'headroom_report_status == 0' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3232
+ || ! grep -Fq '"report_check_exit_code": headroom_report_status' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3233
+ || ! grep -Fq 'headroom audit unsupported registry rejection count missing or malformed' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3234
+ || ! grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3235
+ || ! grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3236
+ || ! grep -Fq 'pair trigger reason rows {len(rows)} do not match summary count {count}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3237
+ || ! grep -Fq 'pair trigger reasons missing for fixture(s)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3238
+ || ! grep -Fq 'missing-trigger-reasons' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3239
+ || ! grep -Fq 'malformed-trigger-reason-rows' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3240
+ || ! grep -Fq 'mixed-unknown-trigger-reason-rows' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3241
+ || ! grep -Fq 'pair_trigger_reasons=PASS canonical=2 historical_alias=1 exposed=2 total=2 summary=2 rows_match=true' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3242
+ || ! grep -Fq 'pair_trigger_historical_aliases=F21-cli-scheduler-priority=risk_profile.high_risk' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3243
+ || ! grep -Fq '"unsupported_registry_rejection_count": (' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3244
+ || ! grep -Fq 'len(unsupported) if isinstance(unsupported, list) else None' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3245
+ || ! grep -Fq 'headroom-missing-unsupported' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3246
+ || ! grep -Fq 'unsupported_registry_rejection_count' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3247
+ || ! grep -Fq 'unsupported_registry_rejection_count' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3248
+ || ! grep -Fq '`checks.headroom_rejections` records child verdict plus unrecorded/unsupported counts' README.md \
3249
+ || ! grep -Fq '`checks.headroom_rejections` records the child verdict plus unrecorded and' benchmark/auto-resolve/README.md \
3250
+ || ! grep -Fq '`checks.headroom_rejections` records child verdict plus unrecorded/unsupported counts' benchmark/auto-resolve/run-real-benchmark.md \
3251
+ || ! grep -Fq 'def check_frontier_stdout' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3252
+ || ! grep -Fq '"final_verdict_rows": final_verdict_rows' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3253
+ || ! grep -Fq 'frontier stdout final verdict row count is not exactly 1' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3254
+ || ! grep -Fq 'frontier stdout missing score row' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3255
+ || ! grep -Fq 'wall={wall} run={run_id}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3256
+ || ! grep -Fq 'frontier stdout aggregate score row count is not exactly 1' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3257
+ || ! grep -Fq 'frontier stdout summary score row count is not exactly 1' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3258
+ || ! grep -Fq 'frontier stdout aggregate fields malformed' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3259
+ || ! grep -Fq 'frontier stdout score row count' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3260
+ || ! grep -Fq 'summary_count = stdout.splitlines().count(required_summary)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3261
+ || ! grep -Fq 'aggregate_count = stdout.splitlines().count(required_aggregate)' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3262
+ || ! grep -Fq 'frontier stdout summary counts malformed' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3263
+ || ! grep -Fq 'count_keys = {' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3264
+ || ! grep -Fq 'pair_margin_avg={avg} pair_margin_min={min_margin}' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3265
+ || ! grep -Fq 'def format_decimal_margin' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3266
+ || ! grep -Fq 'def format_wall_ratio' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3267
+ || ! grep -Fq 'frontier stdout check missing summary fields' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3268
+ || ! grep -Fq 'missing-frontier-score-row' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3269
+ || ! grep -Fq 'missing-frontier-aggregate-row' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3270
+ || ! grep -Fq 'duplicate-frontier-summary-row' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3271
+ || ! grep -Fq 'duplicate-frontier-aggregate-row' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3272
+ || ! grep -Fq 'pair_margin_avg=+27.00 pair_margin_min=+21 wall_avg=1.38x wall_max=1.47x' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3273
+ || ! grep -Fq 'partial-frontier-score-row' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3274
+ || ! grep -Fq 'extra-frontier-score-row' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3275
+ || ! grep -Fq 'malformed-frontier-stdout-summary' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3276
+ || ! grep -Fq 'malformed-frontier-stdout-counts' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3277
+ || ! grep -Fq 'malformed-frontier-stdout-aggregate' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3278
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["status"] == "PASS"' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3279
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["summary_rows"] == 1' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3280
+ || ! grep -Fq 'report["checks"]["frontier_stdout"]["aggregate_rows"] == 1' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3281
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["final_verdict_rows"] == 1' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3282
+ || ! grep -Fq 'missing-final-frontier-verdict' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3283
+ || ! grep -Fq 'duplicate-final-frontier-verdict' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3284
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["expected_rows"] == len(audit["pair_evidence_rows"])' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3285
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["trigger_rows"] == len(audit["pair_evidence_rows"])' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3286
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["hypothesis_trigger_rows"] == len(audit["pair_evidence_rows"])' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3287
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["trigger_rows_match_count"] is True' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3288
+ || ! grep -Fq 'audit["checks"]["frontier_stdout"]["hypothesis_trigger_rows_match_count"] is True' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3289
+ || ! grep -Fq 'report["checks"]["frontier_stdout"]["trigger_rows"] == 2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3290
+ || ! grep -Fq 'report["checks"]["frontier_stdout"]["hypothesis_trigger_rows"] == 2' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3291
+ || ! grep -Fq 'report["checks"]["frontier_stdout"]["trigger_rows_match_count"] is True' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3292
+ || ! grep -Fq 'report["checks"]["frontier_stdout"]["hypothesis_trigger_rows_match_count"] is True' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3293
+ || ! grep -Fq 'report["checks"]["frontier_stdout"]["rows_match_count"] is True' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3294
+ || ! grep -Fq 'audit["artifacts"]["frontier_stdout"] == "frontier.stdout"' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3295
+ || ! grep -Fq 'report["artifacts"] == {' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3296
+ || ! grep -Fq 'frontier.stdout' benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh \
3297
+ || ! grep -Fq 'frontier.stdout' benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh \
3298
+ || ! grep -Fq 'frontier summary and an artifact map (`artifacts`)' benchmark/auto-resolve/README.md \
3299
+ || ! grep -Fq 'artifact map' benchmark/auto-resolve/run-real-benchmark.md \
3300
+ || ! grep -Fq 'child stdout/stderr logs' benchmark/auto-resolve/run-real-benchmark.md \
3301
+ || ! grep -Fq '| Fixture | Status | Verdict | Evidence | Pair arm | Triggers | Hypothesis trigger | Bare | Solo_claude | Pair | Margin | Wall ratio | Rejected reason |' benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh \
3302
+ || ! grep -Fq 'benchmark frontier` also prints a stdout score summary for existing complete pair' README.md \
3303
+ || ! grep -Fq 'plus row-level verdicts' README.md \
3304
+ || ! grep -Fq 'including pair arm, trigger reasons, average/minimum pair margin' README.md \
3305
+ || ! grep -Fq 'Markdown frontier artifacts include a `Triggers` column' README.md \
3306
+ || ! grep -Fq 'Full-pipeline pair gate artifacts record `require_hypothesis_trigger` in JSON' README.md \
3307
+ || ! grep -Fq 'Full-pipeline pair gate artifacts record `require_hypothesis_trigger` in JSON' benchmark/auto-resolve/README.md \
3308
+ || ! grep -Fq 'Full-pipeline pair gate artifacts record `require_hypothesis_trigger` in JSON' benchmark/auto-resolve/run-real-benchmark.md \
3309
+ || ! grep -Fq 'includes a Markdown `Hypothesis trigger` column' benchmark/auto-resolve/README.md \
3310
+ || ! grep -Fq 'includes a Markdown `Hypothesis trigger` column' benchmark/auto-resolve/run-real-benchmark.md \
3311
+ || ! grep -Fq 'pair arm, margin, wall ratio, run id, verdict, and trigger reasons' benchmark/auto-resolve/README.md \
3312
+ || ! grep -Fq 'overall verdict plus row-level verdict, pair-arm, and trigger-reason columns' benchmark/auto-resolve/README.md \
3313
+ || ! grep -Fq 'with pair arm, verdict, and trigger reasons from the frontier step' benchmark/auto-resolve/run-real-benchmark.md \
3314
+ || ! grep -Fq 'overall verdict plus row-level verdict, pair-arm, and trigger-reason columns' benchmark/auto-resolve/run-real-benchmark.md \
3315
+ || ! grep -Fq 'complete pair evidence rows' benchmark/auto-resolve/README.md \
3316
+ || ! grep -Fq 'complete pair evidence rows' benchmark/auto-resolve/run-real-benchmark.md; then
3317
+ offenders="${offenders}"$'\n'"benchmark docs and CLI must expose the pair-candidate frontier report before new provider spend"
3318
+ fi
3319
+ if ! grep -Fq '"benchmark/auto-resolve/scripts/**"' package.json \
3320
+ || ! grep -Fq 'benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json' package.json \
3321
+ || ! grep -Fq 'benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.json' package.json \
3322
+ || ! grep -Fq 'benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.json' package.json \
3323
+ || ! grep -Fq 'benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.json' package.json; then
3324
+ offenders="${offenders}"$'\n'"package.json must include benchmark scripts, runner regression tests, current pair evidence, and rejected-headroom evidence in npm packages"
3325
+ fi
3326
+ if ! grep -Fq '"max_observed_pair_solo_wall_ratio": 2.2506234413965087' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json \
3327
+ || ! grep -Fq '"max_observed_pair_solo_wall_ratio": 1.4728476821192054' benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json \
3328
+ || ! grep -Fq 'pair_trigger eligible with a canonical reason' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json \
3329
+ || ! grep -Fq 'pair_trigger eligible with a canonical reason' benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json \
3330
+ || ! grep -Fq 'pair_trigger eligible with canonical reason' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
3331
+ || ! grep -Fq 'pair_trigger eligible with canonical reason' benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md \
3332
+ || ! grep -Fq 'Allowed pair/solo wall ratio: 3.00x' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
3333
+ || ! grep -Fq 'Maximum observed pair/solo wall ratio: 2.25x' benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md \
3334
+ || ! grep -Fq 'Allowed pair/solo wall ratio: 3.00x' benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md \
3335
+ || ! grep -Fq 'Maximum observed pair/solo wall ratio: 1.47x' benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md; then
3336
+ offenders="${offenders}"$'\n'"packaged pair evidence artifacts must use the current observed-vs-allowed wall-ratio schema and canonical trigger rule wording"
3337
+ fi
3338
+ if make_temp_dir package_results /tmp/devlyn-lint-package-results.XXXXXX \
3339
+ && make_temp_dir package_audit /tmp/devlyn-lint-package-audit.XXXXXX \
3340
+ && make_temp_file package_audit_stdout /tmp/devlyn-lint-package-audit.XXXXXX.out; then
3341
+ cp -R benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof "$package_results/"
3342
+ cp -R benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1 "$package_results/"
3343
+ for rejected_run in \
3344
+ 20260507-f10-f11-tier1-full-pipeline \
3345
+ 20260508-f22-exact-error-headroom \
3346
+ 20260508-f26-headroom \
3347
+ 20260511-f3-http-error-headroom \
3348
+ 20260511-f12-webhook-headroom \
3349
+ 20260511-f15-concurrency-headroom \
3350
+ 20260512-f2-medium-headroom \
3351
+ 20260512-f4-web-headroom \
3352
+ 20260512-f5-fixloop-headroom \
3353
+ 20260512-f6-checksum-headroom \
3354
+ 20260512-f7-scope-headroom \
3355
+ 20260512-f9-e2e-headroom \
3356
+ 20260512-f31-seat-rebalance-headroom \
3357
+ 20260512-f32-subscription-renewal-headroom; do
3358
+ cp -R "benchmark/auto-resolve/results/$rejected_run" "$package_results/"
3359
+ done
3360
+ if ! python3 benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3361
+ --results-root "$package_results" \
3362
+ --out-dir "$package_audit" >"$package_audit_stdout" 2>&1; then
3363
+ offenders="${offenders}"$'\n'"packaged pair evidence subset must pass benchmark audit without relying on unshipped local results"
3364
+ elif ! grep -Fq 'headroom_rejections=PASS verdict=PASS unrecorded=0 unsupported=0' "$package_audit_stdout" \
3365
+ || ! grep -Fq 'pair_evidence_quality=PASS min_pair_margin_actual=+21 min_pair_margin_required=+5 max_wall_actual=2.25x max_wall_allowed=3.00x' "$package_audit_stdout" \
3366
+ || ! grep -Fq 'pair_trigger_reasons=PASS canonical=4 historical_alias=0 exposed=4 total=4 summary=4 rows_match=true' "$package_audit_stdout" \
3367
+ || ! grep -Fq 'pair_evidence_hypothesis_triggers=PASS matched=4 documented=4 total=4' "$package_audit_stdout" \
3368
+ || grep -Fq 'pair_trigger_historical_aliases=' "$package_audit_stdout" \
3369
+ || grep -Fq 'pair_evidence_hypothesis_trigger_gaps=' "$package_audit_stdout"; then
3370
+ offenders="${offenders}"$'\n'"packaged pair evidence subset audit stdout must expose headroom, pair-quality, and trigger-reason handoff rows"
3371
+ elif ! python3 - "$package_audit/audit.json" <<'PY'
3372
+ import json
3373
+ import sys
3374
+
3375
+ report = json.load(open(sys.argv[1], encoding="utf8"))
3376
+ rows = report.get("pair_evidence_rows")
3377
+ frontier_summary = report.get("frontier_summary", {})
3378
+ frontier_report = report.get("checks", {}).get("frontier_report", {})
3379
+ frontier_stdout = report.get("checks", {}).get("frontier_stdout", {})
3380
+ min_pair_evidence = report.get("checks", {}).get("min_pair_evidence", {})
3381
+ pair_evidence_quality = report.get("checks", {}).get("pair_evidence_quality", {})
3382
+ pair_trigger_reasons = report.get("checks", {}).get("pair_trigger_reasons", {})
3383
+ headroom_rejections = report.get("checks", {}).get("headroom_rejections", {})
3384
+ artifacts = report.get("artifacts", {})
3385
+ assert report.get("verdict") == "PASS"
3386
+ assert frontier_summary.get("verdict") == "PASS"
3387
+ assert frontier_summary.get("pair_evidence_count") == 4
3388
+ assert frontier_summary.get("unmeasured_count") == 0
3389
+ assert frontier_report.get("status") == "PASS"
3390
+ assert frontier_report.get("verdict") == "PASS"
3391
+ assert frontier_report.get("unmeasured_count") == 0
3392
+ assert frontier_stdout.get("status") == "PASS"
3393
+ assert frontier_stdout.get("report", "").endswith("frontier.stdout")
3394
+ assert frontier_stdout.get("summary_rows") == 1
3395
+ assert frontier_stdout.get("aggregate_rows") == 1
3396
+ assert frontier_stdout.get("final_verdict_rows") == 1
3397
+ assert frontier_stdout.get("expected_rows") == len(rows) == 4
3398
+ assert frontier_stdout.get("stdout_rows") == len(rows) == 4
3399
+ assert frontier_stdout.get("trigger_rows") == len(rows) == 4
3400
+ assert frontier_stdout.get("hypothesis_trigger_rows") == len(rows) == 4
3401
+ assert frontier_stdout.get("rows_match_count") is True
3402
+ assert frontier_stdout.get("trigger_rows_match_count") is True
3403
+ assert frontier_stdout.get("hypothesis_trigger_rows_match_count") is True
3404
+ assert headroom_rejections.get("status") == "PASS"
3405
+ assert headroom_rejections.get("report_check_exit_code") == 0
3406
+ assert headroom_rejections.get("verdict") == "PASS"
3407
+ assert headroom_rejections.get("unrecorded_failure_count") == 0
3408
+ assert headroom_rejections.get("unsupported_registry_rejection_count") == 0
3409
+ assert min_pair_evidence.get("rows_match_count") is True
3410
+ assert min_pair_evidence.get("actual_rows") == len(rows) == 4
3411
+ assert pair_evidence_quality.get("status") == "PASS"
3412
+ assert pair_evidence_quality.get("min_pair_margin_actual") == frontier_summary.get("pair_margin_min")
3413
+ assert pair_evidence_quality.get("max_pair_solo_wall_ratio_actual") == frontier_summary.get("pair_solo_wall_ratio_max")
3414
+ assert pair_trigger_reasons.get("status") == "PASS"
3415
+ assert pair_trigger_reasons.get("summary_pair_evidence_count") == 4
3416
+ assert pair_trigger_reasons.get("canonical_rows") == 4
3417
+ assert pair_trigger_reasons.get("historical_alias_rows") == 0
3418
+ assert pair_trigger_reasons.get("historical_alias_details") == []
3419
+ assert pair_trigger_reasons.get("exposed_rows") == 4
3420
+ assert pair_trigger_reasons.get("total_rows") == 4
3421
+ assert pair_trigger_reasons.get("rows_match_count") is True
3422
+ pair_hypothesis_triggers = report.get("checks", {}).get("pair_evidence_hypothesis_triggers", {})
3423
+ assert pair_hypothesis_triggers.get("status") == "PASS"
3424
+ assert pair_hypothesis_triggers.get("matched_rows") == 4
3425
+ assert pair_hypothesis_triggers.get("documented_rows") == 4
3426
+ assert pair_hypothesis_triggers.get("gap_details") == []
3427
+ assert artifacts.get("frontier_json") == "frontier.json"
3428
+ assert artifacts.get("frontier_stdout") == "frontier.stdout"
3429
+ assert artifacts.get("headroom_audit_json") == "headroom-audit.json"
3430
+ assert artifacts.get("headroom_rejections_stdout") == "headroom-rejections.stdout"
3431
+ for row in rows:
3432
+ assert row.get("verdict") == "pair_evidence_passed"
3433
+ assert row.get("pair_arm") == "l2_risk_probes"
3434
+ assert row.get("pair_mode") is True
3435
+ assert row.get("pair_trigger_eligible") is True
3436
+ assert isinstance(row.get("pair_trigger_reasons"), list)
3437
+ assert row.get("pair_trigger_reasons")
3438
+ assert row.get("pair_trigger_has_canonical_reason") is True
3439
+ assert row.get("pair_trigger_has_hypothesis_reason") is True
3440
+ PY
3441
+ then
3442
+ offenders="${offenders}"$'\n'"packaged pair evidence subset audit.json must expose frontier_report, frontier_stdout, artifact map, and 4 trigger-backed verdict-bearing pair rows with trigger reasons"
3443
+ fi
3444
+ rm -rf "$package_results" "$package_audit" "$package_audit_stdout"
3445
+ else
3446
+ offenders="${offenders}"$'\n'"packaged pair evidence subset could not allocate temporary audit workspace"
3447
+ fi
3448
+ if make_temp_file pack_json /tmp/devlyn-lint-pack.XXXXXX.json \
3449
+ && npm pack --dry-run --json > "$pack_json" 2>/dev/null; then
3450
+ if ! node - "$pack_json" <<'NODE'
3451
+ const fs = require("fs");
3452
+ const path = require("path");
3453
+ const packPath = process.argv[2];
3454
+ const pack = JSON.parse(fs.readFileSync(packPath, "utf8"))[0];
3455
+ const files = new Set(pack.files.map((file) => file.path));
3456
+ function listFiles(dir) {
3457
+ return fs.readdirSync(dir, { withFileTypes: true }).flatMap((entry) => {
3458
+ const full = path.join(dir, entry.name);
3459
+ if (entry.isDirectory()) return listFiles(full);
3460
+ if (!entry.isFile()) return [];
3461
+ return [full.split(path.sep).join("/")];
3462
+ });
3463
+ }
3464
+ const shadowRequired = fs
3465
+ .readdirSync("benchmark/auto-resolve/shadow-fixtures", { withFileTypes: true })
3466
+ .filter((entry) => entry.isDirectory() && /^S/.test(entry.name))
3467
+ .flatMap((entry) => listFiles(path.join("benchmark/auto-resolve/shadow-fixtures", entry.name)));
3468
+ const required = [
3469
+ "benchmark/auto-resolve/BENCHMARK-RESULTS.md",
3470
+ "benchmark/auto-resolve/run-real-benchmark.md",
3471
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md",
3472
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json",
3473
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md",
3474
+ "benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json",
3475
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.md",
3476
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/headroom-gate.json",
3477
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.md",
3478
+ "benchmark/auto-resolve/results/20260511-f21-current-riskprobes-v1/full-pipeline-pair-gate.json",
3479
+ "benchmark/auto-resolve/results/20260512-f2-medium-headroom/headroom-gate.json",
3480
+ "benchmark/auto-resolve/results/20260512-f31-seat-rebalance-headroom/headroom-gate.json",
3481
+ "benchmark/auto-resolve/results/20260512-f32-subscription-renewal-headroom/headroom-gate.json",
3482
+ "benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md",
3483
+ "benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md",
3484
+ "benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json",
3485
+ "benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json",
3486
+ "benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js",
3487
+ "benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js",
3488
+ "benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md",
3489
+ "benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md",
3490
+ "benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json",
3491
+ "benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json",
3492
+ "benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js",
3493
+ "benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js",
3494
+ "benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh",
3495
+ "benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh",
3496
+ "benchmark/auto-resolve/scripts/pair_evidence_contract.py",
3497
+ "benchmark/auto-resolve/scripts/pair-candidate-frontier.py",
3498
+ "benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh",
3499
+ "benchmark/auto-resolve/scripts/audit-pair-evidence.py",
3500
+ "benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py",
3501
+ "benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py",
3502
+ "benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh",
3503
+ "benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh",
3504
+ "benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh",
3505
+ "benchmark/auto-resolve/scripts/test-lint-fixtures.sh",
3506
+ "benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh",
3507
+ "benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh",
3508
+ "benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh",
3509
+ "benchmark/auto-resolve/scripts/test-ship-gate.sh",
3510
+ "benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh",
3511
+ "benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh",
3512
+ "benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md",
3513
+ "benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md",
3514
+ "benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md",
3515
+ "benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md",
3516
+ "benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md",
3517
+ "benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md",
3518
+ "benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md",
3519
+ "scripts/lint-fixtures.sh",
3520
+ "scripts/lint-shadow-fixtures.sh",
3521
+ ...shadowRequired,
3522
+ ];
3523
+ const missing = required.filter((file) => !files.has(file));
3524
+ if (missing.length > 0) {
3525
+ console.error(missing.join("\n"));
3526
+ process.exit(1);
3527
+ }
3528
+ const forbidden = pack.files
3529
+ .map((file) => file.path)
3530
+ .filter((file) => file.includes("__pycache__") || file.endsWith(".pyc"));
3531
+ if (forbidden.length > 0) {
3532
+ console.error(forbidden.join("\n"));
3533
+ process.exit(2);
3534
+ }
3535
+ NODE
3536
+ then
3537
+ offenders="${offenders}"$'\n'"npm pack dry-run must include benchmark runner/gate regression tests, all shadow fixture files, retired fixture replay docs, and exclude pycache artifacts"
3538
+ fi
3539
+ else
3540
+ offenders="${offenders}"$'\n'"npm pack dry-run failed while checking benchmark runner/gate regression tests"
3541
+ fi
3542
+ [ -n "${pack_json:-}" ] && rm -f "$pack_json"
3543
+ non_executable_shell_scripts=$(find benchmark/auto-resolve/scripts -maxdepth 1 -name '*.sh' -type f ! -perm -111 -print | sort)
3544
+ if [ -n "$non_executable_shell_scripts" ]; then
3545
+ while IFS= read -r f; do
3546
+ offenders="${offenders}"$'\n'"benchmark shell script must be executable: $f"
3547
+ done <<< "$non_executable_shell_scripts"
3548
+ fi
3549
+ for gate_test in \
3550
+ test-benchmark-arg-parsing.sh \
3551
+ test-pair-candidate-frontier.sh \
3552
+ test-audit-pair-evidence.sh \
3553
+ test-audit-headroom-rejections.sh \
3554
+ test-build-pair-eligible-manifest.sh \
3555
+ test-ship-gate.sh \
3556
+ test-headroom-gate.sh \
3557
+ test-run-headroom-candidate.sh \
3558
+ test-check-f9-artifacts.sh \
3559
+ test-lint-fixtures.sh \
3560
+ test-run-full-pipeline-pair-candidate.sh \
3561
+ test-full-pipeline-pair-gate.sh \
3562
+ test-iter-0033c-l1-summary.sh \
3563
+ test-iter-0033c-compare.sh \
3564
+ test-run-swebench-solver-batch.sh \
3565
+ test-swebench-frozen-case.sh \
3566
+ test-frozen-verify-gate.sh
3567
+ do
3568
+ if ! grep -Fq "$gate_test" benchmark/auto-resolve/README.md; then
3569
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/README.md: gate-change instructions must list $gate_test"
3570
+ fi
3571
+ done
3572
+ if ! bash benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh >/dev/null 2>&1; then
3573
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh: failed"
3574
+ fi
3575
+ if ! bash benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh >/dev/null 2>&1; then
3576
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh: failed"
3577
+ fi
3578
+ if ! bash benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh >/dev/null 2>&1; then
3579
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh: failed"
3580
+ fi
3581
+ if ! bash benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh >/dev/null 2>&1; then
3582
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh: failed"
3583
+ fi
3584
+ if ! bash benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh >/dev/null 2>&1; then
3585
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh: failed"
3586
+ fi
3587
+ if ! bash benchmark/auto-resolve/scripts/test-headroom-gate.sh >/dev/null 2>&1; then
3588
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-headroom-gate.sh: failed"
3589
+ fi
3590
+ if ! bash benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh >/dev/null 2>&1; then
3591
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh: failed"
3592
+ fi
3593
+ if ! bash benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh >/dev/null 2>&1; then
3594
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh: failed"
3595
+ fi
3596
+ if ! bash benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh >/dev/null 2>&1; then
3597
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh: failed"
3598
+ fi
3599
+ if ! bash benchmark/auto-resolve/scripts/test-ship-gate.sh >/dev/null 2>&1; then
3600
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-ship-gate.sh: failed"
3601
+ fi
3602
+ if ! bash benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh >/dev/null 2>&1; then
3603
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh: failed"
3604
+ fi
3605
+ if ! bash benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh >/dev/null 2>&1; then
3606
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh: failed"
3607
+ fi
3608
+ if ! bash benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh >/dev/null 2>&1; then
3609
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh: failed"
3610
+ fi
3611
+ if ! bash benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh >/dev/null 2>&1; then
3612
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh: failed"
3613
+ fi
3614
+ if ! bash benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh >/dev/null 2>&1; then
3615
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh: failed"
3616
+ fi
3617
+ if ! bash benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh >/dev/null 2>&1; then
3618
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh: failed"
3619
+ fi
3620
+ if ! bash scripts/lint-fixtures.sh >/dev/null 2>&1; then
3621
+ offenders="${offenders}"$'\n'"scripts/lint-fixtures.sh: failed"
3622
+ fi
3623
+ if ! bash scripts/lint-shadow-fixtures.sh >/dev/null 2>&1; then
3624
+ offenders="${offenders}"$'\n'"scripts/lint-shadow-fixtures.sh: failed"
3625
+ fi
3626
+ if ! bash benchmark/auto-resolve/scripts/test-lint-fixtures.sh >/dev/null 2>&1; then
3627
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/test-lint-fixtures.sh: failed"
3628
+ fi
3629
+ if ! python3 -m py_compile benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py >/dev/null 2>&1; then
3630
+ offenders="${offenders}"$'\n'"benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py: py_compile failed"
3631
+ fi
3632
+ if ! grep -Fq 'high-risk fixture must include a resolve risk-trigger term' scripts/lint-fixtures.sh \
3633
+ || ! grep -Fq 'SOLO_HEADROOM_CHECK="$REPO_ROOT/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py"' scripts/lint-fixtures.sh \
3634
+ || ! grep -Fq 'pair_evidence_passed fixture spec.md must document an actionable solo-headroom hypothesis with solo_claude miss and observable command from expected.json' scripts/lint-fixtures.sh \
3635
+ || ! grep -Fq -- '--expected-json "$d/expected.json" "$d/spec.md"' scripts/lint-fixtures.sh \
3636
+ || ! grep -Fq 'str(fixture_dir / "expected.json")' benchmark/auto-resolve/scripts/audit-pair-evidence.py \
3637
+ || ! grep -Fq 'hypothesis command must match expected.json verification command' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3638
+ || ! grep -Fq 'pair-evidence-hypothesis-fail.out' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3639
+ || ! grep -Fq 'pair-evidence-hypothesis-pass.out' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3640
+ || ! grep -Fq 'DEVLYN_FIXTURES_DIR' scripts/lint-fixtures.sh \
3641
+ || ! grep -Fq 'DEVLYN_FIXTURE_GLOB' scripts/lint-fixtures.sh \
3642
+ || ! grep -Fq 'DEVLYN_FIXTURE_GLOB="S*"' scripts/lint-shadow-fixtures.sh \
3643
+ || ! grep -Fq 'benchmark/auto-resolve/shadow-fixtures' scripts/lint-shadow-fixtures.sh \
3644
+ || ! grep -Fq 'category") != "high-risk"' scripts/lint-fixtures.sh \
3645
+ || ! grep -Fq 'permissions?' scripts/lint-fixtures.sh \
3646
+ || ! grep -Fq 'idempoten\w*' scripts/lint-fixtures.sh \
3647
+ || ! grep -Fq 'output-shape' scripts/lint-fixtures.sh \
3648
+ || ! grep -Fq 'high-risk trigger validation' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3649
+ || ! grep -Fq 'DEVLYN_FIXTURES_DIR="$FIXTURES_DIR" bash "$ROOT/scripts/lint-fixtures.sh"' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3650
+ || ! grep -Fq 'DEVLYN_LINT_FIXTURES_NO_JSONSCHEMA=1' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3651
+ || ! grep -Fq 'spec-verify-check --check-expected failed' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3652
+ || ! grep -Fq 'unless sibling spec.md declares all Requirements are pure-design' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3653
+ || ! grep -Fq 'verification_commands must be an array' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3654
+ || ! grep -Fq 'expected.json must be an object' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3655
+ || ! grep -Fq 'hidden oracle missing contract_refs' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3656
+ || ! grep -Fq 'contract_ref not found in spec.md' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3657
+ || ! grep -Fq 'BENCH_FIXTURE_DIR file not found' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3658
+ || ! grep -Fq 'BENCH_FIXTURE_DIR file escapes fixture dir' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3659
+ || ! grep -Fq 'hidden oracle must reference an explicit $BENCH_FIXTURE_DIR/... file' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3660
+ || ! grep -Fq 'hidden oracle must assert stdout_contains includes' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3661
+ || ! grep -Fq 'cd \"$BENCH_FIXTURE_DIR\" && node verifiers/hidden-oracle.js' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3662
+ || ! grep -Fq 'missing-hidden-oracle.js' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3663
+ || ! grep -Fq '../outside-hidden-oracle.js' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3664
+ || ! grep -Fq 'This visible contract is not in the spec.' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3665
+ || ! grep -Fq 'spec-verify-check --check failed' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3666
+ || ! grep -Fq 'frontmatter complexity must be one of' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3667
+ || ! grep -Fq 'resolve spec contract enum' benchmark/auto-resolve/fixtures/SCHEMA.md \
3668
+ || ! grep -Fq 'metadata.difficulty' benchmark/auto-resolve/fixtures/SCHEMA.md \
3669
+ || ! grep -Fq "grep -Fq 'Traceback'" benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3670
+ || ! grep -Fq 'hidden oracle missing contract_refs' scripts/lint-fixtures.sh \
3671
+ || ! grep -Fq 'contract_ref not found in spec.md' scripts/lint-fixtures.sh \
3672
+ || ! grep -Fq 'BENCH_FIXTURE_DIR file not found' scripts/lint-fixtures.sh \
3673
+ || ! grep -Fq 'BENCH_FIXTURE_DIR file escapes fixture dir' scripts/lint-fixtures.sh \
3674
+ || ! grep -Fq 'hidden oracle must reference an explicit $BENCH_FIXTURE_DIR/... file' scripts/lint-fixtures.sh \
3675
+ || ! grep -Fq 'hidden oracle must assert stdout_contains includes' scripts/lint-fixtures.sh \
3676
+ || ! grep -Fq 'SPEC_VERIFY_CHECK="$REPO_ROOT/config/skills/_shared/spec-verify-check.py"' scripts/lint-fixtures.sh \
3677
+ || ! grep -Fq 'python3 "$SPEC_VERIFY_CHECK" --check "$d/spec.md"' scripts/lint-fixtures.sh \
3678
+ || ! grep -Fq 'spec-verify-check --check failed' scripts/lint-fixtures.sh \
3679
+ || ! grep -Fq 'python3 "$SPEC_VERIFY_CHECK" --check-expected "$d/expected.json"' scripts/lint-fixtures.sh \
3680
+ || ! grep -Fq 'spec-verify-check --check-expected failed' scripts/lint-fixtures.sh \
3681
+ || ! grep -Fq 'REJECTED_REGISTRY="${DEVLYN_REJECTED_FIXTURE_REGISTRY:-$REPO_ROOT/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh}"' scripts/lint-fixtures.sh \
3682
+ || ! grep -Fq 'declare -F rejected_pair_fixture_reason' scripts/lint-fixtures.sh \
3683
+ || ! grep -Fq 'rejected fixture registry must define rejected_pair_fixture_reason' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3684
+ || ! grep -Fq 'malformed-rejected.sh' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3685
+ || ! grep -Fq 'NOTES.md records pair-candidate rejection but pair-rejected-fixtures.sh has no rejected reason' scripts/lint-fixtures.sh \
3686
+ || ! grep -Fq 'test-active-headroom' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3687
+ || ! grep -Fq 'active-calibration-rejected-missing.out' benchmark/auto-resolve/scripts/test-lint-fixtures.sh \
3688
+ || ! grep -Fq 'Rejected controls should remain replayable' benchmark/auto-resolve/fixtures/SCHEMA.md \
3689
+ || ! grep -Fq 'pair-rejected-fixtures.sh' benchmark/auto-resolve/fixtures/SCHEMA.md \
3690
+ || ! grep -Fq 'target.relative_to(fixture_root)' scripts/lint-fixtures.sh \
3691
+ || ! grep -Fq '$BENCH_FIXTURE_DIR/...' benchmark/auto-resolve/fixtures/SCHEMA.md \
3692
+ || ! grep -Fq 'must not escape the fixture directory' benchmark/auto-resolve/fixtures/SCHEMA.md \
3693
+ || ! grep -Fq 'cd "$BENCH_FIXTURE_DIR"` indirection' benchmark/auto-resolve/fixtures/SCHEMA.md \
3694
+ || ! grep -Fq 'success sentinel' benchmark/auto-resolve/fixtures/SCHEMA.md \
3695
+ || ! grep -Fq 'def fallback_validate():' scripts/lint-fixtures.sh \
3696
+ || ! grep -Fq 'expected.json must be an object' scripts/lint-fixtures.sh \
3697
+ || ! grep -Fq 'schema_ok=0' scripts/lint-fixtures.sh \
3698
+ || ! grep -Fq 'DEVLYN_LINT_FIXTURES_NO_JSONSCHEMA' scripts/lint-fixtures.sh \
3699
+ || ! grep -Fq 'conditional pair/risk-probe triggers' benchmark/auto-resolve/fixtures/SCHEMA.md; then
3700
+ offenders="${offenders}"$'\n'"fixture lint must require high-risk fixtures to include resolve pair/risk trigger terms"
3701
+ fi
3702
+ unsafe_json_parser_refs=$(python3 - <<'PY'
3703
+ import pathlib
3704
+ import re
3705
+
3706
+ root = pathlib.Path("benchmark/auto-resolve/scripts")
3707
+ patterns = [
3708
+ re.compile(r"json\.load\(open\("),
3709
+ re.compile(r"json\.load\(response\)"),
3710
+ re.compile(r"json\.loads\([^,\n]*(?:read_text\(\)|\bline\b|\bln\b)[^,\n]*\)"),
3711
+ ]
3712
+ for path in sorted(root.glob("*")):
3713
+ if path.name.startswith("test-") or path.suffix not in {".py", ".sh"}:
3714
+ continue
3715
+ text = path.read_text(errors="ignore")
3716
+ for line_no, line in enumerate(text.splitlines(), 1):
3717
+ if any(pattern.search(line) for pattern in patterns):
3718
+ print(f"{path}:{line_no}:{line.strip()}")
3719
+ PY
3720
+ )
3721
+ if [ -n "$unsafe_json_parser_refs" ]; then
3722
+ while IFS= read -r f; do
3723
+ offenders="${offenders}"$'\n'"benchmark script uses unsafe JSON parser without strict constants: $f"
3724
+ done <<< "$unsafe_json_parser_refs"
3725
+ fi
3726
+ if [ -z "$offenders" ]; then
3727
+ ok "benchmark docs use current bare / solo_claude / pair-arm topology"
3728
+ else
3729
+ while IFS= read -r f; do bad "$f"; done <<< "$offenders"
3730
+ fi
3731
+
3732
+ # ---------------------------------------------------------------------------
3733
+ # 9. Engine availability fails closed; stale silent-downgrade wording is forbidden.
287
3734
  # ---------------------------------------------------------------------------
288
- section "Check 9: Downgrade string uses 'codex-unavailable'"
289
- offenders=$(grep -RIln 'codex-ping failed\|codex-ping fail' \
290
- config/skills CLAUDE.md README.md bin/ 2>/dev/null \
3735
+ section "Check 9: Engine availability fails closed"
3736
+ offenders=$(grep -RInE 'codex-ping failed|codex-ping fail|engine downgraded: codex-unavailable|downgrades to Claude-only|silently downgrades|silently downgrade|silently switch to Claude|Codex CLI availability downgrade' \
3737
+ config/skills CLAUDE.md README.md bin/ benchmark/auto-resolve/run-real-benchmark.md 2>/dev/null \
291
3738
  | grep -v 'roadmap-archival-workspace/' \
292
3739
  | grep -v 'devlyn:auto-resolve-workspace/' \
293
3740
  | grep -v 'devlyn:ideate-workspace/' \
294
3741
  | grep -v 'preflight-workspace/' \
295
3742
  || true)
296
3743
  if [ -z "$offenders" ]; then
297
- ok "all downgrade strings canonical"
3744
+ ok "engine availability fail-closed wording canonical"
298
3745
  else
299
3746
  while IFS= read -r f; do bad "$f"; done <<< "$offenders"
300
3747
  fi
301
3748
 
3749
+ # ---------------------------------------------------------------------------
3750
+ # 9b. Release tag and package version parity.
3751
+ # v2.2.3 was tagged while package.json still said 2.2.2. A mismatched npm
3752
+ # package makes Codex/Claude installed-skill drift hard to diagnose because
3753
+ # users can be on a tag whose package metadata points at a different build.
3754
+ # ---------------------------------------------------------------------------
3755
+ section "Check 9b: package.json version matches exact release tag"
3756
+ exact_tag=$(git describe --tags --exact-match HEAD 2>/dev/null || true)
3757
+ pkg_version=$(node -p "require('./package.json').version" 2>/dev/null || true)
3758
+ if [ -z "$exact_tag" ]; then
3759
+ ok "HEAD is not an exact tag — package/tag parity not applicable"
3760
+ elif [ "$pkg_version" = "${exact_tag#v}" ]; then
3761
+ ok "package.json version matches $exact_tag"
3762
+ else
3763
+ bad "package.json version '$pkg_version' does not match HEAD tag '$exact_tag'"
3764
+ fi
3765
+
302
3766
  # ---------------------------------------------------------------------------
303
3767
  # (Check 7 retired iter-0034 Phase 4 cutover: the 4 findings-producing
304
3768
  # standalones — evaluate / review / clean / team-review — were deleted; the
@@ -368,29 +3832,31 @@ else
368
3832
 
369
3833
  # Content: byte-compare each section block via diff over temp files.
370
3834
  # awk-into-tmpfile preserves trailing newlines (command substitution strips them).
371
- tmp_rp=$(mktemp)
372
- tmp_claude=$(mktemp)
373
- for name in $expected_sections; do
374
- begin="<!-- runtime-principles:section=${name}:begin -->"
375
- end="<!-- runtime-principles:section=${name}:end -->"
376
- awk -v b="$begin" -v e="$end" '$0==b{f=1;next}$0==e{f=0}f' "$rp_src" > "$tmp_rp"
377
- awk -v b="$begin" -v e="$end" '$0==b{f=1;next}$0==e{f=0}f' "$claude_src" > "$tmp_claude"
378
- if [ ! -s "$tmp_rp" ]; then
379
- bad "${name}: empty/missing block in $rp_src"
380
- rp_drift=1
381
- continue
382
- fi
383
- if [ ! -s "$tmp_claude" ]; then
384
- bad "${name}: empty/missing block in $claude_src"
385
- rp_drift=1
386
- continue
387
- fi
388
- if ! diff -q "$tmp_rp" "$tmp_claude" >/dev/null 2>&1; then
389
- bad "${name}: CLAUDE.md and runtime-principles.md content differ"
390
- rp_drift=1
391
- fi
392
- done
393
- rm -f "$tmp_rp" "$tmp_claude"
3835
+ if make_temp_file tmp_rp && make_temp_file tmp_claude; then
3836
+ for name in $expected_sections; do
3837
+ begin="<!-- runtime-principles:section=${name}:begin -->"
3838
+ end="<!-- runtime-principles:section=${name}:end -->"
3839
+ awk -v b="$begin" -v e="$end" '$0==b{f=1;next}$0==e{f=0}f' "$rp_src" > "$tmp_rp"
3840
+ awk -v b="$begin" -v e="$end" '$0==b{f=1;next}$0==e{f=0}f' "$claude_src" > "$tmp_claude"
3841
+ if [ ! -s "$tmp_rp" ]; then
3842
+ bad "${name}: empty/missing block in $rp_src"
3843
+ rp_drift=1
3844
+ continue
3845
+ fi
3846
+ if [ ! -s "$tmp_claude" ]; then
3847
+ bad "${name}: empty/missing block in $claude_src"
3848
+ rp_drift=1
3849
+ continue
3850
+ fi
3851
+ if ! diff -q "$tmp_rp" "$tmp_claude" >/dev/null 2>&1; then
3852
+ bad "${name}: CLAUDE.md and runtime-principles.md content differ"
3853
+ rp_drift=1
3854
+ fi
3855
+ done
3856
+ rm -f "$tmp_rp" "$tmp_claude"
3857
+ else
3858
+ rp_drift=1
3859
+ fi
394
3860
 
395
3861
  if [ $rp_drift -eq 0 ]; then
396
3862
  ok "all 4 contract sections in parity (subtractive-first / goal-locked / no-workaround / evidence) — markers, topology, content"
@@ -411,20 +3877,44 @@ if [ ! -x "$idgen" ] && [ ! -f "$idgen" ]; then
411
3877
  elif [ ! -d "$fixture" ]; then
412
3878
  bad "Check 13 prerequisite missing: $fixture"
413
3879
  else
414
- tmp1=$(mktemp); tmp2=$(mktemp)
415
- if python3 "$idgen" --fixture "$fixture" --generated-at 2026-04-29T18:30:00Z --output "$tmp1" >/dev/null 2>&1 \
416
- && python3 "$idgen" --fixture "$fixture" --generated-at 2026-04-29T18:30:00Z --output "$tmp2" >/dev/null 2>&1; then
417
- sha1=$(shasum -a 256 "$tmp1" | awk '{print $1}')
418
- sha2=$(shasum -a 256 "$tmp2" | awk '{print $1}')
419
- if [ "$sha1" = "$sha2" ]; then
420
- ok "F2 registry sha256 stable across two idgen runs ($sha1)"
3880
+ if make_temp_file tmp1 && make_temp_file tmp2; then
3881
+ if python3 "$idgen" --fixture "$fixture" --generated-at 2026-04-29T18:30:00Z --output "$tmp1" >/dev/null 2>&1 \
3882
+ && python3 "$idgen" --fixture "$fixture" --generated-at 2026-04-29T18:30:00Z --output "$tmp2" >/dev/null 2>&1; then
3883
+ sha1=$(shasum -a 256 "$tmp1" | awk '{print $1}')
3884
+ sha2=$(shasum -a 256 "$tmp2" | awk '{print $1}')
3885
+ if [ "$sha1" = "$sha2" ]; then
3886
+ ok "F2 registry sha256 stable across two idgen runs ($sha1)"
3887
+ else
3888
+ bad "F2 registry sha256 drift: run1=$sha1 run2=$sha2"
3889
+ fi
421
3890
  else
422
- bad "F2 registry sha256 drift: run1=$sha1 run2=$sha2"
3891
+ bad "idgen invocation failed; cannot verify determinism"
423
3892
  fi
424
3893
  else
425
- bad "idgen invocation failed; cannot verify determinism"
3894
+ bad "idgen temp files unavailable; cannot verify determinism"
426
3895
  fi
427
- rm -f "$tmp1" "$tmp2"
3896
+ if make_temp_dir tmp_bad && make_temp_file idgen_nan_out /tmp/pair-plan-idgen-nan.XXXXXX.out; then
3897
+ cp -R "$fixture/." "$tmp_bad/"
3898
+ printf '{"verification_commands": NaN}\n' > "$tmp_bad/expected.json"
3899
+ if python3 "$idgen" --fixture "$tmp_bad" --generated-at 2026-04-29T18:30:00Z >"$idgen_nan_out" 2>&1; then
3900
+ bad "pair-plan-idgen.py accepted NaN in expected.json"
3901
+ elif ! grep -Fq 'invalid JSON numeric constant: NaN' "$idgen_nan_out"; then
3902
+ bad "pair-plan-idgen.py NaN failure did not cite invalid JSON numeric constant"
3903
+ fi
3904
+ rm -rf "$tmp_bad" "$idgen_nan_out"
3905
+ fi
3906
+
3907
+ if make_temp_file bad_plan && make_temp_file plan_lint_nan_out /tmp/pair-plan-lint-nan.XXXXXX.out; then
3908
+ printf '{"schema_version": NaN}\n' > "$bad_plan"
3909
+ if python3 benchmark/auto-resolve/scripts/pair-plan-lint.py --plan "$bad_plan" --quiet >"$plan_lint_nan_out" 2>&1; then
3910
+ bad "pair-plan-lint.py accepted NaN in pair-plan.json"
3911
+ elif ! grep -Fq '"code": "plan_invalid_json"' "$plan_lint_nan_out" \
3912
+ || ! grep -Fq 'invalid JSON numeric constant: NaN' "$plan_lint_nan_out"; then
3913
+ bad "pair-plan-lint.py NaN failure did not report plan_invalid_json with invalid numeric constant"
3914
+ fi
3915
+ rm -f "$bad_plan" "$plan_lint_nan_out"
3916
+ fi
3917
+ rm -f "${tmp1:-}" "${tmp2:-}"
428
3918
  fi
429
3919
 
430
3920
  # ---------------------------------------------------------------------------
@@ -467,6 +3957,66 @@ if [ $f9_drift -eq 0 ]; then
467
3957
  ok "F9 fixture id is canonical (F9-e2e-ideate-to-resolve); no stale refs outside retired/"
468
3958
  fi
469
3959
 
3960
+ # ---------------------------------------------------------------------------
3961
+ # 15. Current pair-evidence fixtures carry their local evidence handoff notes.
3962
+ # The audit artifacts are canonical, but fixture-level notes keep future
3963
+ # fixture edits from losing why a candidate currently counts as solo<pair
3964
+ # evidence.
3965
+ # ---------------------------------------------------------------------------
3966
+ section "Check 15: Pair evidence fixture notes cite current passing runs"
3967
+ if python3 - <<'PY'
3968
+ import importlib.util
3969
+ import pathlib
3970
+ import sys
3971
+
3972
+ script = pathlib.Path("benchmark/auto-resolve/scripts/pair-candidate-frontier.py")
3973
+ spec = importlib.util.spec_from_file_location("pair_candidate_frontier", script)
3974
+ module = importlib.util.module_from_spec(spec)
3975
+ assert spec.loader is not None
3976
+ spec.loader.exec_module(module)
3977
+
3978
+ report = module.build_report(
3979
+ fixtures_root=pathlib.Path("benchmark/auto-resolve/fixtures"),
3980
+ registry=pathlib.Path("benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh"),
3981
+ results_root=pathlib.Path("benchmark/auto-resolve/results"),
3982
+ )
3983
+ errors = []
3984
+ for row in report["rows"]:
3985
+ if row.get("status") != "pair_evidence_passed":
3986
+ continue
3987
+ fixture = row["fixture"]
3988
+ best = module.best_pair_evidence(row.get("passing_pair_evidence", []))
3989
+ if best is None:
3990
+ errors.append(f"{fixture}: missing complete pair evidence")
3991
+ continue
3992
+ notes_path = pathlib.Path("benchmark/auto-resolve/fixtures") / fixture / "NOTES.md"
3993
+ try:
3994
+ notes = notes_path.read_text(encoding="utf8")
3995
+ except OSError:
3996
+ errors.append(f"{fixture}: NOTES.md missing")
3997
+ continue
3998
+ required = [
3999
+ str(best["run_id"]),
4000
+ "pair_evidence_passed",
4001
+ f"bare `{best['bare_score']}`",
4002
+ f"solo_claude `{best['solo_score']}`",
4003
+ f"pair `{best['pair_score']}`",
4004
+ f"margin `{best['pair_margin']:+d}`",
4005
+ f"wall `{best['pair_solo_wall_ratio']:.2f}x`",
4006
+ f"arm `{best['pair_arm']}`",
4007
+ ]
4008
+ missing = [item for item in required if item not in notes]
4009
+ if missing:
4010
+ errors.append(f"{fixture}: NOTES.md missing {', '.join(missing)}")
4011
+ if errors:
4012
+ raise SystemExit("\n".join(errors))
4013
+ PY
4014
+ then
4015
+ ok "current pair-evidence fixture notes cite passing run ids"
4016
+ else
4017
+ bad "current pair-evidence fixture NOTES.md files must cite passing run ids and pair_evidence_passed"
4018
+ fi
4019
+
470
4020
  # ---------------------------------------------------------------------------
471
4021
  # Summary.
472
4022
  # ---------------------------------------------------------------------------