devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -17,28 +17,30 @@ write_fixture() {
17
17
  local pair_mode="${6:-true}"
18
18
  local pair_elapsed="${7:-200}"
19
19
  local solo_elapsed="${8:-100}"
20
- local pair_arm="${9:-l2_gated}"
20
+ local pair_arm="${9:-l2_risk_probes}"
21
21
  local dir="$TMP_DIR/$run_id/$fixture"
22
22
  mkdir -p "$dir/bare" "$dir/solo_claude" "$dir/$pair_arm"
23
23
  cat > "$dir/judge.json" <<EOF
24
24
  {
25
25
  "scores_by_arm": {"bare": $bare, "solo_claude": $solo, "$pair_arm": $pair},
26
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "C": "$pair_arm", "seed": 1},
26
27
  "disqualifiers_by_arm": {}
27
28
  }
28
29
  EOF
29
30
  for arm in bare solo_claude "$pair_arm"; do
30
31
  cat > "$dir/$arm/verify.json" <<'EOF'
31
- {"disqualifier": false}
32
+ {"disqualifier": false, "verify_score": 1.0}
32
33
  EOF
34
+ : > "$dir/$arm/diff.patch"
33
35
  done
34
36
  cat > "$dir/bare/result.json" <<'EOF'
35
37
  {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": 20}
36
38
  EOF
37
39
  cat > "$dir/solo_claude/result.json" <<EOF
38
- {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $solo_elapsed}
40
+ {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $solo_elapsed, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
39
41
  EOF
40
42
  cat > "$dir/$pair_arm/result.json" <<EOF
41
- {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $pair_elapsed, "pair_mode": $pair_mode}
43
+ {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $pair_elapsed, "pair_mode": $pair_mode, "pair_trigger": {"eligible": true, "reasons": ["complexity.high"], "skipped_reason": null}, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
42
44
  EOF
43
45
  }
44
46
 
@@ -60,39 +62,455 @@ expect_fail_contains() {
60
62
  }
61
63
 
62
64
  write_fixture pass F21 50 75 82 true 220 110
63
- write_fixture pass F22 60 80 88 true 280 140
65
+ write_fixture pass F23 55 75 83 true 280 140
66
+ expect_fail_contains missing-rejected-registry "rejected fixture registry missing" \
67
+ env PAIR_REJECTED_FIXTURES_REGISTRY="$TMP_DIR/missing-registry.sh" \
68
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-fixtures 1
69
+ empty_registry="$TMP_DIR/empty-registry.sh"
70
+ : > "$empty_registry"
71
+ expect_fail_contains empty-rejected-registry "rejected fixture registry has no fixture entries" \
72
+ env PAIR_REJECTED_FIXTURES_REGISTRY="$empty_registry" \
73
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-fixtures 1
64
74
  python3 "$GATE" --results-root "$TMP_DIR" --run-id pass \
65
75
  --max-pair-solo-wall-ratio 3 \
66
76
  --out-json "$TMP_DIR/pass.json" \
67
77
  --out-md "$TMP_DIR/pass.md"
68
78
  grep -Fq '"verdict": "PASS"' "$TMP_DIR/pass.json"
79
+ grep -Fq '"avg_pair_margin": 7.5' "$TMP_DIR/pass.json"
69
80
  grep -Fq '"avg_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.json"
81
+ grep -Fq '"max_pair_solo_wall_ratio": 3.0' "$TMP_DIR/pass.json"
82
+ grep -Fq '"max_observed_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.json"
83
+ grep -Fq '"require_hypothesis_trigger": false' "$TMP_DIR/pass.json"
84
+ grep -Fq '"pair_trigger_has_canonical_reason": true' "$TMP_DIR/pass.json"
85
+ grep -Fq '"pair_trigger_has_hypothesis_reason": false' "$TMP_DIR/pass.json"
86
+ grep -Fq 'pair_trigger eligible with a canonical reason' "$TMP_DIR/pass.json"
70
87
  grep -Fq 'Verdict: **PASS**' "$TMP_DIR/pass.md"
88
+ grep -Fq 'Fixtures passed: 2/2 (minimum required: 2)' "$TMP_DIR/pass.md"
89
+ grep -Fq 'Average pair margin: +7.5' "$TMP_DIR/pass.md"
90
+ grep -Fq 'Allowed pair/solo wall ratio: 3.00x' "$TMP_DIR/pass.md"
91
+ grep -Fq 'Maximum observed pair/solo wall ratio: 2.00x' "$TMP_DIR/pass.md"
92
+ grep -Fq 'Hypothesis trigger required: false' "$TMP_DIR/pass.md"
93
+ grep -Fq 'pair_trigger eligible with canonical reason' "$TMP_DIR/pass.md"
94
+ grep -Fq '"min_bare_headroom_required": 5' "$TMP_DIR/pass.json"
95
+ grep -Fq '"min_solo_headroom_required": 5' "$TMP_DIR/pass.json"
96
+ grep -Fq '| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Pair | Margin | Pair mode | Hypothesis trigger | Triggers | Wall ratio | Status | Reason |' "$TMP_DIR/pass.md"
97
+ grep -Fq '| F21 | 50 | 10 | 75 | 5 | 82 | +7 | true | false | complexity.high | 2.00x | PASS | |' "$TMP_DIR/pass.md"
98
+ grep -Fq '| F23 | 55 | 5 | 75 | 5 | 83 | +8 | true | false | complexity.high | 2.00x | PASS | |' "$TMP_DIR/pass.md"
99
+
100
+ write_fixture nan-result F21 50 75 85 true
101
+ cat > "$TMP_DIR/nan-result/F21/l2_risk_probes/result.json" <<'EOF'
102
+ {"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": NaN, "pair_mode": true, "pair_trigger": {"eligible": true, "reasons": ["complexity.high"], "skipped_reason": null}, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
103
+ EOF
104
+ expect_fail_contains nan-result-json "l2_risk_probes result.json malformed" \
105
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id nan-result --min-fixtures 1
106
+
107
+ write_fixture rejected-direct F2 50 75 85 true
108
+ write_fixture rejected-direct F21 50 75 85 true
109
+ expect_fail_contains rejected-direct "fixture rejected for pair-candidate runs" \
110
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id rejected-direct --min-fixtures 1
111
+
112
+ write_fixture rejected-shadow-direct S3-cli-ticket-assignment 50 75 85 true
113
+ write_fixture rejected-shadow-direct F21 50 75 85 true
114
+ expect_fail_contains rejected-shadow-direct "fixture rejected for pair-candidate runs" \
115
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id rejected-shadow-direct --min-fixtures 1
116
+
117
+ write_fixture partial-baseline F21 50 75 85 true
118
+ write_fixture partial-baseline F23 55 75 90 true
119
+ python3 - "$TMP_DIR/partial-baseline/F21/solo_claude/verify.json" <<'PY'
120
+ import json, sys
121
+ path = sys.argv[1]
122
+ data = json.load(open(path))
123
+ data["verify_score"] = 0.75
124
+ json.dump(data, open(path, "w"), indent=2)
125
+ PY
126
+ python3 - "$TMP_DIR/partial-baseline/F21/solo_claude/result.json" <<'PY'
127
+ import json, sys
128
+ path = sys.argv[1]
129
+ data = json.load(open(path))
130
+ data["terminal_verdict"] = "FAIL"
131
+ data["verify_verdict"] = "FAIL"
132
+ json.dump(data, open(path, "w"), indent=2)
133
+ PY
134
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id partial-baseline \
135
+ --max-pair-solo-wall-ratio 3 \
136
+ --out-json "$TMP_DIR/partial-baseline.json" \
137
+ --out-md "$TMP_DIR/partial-baseline.md"
138
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/partial-baseline.json"
139
+ grep -Fq '| F21 | 50 | 10 | 75 | 5 | 85 | +10 | true | false | complexity.high | 2.00x | PASS | |' "$TMP_DIR/partial-baseline.md"
71
140
 
72
141
  write_fixture no-headroom F21 50 81 90 true
73
- write_fixture no-headroom F22 60 80 88 true
142
+ write_fixture no-headroom F23 55 75 83 true
74
143
  expect_fail_contains no-headroom "solo_claude score 81 > 80" \
75
144
  python3 "$GATE" --results-root "$TMP_DIR" --run-id no-headroom
76
145
 
77
- write_fixture no-pair-mode F21 50 75 85 false
78
- write_fixture no-pair-mode F22 60 80 90 true
146
+ write_fixture marginal-headroom F21 59 66 85 true
147
+ write_fixture marginal-headroom F23 50 75 82 true
148
+ expect_fail_contains marginal-headroom "bare headroom 1 < 5" \
149
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id marginal-headroom
150
+
151
+ write_fixture dirty-bare F21 50 75 85 true
152
+ python3 - "$TMP_DIR/dirty-bare/F21/bare/result.json" <<'PY'
153
+ import json, sys
154
+ path = sys.argv[1]
155
+ data = json.load(open(path))
156
+ data["disqualifier"] = True
157
+ json.dump(data, open(path, "w"), indent=2)
158
+ PY
159
+ expect_fail_contains dirty-bare "bare result disqualifier" \
160
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-bare --min-fixtures 1
161
+
162
+ write_fixture dirty-solo F21 50 75 85 true
163
+ python3 - "$TMP_DIR/dirty-solo/F21/solo_claude/verify.json" <<'PY'
164
+ import json, sys
165
+ path = sys.argv[1]
166
+ data = json.load(open(path))
167
+ data["disqualifier"] = True
168
+ json.dump(data, open(path, "w"), indent=2)
169
+ PY
170
+ expect_fail_contains dirty-solo "solo_claude verify disqualifier" \
171
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo --min-fixtures 1
172
+
173
+ write_fixture control-ceiling F22-cli-ledger-close 94 98 99 true 140 100 l2_risk_probes
174
+ write_fixture control-ceiling F26-cli-payout-ledger-rules 25 98 99 true 140 100 l2_risk_probes
175
+ expect_fail_contains control-ceiling "solo_claude score 98 > 80" \
176
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id control-ceiling \
177
+ --pair-arm l2_risk_probes --min-fixtures 2
178
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id control-ceiling \
179
+ --pair-arm l2_risk_probes --min-fixtures 2 \
180
+ --out-json "$TMP_DIR/control-ceiling.json" \
181
+ --out-md "$TMP_DIR/control-ceiling.md" >/dev/null 2>&1 || true
182
+ grep -Fq '"verdict": "FAIL"' "$TMP_DIR/control-ceiling.json"
183
+ grep -Fq 'F22-cli-ledger-close' "$TMP_DIR/control-ceiling.md"
184
+ grep -Fq 'F26-cli-payout-ledger-rules' "$TMP_DIR/control-ceiling.md"
185
+
186
+ write_fixture no-pair-mode F21 50 75 85 false 200 100 l2_gated
187
+ write_fixture no-pair-mode F23 55 75 85 true 200 100 l2_gated
79
188
  expect_fail_contains no-pair-mode "l2_gated pair_mode not true" \
80
- python3 "$GATE" --results-root "$TMP_DIR" --run-id no-pair-mode
189
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id no-pair-mode \
190
+ --pair-arm l2_gated
191
+
192
+ write_fixture malformed-pair-trigger F21 50 75 85 true
193
+ python3 - "$TMP_DIR/malformed-pair-trigger/F21/l2_risk_probes/result.json" <<'PY'
194
+ import json, sys
195
+ path = sys.argv[1]
196
+ data = json.load(open(path))
197
+ data["pair_trigger"] = {"eligible": True, "reasons": "complexity.high", "skipped_reason": None}
198
+ json.dump(data, open(path, "w"), indent=2)
199
+ PY
200
+ expect_fail_contains malformed-pair-trigger "l2_risk_probes pair_trigger.reasons malformed" \
201
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-pair-trigger --min-fixtures 1
202
+
203
+ write_fixture unknown-pair-trigger-reason F21 50 75 85 true
204
+ python3 - "$TMP_DIR/unknown-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
205
+ import json, sys
206
+ path = sys.argv[1]
207
+ data = json.load(open(path))
208
+ data["pair_trigger"] = {"eligible": True, "reasons": ["looks-hard"], "skipped_reason": None}
209
+ json.dump(data, open(path, "w"), indent=2)
210
+ PY
211
+ expect_fail_contains unknown-pair-trigger-reason "l2_risk_probes pair_trigger reasons missing known trigger reason" \
212
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id unknown-pair-trigger-reason --min-fixtures 1
213
+
214
+ write_fixture mixed-unknown-pair-trigger-reason F21 50 75 85 true
215
+ python3 - "$TMP_DIR/mixed-unknown-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
216
+ import json, sys
217
+ path = sys.argv[1]
218
+ data = json.load(open(path))
219
+ data["pair_trigger"] = {"eligible": True, "reasons": ["complexity.high", "looks-hard"], "skipped_reason": None}
220
+ json.dump(data, open(path, "w"), indent=2)
221
+ PY
222
+ expect_fail_contains mixed-unknown-pair-trigger-reason "l2_risk_probes pair_trigger reasons contain unknown trigger reason" \
223
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id mixed-unknown-pair-trigger-reason --min-fixtures 1
224
+
225
+ write_fixture normalized-canonical-pair-trigger-reason F21 50 75 85 true
226
+ python3 - "$TMP_DIR/normalized-canonical-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
227
+ import json, sys
228
+ path = sys.argv[1]
229
+ data = json.load(open(path))
230
+ data["pair_trigger"] = {"eligible": True, "reasons": ["risk high"], "skipped_reason": None}
231
+ json.dump(data, open(path, "w"), indent=2)
232
+ PY
233
+ expect_fail_contains normalized-canonical-pair-trigger-reason "l2_risk_probes pair_trigger reasons missing known trigger reason" \
234
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id normalized-canonical-pair-trigger-reason --min-fixtures 1
235
+
236
+ write_fixture historical-only-pair-trigger-reason F21 50 75 85 true
237
+ python3 - "$TMP_DIR/historical-only-pair-trigger-reason/F21/l2_risk_probes/result.json" <<'PY'
238
+ import json, sys
239
+ path = sys.argv[1]
240
+ data = json.load(open(path))
241
+ data["pair_trigger"] = {"eligible": True, "reasons": ["risk_profile.high_risk"], "skipped_reason": None}
242
+ json.dump(data, open(path, "w"), indent=2)
243
+ PY
244
+ expect_fail_contains historical-only-pair-trigger-reason "l2_risk_probes pair_trigger reasons missing canonical trigger reason" \
245
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id historical-only-pair-trigger-reason --min-fixtures 1
246
+
247
+ write_fixture missing-hypothesis-trigger F16-cli-quote-tax-rules 50 75 85 true
248
+ expect_fail_contains missing-hypothesis-trigger "l2_risk_probes pair_trigger missing spec.solo_headroom_hypothesis" \
249
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-hypothesis-trigger --min-fixtures 1 --require-hypothesis-trigger
250
+ python3 - "$TMP_DIR/missing-hypothesis-trigger/F16-cli-quote-tax-rules/l2_risk_probes/result.json" <<'PY'
251
+ import json, sys
252
+ path = sys.argv[1]
253
+ data = json.load(open(path))
254
+ data["pair_trigger"]["reasons"] = ["complexity.high", "spec.solo_headroom_hypothesis"]
255
+ json.dump(data, open(path, "w"), indent=2)
256
+ PY
257
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-hypothesis-trigger \
258
+ --min-fixtures 1 \
259
+ --require-hypothesis-trigger \
260
+ --out-json "$TMP_DIR/hypothesis-trigger-pass.json" \
261
+ --out-md "$TMP_DIR/hypothesis-trigger-pass.md"
262
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/hypothesis-trigger-pass.json"
263
+ grep -Fq '"require_hypothesis_trigger": true' "$TMP_DIR/hypothesis-trigger-pass.json"
264
+ grep -Fq '"pair_trigger_has_hypothesis_reason": true' "$TMP_DIR/hypothesis-trigger-pass.json"
265
+ grep -Fq 'Hypothesis trigger required: true' "$TMP_DIR/hypothesis-trigger-pass.md"
266
+ grep -Fq '| F16-cli-quote-tax-rules | 50 | 10 | 75 | 5 | 85 | +10 | true | true | complexity.high,spec.solo_headroom_hypothesis | 2.00x | PASS | |' "$TMP_DIR/hypothesis-trigger-pass.md"
267
+ grep -Fq 'complexity.high,spec.solo_headroom_hypothesis' "$TMP_DIR/hypothesis-trigger-pass.md"
81
268
 
82
269
  write_fixture weak-margin F21 50 75 79 true
83
- write_fixture weak-margin F22 60 80 88 true
84
- expect_fail_contains weak-margin "l2_gated margin +4 < +5" \
270
+ write_fixture weak-margin F23 55 75 88 true
271
+ expect_fail_contains weak-margin "l2_risk_probes margin +4 < +5" \
85
272
  python3 "$GATE" --results-root "$TMP_DIR" --run-id weak-margin
86
273
 
87
- write_fixture custom-pair-arm F21 50 75 82 true 220 110 l2_risk_probes
88
- write_fixture custom-pair-arm F22 60 80 88 true 280 140 l2_risk_probes
274
+ write_fixture dirty-pair F21 50 75 85 true
275
+ python3 - "$TMP_DIR/dirty-pair/F21/l2_risk_probes/verify.json" <<'PY'
276
+ import json, sys
277
+ path = sys.argv[1]
278
+ data = json.load(open(path))
279
+ data["disqualifier"] = True
280
+ json.dump(data, open(path, "w"), indent=2)
281
+ PY
282
+ expect_fail_contains dirty-pair "l2_risk_probes verify disqualifier" \
283
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair --min-fixtures 1
284
+
285
+ write_fixture dirty-pair-verify-score F21 50 75 85 true
286
+ python3 - "$TMP_DIR/dirty-pair-verify-score/F21/l2_risk_probes/verify.json" <<'PY'
287
+ import json, sys
288
+ path = sys.argv[1]
289
+ data = json.load(open(path))
290
+ data["verify_score"] = 0.75
291
+ json.dump(data, open(path, "w"), indent=2)
292
+ PY
293
+ expect_fail_contains dirty-pair-verify-score "l2_risk_probes verify_score < 1.0" \
294
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-verify-score --min-fixtures 1
295
+
296
+ write_fixture boolean-pair-verify-score F21 50 75 85 true
297
+ python3 - "$TMP_DIR/boolean-pair-verify-score/F21/l2_risk_probes/verify.json" <<'PY'
298
+ import json, sys
299
+ path = sys.argv[1]
300
+ data = json.load(open(path))
301
+ data["verify_score"] = True
302
+ json.dump(data, open(path, "w"), indent=2)
303
+ PY
304
+ expect_fail_contains boolean-pair-verify-score "l2_risk_probes verify_score < 1.0" \
305
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-pair-verify-score --min-fixtures 1
306
+
307
+ write_fixture dirty-pair-verdict F21 50 75 85 true
308
+ python3 - "$TMP_DIR/dirty-pair-verdict/F21/l2_risk_probes/result.json" <<'PY'
309
+ import json, sys
310
+ path = sys.argv[1]
311
+ data = json.load(open(path))
312
+ data["terminal_verdict"] = "BLOCKED:probe-derive-malformed"
313
+ data["verify_verdict"] = "BLOCKED"
314
+ json.dump(data, open(path, "w"), indent=2)
315
+ PY
316
+ expect_fail_contains dirty-pair-verdict "l2_risk_probes terminal verdict not pass" \
317
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-verdict --min-fixtures 1
318
+
319
+ write_fixture dirty-pair-axis F21 50 75 85 true
320
+ python3 - "$TMP_DIR/dirty-pair-axis/F21/judge.json" <<'PY'
321
+ import json, sys
322
+ path = sys.argv[1]
323
+ data = json.load(open(path))
324
+ data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_risk_probes", "seed": 1}
325
+ data["_axis_validation"] = {
326
+ "out_of_range_count": 1,
327
+ "out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
328
+ "axis_range": [0, 25],
329
+ }
330
+ json.dump(data, open(path, "w"), indent=2)
331
+ PY
332
+ expect_fail_contains dirty-pair-axis "l2_risk_probes judge axis-invalid (1)" \
333
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-axis --min-fixtures 1
334
+
335
+ write_fixture dirty-solo-axis F21 50 75 85 true
336
+ python3 - "$TMP_DIR/dirty-solo-axis/F21/judge.json" <<'PY'
337
+ import json, sys
338
+ path = sys.argv[1]
339
+ data = json.load(open(path))
340
+ data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_risk_probes", "seed": 1}
341
+ data["_axis_validation"] = {
342
+ "out_of_range_count": 1,
343
+ "out_of_range_cells": [{"breakdown": "b_breakdown", "axis": "quality", "value": 26}],
344
+ "axis_range": [0, 25],
345
+ }
346
+ json.dump(data, open(path, "w"), indent=2)
347
+ PY
348
+ expect_fail_contains dirty-solo-axis "solo_claude judge axis-invalid (1)" \
349
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo-axis --min-fixtures 1
350
+
351
+ write_fixture unmapped-axis F21 50 75 85 true
352
+ python3 - "$TMP_DIR/unmapped-axis/F21/judge.json" <<'PY'
353
+ import json, sys
354
+ path = sys.argv[1]
355
+ data = json.load(open(path))
356
+ data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_forced", "seed": 1}
357
+ data["_axis_validation"] = {
358
+ "out_of_range_count": 1,
359
+ "out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
360
+ "axis_range": [0, 25],
361
+ }
362
+ json.dump(data, open(path, "w"), indent=2)
363
+ PY
364
+ expect_fail_contains unmapped-axis "judge axis-invalid unmapped (1)" \
365
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id unmapped-axis --min-fixtures 1
366
+
367
+ write_fixture missing-mapping F21 50 75 85 true
368
+ python3 - "$TMP_DIR/missing-mapping/F21/judge.json" <<'PY'
369
+ import json, sys
370
+ path = sys.argv[1]
371
+ data = json.load(open(path))
372
+ del data["_blind_mapping"]
373
+ json.dump(data, open(path, "w"), indent=2)
374
+ PY
375
+ expect_fail_contains missing-mapping "judge blind mapping missing" \
376
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-mapping --min-fixtures 1
377
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-mapping --min-fixtures 1 \
378
+ --out-json "$TMP_DIR/missing-mapping.json" >/dev/null 2>&1 || true
379
+ grep -Fq '"bare_score": null' "$TMP_DIR/missing-mapping.json"
380
+ grep -Fq '"solo_score": null' "$TMP_DIR/missing-mapping.json"
381
+ grep -Fq '"pair_score": null' "$TMP_DIR/missing-mapping.json"
382
+
383
+ write_fixture malformed-mapping-axis F21 50 75 85 true
384
+ python3 - "$TMP_DIR/malformed-mapping-axis/F21/judge.json" <<'PY'
385
+ import json, sys
386
+ path = sys.argv[1]
387
+ data = json.load(open(path))
388
+ data["_blind_mapping"] = "not-a-dict"
389
+ data["_axis_validation"] = {
390
+ "out_of_range_count": 1,
391
+ "out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
392
+ "axis_range": [0, 25],
393
+ }
394
+ json.dump(data, open(path, "w"), indent=2)
395
+ PY
396
+ expect_fail_contains malformed-mapping-axis "judge blind mapping missing" \
397
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-mapping-axis --min-fixtures 1
398
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-mapping-axis --min-fixtures 1 \
399
+ --out-json "$TMP_DIR/malformed-mapping-axis.json" >/dev/null 2>&1 || true
400
+ grep -Fq '"bare_score": null' "$TMP_DIR/malformed-mapping-axis.json"
401
+ grep -Fq '"solo_score": null' "$TMP_DIR/malformed-mapping-axis.json"
402
+ grep -Fq '"pair_score": null' "$TMP_DIR/malformed-mapping-axis.json"
403
+
404
+ write_fixture wrong-pair-mapping F21 50 75 85 true
405
+ python3 - "$TMP_DIR/wrong-pair-mapping/F21/judge.json" <<'PY'
406
+ import json, sys
407
+ path = sys.argv[1]
408
+ data = json.load(open(path))
409
+ data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "C": "l2_gated", "seed": 1}
410
+ json.dump(data, open(path, "w"), indent=2)
411
+ PY
412
+ expect_fail_contains wrong-pair-mapping "judge blind mapping missing arm(s): l2_risk_probes" \
413
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id wrong-pair-mapping --min-fixtures 1
414
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id wrong-pair-mapping --min-fixtures 1 \
415
+ --out-json "$TMP_DIR/wrong-pair-mapping.json" >/dev/null 2>&1 || true
416
+ grep -Fq '"bare_score": 50' "$TMP_DIR/wrong-pair-mapping.json"
417
+ grep -Fq '"solo_score": 75' "$TMP_DIR/wrong-pair-mapping.json"
418
+ grep -Fq '"pair_score": null' "$TMP_DIR/wrong-pair-mapping.json"
419
+ grep -Fq '"pair_margin": null' "$TMP_DIR/wrong-pair-mapping.json"
420
+
421
+ write_fixture malformed-scores F21 50 75 85 true
422
+ python3 - "$TMP_DIR/malformed-scores/F21/judge.json" <<'PY'
423
+ import json, sys
424
+ path = sys.argv[1]
425
+ data = json.load(open(path))
426
+ data["scores_by_arm"] = ["not", "a", "dict"]
427
+ json.dump(data, open(path, "w"), indent=2)
428
+ PY
429
+ expect_fail_contains malformed-scores "bare score missing" \
430
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-scores --min-fixtures 1
431
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-scores --min-fixtures 1 \
432
+ --out-json "$TMP_DIR/malformed-scores.json" >/dev/null 2>&1 || true
433
+ grep -Fq '"bare_score": null' "$TMP_DIR/malformed-scores.json"
434
+ grep -Fq '"solo_score": null' "$TMP_DIR/malformed-scores.json"
435
+ grep -Fq '"pair_score": null' "$TMP_DIR/malformed-scores.json"
436
+ grep -Fq '"pair_margin": null' "$TMP_DIR/malformed-scores.json"
437
+
438
+ write_fixture overrange-score F21 50 75 101 true
439
+ expect_fail_contains overrange-score "l2_risk_probes score missing" \
440
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id overrange-score --min-fixtures 1
441
+
442
+ write_fixture boolean-score F21 true 75 85 true
443
+ expect_fail_contains boolean-score "bare score missing" \
444
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-score --min-fixtures 1
445
+
446
+ write_fixture boolean-wall-time F21 50 75 85 true true 100
447
+ expect_fail_contains boolean-wall-time "pair/solo wall ratio missing" \
448
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-wall-time --min-fixtures 1
449
+
450
+ write_fixture dirty-pair-env F21 50 75 85 true
451
+ python3 - "$TMP_DIR/dirty-pair-env/F21/l2_risk_probes/result.json" <<'PY'
452
+ import json, sys
453
+ path = sys.argv[1]
454
+ data = json.load(open(path))
455
+ data["environment_contamination"] = True
456
+ json.dump(data, open(path, "w"), indent=2)
457
+ PY
458
+ expect_fail_contains dirty-pair-env "l2_risk_probes environment contamination" \
459
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-pair-env --min-fixtures 1
460
+
461
+ write_fixture malformed-pair-bool F21 50 75 85 true
462
+ python3 - "$TMP_DIR/malformed-pair-bool/F21/l2_risk_probes/result.json" <<'PY'
463
+ import json, sys
464
+ path = sys.argv[1]
465
+ data = json.load(open(path))
466
+ data["timed_out"] = "false"
467
+ json.dump(data, open(path, "w"), indent=2)
468
+ PY
469
+ expect_fail_contains malformed-pair-bool "l2_risk_probes result timed_out malformed" \
470
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-pair-bool --min-fixtures 1
471
+
472
+ write_fixture malformed-judge-bool F21 50 75 85 true
473
+ python3 - "$TMP_DIR/malformed-judge-bool/F21/judge.json" <<'PY'
474
+ import json, sys
475
+ path = sys.argv[1]
476
+ data = json.load(open(path))
477
+ data["disqualifiers_by_arm"] = {"l2_risk_probes": {"disqualifier": "false"}}
478
+ json.dump(data, open(path, "w"), indent=2)
479
+ PY
480
+ expect_fail_contains malformed-judge-bool "l2_risk_probes judge disqualifier malformed" \
481
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-judge-bool --min-fixtures 1
482
+
483
+ write_fixture missing-pair-diff F21 50 75 85 true
484
+ rm "$TMP_DIR/missing-pair-diff/F21/l2_risk_probes/diff.patch"
485
+ expect_fail_contains missing-pair-diff "l2_risk_probes diff.patch missing" \
486
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-pair-diff --min-fixtures 1
487
+
488
+ write_fixture malformed-result-artifact F21 50 75 85 true
489
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-result-artifact/F21/l2_risk_probes/result.json"
490
+ expect_fail_contains malformed-result-artifact "l2_risk_probes result.json malformed" \
491
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-result-artifact --min-fixtures 1
492
+
493
+ write_fixture malformed-verify-artifact F21 50 75 85 true
494
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-verify-artifact/F21/l2_risk_probes/verify.json"
495
+ expect_fail_contains malformed-verify-artifact "l2_risk_probes verify.json malformed" \
496
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-verify-artifact --min-fixtures 1
497
+
498
+ write_fixture malformed-judge-artifact F21 50 75 85 true
499
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-judge-artifact/F21/judge.json"
500
+ expect_fail_contains malformed-judge-artifact "judge.json malformed" \
501
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-judge-artifact --min-fixtures 1
502
+
503
+ write_fixture custom-pair-arm F21 50 75 82 true 220 110 l2_gated
504
+ write_fixture custom-pair-arm F23 55 75 83 true 280 140 l2_gated
89
505
  python3 "$GATE" --results-root "$TMP_DIR" --run-id custom-pair-arm \
90
- --pair-arm l2_risk_probes \
506
+ --pair-arm l2_gated \
91
507
  --max-pair-solo-wall-ratio 3 \
92
508
  --out-json "$TMP_DIR/custom-pair-arm.json" \
93
509
  --out-md "$TMP_DIR/custom-pair-arm.md"
94
- grep -Fq '"pair_arm": "l2_risk_probes"' "$TMP_DIR/custom-pair-arm.json"
95
- grep -Fq 'l2_risk_probes - solo_claude >= 5' "$TMP_DIR/custom-pair-arm.md"
510
+ grep -Fq '"pair_arm": "l2_gated"' "$TMP_DIR/custom-pair-arm.json"
511
+ grep -Fq 'l2_gated must be evidence-clean' "$TMP_DIR/custom-pair-arm.json"
512
+ grep -Fq 'pair_trigger eligible with a canonical reason' "$TMP_DIR/custom-pair-arm.json"
513
+ grep -Fq 'l2_gated - solo_claude >= 5' "$TMP_DIR/custom-pair-arm.md"
96
514
 
97
515
  write_fixture provider-limit F21 50 75 85 true 37 100 l2_risk_probes
98
516
  python3 - "$TMP_DIR/provider-limit/F21/l2_risk_probes/result.json" <<'PY'
@@ -111,6 +529,7 @@ python3 "$GATE" --results-root "$TMP_DIR" --run-id provider-limit \
111
529
  --out-json "$TMP_DIR/provider-limit.json" \
112
530
  --out-md "$TMP_DIR/provider-limit.md" >/dev/null 2>&1 || true
113
531
  grep -Fq '"pair_margin": null' "$TMP_DIR/provider-limit.json"
532
+ grep -Fq '"avg_pair_margin": null' "$TMP_DIR/provider-limit.json"
114
533
  grep -Fq '"pair_solo_wall_ratio": null' "$TMP_DIR/provider-limit.json"
115
534
  if grep -Fq 'margin -' "$TMP_DIR/provider-limit.md"; then
116
535
  echo "provider-limit row must not report quality margin" >&2
@@ -119,13 +538,73 @@ if grep -Fq 'margin -' "$TMP_DIR/provider-limit.md"; then
119
538
  fi
120
539
 
121
540
  write_fixture slow-pair F21 50 75 85 true 401 100
122
- write_fixture slow-pair F22 60 80 88 true 280 140
541
+ write_fixture slow-pair F23 55 75 83 true 280 140
123
542
  expect_fail_contains slow-pair "pair/solo wall ratio 4.01 > 3.00" \
124
- python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair --max-pair-solo-wall-ratio 3
543
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair
544
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair \
545
+ --max-pair-solo-wall-ratio 5 \
546
+ --out-json "$TMP_DIR/slow-pair-diagnostic.json" >/dev/null
547
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/slow-pair-diagnostic.json"
125
548
 
126
549
  write_fixture one-fixture F21 50 75 85 true
127
550
  expect_fail_contains one-fixture "fixture_count_ok" \
128
551
  python3 "$GATE" --results-root "$TMP_DIR" --run-id one-fixture --out-json "$TMP_DIR/one-fixture.json"
129
552
  grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-fixture.json"
130
553
 
554
+ write_fixture malformed-dq F21 50 75 85 true
555
+ python3 - "$TMP_DIR/malformed-dq/F21/judge.json" <<'PY'
556
+ import json, sys
557
+ path = sys.argv[1]
558
+ data = json.load(open(path))
559
+ data["disqualifiers_by_arm"] = ["not", "a", "dict"]
560
+ json.dump(data, open(path, "w"), indent=2)
561
+ PY
562
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-dq --min-fixtures 1 \
563
+ --out-json "$TMP_DIR/malformed-dq.json" >/dev/null
564
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/malformed-dq.json"
565
+
566
+ write_fixture malformed-dq-entry F21 50 75 85 true
567
+ python3 - "$TMP_DIR/malformed-dq-entry/F21/judge.json" <<'PY'
568
+ import json, sys
569
+ path = sys.argv[1]
570
+ data = json.load(open(path))
571
+ data["disqualifiers_by_arm"] = {"l2_risk_probes": True}
572
+ json.dump(data, open(path, "w"), indent=2)
573
+ PY
574
+ expect_fail_contains malformed-dq-entry "l2_risk_probes judge disqualifier" \
575
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-dq-entry --min-fixtures 1
576
+
577
+ write_fixture malformed-axis-wrapper F21 50 75 85 true
578
+ python3 - "$TMP_DIR/malformed-axis-wrapper/F21/judge.json" <<'PY'
579
+ import json, sys
580
+ path = sys.argv[1]
581
+ data = json.load(open(path))
582
+ data["_axis_validation"] = ["not", "a", "dict"]
583
+ json.dump(data, open(path, "w"), indent=2)
584
+ PY
585
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-axis-wrapper --min-fixtures 1 \
586
+ --out-json "$TMP_DIR/malformed-axis-wrapper.json" >/dev/null
587
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/malformed-axis-wrapper.json"
588
+
589
+ expect_fail_contains invalid-min-pair-margin "value must be > 0" \
590
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-pair-margin 0
591
+
592
+ expect_fail_contains invalid-max-wall-ratio "value must be finite and > 0" \
593
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --max-pair-solo-wall-ratio nan
594
+
595
+ expect_fail_contains invalid-min-fixtures "value must be > 0" \
596
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-fixtures 0
597
+
598
+ expect_fail_contains invalid-min-bare-headroom "value must be >= 0" \
599
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-bare-headroom -1
600
+
601
+ expect_fail_contains invalid-min-solo-headroom "value must be >= 0" \
602
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --min-solo-headroom -1
603
+
604
+ expect_fail_contains invalid-pair-arm "pair-arm must be one of" \
605
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --pair-arm variant
606
+
607
+ expect_fail_contains retired-pair-arm "pair-arm l2_forced is retired" \
608
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id pass --pair-arm l2_forced
609
+
131
610
  echo "PASS test-full-pipeline-pair-gate"