devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -19,6 +19,7 @@ write_fixture() {
19
19
  cat > "$dir/judge.json" <<EOF
20
20
  {
21
21
  "scores_by_arm": {"bare": $bare, "solo_claude": $solo},
22
+ "_blind_mapping": {"A": "bare", "B": "solo_claude", "seed": 1},
22
23
  "disqualifiers_by_arm": {}
23
24
  }
24
25
  EOF
@@ -26,14 +27,16 @@ EOF
26
27
  {"timed_out": false, "invoke_failure": false}
27
28
  EOF
28
29
  cat > "$dir/bare/verify.json" <<'EOF'
29
- {"disqualifier": false}
30
+ {"disqualifier": false, "verify_score": 1.0}
30
31
  EOF
32
+ : > "$dir/bare/diff.patch"
31
33
  cat > "$dir/solo_claude/result.json" <<EOF
32
- {"timed_out": $solo_timed_out, "invoke_failure": false}
34
+ {"timed_out": $solo_timed_out, "invoke_failure": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
33
35
  EOF
34
36
  cat > "$dir/solo_claude/verify.json" <<'EOF'
35
- {"disqualifier": false}
37
+ {"disqualifier": false, "verify_score": 1.0}
36
38
  EOF
39
+ : > "$dir/solo_claude/diff.patch"
37
40
  }
38
41
 
39
42
  expect_fail_contains() {
@@ -53,32 +56,346 @@ expect_fail_contains() {
53
56
  fi
54
57
  }
55
58
 
56
- write_fixture one-pass F10 50 75
59
+ write_fixture one-pass F16 50 75
60
+ expect_fail_contains missing-rejected-registry "rejected fixture registry missing" \
61
+ env PAIR_REJECTED_FIXTURES_REGISTRY="$TMP_DIR/missing-registry.sh" \
62
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --min-fixtures 1
63
+ empty_registry="$TMP_DIR/empty-registry.sh"
64
+ : > "$empty_registry"
65
+ expect_fail_contains empty-rejected-registry "rejected fixture registry has no fixture entries" \
66
+ env PAIR_REJECTED_FIXTURES_REGISTRY="$empty_registry" \
67
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --min-fixtures 1
57
68
  expect_fail_contains min-fixtures 'Verdict: **FAIL**' \
58
69
  python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --out-json "$TMP_DIR/one-pass.json"
59
70
  grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-pass.json"
60
71
 
61
- write_fixture two-pass F10 50 75
62
- write_fixture two-pass F12 60 80
72
+ expect_fail_contains invalid-min-fixtures "value must be > 0" \
73
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --min-fixtures 0
74
+
75
+ expect_fail_contains invalid-min-bare-headroom "value must be >= 0" \
76
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --min-bare-headroom -1
77
+
78
+ expect_fail_contains invalid-min-solo-headroom "value must be >= 0" \
79
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --min-solo-headroom -1
80
+
81
+ write_fixture nan-result F16 50 75
82
+ cat > "$TMP_DIR/nan-result/F16/bare/result.json" <<'EOF'
83
+ {"timed_out": false, "invoke_failure": false, "elapsed_seconds": NaN}
84
+ EOF
85
+ expect_fail_contains nan-result-json "bare result.json malformed" \
86
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id nan-result --min-fixtures 1
87
+
88
+ write_fixture two-pass F16 50 75
89
+ write_fixture two-pass F21 55 75
63
90
  python3 "$GATE" --results-root "$TMP_DIR" --run-id two-pass --out-json "$TMP_DIR/two-pass.json" \
64
- > "$TMP_DIR/two-pass.out"
91
+ --out-md "$TMP_DIR/two-pass.md" > "$TMP_DIR/two-pass.out"
65
92
  grep -Fq '"verdict": "PASS"' "$TMP_DIR/two-pass.json"
66
93
  grep -Fq '"fixture_count_ok": true' "$TMP_DIR/two-pass.json"
94
+ grep -Fq '"min_bare_headroom_required": 5' "$TMP_DIR/two-pass.json"
95
+ grep -Fq '"min_solo_headroom_required": 5' "$TMP_DIR/two-pass.json"
96
+ grep -Fq '"avg_bare_headroom": 7.5' "$TMP_DIR/two-pass.json"
97
+ grep -Fq '"min_bare_headroom": 5' "$TMP_DIR/two-pass.json"
98
+ grep -Fq '"avg_solo_headroom": 5.0' "$TMP_DIR/two-pass.json"
99
+ grep -Fq '"min_solo_headroom": 5' "$TMP_DIR/two-pass.json"
100
+ grep -Fq '"bare_headroom": 10' "$TMP_DIR/two-pass.json"
101
+ grep -Fq '"solo_headroom": 5' "$TMP_DIR/two-pass.json"
102
+ grep -Fq 'headroom >= 5' "$TMP_DIR/two-pass.md"
103
+ grep -Fq 'Average bare headroom: 7.5' "$TMP_DIR/two-pass.md"
104
+ grep -Fq 'Minimum bare headroom: 5' "$TMP_DIR/two-pass.md"
105
+ grep -Fq 'Average solo_claude headroom: 5.0' "$TMP_DIR/two-pass.md"
106
+ grep -Fq 'Minimum solo_claude headroom: 5' "$TMP_DIR/two-pass.md"
107
+ grep -Fq 'Fixtures passed: 2/2 (minimum required: 2)' "$TMP_DIR/two-pass.md"
108
+ grep -Fq '| Fixture | Bare | Bare headroom | Solo_claude | Solo_claude headroom | Status | Reason |' "$TMP_DIR/two-pass.md"
109
+ grep -Fq '| F16 | 50 | 10 | 75 | 5 | PASS | |' "$TMP_DIR/two-pass.md"
110
+ grep -Fq '| F21 | 55 | 5 | 75 | 5 | PASS | |' "$TMP_DIR/two-pass.md"
111
+
112
+ write_fixture rejected-direct F2 50 75
113
+ write_fixture rejected-direct F16 50 75
114
+ expect_fail_contains rejected-direct "fixture rejected for pair-candidate runs" \
115
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id rejected-direct --min-fixtures 1
116
+
117
+ write_fixture rejected-shadow-direct S3-cli-ticket-assignment 50 75
118
+ write_fixture rejected-shadow-direct F16 50 75
119
+ expect_fail_contains rejected-shadow-direct "fixture rejected for pair-candidate runs" \
120
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id rejected-shadow-direct --min-fixtures 1
121
+
122
+ write_fixture marginal-bare F16 59 66
123
+ write_fixture marginal-bare F21 50 75
124
+ expect_fail_contains marginal-bare "bare headroom 1 < 5" \
125
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id marginal-bare
126
+
127
+ write_fixture marginal-solo F16 50 78
128
+ write_fixture marginal-solo F21 50 75
129
+ expect_fail_contains marginal-solo "solo_claude headroom 2 < 5" \
130
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id marginal-solo
67
131
 
68
- write_fixture solo-ceiling F10 50 75
69
- write_fixture solo-ceiling F12 20 92
132
+ write_fixture explicit-zero-margin F16 60 80
133
+ write_fixture explicit-zero-margin F21 50 75
134
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id explicit-zero-margin \
135
+ --min-bare-headroom 0 --min-solo-headroom 0 \
136
+ --out-json "$TMP_DIR/explicit-zero-margin.json"
137
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/explicit-zero-margin.json"
138
+
139
+ write_fixture solo-ceiling F16 50 75
140
+ write_fixture solo-ceiling F21 20 92
70
141
  expect_fail_contains solo-ceiling "solo_claude score 92 > 80" \
71
142
  python3 "$GATE" --results-root "$TMP_DIR" --run-id solo-ceiling
72
143
 
73
- write_fixture dirty-solo F10 50 75
74
- write_fixture dirty-solo F12 20 70 true
144
+ write_fixture dirty-solo F16 50 75
145
+ write_fixture dirty-solo F21 20 70 true
75
146
  expect_fail_contains dirty-solo "solo_claude timed out" \
76
147
  python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo
77
148
 
78
- write_fixture missing-artifact F10 50 75
79
- write_fixture missing-artifact F12 20 70
80
- rm "$TMP_DIR/missing-artifact/F12/solo_claude/verify.json"
149
+ write_fixture dirty-bare F16 50 75
150
+ write_fixture dirty-bare F21 20 70
151
+ python3 - "$TMP_DIR/dirty-bare/F16/bare/result.json" <<'PY'
152
+ import json, sys
153
+ path = sys.argv[1]
154
+ data = json.load(open(path))
155
+ data["disqualifier"] = True
156
+ json.dump(data, open(path, "w"), indent=2)
157
+ PY
158
+ expect_fail_contains dirty-bare "bare result disqualifier" \
159
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-bare
160
+
161
+ write_fixture dirty-solo-axis F16 50 75
162
+ write_fixture dirty-solo-axis F21 20 70
163
+ python3 - "$TMP_DIR/dirty-solo-axis/F21/judge.json" <<'PY'
164
+ import json, sys
165
+ path = sys.argv[1]
166
+ data = json.load(open(path))
167
+ data["_blind_mapping"] = {"A": "bare", "B": "solo_claude", "seed": 1}
168
+ data["_axis_validation"] = {
169
+ "out_of_range_count": 1,
170
+ "out_of_range_cells": [{"breakdown": "b_breakdown", "axis": "quality", "value": 26}],
171
+ "axis_range": [0, 25],
172
+ }
173
+ json.dump(data, open(path, "w"), indent=2)
174
+ PY
175
+ expect_fail_contains dirty-solo-axis "solo_claude judge axis-invalid (1)" \
176
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo-axis
177
+
178
+ write_fixture unmapped-axis F16 50 75
179
+ write_fixture unmapped-axis F21 20 70
180
+ python3 - "$TMP_DIR/unmapped-axis/F21/judge.json" <<'PY'
181
+ import json, sys
182
+ path = sys.argv[1]
183
+ data = json.load(open(path))
184
+ data["_blind_mapping"] = {"A": "bare", "B": "variant", "seed": 1}
185
+ data["_axis_validation"] = {
186
+ "out_of_range_count": 1,
187
+ "out_of_range_cells": [{"breakdown": "b_breakdown", "axis": "quality", "value": 26}],
188
+ "axis_range": [0, 25],
189
+ }
190
+ json.dump(data, open(path, "w"), indent=2)
191
+ PY
192
+ expect_fail_contains unmapped-axis "judge axis-invalid unmapped (1)" \
193
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id unmapped-axis
194
+
195
+ write_fixture missing-mapping F16 50 75
196
+ write_fixture missing-mapping F21 20 70
197
+ python3 - "$TMP_DIR/missing-mapping/F21/judge.json" <<'PY'
198
+ import json, sys
199
+ path = sys.argv[1]
200
+ data = json.load(open(path))
201
+ del data["_blind_mapping"]
202
+ json.dump(data, open(path, "w"), indent=2)
203
+ PY
204
+ expect_fail_contains missing-mapping "judge blind mapping missing" \
205
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-mapping
206
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-mapping \
207
+ --out-json "$TMP_DIR/missing-mapping.json" >/dev/null 2>&1 || true
208
+ grep -Fq '"bare_score": null' "$TMP_DIR/missing-mapping.json"
209
+ grep -Fq '"solo_score": null' "$TMP_DIR/missing-mapping.json"
210
+
211
+ write_fixture malformed-mapping-axis F16 50 75
212
+ write_fixture malformed-mapping-axis F21 20 70
213
+ python3 - "$TMP_DIR/malformed-mapping-axis/F21/judge.json" <<'PY'
214
+ import json, sys
215
+ path = sys.argv[1]
216
+ data = json.load(open(path))
217
+ data["_blind_mapping"] = "not-a-dict"
218
+ data["_axis_validation"] = {
219
+ "out_of_range_count": 1,
220
+ "out_of_range_cells": [{"breakdown": "b_breakdown", "axis": "quality", "value": 26}],
221
+ "axis_range": [0, 25],
222
+ }
223
+ json.dump(data, open(path, "w"), indent=2)
224
+ PY
225
+ expect_fail_contains malformed-mapping-axis "judge blind mapping missing" \
226
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-mapping-axis
227
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-mapping-axis \
228
+ --out-json "$TMP_DIR/malformed-mapping-axis.json" >/dev/null 2>&1 || true
229
+ grep -Fq '"bare_score": null' "$TMP_DIR/malformed-mapping-axis.json"
230
+ grep -Fq '"solo_score": null' "$TMP_DIR/malformed-mapping-axis.json"
231
+
232
+ write_fixture wrong-mapping F16 50 75
233
+ write_fixture wrong-mapping F21 20 70
234
+ python3 - "$TMP_DIR/wrong-mapping/F21/judge.json" <<'PY'
235
+ import json, sys
236
+ path = sys.argv[1]
237
+ data = json.load(open(path))
238
+ data["_blind_mapping"] = {"A": "bare", "B": "variant", "seed": 1}
239
+ json.dump(data, open(path, "w"), indent=2)
240
+ PY
241
+ expect_fail_contains wrong-mapping "judge blind mapping missing arm(s): solo_claude" \
242
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id wrong-mapping
243
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id wrong-mapping \
244
+ --out-json "$TMP_DIR/wrong-mapping.json" >/dev/null 2>&1 || true
245
+ grep -Fq '"bare_score": 20' "$TMP_DIR/wrong-mapping.json"
246
+ grep -Fq '"solo_score": null' "$TMP_DIR/wrong-mapping.json"
247
+
248
+ write_fixture malformed-scores F16 50 75
249
+ write_fixture malformed-scores F21 20 70
250
+ python3 - "$TMP_DIR/malformed-scores/F21/judge.json" <<'PY'
251
+ import json, sys
252
+ path = sys.argv[1]
253
+ data = json.load(open(path))
254
+ data["scores_by_arm"] = ["not", "a", "dict"]
255
+ json.dump(data, open(path, "w"), indent=2)
256
+ PY
257
+ expect_fail_contains malformed-scores "bare score missing" \
258
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-scores
259
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-scores \
260
+ --out-json "$TMP_DIR/malformed-scores.json" >/dev/null 2>&1 || true
261
+ grep -Fq '"bare_score": null' "$TMP_DIR/malformed-scores.json"
262
+ grep -Fq '"solo_score": null' "$TMP_DIR/malformed-scores.json"
263
+
264
+ write_fixture overrange-score F16 50 75
265
+ write_fixture overrange-score F21 20 101
266
+ expect_fail_contains overrange-score "solo_claude score missing" \
267
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id overrange-score
268
+
269
+ write_fixture boolean-score F16 true 75
270
+ write_fixture boolean-score F21 20 70
271
+ expect_fail_contains boolean-score "bare score missing" \
272
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id boolean-score
273
+
274
+ write_fixture partial-baseline F16 50 75
275
+ write_fixture partial-baseline F21 20 70
276
+ python3 - "$TMP_DIR/partial-baseline/F21/solo_claude/verify.json" <<'PY'
277
+ import json, sys
278
+ path = sys.argv[1]
279
+ data = json.load(open(path))
280
+ data["verify_score"] = 0.75
281
+ json.dump(data, open(path, "w"), indent=2)
282
+ PY
283
+ python3 - "$TMP_DIR/partial-baseline/F21/solo_claude/result.json" <<'PY'
284
+ import json, sys
285
+ path = sys.argv[1]
286
+ data = json.load(open(path))
287
+ data["terminal_verdict"] = "FAIL"
288
+ data["verify_verdict"] = "FAIL"
289
+ json.dump(data, open(path, "w"), indent=2)
290
+ PY
291
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id partial-baseline \
292
+ --out-json "$TMP_DIR/partial-baseline.json" \
293
+ --out-md "$TMP_DIR/partial-baseline.md"
294
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/partial-baseline.json"
295
+ grep -Fq '| F21 | 20 | 40 | 70 | 10 | PASS | |' "$TMP_DIR/partial-baseline.md"
296
+
297
+ write_fixture dirty-bare-env F16 50 75
298
+ write_fixture dirty-bare-env F21 20 70
299
+ python3 - "$TMP_DIR/dirty-bare-env/F16/bare/result.json" <<'PY'
300
+ import json, sys
301
+ path = sys.argv[1]
302
+ data = json.load(open(path))
303
+ data["environment_contamination"] = True
304
+ json.dump(data, open(path, "w"), indent=2)
305
+ PY
306
+ expect_fail_contains dirty-bare-env "bare environment contamination" \
307
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-bare-env
308
+
309
+ write_fixture malformed-bare-bool F16 50 75
310
+ write_fixture malformed-bare-bool F21 20 70
311
+ python3 - "$TMP_DIR/malformed-bare-bool/F16/bare/result.json" <<'PY'
312
+ import json, sys
313
+ path = sys.argv[1]
314
+ data = json.load(open(path))
315
+ data["timed_out"] = "false"
316
+ json.dump(data, open(path, "w"), indent=2)
317
+ PY
318
+ expect_fail_contains malformed-bare-bool "bare result timed_out malformed" \
319
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-bare-bool
320
+
321
+ write_fixture malformed-judge-bool F16 50 75
322
+ write_fixture malformed-judge-bool F21 20 70
323
+ python3 - "$TMP_DIR/malformed-judge-bool/F16/judge.json" <<'PY'
324
+ import json, sys
325
+ path = sys.argv[1]
326
+ data = json.load(open(path))
327
+ data["disqualifiers_by_arm"] = {"bare": {"disqualifier": "false"}}
328
+ json.dump(data, open(path, "w"), indent=2)
329
+ PY
330
+ expect_fail_contains malformed-judge-bool "bare judge disqualifier malformed" \
331
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-judge-bool
332
+
333
+ write_fixture missing-artifact F16 50 75
334
+ write_fixture missing-artifact F21 20 70
335
+ rm "$TMP_DIR/missing-artifact/F21/solo_claude/verify.json"
81
336
  expect_fail_contains missing-artifact "solo_claude verify.json missing" \
82
337
  python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-artifact
83
338
 
339
+ write_fixture malformed-result-artifact F16 50 75
340
+ write_fixture malformed-result-artifact F21 20 70
341
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-result-artifact/F21/solo_claude/result.json"
342
+ expect_fail_contains malformed-result-artifact "solo_claude result.json malformed" \
343
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-result-artifact
344
+
345
+ write_fixture malformed-verify-artifact F16 50 75
346
+ write_fixture malformed-verify-artifact F21 20 70
347
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-verify-artifact/F21/solo_claude/verify.json"
348
+ expect_fail_contains malformed-verify-artifact "solo_claude verify.json malformed" \
349
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-verify-artifact
350
+
351
+ write_fixture malformed-judge-artifact F16 50 75
352
+ write_fixture malformed-judge-artifact F21 20 70
353
+ printf '["not", "a", "dict"]\n' > "$TMP_DIR/malformed-judge-artifact/F21/judge.json"
354
+ expect_fail_contains malformed-judge-artifact "judge.json malformed" \
355
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-judge-artifact
356
+
357
+ write_fixture missing-diff F16 50 75
358
+ write_fixture missing-diff F21 20 70
359
+ rm "$TMP_DIR/missing-diff/F21/solo_claude/diff.patch"
360
+ expect_fail_contains missing-diff "solo_claude diff.patch missing" \
361
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-diff
362
+
363
+ write_fixture malformed-dq F16 50 75
364
+ write_fixture malformed-dq F21 20 70
365
+ python3 - "$TMP_DIR/malformed-dq/F21/judge.json" <<'PY'
366
+ import json, sys
367
+ path = sys.argv[1]
368
+ data = json.load(open(path))
369
+ data["disqualifiers_by_arm"] = ["not", "a", "dict"]
370
+ json.dump(data, open(path, "w"), indent=2)
371
+ PY
372
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-dq --min-fixtures 1 \
373
+ --out-json "$TMP_DIR/malformed-dq.json" >/dev/null
374
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/malformed-dq.json"
375
+
376
+ write_fixture malformed-dq-entry F16 50 75
377
+ write_fixture malformed-dq-entry F21 20 70
378
+ python3 - "$TMP_DIR/malformed-dq-entry/F21/judge.json" <<'PY'
379
+ import json, sys
380
+ path = sys.argv[1]
381
+ data = json.load(open(path))
382
+ data["disqualifiers_by_arm"] = {"solo_claude": True}
383
+ json.dump(data, open(path, "w"), indent=2)
384
+ PY
385
+ expect_fail_contains malformed-dq-entry "solo_claude judge disqualifier" \
386
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-dq-entry
387
+
388
+ write_fixture malformed-axis-wrapper F16 50 75
389
+ write_fixture malformed-axis-wrapper F21 20 70
390
+ python3 - "$TMP_DIR/malformed-axis-wrapper/F21/judge.json" <<'PY'
391
+ import json, sys
392
+ path = sys.argv[1]
393
+ data = json.load(open(path))
394
+ data["_axis_validation"] = ["not", "a", "dict"]
395
+ json.dump(data, open(path, "w"), indent=2)
396
+ PY
397
+ python3 "$GATE" --results-root "$TMP_DIR" --run-id malformed-axis-wrapper --min-fixtures 1 \
398
+ --out-json "$TMP_DIR/malformed-axis-wrapper.json" >/dev/null
399
+ grep -Fq '"verdict": "PASS"' "$TMP_DIR/malformed-axis-wrapper.json"
400
+
84
401
  echo "✓ test-headroom-gate"