devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (219) hide show
  1. package/AGENTS.md +1 -1
  2. package/CLAUDE.md +2 -2
  3. package/README.md +82 -29
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +211 -18
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +3 -2
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +1 -1
  201. package/config/skills/_shared/runtime-principles.md +5 -8
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  205. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  206. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  207. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  208. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  209. package/config/skills/devlyn:resolve/SKILL.md +49 -18
  210. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  211. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  212. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  213. package/config/skills/devlyn:resolve/references/phases/verify.md +62 -28
  214. package/config/skills/devlyn:resolve/references/state-schema.md +7 -4
  215. package/package.json +47 -2
  216. package/scripts/lint-fixtures.sh +349 -0
  217. package/scripts/lint-shadow-fixtures.sh +58 -0
  218. package/scripts/lint-skills.sh +3642 -92
  219. /package/{optional-skills → config/skills}/devlyn:design-ui/SKILL.md +0 -0
@@ -0,0 +1,1189 @@
1
+ #!/usr/bin/env bash
2
+ # Regression tests for ship-gate.py summary semantics.
3
+
4
+ set -euo pipefail
5
+
6
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
7
+ BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
8
+ GATE="$SCRIPT_DIR/ship-gate.py"
9
+ COMPILE="$SCRIPT_DIR/compile-report.py"
10
+ TMP_DIR="$(mktemp -d /tmp/ship-gate-test.XXXXXX)"
11
+ BASELINE="$BENCH_ROOT/history/baselines/shipped.json"
12
+ BASELINE_BACKUP="$TMP_DIR/shipped.backup.json"
13
+ BASELINE_EXISTED=0
14
+
15
+ restore_baseline() {
16
+ rm -rf "$BENCH_ROOT/results/$RUN_PREFIX"*
17
+ if [ "$BASELINE_EXISTED" -eq 1 ]; then
18
+ mkdir -p "$(dirname "$BASELINE")"
19
+ cp "$BASELINE_BACKUP" "$BASELINE"
20
+ else
21
+ rm -f "$BASELINE"
22
+ rmdir "$BENCH_ROOT/history/baselines" "$BENCH_ROOT/history" 2>/dev/null || true
23
+ fi
24
+ rm -rf "$TMP_DIR"
25
+ }
26
+
27
+ RUN_PREFIX="ship-gate-test-$(basename "$TMP_DIR")"
28
+ if [ -f "$BASELINE" ]; then
29
+ BASELINE_EXISTED=1
30
+ cp "$BASELINE" "$BASELINE_BACKUP"
31
+ fi
32
+ trap restore_baseline EXIT
33
+
34
+ write_summary() {
35
+ local run_id="$1"
36
+ local non_edge_passes="$2"
37
+ local f8_score="${3:-80}"
38
+ local run_dir="$BENCH_ROOT/results/$run_id"
39
+ mkdir -p "$run_dir"
40
+ python3 - "$run_dir/summary.json" "$run_id" "$non_edge_passes" "$f8_score" <<'PY'
41
+ import json
42
+ import sys
43
+
44
+ out, run_id, non_edge_passes, f8_score = sys.argv[1], sys.argv[2], int(sys.argv[3]), int(sys.argv[4])
45
+ rows = []
46
+ for i in range(1, non_edge_passes):
47
+ rows.append({
48
+ "fixture": f"F{i}",
49
+ "category": "trivial",
50
+ "variant_score": 90,
51
+ "margin": 5,
52
+ "margins": {"solo_over_bare": 5},
53
+ "arms": {
54
+ "bare": {"score": 50},
55
+ "solo_claude": {"score": 55, "disqualifier": False},
56
+ },
57
+ })
58
+ rows.append({
59
+ "fixture": "F9-e2e-ideate-to-resolve",
60
+ "category": "e2e",
61
+ "variant_score": 90,
62
+ "margin": 5,
63
+ "margins": {"solo_over_bare": 5},
64
+ "arms": {
65
+ "bare": {"score": 50},
66
+ "solo_claude": {"score": 55, "disqualifier": False},
67
+ },
68
+ })
69
+ rows.append({
70
+ "fixture": "F8-known-limit-ambiguous",
71
+ "category": "edge",
72
+ "variant_score": f8_score,
73
+ "margin": 5,
74
+ "margins": {"solo_over_bare": 5},
75
+ "arms": {
76
+ "bare": {"score": 50},
77
+ "solo_claude": {"score": 55, "disqualifier": False},
78
+ },
79
+ })
80
+ summary = {
81
+ "run_id": run_id,
82
+ "hard_floor_violations": 0,
83
+ "margin_ge_5_count": 7,
84
+ "gated_fixtures": 7,
85
+ "margin_avg": 5,
86
+ "arms_present": {"solo_claude": True},
87
+ "margins_avg": {"solo_over_bare": 5},
88
+ "rows": rows,
89
+ }
90
+ with open(out, "w", encoding="utf8") as fh:
91
+ json.dump(summary, fh)
92
+ PY
93
+ }
94
+
95
+ expect_fail_contains() {
96
+ local label="$1"
97
+ local needle="$2"
98
+ shift 2
99
+ local out="$TMP_DIR/$label.out"
100
+ if "$@" > "$out" 2>&1; then
101
+ echo "expected failure for $label" >&2
102
+ cat "$out" >&2
103
+ exit 1
104
+ fi
105
+ if ! grep -Fq "$needle" "$out"; then
106
+ echo "missing expected text for $label: $needle" >&2
107
+ cat "$out" >&2
108
+ exit 1
109
+ fi
110
+ }
111
+
112
+ expect_pass() {
113
+ local label="$1"
114
+ shift
115
+ local out="$TMP_DIR/$label.out"
116
+ if ! "$@" > "$out" 2>&1; then
117
+ echo "expected pass for $label" >&2
118
+ cat "$out" >&2
119
+ exit 1
120
+ fi
121
+ }
122
+
123
+ MALFORMED_SUMMARY_RUN="$RUN_PREFIX-malformed-summary"
124
+ mkdir -p "$BENCH_ROOT/results/$MALFORMED_SUMMARY_RUN"
125
+ printf '["not", "a", "dict"]\n' > "$BENCH_ROOT/results/$MALFORMED_SUMMARY_RUN/summary.json"
126
+ expect_fail_contains malformed-summary \
127
+ "measurement invalid: malformed summary.json (expected object)" \
128
+ python3 "$GATE" --run-id "$MALFORMED_SUMMARY_RUN"
129
+
130
+ NAN_SUMMARY_RUN="$RUN_PREFIX-nan-summary"
131
+ mkdir -p "$BENCH_ROOT/results/$NAN_SUMMARY_RUN"
132
+ cat > "$BENCH_ROOT/results/$NAN_SUMMARY_RUN/summary.json" <<'JSON'
133
+ {"hard_floor_violations": 0, "margin_avg": NaN, "rows": []}
134
+ JSON
135
+ expect_fail_contains nan-summary \
136
+ "measurement invalid: malformed summary.json (invalid JSON)" \
137
+ python3 "$GATE" --run-id "$NAN_SUMMARY_RUN"
138
+
139
+ MALFORMED_SUMMARY_ROWS_RUN="$RUN_PREFIX-malformed-summary-rows"
140
+ mkdir -p "$BENCH_ROOT/results/$MALFORMED_SUMMARY_ROWS_RUN"
141
+ cat > "$BENCH_ROOT/results/$MALFORMED_SUMMARY_ROWS_RUN/summary.json" <<'JSON'
142
+ {
143
+ "hard_floor_violations": 0,
144
+ "margin_ge_5_count": 7,
145
+ "gated_fixtures": 7,
146
+ "rows": ["not-a-row"]
147
+ }
148
+ JSON
149
+ expect_fail_contains malformed-summary-rows \
150
+ "summary rows contain non-object entries" \
151
+ python3 "$GATE" --run-id "$MALFORMED_SUMMARY_ROWS_RUN" --accept-missing
152
+
153
+ MALFORMED_SUMMARY_COUNTS_RUN="$RUN_PREFIX-malformed-summary-counts"
154
+ mkdir -p "$BENCH_ROOT/results/$MALFORMED_SUMMARY_COUNTS_RUN"
155
+ cat > "$BENCH_ROOT/results/$MALFORMED_SUMMARY_COUNTS_RUN/summary.json" <<'JSON'
156
+ {
157
+ "hard_floor_violations": "zero",
158
+ "margin_ge_5_count": 7,
159
+ "gated_fixtures": 7,
160
+ "rows": []
161
+ }
162
+ JSON
163
+ expect_fail_contains malformed-summary-counts \
164
+ "summary hard_floor_violations missing or malformed" \
165
+ python3 "$GATE" --run-id "$MALFORMED_SUMMARY_COUNTS_RUN" --accept-missing
166
+
167
+ MALFORMED_SUMMARY_FIELD_TYPES_RUN="$RUN_PREFIX-malformed-summary-field-types"
168
+ mkdir -p "$BENCH_ROOT/results/$MALFORMED_SUMMARY_FIELD_TYPES_RUN"
169
+ cat > "$BENCH_ROOT/results/$MALFORMED_SUMMARY_FIELD_TYPES_RUN/summary.json" <<'JSON'
170
+ {
171
+ "hard_floor_violations": 0,
172
+ "margin_ge_5_count": 7,
173
+ "gated_fixtures": 7,
174
+ "arms_present": {"solo_claude": true},
175
+ "margins_avg": {"solo_over_bare": "bad"},
176
+ "rows": [
177
+ {
178
+ "fixture": "F9-e2e-ideate-to-resolve",
179
+ "category": 123,
180
+ "margin": "bad",
181
+ "margins": {"solo_over_bare": "bad"},
182
+ "_axis_validation_unmapped_out_of_range_count": "bad",
183
+ "arms": {
184
+ "variant": {"_axis_validation_out_of_range_count": "bad"},
185
+ "bare": {"score": 50, "_axis_validation_out_of_range_count": "bad"},
186
+ "solo_claude": {"score": 55, "disqualifier": false, "_axis_validation_out_of_range_count": "bad"}
187
+ }
188
+ }
189
+ ]
190
+ }
191
+ JSON
192
+ expect_fail_contains malformed-summary-field-types \
193
+ "variant axis count malformed" \
194
+ python3 "$GATE" --run-id "$MALFORMED_SUMMARY_FIELD_TYPES_RUN" --accept-missing
195
+
196
+ MALFORMED_ARMS_PRESENT_WRAPPER_RUN="$RUN_PREFIX-malformed-arms-present-wrapper"
197
+ write_summary "$MALFORMED_ARMS_PRESENT_WRAPPER_RUN" 7
198
+ python3 - "$BENCH_ROOT/results/$MALFORMED_ARMS_PRESENT_WRAPPER_RUN/summary.json" <<'PY'
199
+ import json
200
+ import sys
201
+
202
+ path = sys.argv[1]
203
+ summary = json.load(open(path, encoding="utf8"))
204
+ summary["arms_present"] = ["not", "an", "object"]
205
+ json.dump(summary, open(path, "w", encoding="utf8"), indent=2)
206
+ PY
207
+ expect_fail_contains malformed-arms-present-wrapper \
208
+ "summary arms_present malformed" \
209
+ python3 "$GATE" --run-id "$MALFORMED_ARMS_PRESENT_WRAPPER_RUN"
210
+
211
+ MALFORMED_MARGINS_AVG_WRAPPER_RUN="$RUN_PREFIX-malformed-margins-avg-wrapper"
212
+ write_summary "$MALFORMED_MARGINS_AVG_WRAPPER_RUN" 7
213
+ python3 - "$BENCH_ROOT/results/$MALFORMED_MARGINS_AVG_WRAPPER_RUN/summary.json" <<'PY'
214
+ import json
215
+ import sys
216
+
217
+ path = sys.argv[1]
218
+ summary = json.load(open(path, encoding="utf8"))
219
+ summary["margins_avg"] = ["not", "an", "object"]
220
+ json.dump(summary, open(path, "w", encoding="utf8"), indent=2)
221
+ PY
222
+ expect_fail_contains malformed-margins-avg-wrapper \
223
+ "summary margins_avg malformed" \
224
+ python3 "$GATE" --run-id "$MALFORMED_MARGINS_AVG_WRAPPER_RUN"
225
+
226
+ MALFORMED_ARMS_PRESENT_RUN="$RUN_PREFIX-malformed-arms-present"
227
+ write_summary "$MALFORMED_ARMS_PRESENT_RUN" 7
228
+ python3 - "$BENCH_ROOT/results/$MALFORMED_ARMS_PRESENT_RUN/summary.json" <<'PY'
229
+ import json
230
+ import sys
231
+
232
+ path = sys.argv[1]
233
+ summary = json.load(open(path, encoding="utf8"))
234
+ summary["arms_present"]["solo_claude"] = "false"
235
+ json.dump(summary, open(path, "w", encoding="utf8"), indent=2)
236
+ PY
237
+ expect_fail_contains malformed-arms-present \
238
+ "summary arms_present.solo_claude malformed" \
239
+ python3 "$GATE" --run-id "$MALFORMED_ARMS_PRESENT_RUN"
240
+
241
+ MALFORMED_L1_DQ_SUMMARY_RUN="$RUN_PREFIX-malformed-l1-dq-summary"
242
+ write_summary "$MALFORMED_L1_DQ_SUMMARY_RUN" 7
243
+ python3 - "$BENCH_ROOT/results/$MALFORMED_L1_DQ_SUMMARY_RUN/summary.json" <<'PY'
244
+ import json
245
+ import sys
246
+
247
+ path = sys.argv[1]
248
+ summary = json.load(open(path, encoding="utf8"))
249
+ summary["rows"][0]["arms"]["solo_claude"]["disqualifier"] = "false"
250
+ json.dump(summary, open(path, "w", encoding="utf8"), indent=2)
251
+ PY
252
+ expect_fail_contains malformed-l1-dq-summary \
253
+ "F1 L1 disqualifier malformed" \
254
+ python3 "$GATE" --run-id "$MALFORMED_L1_DQ_SUMMARY_RUN"
255
+
256
+ write_summary "$RUN_PREFIX-edge-cannot-count-for-l1" 6 90
257
+ expect_fail_contains edge-cannot-count-for-l1 \
258
+ "L1: only 6 of 6 headroom-available fixtures" \
259
+ python3 "$GATE" --run-id "$RUN_PREFIX-edge-cannot-count-for-l1"
260
+
261
+ mkdir -p "$(dirname "$BASELINE")"
262
+ cat > "$BASELINE" <<'JSON'
263
+ {
264
+ "margin_avg": 5,
265
+ "rows": [
266
+ {"fixture": "F8-known-limit-ambiguous", "category": "edge", "variant_score": 90, "margin": 5}
267
+ ]
268
+ }
269
+ JSON
270
+ write_summary "$RUN_PREFIX-edge-regression-excluded" 7 80
271
+ expect_pass edge-regression-excluded \
272
+ python3 "$GATE" --run-id "$RUN_PREFIX-edge-regression-excluded"
273
+ grep -Fq "known-limit margin +5 outside expected [-3,+3] range" "$TMP_DIR/edge-regression-excluded.out"
274
+
275
+ AXIS_RUN="$RUN_PREFIX-axis-invalid"
276
+ AXIS_DIR="$BENCH_ROOT/results/$AXIS_RUN/F9-e2e-ideate-to-resolve"
277
+ mkdir -p "$AXIS_DIR"/{solo_claude,bare,variant}
278
+ for arm in solo_claude bare variant; do
279
+ cat > "$AXIS_DIR/$arm/result.json" <<'JSON'
280
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
281
+ JSON
282
+ done
283
+ cat > "$AXIS_DIR/judge.json" <<'JSON'
284
+ {
285
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
286
+ "_axis_validation": {
287
+ "out_of_range_count": 1,
288
+ "out_of_range_cells": [{"breakdown": "a_breakdown", "axis": "quality", "value": 26}],
289
+ "axis_range": [0, 25]
290
+ },
291
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
292
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
293
+ "winner_arm": "variant",
294
+ "disqualifiers_by_arm": {
295
+ "solo_claude": {"disqualifier": false},
296
+ "bare": {"disqualifier": false},
297
+ "variant": {"disqualifier": false}
298
+ }
299
+ }
300
+ JSON
301
+ python3 "$COMPILE" --run-id "$AXIS_RUN" > "$TMP_DIR/axis-compile.out" 2>&1
302
+ grep -Fq '| Fixture | Category | variant (L2) | solo_claude (L1) | bare (L0) | variant-bare | solo_claude-bare | variant-solo_claude | Winner | Wall variant/solo_claude/bare | Wall variant/solo_claude | Wall variant/bare |' \
303
+ "$TMP_DIR/axis-compile.out"
304
+ grep -Fq '| F9-e2e-ideate-to-resolve | e2e | 70 | 60 | 50 | +20 | +10 | +10 | variant | 10s/10s/10s | 1.0x | 1.0x |' \
305
+ "$TMP_DIR/axis-compile.out"
306
+ grep -Fq '**Fixtures with margin ≥ +5:** 1 / 1 (gate: ≥ 7)' "$TMP_DIR/axis-compile.out"
307
+ grep -Fq '**variant (L2) vs bare (L0) margin avg:** +20.0' "$TMP_DIR/axis-compile.out"
308
+ grep -Fq '**solo_claude (L1) vs bare (L0) margin avg:** +10.0' "$TMP_DIR/axis-compile.out"
309
+ grep -Fq '**variant (L2) vs solo_claude (L1) margin avg:** +10.0' "$TMP_DIR/axis-compile.out"
310
+ grep -Fq '**Wall ratio variant (L2) / solo_claude (L1):** 1.0x' "$TMP_DIR/axis-compile.out"
311
+ if grep -Fq 'gate: ≥ 7 of 9' "$TMP_DIR/axis-compile.out"; then
312
+ echo "compile report must not use stale 7-of-9 gate wording" >&2
313
+ cat "$TMP_DIR/axis-compile.out" >&2
314
+ exit 1
315
+ fi
316
+ python3 - "$BENCH_ROOT/results/$AXIS_RUN/summary.json" <<'PY'
317
+ import json
318
+ import sys
319
+
320
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
321
+ row = summary["rows"][0]
322
+ count = row["arms"]["solo_claude"].get("_axis_validation_out_of_range_count")
323
+ if count != 1:
324
+ raise SystemExit(f"expected solo_claude axis count 1, got {count!r}")
325
+ PY
326
+ NO_JUDGE_RUN="$RUN_PREFIX-no-judge-row"
327
+ NO_JUDGE_DIR="$BENCH_ROOT/results/$NO_JUDGE_RUN/F0-no-judge"
328
+ mkdir -p "$NO_JUDGE_DIR"
329
+ python3 "$COMPILE" --run-id "$NO_JUDGE_RUN" > "$TMP_DIR/no-judge-compile.out" 2>&1
330
+ grep -Fq '| F0-no-judge | — | — | — | — | — | — | — | NO_JUDGE | — | — | — |' \
331
+ "$TMP_DIR/no-judge-compile.out"
332
+
333
+ expect_fail_contains axis-invalid-through-compile \
334
+ "L1 axis-invalid: 1 fixture(s)" \
335
+ python3 "$GATE" --run-id "$AXIS_RUN" --accept-missing
336
+
337
+ VARIANT_AXIS_RUN="$RUN_PREFIX-variant-axis-invalid"
338
+ VARIANT_AXIS_DIR="$BENCH_ROOT/results/$VARIANT_AXIS_RUN/F9-e2e-ideate-to-resolve"
339
+ mkdir -p "$VARIANT_AXIS_DIR"/{solo_claude,bare,variant}
340
+ for arm in solo_claude bare variant; do
341
+ cat > "$VARIANT_AXIS_DIR/$arm/result.json" <<'JSON'
342
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
343
+ JSON
344
+ done
345
+ cat > "$VARIANT_AXIS_DIR/judge.json" <<'JSON'
346
+ {
347
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
348
+ "_axis_validation": {
349
+ "out_of_range_count": 1,
350
+ "out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
351
+ "axis_range": [0, 25]
352
+ },
353
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
354
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
355
+ "winner_arm": "variant",
356
+ "disqualifiers_by_arm": {
357
+ "solo_claude": {"disqualifier": false},
358
+ "bare": {"disqualifier": false},
359
+ "variant": {"disqualifier": false}
360
+ }
361
+ }
362
+ JSON
363
+ python3 "$COMPILE" --run-id "$VARIANT_AXIS_RUN" > "$TMP_DIR/variant-axis-compile.out" 2>&1
364
+ python3 - "$BENCH_ROOT/results/$VARIANT_AXIS_RUN/summary.json" <<'PY'
365
+ import json
366
+ import sys
367
+
368
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
369
+ row = summary["rows"][0]
370
+ count = row["arms"]["variant"].get("_axis_validation_out_of_range_count")
371
+ if count != 1:
372
+ raise SystemExit(f"expected variant axis count 1, got {count!r}")
373
+ PY
374
+ expect_fail_contains variant-axis-invalid-through-compile \
375
+ "variant axis-invalid: 1 fixture(s)" \
376
+ python3 "$GATE" --run-id "$VARIANT_AXIS_RUN" --accept-missing
377
+
378
+ BARE_AXIS_RUN="$RUN_PREFIX-bare-axis-invalid"
379
+ BARE_AXIS_DIR="$BENCH_ROOT/results/$BARE_AXIS_RUN/F9-e2e-ideate-to-resolve"
380
+ mkdir -p "$BARE_AXIS_DIR"/{solo_claude,bare,variant}
381
+ for arm in solo_claude bare variant; do
382
+ cat > "$BARE_AXIS_DIR/$arm/result.json" <<'JSON'
383
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
384
+ JSON
385
+ done
386
+ cat > "$BARE_AXIS_DIR/judge.json" <<'JSON'
387
+ {
388
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
389
+ "_axis_validation": {
390
+ "out_of_range_count": 1,
391
+ "out_of_range_cells": [{"breakdown": "b_breakdown", "axis": "quality", "value": 26}],
392
+ "axis_range": [0, 25]
393
+ },
394
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
395
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
396
+ "winner_arm": "variant",
397
+ "disqualifiers_by_arm": {
398
+ "solo_claude": {"disqualifier": false},
399
+ "bare": {"disqualifier": false},
400
+ "variant": {"disqualifier": false}
401
+ }
402
+ }
403
+ JSON
404
+ python3 "$COMPILE" --run-id "$BARE_AXIS_RUN" > "$TMP_DIR/bare-axis-compile.out" 2>&1
405
+ python3 - "$BENCH_ROOT/results/$BARE_AXIS_RUN/summary.json" <<'PY'
406
+ import json
407
+ import sys
408
+
409
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
410
+ row = summary["rows"][0]
411
+ count = row["arms"]["bare"].get("_axis_validation_out_of_range_count")
412
+ if count != 1:
413
+ raise SystemExit(f"expected bare axis count 1, got {count!r}")
414
+ PY
415
+ expect_fail_contains bare-axis-invalid-through-compile \
416
+ "bare axis-invalid: 1 fixture(s)" \
417
+ python3 "$GATE" --run-id "$BARE_AXIS_RUN" --accept-missing
418
+
419
+ UNMAPPED_AXIS_RUN="$RUN_PREFIX-unmapped-axis-invalid"
420
+ UNMAPPED_AXIS_DIR="$BENCH_ROOT/results/$UNMAPPED_AXIS_RUN/F9-e2e-ideate-to-resolve"
421
+ mkdir -p "$UNMAPPED_AXIS_DIR"/{solo_claude,bare,variant}
422
+ for arm in solo_claude bare variant; do
423
+ cat > "$UNMAPPED_AXIS_DIR/$arm/result.json" <<'JSON'
424
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
425
+ JSON
426
+ done
427
+ cat > "$UNMAPPED_AXIS_DIR/judge.json" <<'JSON'
428
+ {
429
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_forced", "seed": 1},
430
+ "_axis_validation": {
431
+ "out_of_range_count": 1,
432
+ "out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
433
+ "axis_range": [0, 25]
434
+ },
435
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
436
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
437
+ "winner_arm": "variant",
438
+ "disqualifiers_by_arm": {
439
+ "solo_claude": {"disqualifier": false},
440
+ "bare": {"disqualifier": false},
441
+ "variant": {"disqualifier": false}
442
+ }
443
+ }
444
+ JSON
445
+ python3 "$COMPILE" --run-id "$UNMAPPED_AXIS_RUN" > "$TMP_DIR/unmapped-axis-compile.out" 2>&1
446
+ python3 - "$BENCH_ROOT/results/$UNMAPPED_AXIS_RUN/summary.json" <<'PY'
447
+ import json
448
+ import sys
449
+
450
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
451
+ row = summary["rows"][0]
452
+ count = row.get("_axis_validation_unmapped_out_of_range_count")
453
+ if count != 1:
454
+ raise SystemExit(f"expected unmapped axis count 1, got {count!r}")
455
+ PY
456
+ expect_fail_contains unmapped-axis-invalid-through-compile \
457
+ "judge axis-invalid unmapped: 1 fixture(s)" \
458
+ python3 "$GATE" --run-id "$UNMAPPED_AXIS_RUN" --accept-missing
459
+
460
+ VARIANT_MAPPING_RUN="$RUN_PREFIX-variant-mapping-missing"
461
+ VARIANT_MAPPING_DIR="$BENCH_ROOT/results/$VARIANT_MAPPING_RUN/F9-e2e-ideate-to-resolve"
462
+ mkdir -p "$VARIANT_MAPPING_DIR"/{solo_claude,bare,variant}
463
+ for arm in solo_claude bare variant; do
464
+ cat > "$VARIANT_MAPPING_DIR/$arm/result.json" <<'JSON'
465
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
466
+ JSON
467
+ done
468
+ cat > "$VARIANT_MAPPING_DIR/judge.json" <<'JSON'
469
+ {
470
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "l2_forced", "seed": 1},
471
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
472
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
473
+ "winner_arm": "variant",
474
+ "disqualifiers_by_arm": {
475
+ "solo_claude": {"disqualifier": false},
476
+ "bare": {"disqualifier": false},
477
+ "variant": {"disqualifier": false}
478
+ }
479
+ }
480
+ JSON
481
+ python3 "$COMPILE" --run-id "$VARIANT_MAPPING_RUN" > "$TMP_DIR/variant-mapping-compile.out" 2>&1
482
+ python3 - "$BENCH_ROOT/results/$VARIANT_MAPPING_RUN/summary.json" <<'PY'
483
+ import json
484
+ import sys
485
+
486
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
487
+ row = summary["rows"][0]
488
+ variant = row["arms"]["variant"]
489
+ if variant.get("blind_mapping_arm_missing") is not True:
490
+ raise SystemExit("expected variant score without blind mapping to be marked")
491
+ if row["variant_disqualifier"] is not True:
492
+ raise SystemExit("expected variant score without blind mapping to disqualify")
493
+ PY
494
+ expect_fail_contains variant-mapping-disqualifies \
495
+ "variant disqualifier(s)" \
496
+ python3 "$GATE" --run-id "$VARIANT_MAPPING_RUN" --accept-missing
497
+
498
+ SOLO_MAPPING_RUN="$RUN_PREFIX-solo-mapping-missing"
499
+ SOLO_MAPPING_DIR="$BENCH_ROOT/results/$SOLO_MAPPING_RUN/F9-e2e-ideate-to-resolve"
500
+ mkdir -p "$SOLO_MAPPING_DIR"/{solo_claude,bare,variant}
501
+ for arm in solo_claude bare variant; do
502
+ cat > "$SOLO_MAPPING_DIR/$arm/result.json" <<'JSON'
503
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
504
+ JSON
505
+ done
506
+ cat > "$SOLO_MAPPING_DIR/judge.json" <<'JSON'
507
+ {
508
+ "_blind_mapping": {"A": "variant", "B": "bare", "seed": 1},
509
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
510
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
511
+ "winner_arm": "solo_claude",
512
+ "disqualifiers_by_arm": {
513
+ "solo_claude": {"disqualifier": false},
514
+ "bare": {"disqualifier": false},
515
+ "variant": {"disqualifier": false}
516
+ }
517
+ }
518
+ JSON
519
+ python3 "$COMPILE" --run-id "$SOLO_MAPPING_RUN" > "$TMP_DIR/solo-mapping-compile.out" 2>&1
520
+ python3 - "$BENCH_ROOT/results/$SOLO_MAPPING_RUN/summary.json" <<'PY'
521
+ import json
522
+ import sys
523
+
524
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
525
+ row = summary["rows"][0]
526
+ solo = row["arms"]["solo_claude"]
527
+ if solo.get("blind_mapping_arm_missing") is not True:
528
+ raise SystemExit("expected solo score without blind mapping to be marked")
529
+ if solo.get("score") is not None:
530
+ raise SystemExit("solo score without blind mapping must not be displayed")
531
+ if solo.get("disqualifier") is not True:
532
+ raise SystemExit("expected solo score without blind mapping to disqualify")
533
+ if row["margins"].get("solo_over_bare") is not None:
534
+ raise SystemExit("solo_over_bare margin without solo blind mapping must be null")
535
+ if row["margins"].get("variant_over_solo") is not None:
536
+ raise SystemExit("variant_over_solo margin without solo blind mapping must be null")
537
+ if row["margins"].get("variant_over_bare") != 20:
538
+ raise SystemExit("variant_over_bare should remain available when variant and bare are mapped")
539
+ if row.get("winner") is not None:
540
+ raise SystemExit("winner without blind-mapped trusted score must be null")
541
+ PY
542
+ expect_fail_contains solo-mapping-disqualifies \
543
+ "L1 disqualifier(s): 1" \
544
+ python3 "$GATE" --run-id "$SOLO_MAPPING_RUN" --accept-missing
545
+
546
+ STALE_MARGIN_RUN="$RUN_PREFIX-stale-margin"
547
+ STALE_MARGIN_DIR="$BENCH_ROOT/results/$STALE_MARGIN_RUN/F9-e2e-ideate-to-resolve"
548
+ mkdir -p "$STALE_MARGIN_DIR"/{solo_claude,bare,variant}
549
+ for arm in solo_claude bare variant; do
550
+ cat > "$STALE_MARGIN_DIR/$arm/result.json" <<'JSON'
551
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
552
+ JSON
553
+ done
554
+ cat > "$STALE_MARGIN_DIR/judge.json" <<'JSON'
555
+ {
556
+ "_blind_mapping": {"A": "variant", "B": "bare", "C": "solo_claude", "seed": 1},
557
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
558
+ "margins": {"solo_over_bare": 999, "variant_over_bare": 888, "variant_over_solo": 777},
559
+ "winner_arm": "variant",
560
+ "disqualifiers_by_arm": {
561
+ "solo_claude": {"disqualifier": false},
562
+ "bare": {"disqualifier": false},
563
+ "variant": {"disqualifier": false}
564
+ }
565
+ }
566
+ JSON
567
+ python3 "$COMPILE" --run-id "$STALE_MARGIN_RUN" > "$TMP_DIR/stale-margin-compile.out" 2>&1
568
+ python3 - "$BENCH_ROOT/results/$STALE_MARGIN_RUN/summary.json" <<'PY'
569
+ import json
570
+ import sys
571
+
572
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
573
+ row = summary["rows"][0]
574
+ expected = {
575
+ "solo_over_bare": 10,
576
+ "variant_over_bare": 20,
577
+ "variant_over_solo": 10,
578
+ }
579
+ if row["margins"] != expected:
580
+ raise SystemExit(f"stale judge margins must be recomputed from trusted scores: {row['margins']}")
581
+ if row["margin"] != 20:
582
+ raise SystemExit("legacy margin must also be recomputed from trusted scores")
583
+ if summary["margins_avg"] != expected:
584
+ raise SystemExit(f"summary margin averages must use recomputed margins: {summary['margins_avg']}")
585
+ PY
586
+
587
+ MALFORMED_SCORES_RUN="$RUN_PREFIX-malformed-scores"
588
+ MALFORMED_SCORES_DIR="$BENCH_ROOT/results/$MALFORMED_SCORES_RUN/F9-e2e-ideate-to-resolve"
589
+ mkdir -p "$MALFORMED_SCORES_DIR"/{solo_claude,bare,variant}
590
+ for arm in solo_claude bare variant; do
591
+ cat > "$MALFORMED_SCORES_DIR/$arm/result.json" <<'JSON'
592
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
593
+ JSON
594
+ done
595
+ cat > "$MALFORMED_SCORES_DIR/judge.json" <<'JSON'
596
+ {
597
+ "_blind_mapping": {"A": "variant", "B": "bare", "C": "solo_claude", "seed": 1},
598
+ "scores_by_arm": ["not", "a", "dict"],
599
+ "winner_arm": "variant",
600
+ "disqualifiers_by_arm": {
601
+ "solo_claude": {"disqualifier": false},
602
+ "bare": {"disqualifier": false},
603
+ "variant": {"disqualifier": false}
604
+ }
605
+ }
606
+ JSON
607
+ python3 "$COMPILE" --run-id "$MALFORMED_SCORES_RUN" > "$TMP_DIR/malformed-scores-compile.out" 2>&1
608
+ python3 - "$BENCH_ROOT/results/$MALFORMED_SCORES_RUN/summary.json" <<'PY'
609
+ import json
610
+ import sys
611
+
612
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
613
+ row = summary["rows"][0]
614
+ for arm in ("variant", "bare", "solo_claude"):
615
+ if row["arms"][arm]["score"] is not None:
616
+ raise SystemExit(f"malformed scores_by_arm must not expose {arm} score")
617
+ if any(value is not None for value in row["margins"].values()):
618
+ raise SystemExit(f"malformed scores_by_arm must null margins: {row['margins']}")
619
+ if row.get("winner") is not None:
620
+ raise SystemExit("winner without trusted score must be null")
621
+ PY
622
+
623
+ OVERRANGE_SCORES_RUN="$RUN_PREFIX-overrange-scores"
624
+ OVERRANGE_SCORES_DIR="$BENCH_ROOT/results/$OVERRANGE_SCORES_RUN/F9-e2e-ideate-to-resolve"
625
+ mkdir -p "$OVERRANGE_SCORES_DIR"/{solo_claude,bare,variant}
626
+ for arm in solo_claude bare variant; do
627
+ cat > "$OVERRANGE_SCORES_DIR/$arm/result.json" <<'JSON'
628
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
629
+ JSON
630
+ done
631
+ cat > "$OVERRANGE_SCORES_DIR/judge.json" <<'JSON'
632
+ {
633
+ "_blind_mapping": {"A": "variant", "B": "bare", "C": "solo_claude", "seed": 1},
634
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 101},
635
+ "variant_score": 101,
636
+ "winner_arm": "variant",
637
+ "disqualifiers_by_arm": {
638
+ "solo_claude": {"disqualifier": false},
639
+ "bare": {"disqualifier": false},
640
+ "variant": {"disqualifier": false}
641
+ }
642
+ }
643
+ JSON
644
+ python3 "$COMPILE" --run-id "$OVERRANGE_SCORES_RUN" > "$TMP_DIR/overrange-scores-compile.out" 2>&1
645
+ python3 - "$BENCH_ROOT/results/$OVERRANGE_SCORES_RUN/summary.json" <<'PY'
646
+ import json
647
+ import sys
648
+
649
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
650
+ row = summary["rows"][0]
651
+ if row["arms"]["variant"]["score"] is not None:
652
+ raise SystemExit("out-of-range scores_by_arm must not expose variant score")
653
+ if row.get("variant_score") is not None:
654
+ raise SystemExit("legacy variant_score must also be null for out-of-range scores")
655
+ if row.get("winner") is not None:
656
+ raise SystemExit("winner without trusted score must be null for out-of-range scores")
657
+ if row["margins"]["variant_over_bare"] is not None:
658
+ raise SystemExit("out-of-range variant score must null dependent margins")
659
+ PY
660
+
661
+ BOOLEAN_SCORES_RUN="$RUN_PREFIX-boolean-scores"
662
+ BOOLEAN_SCORES_DIR="$BENCH_ROOT/results/$BOOLEAN_SCORES_RUN/F9-e2e-ideate-to-resolve"
663
+ mkdir -p "$BOOLEAN_SCORES_DIR"/{solo_claude,bare,variant}
664
+ for arm in solo_claude bare variant; do
665
+ cat > "$BOOLEAN_SCORES_DIR/$arm/result.json" <<'JSON'
666
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
667
+ JSON
668
+ done
669
+ cat > "$BOOLEAN_SCORES_DIR/judge.json" <<'JSON'
670
+ {
671
+ "_blind_mapping": {"A": "variant", "B": "bare", "C": "solo_claude", "seed": 1},
672
+ "scores_by_arm": {"solo_claude": true, "bare": 50, "variant": 70},
673
+ "winner_arm": "variant",
674
+ "disqualifiers_by_arm": {
675
+ "solo_claude": {"disqualifier": false},
676
+ "bare": {"disqualifier": false},
677
+ "variant": {"disqualifier": false}
678
+ }
679
+ }
680
+ JSON
681
+ python3 "$COMPILE" --run-id "$BOOLEAN_SCORES_RUN" > "$TMP_DIR/boolean-scores-compile.out" 2>&1
682
+ python3 - "$BENCH_ROOT/results/$BOOLEAN_SCORES_RUN/summary.json" <<'PY'
683
+ import json
684
+ import sys
685
+
686
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
687
+ row = summary["rows"][0]
688
+ if row["arms"]["solo_claude"]["score"] is not None:
689
+ raise SystemExit("boolean scores_by_arm must not expose solo score")
690
+ if row["margins"]["solo_over_bare"] is not None:
691
+ raise SystemExit("boolean solo score must null dependent margins")
692
+ PY
693
+
694
+ BOOLEAN_WALL_RUN="$RUN_PREFIX-boolean-wall"
695
+ BOOLEAN_WALL_DIR="$BENCH_ROOT/results/$BOOLEAN_WALL_RUN/F9-e2e-ideate-to-resolve"
696
+ mkdir -p "$BOOLEAN_WALL_DIR"/{solo_claude,bare,variant}
697
+ cat > "$BOOLEAN_WALL_DIR/variant/result.json" <<'JSON'
698
+ {"elapsed_seconds": true, "verify_score": true, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
699
+ JSON
700
+ for arm in solo_claude bare; do
701
+ cat > "$BOOLEAN_WALL_DIR/$arm/result.json" <<'JSON'
702
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
703
+ JSON
704
+ done
705
+ cat > "$BOOLEAN_WALL_DIR/judge.json" <<'JSON'
706
+ {
707
+ "_blind_mapping": {"A": "variant", "B": "bare", "C": "solo_claude", "seed": 1},
708
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
709
+ "winner_arm": "variant",
710
+ "disqualifiers_by_arm": {
711
+ "solo_claude": {"disqualifier": false},
712
+ "bare": {"disqualifier": false},
713
+ "variant": {"disqualifier": false}
714
+ }
715
+ }
716
+ JSON
717
+ python3 "$COMPILE" --run-id "$BOOLEAN_WALL_RUN" > "$TMP_DIR/boolean-wall-compile.out" 2>&1
718
+ python3 - "$BENCH_ROOT/results/$BOOLEAN_WALL_RUN/summary.json" <<'PY'
719
+ import json
720
+ import sys
721
+
722
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
723
+ row = summary["rows"][0]
724
+ variant = row["arms"]["variant"]
725
+ if variant["wall_s"] is not None or variant["verify_score"] is not None:
726
+ raise SystemExit("boolean result numeric fields must not appear in compile summary")
727
+ if row["wall_ratios"]["variant_over_bare"] is not None:
728
+ raise SystemExit("boolean wall time must null dependent wall ratios")
729
+ PY
730
+
731
+ MALFORMED_RESULT_BOOL_RUN="$RUN_PREFIX-malformed-result-bool"
732
+ MALFORMED_RESULT_BOOL_DIR="$BENCH_ROOT/results/$MALFORMED_RESULT_BOOL_RUN/F9-e2e-ideate-to-resolve"
733
+ mkdir -p "$MALFORMED_RESULT_BOOL_DIR"/{solo_claude,bare,variant}
734
+ cat > "$MALFORMED_RESULT_BOOL_DIR/variant/result.json" <<'JSON'
735
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "timed_out": "false", "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
736
+ JSON
737
+ for arm in solo_claude bare; do
738
+ cat > "$MALFORMED_RESULT_BOOL_DIR/$arm/result.json" <<'JSON'
739
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
740
+ JSON
741
+ done
742
+ cat > "$MALFORMED_RESULT_BOOL_DIR/judge.json" <<'JSON'
743
+ {
744
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
745
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
746
+ "winner_arm": "variant",
747
+ "disqualifiers_by_arm": {
748
+ "solo_claude": {"disqualifier": false},
749
+ "bare": {"disqualifier": false},
750
+ "variant": {"disqualifier": false}
751
+ }
752
+ }
753
+ JSON
754
+ python3 "$COMPILE" --run-id "$MALFORMED_RESULT_BOOL_RUN" > "$TMP_DIR/malformed-result-bool-compile.out" 2>&1
755
+ python3 - "$BENCH_ROOT/results/$MALFORMED_RESULT_BOOL_RUN/summary.json" <<'PY'
756
+ import json
757
+ import sys
758
+
759
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
760
+ variant = summary["rows"][0]["arms"]["variant"]
761
+ if variant["timed_out"] is not False:
762
+ raise SystemExit("malformed timed_out must not become exact true")
763
+ if variant["malformed_boolean_fields"] != ["timed_out"]:
764
+ raise SystemExit("malformed timed_out must be recorded")
765
+ if variant["dq_deterministic"] is not True or variant["disqualifier"] is not True:
766
+ raise SystemExit("malformed boolean artifact must disqualify deterministically")
767
+ PY
768
+
769
+ MALFORMED_JUDGE_BOOL_RUN="$RUN_PREFIX-malformed-judge-bool"
770
+ MALFORMED_JUDGE_BOOL_DIR="$BENCH_ROOT/results/$MALFORMED_JUDGE_BOOL_RUN/F9-e2e-ideate-to-resolve"
771
+ mkdir -p "$MALFORMED_JUDGE_BOOL_DIR"/{solo_claude,bare,variant}
772
+ for arm in solo_claude bare variant; do
773
+ cat > "$MALFORMED_JUDGE_BOOL_DIR/$arm/result.json" <<'JSON'
774
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
775
+ JSON
776
+ done
777
+ cat > "$MALFORMED_JUDGE_BOOL_DIR/judge.json" <<'JSON'
778
+ {
779
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
780
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
781
+ "winner_arm": "variant",
782
+ "disqualifiers_by_arm": {
783
+ "solo_claude": {"disqualifier": false},
784
+ "bare": {"disqualifier": false},
785
+ "variant": {"disqualifier": "false"}
786
+ }
787
+ }
788
+ JSON
789
+ python3 "$COMPILE" --run-id "$MALFORMED_JUDGE_BOOL_RUN" > "$TMP_DIR/malformed-judge-bool-compile.out" 2>&1
790
+ python3 - "$BENCH_ROOT/results/$MALFORMED_JUDGE_BOOL_RUN/summary.json" <<'PY'
791
+ import json
792
+ import sys
793
+
794
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
795
+ variant = summary["rows"][0]["arms"]["variant"]
796
+ if variant["dq_judge_malformed"] is not True:
797
+ raise SystemExit("malformed judge disqualifier must be recorded")
798
+ if variant["dq_judge"] is not True or variant["disqualifier"] is not True:
799
+ raise SystemExit("malformed judge disqualifier must fail closed")
800
+ PY
801
+
802
+ MALFORMED_MAPPING_RUN="$RUN_PREFIX-malformed-mapping"
803
+ MALFORMED_MAPPING_DIR="$BENCH_ROOT/results/$MALFORMED_MAPPING_RUN/F9-e2e-ideate-to-resolve"
804
+ mkdir -p "$MALFORMED_MAPPING_DIR"/{solo_claude,bare,variant}
805
+ for arm in solo_claude bare variant; do
806
+ cat > "$MALFORMED_MAPPING_DIR/$arm/result.json" <<'JSON'
807
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
808
+ JSON
809
+ done
810
+ cat > "$MALFORMED_MAPPING_DIR/judge.json" <<'JSON'
811
+ {
812
+ "_blind_mapping": "not-a-dict",
813
+ "_axis_validation": {
814
+ "out_of_range_count": 1,
815
+ "out_of_range_cells": [{"breakdown": "c_breakdown", "axis": "quality", "value": 26}],
816
+ "axis_range": [0, 25]
817
+ },
818
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
819
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
820
+ "winner_arm": "variant",
821
+ "disqualifiers_by_arm": {
822
+ "solo_claude": {"disqualifier": false},
823
+ "bare": {"disqualifier": false},
824
+ "variant": {"disqualifier": false}
825
+ }
826
+ }
827
+ JSON
828
+ python3 "$COMPILE" --run-id "$MALFORMED_MAPPING_RUN" > "$TMP_DIR/malformed-mapping-compile.out" 2>&1
829
+ python3 - "$BENCH_ROOT/results/$MALFORMED_MAPPING_RUN/summary.json" <<'PY'
830
+ import json
831
+ import sys
832
+
833
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
834
+ row = summary["rows"][0]
835
+ if row.get("_axis_validation_unmapped_out_of_range_count") != 1:
836
+ raise SystemExit("expected malformed mapping axis cell to be unmapped")
837
+ for arm in ("solo_claude", "bare", "variant"):
838
+ payload = row["arms"][arm]
839
+ if payload.get("blind_mapping_arm_missing") is not True:
840
+ raise SystemExit(f"expected {arm} score without dict blind mapping to be marked")
841
+ if payload.get("score") is not None:
842
+ raise SystemExit(f"{arm} score without dict blind mapping must not be displayed")
843
+ if payload.get("disqualifier") is not True:
844
+ raise SystemExit(f"expected {arm} score without dict blind mapping to disqualify")
845
+ for key, value in row["margins"].items():
846
+ if value is not None:
847
+ raise SystemExit(f"{key} without dict blind mapping must be null")
848
+ PY
849
+ expect_fail_contains malformed-mapping-disqualifies \
850
+ "judge axis-invalid unmapped: 1 fixture(s)" \
851
+ python3 "$GATE" --run-id "$MALFORMED_MAPPING_RUN" --accept-missing
852
+
853
+ VARIANT_TIMEOUT_RUN="$RUN_PREFIX-variant-timeout"
854
+ VARIANT_TIMEOUT_DIR="$BENCH_ROOT/results/$VARIANT_TIMEOUT_RUN/F9-e2e-ideate-to-resolve"
855
+ mkdir -p "$VARIANT_TIMEOUT_DIR"/{solo_claude,bare,variant}
856
+ cat > "$VARIANT_TIMEOUT_DIR/variant/result.json" <<'JSON'
857
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "timed_out": true, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
858
+ JSON
859
+ for arm in solo_claude bare; do
860
+ cat > "$VARIANT_TIMEOUT_DIR/$arm/result.json" <<'JSON'
861
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
862
+ JSON
863
+ done
864
+ cat > "$VARIANT_TIMEOUT_DIR/judge.json" <<'JSON'
865
+ {
866
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
867
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
868
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
869
+ "winner_arm": "variant",
870
+ "disqualifiers_by_arm": {
871
+ "solo_claude": {"disqualifier": false},
872
+ "bare": {"disqualifier": false},
873
+ "variant": {"disqualifier": false}
874
+ }
875
+ }
876
+ JSON
877
+ python3 "$COMPILE" --run-id "$VARIANT_TIMEOUT_RUN" > "$TMP_DIR/variant-timeout-compile.out" 2>&1
878
+ python3 - "$BENCH_ROOT/results/$VARIANT_TIMEOUT_RUN/summary.json" <<'PY'
879
+ import json
880
+ import sys
881
+
882
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
883
+ row = summary["rows"][0]
884
+ if row["variant_disqualifier"] is not True:
885
+ raise SystemExit("expected variant timeout to become variant_disqualifier")
886
+ PY
887
+ expect_fail_contains variant-timeout-disqualifies \
888
+ "variant disqualifier(s)" \
889
+ python3 "$GATE" --run-id "$VARIANT_TIMEOUT_RUN" --accept-missing
890
+
891
+ VARIANT_VERIFY_SCORE_RUN="$RUN_PREFIX-variant-verify-score"
892
+ VARIANT_VERIFY_SCORE_DIR="$BENCH_ROOT/results/$VARIANT_VERIFY_SCORE_RUN/F9-e2e-ideate-to-resolve"
893
+ mkdir -p "$VARIANT_VERIFY_SCORE_DIR"/{solo_claude,bare,variant}
894
+ cat > "$VARIANT_VERIFY_SCORE_DIR/variant/result.json" <<'JSON'
895
+ {"elapsed_seconds": 10, "verify_score": 0.75, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
896
+ JSON
897
+ for arm in solo_claude bare; do
898
+ cat > "$VARIANT_VERIFY_SCORE_DIR/$arm/result.json" <<'JSON'
899
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
900
+ JSON
901
+ done
902
+ cat > "$VARIANT_VERIFY_SCORE_DIR/judge.json" <<'JSON'
903
+ {
904
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
905
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
906
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
907
+ "winner_arm": "variant",
908
+ "disqualifiers_by_arm": {
909
+ "solo_claude": {"disqualifier": false},
910
+ "bare": {"disqualifier": false},
911
+ "variant": {"disqualifier": false}
912
+ }
913
+ }
914
+ JSON
915
+ python3 "$COMPILE" --run-id "$VARIANT_VERIFY_SCORE_RUN" > "$TMP_DIR/variant-verify-score-compile.out" 2>&1
916
+ python3 - "$BENCH_ROOT/results/$VARIANT_VERIFY_SCORE_RUN/summary.json" <<'PY'
917
+ import json
918
+ import sys
919
+
920
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
921
+ row = summary["rows"][0]
922
+ if row["variant_disqualifier"] is not True:
923
+ raise SystemExit("expected variant verify_score < 1.0 to become variant_disqualifier")
924
+ PY
925
+ expect_fail_contains variant-verify-score-disqualifies \
926
+ "variant disqualifier(s)" \
927
+ python3 "$GATE" --run-id "$VARIANT_VERIFY_SCORE_RUN" --accept-missing
928
+
929
+ VARIANT_VERDICT_RUN="$RUN_PREFIX-variant-verdict"
930
+ VARIANT_VERDICT_DIR="$BENCH_ROOT/results/$VARIANT_VERDICT_RUN/F9-e2e-ideate-to-resolve"
931
+ mkdir -p "$VARIANT_VERDICT_DIR"/{solo_claude,bare,variant}
932
+ cat > "$VARIANT_VERDICT_DIR/variant/result.json" <<'JSON'
933
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "BLOCKED:probe-derive-malformed", "verify_verdict": "BLOCKED"}
934
+ JSON
935
+ for arm in solo_claude bare; do
936
+ cat > "$VARIANT_VERDICT_DIR/$arm/result.json" <<'JSON'
937
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
938
+ JSON
939
+ done
940
+ cat > "$VARIANT_VERDICT_DIR/judge.json" <<'JSON'
941
+ {
942
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
943
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
944
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
945
+ "winner_arm": "variant",
946
+ "disqualifiers_by_arm": {
947
+ "solo_claude": {"disqualifier": false},
948
+ "bare": {"disqualifier": false},
949
+ "variant": {"disqualifier": false}
950
+ }
951
+ }
952
+ JSON
953
+ python3 "$COMPILE" --run-id "$VARIANT_VERDICT_RUN" > "$TMP_DIR/variant-verdict-compile.out" 2>&1
954
+ python3 - "$BENCH_ROOT/results/$VARIANT_VERDICT_RUN/summary.json" <<'PY'
955
+ import json
956
+ import sys
957
+
958
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
959
+ row = summary["rows"][0]
960
+ if row["variant_disqualifier"] is not True:
961
+ raise SystemExit("expected variant blocked verdict to become variant_disqualifier")
962
+ PY
963
+ expect_fail_contains variant-verdict-disqualifies \
964
+ "variant disqualifier(s)" \
965
+ python3 "$GATE" --run-id "$VARIANT_VERDICT_RUN" --accept-missing
966
+
967
+ SOLO_INVOKE_RUN="$RUN_PREFIX-solo-invoke-failure"
968
+ SOLO_INVOKE_DIR="$BENCH_ROOT/results/$SOLO_INVOKE_RUN/F9-e2e-ideate-to-resolve"
969
+ mkdir -p "$SOLO_INVOKE_DIR"/{solo_claude,bare,variant}
970
+ cat > "$SOLO_INVOKE_DIR/solo_claude/result.json" <<'JSON'
971
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "invoke_failure": true, "invoke_failure_reason": "provider_limit", "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
972
+ JSON
973
+ for arm in variant bare; do
974
+ cat > "$SOLO_INVOKE_DIR/$arm/result.json" <<'JSON'
975
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
976
+ JSON
977
+ done
978
+ cat > "$SOLO_INVOKE_DIR/judge.json" <<'JSON'
979
+ {
980
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
981
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
982
+ "margins": {"solo_over_bare": 10, "variant_over_bare": 20, "variant_over_solo": 10},
983
+ "winner_arm": "variant",
984
+ "disqualifiers_by_arm": {
985
+ "solo_claude": {"disqualifier": false},
986
+ "bare": {"disqualifier": false},
987
+ "variant": {"disqualifier": false}
988
+ }
989
+ }
990
+ JSON
991
+ python3 "$COMPILE" --run-id "$SOLO_INVOKE_RUN" > "$TMP_DIR/solo-invoke-compile.out" 2>&1
992
+ python3 - "$BENCH_ROOT/results/$SOLO_INVOKE_RUN/summary.json" <<'PY'
993
+ import json
994
+ import sys
995
+
996
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
997
+ row = summary["rows"][0]
998
+ if row["arms"]["solo_claude"].get("disqualifier") is not True:
999
+ raise SystemExit("expected solo invoke_failure to become arm disqualifier")
1000
+ PY
1001
+ expect_fail_contains solo-invoke-disqualifies \
1002
+ "L1 disqualifier(s): 1" \
1003
+ python3 "$GATE" --run-id "$SOLO_INVOKE_RUN" --accept-missing
1004
+
1005
+ MALFORMED_FINDINGS_RUN="$RUN_PREFIX-malformed-findings"
1006
+ MALFORMED_FINDINGS_DIR="$BENCH_ROOT/results/$MALFORMED_FINDINGS_RUN/F9-e2e-ideate-to-resolve"
1007
+ mkdir -p "$MALFORMED_FINDINGS_DIR"/{solo_claude,bare,variant}
1008
+ for arm in solo_claude bare variant; do
1009
+ cat > "$MALFORMED_FINDINGS_DIR/$arm/result.json" <<'JSON'
1010
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
1011
+ JSON
1012
+ done
1013
+ cat > "$MALFORMED_FINDINGS_DIR/judge.json" <<'JSON'
1014
+ {
1015
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
1016
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
1017
+ "winner_arm": "variant",
1018
+ "findings_by_arm": {
1019
+ "variant": "single finding string",
1020
+ "solo_claude": ["structured finding"]
1021
+ },
1022
+ "disqualifiers_by_arm": {
1023
+ "solo_claude": {"disqualifier": false},
1024
+ "bare": {"disqualifier": false},
1025
+ "variant": {"disqualifier": false}
1026
+ }
1027
+ }
1028
+ JSON
1029
+ python3 "$COMPILE" --run-id "$MALFORMED_FINDINGS_RUN" > "$TMP_DIR/malformed-findings-compile.out" 2>&1
1030
+ python3 - "$BENCH_ROOT/results/$MALFORMED_FINDINGS_RUN/summary.json" <<'PY'
1031
+ import json
1032
+ import sys
1033
+
1034
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
1035
+ row = summary["rows"][0]
1036
+ if row["arms"]["variant"]["critical_findings"] != ["single finding string"]:
1037
+ raise SystemExit("non-list finding entry must become a one-item list")
1038
+ if row["arms"]["solo_claude"]["critical_findings"] != ["structured finding"]:
1039
+ raise SystemExit("list finding entry must be preserved")
1040
+ PY
1041
+ grep -Fq '**variant (L2):**' "$BENCH_ROOT/results/$MALFORMED_FINDINGS_RUN/report.md"
1042
+ grep -Fq '**solo_claude (L1):**' "$BENCH_ROOT/results/$MALFORMED_FINDINGS_RUN/report.md"
1043
+ grep -Fq -- '- single finding string' "$BENCH_ROOT/results/$MALFORMED_FINDINGS_RUN/report.md"
1044
+
1045
+ MALFORMED_FINDINGS_MAP_RUN="$RUN_PREFIX-malformed-findings-map"
1046
+ MALFORMED_FINDINGS_MAP_DIR="$BENCH_ROOT/results/$MALFORMED_FINDINGS_MAP_RUN/F9-e2e-ideate-to-resolve"
1047
+ mkdir -p "$MALFORMED_FINDINGS_MAP_DIR"/{solo_claude,bare,variant}
1048
+ for arm in solo_claude bare variant; do
1049
+ cat > "$MALFORMED_FINDINGS_MAP_DIR/$arm/result.json" <<'JSON'
1050
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
1051
+ JSON
1052
+ done
1053
+ cat > "$MALFORMED_FINDINGS_MAP_DIR/judge.json" <<'JSON'
1054
+ {
1055
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
1056
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
1057
+ "winner_arm": "variant",
1058
+ "findings_by_arm": ["not", "a", "dict"],
1059
+ "disqualifiers_by_arm": {
1060
+ "solo_claude": {"disqualifier": false},
1061
+ "bare": {"disqualifier": false},
1062
+ "variant": {"disqualifier": false}
1063
+ }
1064
+ }
1065
+ JSON
1066
+ python3 "$COMPILE" --run-id "$MALFORMED_FINDINGS_MAP_RUN" > "$TMP_DIR/malformed-findings-map-compile.out" 2>&1
1067
+ python3 - "$BENCH_ROOT/results/$MALFORMED_FINDINGS_MAP_RUN/summary.json" <<'PY'
1068
+ import json
1069
+ import sys
1070
+
1071
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
1072
+ row = summary["rows"][0]
1073
+ for arm in ("variant", "solo_claude", "bare"):
1074
+ if row["arms"][arm]["critical_findings"]:
1075
+ raise SystemExit("non-dict findings_by_arm must be ignored")
1076
+ PY
1077
+
1078
+ MALFORMED_AXIS_WRAPPER_RUN="$RUN_PREFIX-malformed-axis-wrapper"
1079
+ MALFORMED_AXIS_WRAPPER_DIR="$BENCH_ROOT/results/$MALFORMED_AXIS_WRAPPER_RUN/F9-e2e-ideate-to-resolve"
1080
+ mkdir -p "$MALFORMED_AXIS_WRAPPER_DIR"/{solo_claude,bare,variant}
1081
+ for arm in solo_claude bare variant; do
1082
+ cat > "$MALFORMED_AXIS_WRAPPER_DIR/$arm/result.json" <<'JSON'
1083
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
1084
+ JSON
1085
+ done
1086
+ cat > "$MALFORMED_AXIS_WRAPPER_DIR/judge.json" <<'JSON'
1087
+ {
1088
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
1089
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
1090
+ "winner_arm": "variant",
1091
+ "_axis_validation": ["not", "a", "dict"],
1092
+ "disqualifiers_by_arm": {
1093
+ "solo_claude": {"disqualifier": false},
1094
+ "bare": {"disqualifier": false},
1095
+ "variant": {"disqualifier": false}
1096
+ }
1097
+ }
1098
+ JSON
1099
+ python3 "$COMPILE" --run-id "$MALFORMED_AXIS_WRAPPER_RUN" > "$TMP_DIR/malformed-axis-wrapper-compile.out" 2>&1
1100
+ python3 - "$BENCH_ROOT/results/$MALFORMED_AXIS_WRAPPER_RUN/summary.json" <<'PY'
1101
+ import json
1102
+ import sys
1103
+
1104
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
1105
+ row = summary["rows"][0]
1106
+ if row.get("_axis_validation_unmapped_out_of_range_count") != 0:
1107
+ raise SystemExit("non-dict _axis_validation wrapper must not crash or invent invalid cells")
1108
+ for arm in ("variant", "solo_claude", "bare"):
1109
+ if row["arms"][arm]["_axis_validation_out_of_range_count"] != 0:
1110
+ raise SystemExit("non-dict _axis_validation wrapper must not mark arm axis invalid")
1111
+ PY
1112
+
1113
+ MALFORMED_RESULT_RUN="$RUN_PREFIX-malformed-result-artifact"
1114
+ MALFORMED_RESULT_DIR="$BENCH_ROOT/results/$MALFORMED_RESULT_RUN/F9-e2e-ideate-to-resolve"
1115
+ mkdir -p "$MALFORMED_RESULT_DIR"/{solo_claude,bare,variant}
1116
+ for arm in solo_claude bare; do
1117
+ cat > "$MALFORMED_RESULT_DIR/$arm/result.json" <<'JSON'
1118
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
1119
+ JSON
1120
+ done
1121
+ printf '["not", "a", "dict"]\n' > "$MALFORMED_RESULT_DIR/variant/result.json"
1122
+ cat > "$MALFORMED_RESULT_DIR/judge.json" <<'JSON'
1123
+ {
1124
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
1125
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
1126
+ "winner_arm": "variant",
1127
+ "disqualifiers_by_arm": {
1128
+ "solo_claude": {"disqualifier": false},
1129
+ "bare": {"disqualifier": false},
1130
+ "variant": {"disqualifier": false}
1131
+ }
1132
+ }
1133
+ JSON
1134
+ python3 "$COMPILE" --run-id "$MALFORMED_RESULT_RUN" > "$TMP_DIR/malformed-result-compile.out" 2>&1
1135
+ python3 - "$BENCH_ROOT/results/$MALFORMED_RESULT_RUN/summary.json" <<'PY'
1136
+ import json
1137
+ import sys
1138
+
1139
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
1140
+ row = summary["rows"][0]
1141
+ if row["variant_disqualifier"] is not True:
1142
+ raise SystemExit("non-dict variant result.json must fail closed as a disqualifier")
1143
+ if row["arms"]["variant"].get("wall_s") is not None:
1144
+ raise SystemExit("non-dict variant result.json must not expose timing fields")
1145
+ PY
1146
+ expect_fail_contains malformed-result-artifact-disqualifies \
1147
+ "variant disqualifier(s)" \
1148
+ python3 "$GATE" --run-id "$MALFORMED_RESULT_RUN" --accept-missing
1149
+
1150
+ NAN_RESULT_RUN="$RUN_PREFIX-nan-result-artifact"
1151
+ NAN_RESULT_DIR="$BENCH_ROOT/results/$NAN_RESULT_RUN/F9-e2e-ideate-to-resolve"
1152
+ mkdir -p "$NAN_RESULT_DIR"/{solo_claude,bare,variant}
1153
+ for arm in solo_claude bare; do
1154
+ cat > "$NAN_RESULT_DIR/$arm/result.json" <<'JSON'
1155
+ {"elapsed_seconds": 10, "verify_score": 1, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
1156
+ JSON
1157
+ done
1158
+ cat > "$NAN_RESULT_DIR/variant/result.json" <<'JSON'
1159
+ {"elapsed_seconds": NaN, "verify_score": NaN, "files_changed": 1, "disqualifier": false, "terminal_verdict": "PASS", "verify_verdict": "PASS"}
1160
+ JSON
1161
+ cat > "$NAN_RESULT_DIR/judge.json" <<'JSON'
1162
+ {
1163
+ "_blind_mapping": {"A": "solo_claude", "B": "bare", "C": "variant", "seed": 1},
1164
+ "scores_by_arm": {"solo_claude": 60, "bare": 50, "variant": 70},
1165
+ "winner_arm": "variant",
1166
+ "disqualifiers_by_arm": {
1167
+ "solo_claude": {"disqualifier": false},
1168
+ "bare": {"disqualifier": false},
1169
+ "variant": {"disqualifier": false}
1170
+ }
1171
+ }
1172
+ JSON
1173
+ python3 "$COMPILE" --run-id "$NAN_RESULT_RUN" > "$TMP_DIR/nan-result-compile.out" 2>&1
1174
+ python3 - "$BENCH_ROOT/results/$NAN_RESULT_RUN/summary.json" <<'PY'
1175
+ import json
1176
+ import sys
1177
+
1178
+ summary = json.load(open(sys.argv[1], encoding="utf8"))
1179
+ row = summary["rows"][0]
1180
+ if row["variant_disqualifier"] is not True:
1181
+ raise SystemExit("NaN variant result.json must fail closed as a disqualifier")
1182
+ if row["arms"]["variant"].get("wall_s") is not None:
1183
+ raise SystemExit("NaN variant result.json must not expose timing fields")
1184
+ PY
1185
+ expect_fail_contains nan-result-artifact-disqualifies \
1186
+ "variant disqualifier(s)" \
1187
+ python3 "$GATE" --run-id "$NAN_RESULT_RUN" --accept-missing
1188
+
1189
+ echo "PASS test-ship-gate"