devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (220) hide show
  1. package/AGENTS.md +2 -2
  2. package/CLAUDE.md +4 -4
  3. package/README.md +85 -34
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +61 -44
  5. package/benchmark/auto-resolve/BENCHMARK-RESULTS.md +341 -0
  6. package/benchmark/auto-resolve/README.md +307 -44
  7. package/benchmark/auto-resolve/RUBRIC.md +23 -14
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +7 -3
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +8 -3
  10. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +8 -3
  11. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +10 -4
  12. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +10 -4
  13. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +12 -0
  14. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +6 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +7 -4
  16. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +12 -0
  17. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +6 -0
  18. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +8 -0
  19. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +12 -0
  20. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +6 -0
  21. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +16 -4
  22. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +7 -0
  23. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +11 -5
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +8 -1
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +4 -2
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +1 -1
  27. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/NOTES.md +34 -0
  28. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/expected.json +57 -0
  29. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/metadata.json +10 -0
  30. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/setup.sh +2 -0
  31. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/spec.md +67 -0
  32. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/task.txt +7 -0
  33. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/duplicate-event-error.js +35 -0
  34. package/benchmark/auto-resolve/fixtures/F31-cli-seat-rebalance/verifiers/priority-transfer-rollback.js +53 -0
  35. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/expected.json +57 -0
  37. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/setup.sh +2 -0
  39. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/spec.md +70 -0
  40. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/task.txt +3 -0
  41. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/duplicate-renewal-error.js +42 -0
  42. package/benchmark/auto-resolve/fixtures/F32-cli-subscription-renewal/verifiers/priority-credit-rollback.js +70 -0
  43. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +10 -3
  44. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +7 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +5 -0
  46. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +7 -0
  47. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +3 -0
  48. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +1 -1
  49. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +15 -3
  50. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +1 -1
  51. package/benchmark/auto-resolve/fixtures/SCHEMA.md +53 -7
  52. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/NOTES.md +37 -0
  53. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/RETIRED.md +13 -0
  54. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/expected.json +56 -0
  55. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/setup.sh +18 -0
  57. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/spec.md +69 -0
  58. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/task.txt +7 -0
  59. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/exact-proration.js +48 -0
  60. package/benchmark/auto-resolve/fixtures/retired/F27-cli-subscription-proration/verifiers/rules-source-and-conflict.js +79 -0
  61. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/NOTES.md +54 -0
  62. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/RETIRED.md +7 -0
  63. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/expected.json +67 -0
  64. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/metadata.json +10 -0
  65. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/setup.sh +2 -0
  66. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/spec.md +67 -0
  67. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/task.txt +5 -0
  68. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/policy-precedence.js +72 -0
  69. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-and-immutability.js +43 -0
  70. package/benchmark/auto-resolve/fixtures/retired/F28-cli-return-authorization/verifiers/validation-boundary.js +116 -0
  71. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/NOTES.md +35 -0
  72. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/RETIRED.md +12 -0
  73. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/expected.json +58 -0
  74. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/metadata.json +10 -0
  75. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/setup.sh +2 -0
  76. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/spec.md +73 -0
  77. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/task.txt +17 -0
  78. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/mixed-idempotent-settlement.js +53 -0
  79. package/benchmark/auto-resolve/fixtures/retired/F30-cli-credit-hold-settlement/verifiers/rejection-boundaries.js +74 -0
  80. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/NOTES.md +60 -0
  81. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/RETIRED.md +29 -0
  82. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/expected.json +73 -0
  83. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/metadata.json +10 -0
  84. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/setup.sh +28 -0
  85. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/spec.md +58 -0
  86. package/benchmark/auto-resolve/fixtures/retired/F9-e2e-ideate-to-preflight/task.txt +5 -0
  87. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.json +82 -0
  88. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/full-pipeline-pair-gate.md +18 -0
  89. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.json +46 -0
  90. package/benchmark/auto-resolve/results/20260510-f16-f23-f25-combined-proof/headroom-gate.md +17 -0
  91. package/benchmark/auto-resolve/run-real-benchmark.md +303 -0
  92. package/benchmark/auto-resolve/scripts/audit-headroom-rejections.py +441 -0
  93. package/benchmark/auto-resolve/scripts/audit-pair-evidence.py +1256 -0
  94. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +147 -15
  95. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +28 -16
  96. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +11 -1
  97. package/benchmark/auto-resolve/scripts/compile-report.py +208 -46
  98. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +22 -4
  99. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +175 -30
  100. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +408 -46
  101. package/benchmark/auto-resolve/scripts/headroom-gate.py +270 -39
  102. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +164 -33
  103. package/benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py +97 -0
  104. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +150 -38
  105. package/benchmark/auto-resolve/scripts/judge.sh +153 -26
  106. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +12 -5
  107. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +25 -2
  108. package/benchmark/auto-resolve/scripts/pair-candidate-frontier.py +469 -0
  109. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +5 -5
  110. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +9 -2
  111. package/benchmark/auto-resolve/scripts/pair-rejected-fixtures.sh +91 -0
  112. package/benchmark/auto-resolve/scripts/pair_evidence_contract.py +269 -0
  113. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +39 -10
  114. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +34 -4
  115. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +23 -5
  116. package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py +232 -0
  117. package/benchmark/auto-resolve/scripts/run-fixture.sh +118 -51
  118. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +211 -39
  119. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +335 -39
  120. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +249 -6
  121. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +22 -48
  122. package/benchmark/auto-resolve/scripts/run-suite.sh +44 -7
  123. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +120 -19
  124. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +32 -14
  125. package/benchmark/auto-resolve/scripts/ship-gate.py +219 -50
  126. package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py +53 -0
  127. package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py +77 -0
  128. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +239 -26
  129. package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh +288 -0
  130. package/benchmark/auto-resolve/scripts/test-audit-pair-evidence.sh +1672 -0
  131. package/benchmark/auto-resolve/scripts/test-benchmark-arg-parsing.sh +933 -0
  132. package/benchmark/auto-resolve/scripts/test-build-pair-eligible-manifest.sh +491 -0
  133. package/benchmark/auto-resolve/scripts/test-check-f9-artifacts.sh +91 -0
  134. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +328 -3
  135. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +497 -18
  136. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +331 -14
  137. package/benchmark/auto-resolve/scripts/test-iter-0033c-compare.sh +525 -0
  138. package/benchmark/auto-resolve/scripts/test-iter-0033c-l1-summary.sh +254 -0
  139. package/benchmark/auto-resolve/scripts/test-lint-fixtures.sh +580 -0
  140. package/benchmark/auto-resolve/scripts/test-pair-candidate-frontier.sh +591 -0
  141. package/benchmark/auto-resolve/scripts/test-run-full-pipeline-pair-candidate.sh +497 -0
  142. package/benchmark/auto-resolve/scripts/test-run-headroom-candidate.sh +401 -0
  143. package/benchmark/auto-resolve/scripts/test-run-swebench-solver-batch.sh +111 -0
  144. package/benchmark/auto-resolve/scripts/test-ship-gate.sh +1189 -0
  145. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +924 -5
  146. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/NOTES.md +28 -0
  147. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/expected.json +63 -0
  148. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/metadata.json +10 -0
  149. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/setup.sh +3 -0
  150. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/spec.md +47 -0
  151. package/benchmark/auto-resolve/shadow-fixtures/S1-cli-lang-flag/task.txt +1 -0
  152. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/NOTES.md +34 -0
  153. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/expected.json +53 -0
  154. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/metadata.json +10 -0
  155. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/setup.sh +3 -0
  156. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/spec.md +50 -0
  157. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/task.txt +1 -0
  158. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/duplicate-order-error.js +27 -0
  159. package/benchmark/auto-resolve/shadow-fixtures/S2-cli-inventory-reservation/verifiers/priority-stock-reservation.js +44 -0
  160. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/NOTES.md +34 -0
  161. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/expected.json +55 -0
  162. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/metadata.json +10 -0
  163. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/setup.sh +3 -0
  164. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/spec.md +52 -0
  165. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/task.txt +1 -0
  166. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/duplicate-ticket-error.js +29 -0
  167. package/benchmark/auto-resolve/shadow-fixtures/S3-cli-ticket-assignment/verifiers/priority-agent-assignment.js +48 -0
  168. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/NOTES.md +34 -0
  169. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/expected.json +55 -0
  170. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/metadata.json +10 -0
  171. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/setup.sh +3 -0
  172. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/spec.md +55 -0
  173. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/task.txt +1 -0
  174. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/duplicate-return-error.js +43 -0
  175. package/benchmark/auto-resolve/shadow-fixtures/S4-cli-return-routing/verifiers/priority-return-routing.js +70 -0
  176. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/NOTES.md +37 -0
  177. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/expected.json +54 -0
  178. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/metadata.json +10 -0
  179. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/setup.sh +3 -0
  180. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/spec.md +59 -0
  181. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/task.txt +1 -0
  182. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/credit-ledger-priority.js +98 -0
  183. package/benchmark/auto-resolve/shadow-fixtures/S5-cli-credit-grant-ledger/verifiers/duplicate-charge-error.js +38 -0
  184. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/NOTES.md +36 -0
  185. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/expected.json +56 -0
  186. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/metadata.json +10 -0
  187. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/setup.sh +3 -0
  188. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/spec.md +59 -0
  189. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/task.txt +1 -0
  190. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/duplicate-refund-error.js +41 -0
  191. package/benchmark/auto-resolve/shadow-fixtures/S6-cli-refund-window-ledger/verifiers/priority-refund-ledger.js +65 -0
  192. package/bin/devlyn.js +221 -17
  193. package/config/skills/_shared/adapters/README.md +3 -0
  194. package/config/skills/_shared/adapters/gpt-5-5.md +5 -1
  195. package/config/skills/_shared/adapters/opus-4-7.md +9 -1
  196. package/config/skills/_shared/archive_run.py +78 -6
  197. package/config/skills/_shared/codex-config.md +5 -4
  198. package/config/skills/_shared/codex-monitored.sh +46 -1
  199. package/config/skills/_shared/collect-codex-findings.py +20 -5
  200. package/config/skills/_shared/engine-preflight.md +17 -13
  201. package/config/skills/_shared/runtime-principles.md +6 -9
  202. package/config/skills/_shared/spec-verify-check.py +2664 -107
  203. package/config/skills/_shared/verify-merge-findings.py +1369 -19
  204. package/config/skills/devlyn:design-ui/SKILL.md +364 -0
  205. package/config/skills/devlyn:ideate/SKILL.md +7 -4
  206. package/config/skills/devlyn:ideate/references/elicitation.md +50 -4
  207. package/config/skills/devlyn:ideate/references/from-spec-mode.md +26 -4
  208. package/config/skills/devlyn:ideate/references/project-mode.md +20 -1
  209. package/config/skills/devlyn:ideate/references/spec-template.md +10 -1
  210. package/config/skills/devlyn:resolve/SKILL.md +78 -26
  211. package/config/skills/devlyn:resolve/references/free-form-mode.md +15 -0
  212. package/config/skills/devlyn:resolve/references/phases/build-gate.md +2 -2
  213. package/config/skills/devlyn:resolve/references/phases/implement.md +1 -1
  214. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +74 -2
  215. package/config/skills/devlyn:resolve/references/phases/verify.md +80 -29
  216. package/config/skills/devlyn:resolve/references/state-schema.md +9 -4
  217. package/package.json +47 -2
  218. package/scripts/lint-fixtures.sh +349 -0
  219. package/scripts/lint-shadow-fixtures.sh +58 -0
  220. package/scripts/lint-skills.sh +3645 -95
@@ -23,9 +23,9 @@
23
23
  # `_axis_validation`, same shape as judge.sh).
24
24
  # - Always re-judges (no skip-on-exists) so cross-judge results never go
25
25
  # stale.
26
- # - Aggregator computes per-axis L1-L0 disagreement vs GPT (the decisive
27
- # metric per Codex R0 Q1 — falsification rule: any axis disagreement >2
28
- # means iter-0021/0023 L1 readout is single-judge artifact).
26
+ # - Aggregator computes per-axis solo_claude-bare (L1-L0) disagreement vs
27
+ # GPT (the decisive metric per Codex R0 Q1 — falsification rule: any axis
28
+ # disagreement >2 means iter-0021/0023 L1 readout is single-judge artifact).
29
29
  #
30
30
  # Usage:
31
31
  # judge-opus-pass.sh --run-id <ID>
@@ -38,10 +38,19 @@
38
38
  set -euo pipefail
39
39
 
40
40
  usage() { echo "usage: $0 --run-id <ID>"; exit 1; }
41
+ require_value() {
42
+ local flag="$1"
43
+ local value="${2:-}"
44
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
45
+ echo "$flag requires a value" >&2
46
+ exit 1
47
+ fi
48
+ }
49
+
41
50
  RUN_ID=""
42
51
  while [ $# -gt 0 ]; do
43
52
  case "$1" in
44
- --run-id) RUN_ID="$2"; shift 2;;
53
+ --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
45
54
  *) usage;;
46
55
  esac
47
56
  done
@@ -51,6 +60,50 @@ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
51
60
  RES_ROOT="$BENCH_ROOT/results/$RUN_ID"
52
61
  [ -d "$RES_ROOT" ] || { echo "no results dir: $RES_ROOT"; exit 1; }
53
62
 
63
+ python3 - "$RES_ROOT" "$BENCH_ROOT/scripts" <<'PY'
64
+ import json
65
+ import pathlib
66
+ import sys
67
+ sys.path.insert(0, sys.argv[2])
68
+ from pair_evidence_contract import loads_strict_json_object
69
+
70
+ def is_score(value):
71
+ return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
72
+
73
+ res_root = pathlib.Path(sys.argv[1])
74
+ errors = []
75
+ for fixture_dir in sorted(p for p in res_root.glob("F*/") if p.is_dir()):
76
+ judge_path = fixture_dir / "judge.json"
77
+ prompt_path = fixture_dir / "judge-prompt.txt"
78
+ if not judge_path.exists() or not prompt_path.exists():
79
+ continue
80
+ judge = loads_strict_json_object(judge_path.read_text())
81
+ mapping = judge.get("_blind_mapping")
82
+ if not isinstance(mapping, dict):
83
+ errors.append(f"{fixture_dir.name}: judge blind mapping missing")
84
+ continue
85
+ mapped_arms = {arm for slot, arm in mapping.items() if slot in {"A", "B", "C"}}
86
+ required = {"bare", "solo_claude"}
87
+ raw_scores = judge.get("scores_by_arm")
88
+ scores = raw_scores if isinstance(raw_scores, dict) else {}
89
+ if "variant" in scores:
90
+ required.add("variant")
91
+ malformed_scores = sorted(arm for arm, score in scores.items() if not is_score(score))
92
+ if malformed_scores:
93
+ errors.append(f"{fixture_dir.name}: scores_by_arm malformed score(s): {', '.join(malformed_scores)}")
94
+ missing = sorted(required - mapped_arms)
95
+ if missing:
96
+ errors.append(f"{fixture_dir.name}: judge blind mapping missing arm(s): {', '.join(missing)}")
97
+ unmapped_scores = sorted(set(scores) - mapped_arms)
98
+ if unmapped_scores:
99
+ errors.append(f"{fixture_dir.name}: scores_by_arm without blind mapping: {', '.join(unmapped_scores)}")
100
+
101
+ if errors:
102
+ for error in errors:
103
+ print(f"[opus-judge] ✗ {error}", file=sys.stderr)
104
+ raise SystemExit(2)
105
+ PY
106
+
54
107
  command -v claude >/dev/null 2>&1 || { echo "claude CLI not on PATH"; exit 1; }
55
108
 
56
109
  CLAUDE_CLI_VER=$(claude --version 2>/dev/null || echo "claude-cli unknown")
@@ -105,22 +158,32 @@ for fixture_dir in "$RES_ROOT"/F*/; do
105
158
  continue
106
159
  fi
107
160
 
108
- python3 - "$opus_out_raw" "$gpt_judge_f" "$opus_judge_f" "$CLAUDE_CLI_VER" "$JUDGE_MODEL_ALIAS" <<'PY' || { echo "[opus-judge] ✗ $fid parse failed"; failed=$((failed + 1)); continue; }
161
+ python3 - "$opus_out_raw" "$gpt_judge_f" "$opus_judge_f" "$CLAUDE_CLI_VER" "$JUDGE_MODEL_ALIAS" "$BENCH_ROOT/scripts" <<'PY' || { echo "[opus-judge] ✗ $fid parse failed"; failed=$((failed + 1)); continue; }
109
162
  import sys, json, pathlib
163
+ sys.path.insert(0, sys.argv[6])
164
+ from pair_evidence_contract import loads_strict_json_object, reject_json_constant
165
+
166
+ def is_score(value):
167
+ return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
168
+
169
+ def is_bool(value):
170
+ return isinstance(value, bool)
110
171
 
111
172
  raw = pathlib.Path(sys.argv[1]).read_text()
112
- gpt = json.loads(pathlib.Path(sys.argv[2]).read_text())
173
+ gpt = loads_strict_json_object(pathlib.Path(sys.argv[2]).read_text())
113
174
  target = pathlib.Path(sys.argv[3])
114
175
  cli_ver = sys.argv[4].strip()
115
176
  model_alias = sys.argv[5].strip()
116
177
 
117
178
  # Robust JSON extraction — last valid {} block with required score keys.
118
- mapping = gpt.get("_blind_mapping") or {}
179
+ mapping = gpt.get("_blind_mapping")
180
+ if not isinstance(mapping, dict):
181
+ raise SystemExit("gpt judge.json _blind_mapping must be an object")
119
182
  required_score_keys = ["a_score", "b_score"]
120
183
  if "C" in mapping:
121
184
  required_score_keys.append("c_score")
122
185
 
123
- decoder = json.JSONDecoder()
186
+ decoder = json.JSONDecoder(parse_constant=reject_json_constant)
124
187
  brace_positions = [i for i, c in enumerate(raw) if c == '{']
125
188
  chosen = None
126
189
  for pos in reversed(brace_positions):
@@ -135,6 +198,9 @@ if chosen is None:
135
198
  raise SystemExit(
136
199
  f"no valid JSON with keys {required_score_keys} in opus output: {sys.argv[1]}"
137
200
  )
201
+ invalid_scores = [key for key in required_score_keys if not is_score(chosen.get(key))]
202
+ if invalid_scores:
203
+ raise SystemExit(f"invalid opus score value(s): {', '.join(invalid_scores)}")
138
204
 
139
205
  # Axis validation — mirror judge.sh post iter-0023.
140
206
  AXIS_KEYS = ("spec", "constraint", "scope", "quality")
@@ -147,9 +213,9 @@ for bk in BREAKDOWN_KEYS:
147
213
  if axis not in chosen[bk]:
148
214
  continue
149
215
  v = chosen[bk][axis]
150
- if not isinstance(v, (int, float)) or v < 0 or v > 25:
216
+ if isinstance(v, bool) or not isinstance(v, (int, float)) or v < 0 or v > 25:
151
217
  axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
152
- chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
218
+ chosen[bk][axis] = max(0, min(25, int(v) if not isinstance(v, bool) and isinstance(v, (int, float)) else 0))
153
219
  chosen["_axis_validation"] = {
154
220
  "out_of_range_count": len(axis_invalid_cells),
155
221
  "out_of_range_cells": axis_invalid_cells,
@@ -172,7 +238,7 @@ slot_letters = ["A", "B", "C"]
172
238
  scores_by_arm = {}
173
239
  for letter, key in zip(slot_letters, slot_keys):
174
240
  arm = mapping.get(letter)
175
- if arm is not None and key in chosen:
241
+ if arm is not None and key in chosen and is_score(chosen[key]):
176
242
  scores_by_arm[arm] = chosen[key]
177
243
  chosen["scores_by_arm"] = scores_by_arm
178
244
 
@@ -196,18 +262,26 @@ for letter, bk in zip(slot_letters, BREAKDOWN_KEYS):
196
262
  chosen["breakdowns_by_arm"] = breakdowns_by_arm
197
263
 
198
264
  # Per-arm critical_findings + disqualifiers (same shape judge.sh emits).
199
- findings_letters = chosen.get("critical_findings", {}) or {}
265
+ raw_findings_letters = chosen.get("critical_findings")
266
+ findings_letters = raw_findings_letters if isinstance(raw_findings_letters, dict) else {}
200
267
  chosen["findings_by_arm"] = {
201
268
  mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping
202
269
  }
203
- dq_letters = chosen.get("disqualifiers", {}) or {}
270
+ raw_dq_letters = chosen.get("disqualifiers")
271
+ dq_letters = raw_dq_letters if isinstance(raw_dq_letters, dict) else {}
272
+ invalid_dq_letters = [
273
+ letter for letter in slot_letters
274
+ if letter in dq_letters and not is_bool(dq_letters.get(letter))
275
+ ]
276
+ if invalid_dq_letters:
277
+ raise SystemExit(f"invalid opus disqualifier value(s): {', '.join(invalid_dq_letters)}")
204
278
  dq_by_arm = {}
205
279
  for l in slot_letters:
206
280
  if l not in mapping:
207
281
  continue
208
282
  arm = mapping[l]
209
283
  dq_by_arm[arm] = {
210
- "disqualifier": bool(dq_letters.get(l, False)),
284
+ "disqualifier": dq_letters.get(l, False) is True,
211
285
  "reason": str(dq_letters.get(f"{l}_reason", "") or ""),
212
286
  }
213
287
  chosen["disqualifiers_by_arm"] = dq_by_arm
@@ -236,7 +310,8 @@ target.write_text(json.dumps(chosen, indent=2))
236
310
  print(
237
311
  f"[opus-judge] {target.parent.name} "
238
312
  f"v={chosen.get('variant_score')} l1={chosen.get('solo_score')} l0={chosen.get('bare_score')} "
239
- f"l1-l0={chosen['margins']['solo_over_bare']} v-l1={chosen['margins']['variant_over_solo']}"
313
+ f"solo_claude-bare={chosen['margins']['solo_over_bare']} "
314
+ f"variant-solo_claude={chosen['margins']['variant_over_solo']}"
240
315
  )
241
316
  PY
242
317
  processed=$((processed + 1))
@@ -244,9 +319,14 @@ done
244
319
 
245
320
  echo "[opus-judge] judge passes: processed=$processed skipped=$skipped failed=$failed"
246
321
 
247
- # Aggregate cross-judge agreement, including per-axis L1-L0 disagreement.
248
- python3 - "$RES_ROOT" <<'PY'
322
+ # Aggregate cross-judge agreement, including per-axis solo_claude-bare (L1-L0) disagreement.
323
+ python3 - "$RES_ROOT" "$BENCH_ROOT/scripts" <<'PY'
249
324
  import json, pathlib, sys, math
325
+ sys.path.insert(0, sys.argv[2])
326
+ from pair_evidence_contract import loads_strict_json_object
327
+
328
+ def is_score(value):
329
+ return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
250
330
 
251
331
  res_root = pathlib.Path(sys.argv[1])
252
332
  rows = []
@@ -257,21 +337,30 @@ for fdir in sorted(res_root.glob("F*/")):
257
337
  o_f = fdir / "judge-opus.json"
258
338
  if not g_f.exists() or not o_f.exists():
259
339
  continue
260
- g = json.loads(g_f.read_text())
261
- o = json.loads(o_f.read_text())
340
+ g = loads_strict_json_object(g_f.read_text())
341
+ o = loads_strict_json_object(o_f.read_text())
262
342
 
263
- # Per-axis L1-L0 (solo_claude − bare) for both judges.
343
+ # Per-axis solo_claude-bare (L1-L0) for both judges.
264
344
  # Codex R1 #1: judge.sh historically writes `a/b/c_breakdown` plus
265
345
  # `_blind_mapping`, NOT `breakdowns_by_arm`. iter-0020 judge.json files
266
346
  # are in that historical shape. Derive per-arm breakdowns from letter
267
347
  # fields when `breakdowns_by_arm` is absent; fail loudly when neither
268
348
  # source is available so axis disagreement never silently falls to zero.
349
+ def blind_mapping(j):
350
+ raw_mapping = j.get("_blind_mapping")
351
+ return raw_mapping if isinstance(raw_mapping, dict) else {}
352
+
353
+ def mapped_arm_set(j):
354
+ mapping = blind_mapping(j)
355
+ return {arm for slot, arm in mapping.items() if slot in {"A", "B", "C"}}
356
+
269
357
  def axis_l1_l0(j, label):
358
+ mapped_arms = mapped_arm_set(j)
270
359
  bka = j.get("breakdowns_by_arm") or {}
271
- if "solo_claude" in bka and "bare" in bka:
360
+ if {"solo_claude", "bare"}.issubset(mapped_arms) and "solo_claude" in bka and "bare" in bka:
272
361
  l1 = bka["solo_claude"]; l0 = bka["bare"]
273
362
  else:
274
- mapping = j.get("_blind_mapping") or {}
363
+ mapping = blind_mapping(j)
275
364
  slot_letters = ["A", "B", "C"]
276
365
  slot_breakdowns = ["a_breakdown", "b_breakdown", "c_breakdown"]
277
366
  derived = {}
@@ -287,16 +376,35 @@ for fdir in sorted(res_root.glob("F*/")):
287
376
  l1 = derived["solo_claude"]; l0 = derived["bare"]
288
377
  return {a: (l1.get(a, 0) - l0.get(a, 0)) for a in axis_keys}
289
378
 
379
+ def mapped_scores(j):
380
+ mapped_arms = mapped_arm_set(j)
381
+ raw_scores = j.get("scores_by_arm")
382
+ scores = raw_scores if isinstance(raw_scores, dict) else {}
383
+ return {arm: score for arm, score in scores.items() if arm in mapped_arms and is_score(score)}
384
+
385
+ def margin_from_scores(scores, left, right):
386
+ if left in scores and right in scores:
387
+ return scores[left] - scores[right]
388
+ return None
389
+
390
+ def mapped_winner(j, scores):
391
+ winner = j.get("winner_arm")
392
+ if winner == "tie" or winner in scores:
393
+ return winner
394
+ return None
395
+
290
396
  g_axes = axis_l1_l0(g, f"gpt {fdir.name}")
291
397
  o_axes = axis_l1_l0(o, f"opus {fdir.name}")
292
398
  axis_disagreement = {a: o_axes[a] - g_axes[a] for a in axis_keys}
293
399
 
294
- g_margins = (g.get("margins") or {})
295
- o_margins = (o.get("margins") or {})
296
- g_l1_l0 = g_margins.get("solo_over_bare")
297
- o_l1_l0 = o_margins.get("solo_over_bare")
298
- g_v_l0 = g_margins.get("variant_over_bare")
299
- o_v_l0 = o_margins.get("variant_over_bare")
400
+ g_scores = mapped_scores(g)
401
+ o_scores = mapped_scores(o)
402
+ g_l1_l0 = margin_from_scores(g_scores, "solo_claude", "bare")
403
+ o_l1_l0 = margin_from_scores(o_scores, "solo_claude", "bare")
404
+ g_v_l0 = margin_from_scores(g_scores, "variant", "bare")
405
+ o_v_l0 = margin_from_scores(o_scores, "variant", "bare")
406
+ g_winner = mapped_winner(g, g_scores)
407
+ o_winner = mapped_winner(o, o_scores)
300
408
  margin_l1_l0_diff = (
301
409
  abs(g_l1_l0 - o_l1_l0) if g_l1_l0 is not None and o_l1_l0 is not None else None
302
410
  )
@@ -306,8 +414,8 @@ for fdir in sorted(res_root.glob("F*/")):
306
414
 
307
415
  rows.append({
308
416
  "fixture": fdir.name,
309
- "gpt_scores": g.get("scores_by_arm") or {},
310
- "opus_scores": o.get("scores_by_arm") or {},
417
+ "gpt_scores": g_scores,
418
+ "opus_scores": o_scores,
311
419
  "gpt_margin_l1_l0": g_l1_l0,
312
420
  "opus_margin_l1_l0": o_l1_l0,
313
421
  "margin_l1_l0_diff": margin_l1_l0_diff,
@@ -317,9 +425,9 @@ for fdir in sorted(res_root.glob("F*/")):
317
425
  "gpt_axis_l1_l0": g_axes,
318
426
  "opus_axis_l1_l0": o_axes,
319
427
  "axis_disagreement": axis_disagreement,
320
- "winner_agree": g.get("winner_arm") == o.get("winner_arm"),
321
- "gpt_winner": g.get("winner_arm"),
322
- "opus_winner": o.get("winner_arm"),
428
+ "winner_agree": g_winner is not None and o_winner is not None and g_winner == o_winner,
429
+ "gpt_winner": g_winner,
430
+ "opus_winner": o_winner,
323
431
  })
324
432
 
325
433
  if not rows:
@@ -328,7 +436,7 @@ if not rows:
328
436
 
329
437
  n = len(rows)
330
438
 
331
- # Suite-level per-axis L1-L0 sum (both judges) and disagreement.
439
+ # Suite-level per-axis solo_claude-bare (L1-L0) sum (both judges) and disagreement.
332
440
  g_axis_sum = {a: sum(r["gpt_axis_l1_l0"][a] for r in rows) for a in axis_keys}
333
441
  o_axis_sum = {a: sum(r["opus_axis_l1_l0"][a] for r in rows) for a in axis_keys}
334
442
  axis_sum_disagreement = {a: o_axis_sum[a] - g_axis_sum[a] for a in axis_keys}
@@ -339,7 +447,7 @@ THRESHOLD = 2
339
447
  falsified_by_axis = max_abs_axis_disagreement > THRESHOLD
340
448
  flipped_axes = [a for a, v in axis_sum_disagreement.items() if abs(v) > THRESHOLD]
341
449
 
342
- # Suite avg L1-L0 (both judges) — Codex R1 #3: divide by valid-count, report denom.
450
+ # Suite avg solo_claude-bare (L1-L0, both judges) — Codex R1 #3: divide by valid-count, report denom.
343
451
  gpt_l1_l0_valid = [r["gpt_margin_l1_l0"] for r in rows if r["gpt_margin_l1_l0"] is not None]
344
452
  opus_l1_l0_valid = [r["opus_margin_l1_l0"] for r in rows if r["opus_margin_l1_l0"] is not None]
345
453
  gpt_l1_l0_avg = (sum(gpt_l1_l0_valid) / len(gpt_l1_l0_valid)) if gpt_l1_l0_valid else None
@@ -408,18 +516,22 @@ summary = {
408
516
  out = res_root / "cross-judge-summary.json"
409
517
  out.write_text(json.dumps(summary, indent=2))
410
518
 
519
+ def fmt_metric(value):
520
+ return f"{value:.2f}" if value is not None else "na"
521
+
411
522
  print(
412
523
  f"[cross-judge] n={n} "
413
524
  f"falsified={falsified_by_axis} flipped_axes={flipped_axes} "
414
525
  f"max_axis_disagreement={max_abs_axis_disagreement} "
415
- f"gpt_l1_l0_avg={gpt_l1_l0_avg:.2f} opus_l1_l0_avg={opus_l1_l0_avg:.2f} "
416
- f"suite_avg_diff={suite_avg_diff:.2f}"
526
+ f"gpt_l1_l0_avg={fmt_metric(gpt_l1_l0_avg)} "
527
+ f"opus_l1_l0_avg={fmt_metric(opus_l1_l0_avg)} "
528
+ f"suite_avg_diff={fmt_metric(suite_avg_diff)}"
417
529
  )
418
530
  print(f"[cross-judge] axis_sum_l1_l0: gpt={g_axis_sum} opus={o_axis_sum} disagree={axis_sum_disagreement}")
419
531
  print(f"[cross-judge] wrote {out}")
420
532
  PY
421
533
 
422
- # Hard-fail summary if not all 9 fixtures produced paired judgements.
534
+ # Hard-fail summary if any fixture in the selected result set lacks a paired judgement.
423
535
  EXPECTED_FIXTURES=$(ls -d "$RES_ROOT"/F*/ 2>/dev/null | wc -l | awk '{print $1}')
424
536
  PAIRED=$(find "$RES_ROOT" -maxdepth 2 -name 'judge-opus.json' | wc -l | awk '{print $1}')
425
537
  echo "[opus-judge] expected_fixtures=$EXPECTED_FIXTURES paired=$PAIRED"
@@ -7,30 +7,46 @@
7
7
  # Reads:
8
8
  # results/<run-id>/<fixture>/variant/diff.patch + verify.json
9
9
  # results/<run-id>/<fixture>/bare/diff.patch + verify.json
10
- # fixtures/<fixture>/spec.md + expected.json + NOTES.md
10
+ # fixtures/<fixture>/spec.md + expected.json, or shadow-fixtures/<fixture>/...
11
11
  # RUBRIC.md (stable rubric)
12
12
  #
13
13
  # Writes:
14
14
  # results/<run-id>/<fixture>/judge.json
15
15
  #
16
- # Blind: A/B assignment randomized per fixture, seed stored in judge.json.
16
+ # Blind: arm-to-slot assignment randomized per fixture, seed stored in judge.json.
17
17
 
18
18
  set -euo pipefail
19
19
 
20
20
  usage() { echo "usage: $0 --fixture <FID> --run-id <ID>"; exit 1; }
21
+ require_value() {
22
+ local flag="$1"
23
+ local value="${2:-}"
24
+ if [ -z "$value" ] || [[ "$value" == --* ]]; then
25
+ echo "$flag requires a value" >&2
26
+ exit 1
27
+ fi
28
+ }
29
+
21
30
  FIXTURE=""; RUN_ID=""
22
31
  while [ $# -gt 0 ]; do
23
32
  case "$1" in
24
- --fixture) FIXTURE="$2"; shift 2;;
25
- --run-id) RUN_ID="$2"; shift 2;;
33
+ --fixture) require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
34
+ --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
26
35
  *) usage;;
27
36
  esac
28
37
  done
29
38
  [ -n "$FIXTURE" ] && [ -n "$RUN_ID" ] || usage
30
39
 
31
40
  BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
32
- FIX_DIR="$BENCH_ROOT/fixtures/$FIXTURE"
33
41
  RES_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE"
42
+ FIX_DIR=""
43
+ for candidate in "$BENCH_ROOT/fixtures/$FIXTURE" "$BENCH_ROOT/shadow-fixtures/$FIXTURE"; do
44
+ if [ -d "$candidate" ]; then
45
+ FIX_DIR="$candidate"
46
+ break
47
+ fi
48
+ done
49
+ [ -n "$FIX_DIR" ] || { echo "fixture not found in fixtures/ or shadow-fixtures/: $FIXTURE"; exit 1; }
34
50
 
35
51
  # iter-0019: 3 arms — variant (L2), solo_claude (L1), bare (L0). The judge
36
52
  # scores all three in a single pass with the same prompt + same model so
@@ -43,10 +59,11 @@ ARMS_PRESENT=()
43
59
  # iter-0033c: l2_gated/l2_forced added for NEW L2 vs NEW L1 measurement.
44
60
  # iter-0037: l2_risk_probes adds bounded visible-verification probes before
45
61
  # IMPLEMENT; judge treats it as another blind arm when artifacts exist.
46
- # Slot count is still A/B/C max 3 pair-eligible iter-0033c fixtures supply
47
- # {solo_claude, l2_gated, l2_forced}; non-pair-eligible fixtures supply
48
- # {solo_claude, l2_gated}. The blind-shuffle slot mapping below already
49
- # tolerates arbitrary ARMS_PRESENT counts ≥2.
62
+ # Slot count is still A/B/C max 3. Current pair-candidate proof runs supply
63
+ # {bare, solo_claude, selected pair arm} where the selected pair arm is usually
64
+ # l2_risk_probes; older diagnostic runs may instead supply l2_gated/l2_forced.
65
+ # The blind-shuffle slot mapping below already tolerates arbitrary
66
+ # ARMS_PRESENT counts >=2.
50
67
  for arm in variant solo_claude bare l2_gated l2_risk_probes l2_forced; do
51
68
  if [ -f "$RES_DIR/$arm/diff.patch" ] && [ -f "$RES_DIR/$arm/verify.json" ]; then
52
69
  ARMS_PRESENT+=("$arm")
@@ -100,12 +117,15 @@ fi
100
117
  # pipeline-commit markers, .devlyn/ archive lines) don't leak to the judge.
101
118
  # Judge sees only file-content changes; the transcript, arm label, NOTES.md,
102
119
  # and all process artifacts stay out of the prompt.
103
- python3 - "$PROMPT_FILE" "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md" "$A_DIFF" "$B_DIFF" "$A_VERIFY" "$B_VERIFY" "$C_DIFF" "$C_VERIFY" <<'PY'
120
+ python3 - "$PROMPT_FILE" "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md" "$A_DIFF" "$B_DIFF" "$A_VERIFY" "$B_VERIFY" "$C_DIFF" "$C_VERIFY" "$BENCH_ROOT/scripts" <<'PY'
104
121
  import sys, pathlib, re, json
105
122
  args = sys.argv[1:]
106
123
  out_p, spec_p, exp_p, rubric_p = map(pathlib.Path, args[:4])
107
124
  a_diff, b_diff, a_ver, b_ver = map(pathlib.Path, args[4:8])
108
125
  c_diff_arg, c_ver_arg = args[8], args[9]
126
+ sys.path.insert(0, args[10])
127
+ from pair_evidence_contract import loads_strict_json_object
128
+
109
129
  c_diff = pathlib.Path(c_diff_arg) if c_diff_arg else None
110
130
  c_ver = pathlib.Path(c_ver_arg) if c_ver_arg else None
111
131
  out = out_p
@@ -139,7 +159,7 @@ def sanitize(diff: str) -> str:
139
159
 
140
160
  # Also strip arm-identifying fields from verify.json before passing to judge.
141
161
  def sanitize_verify(path: pathlib.Path) -> str:
142
- data = json.loads(path.read_text())
162
+ data = loads_strict_json_object(path.read_text())
143
163
  # Remove anything that could name the arm
144
164
  data.pop("arm", None)
145
165
  return json.dumps(data, indent=2)
@@ -253,8 +273,88 @@ rm -rf "$JUDGE_CWD"
253
273
  mkdir -p "$JUDGE_CWD"
254
274
 
255
275
  JUDGE_OUT="$RES_DIR/judge-output.txt"
276
+ JUDGE_LAST="$RES_DIR/judge-last-message.txt"
277
+ JUDGE_SCHEMA="$RES_DIR/judge-output.schema.json"
278
+ python3 - "$JUDGE_SCHEMA" "$C_ARM" <<'PY'
279
+ import json
280
+ import pathlib
281
+ import sys
282
+
283
+ out = pathlib.Path(sys.argv[1])
284
+ have_c = bool(sys.argv[2])
285
+ letters = ["A", "B"] + (["C"] if have_c else [])
286
+
287
+ breakdown = {
288
+ "type": "object",
289
+ "required": ["spec", "constraint", "scope", "quality", "notes"],
290
+ "properties": {
291
+ "spec": {"type": "integer", "minimum": 0, "maximum": 25},
292
+ "constraint": {"type": "integer", "minimum": 0, "maximum": 25},
293
+ "scope": {"type": "integer", "minimum": 0, "maximum": 25},
294
+ "quality": {"type": "integer", "minimum": 0, "maximum": 25},
295
+ "notes": {"type": "string"},
296
+ },
297
+ "additionalProperties": False,
298
+ }
299
+ findings = {
300
+ "type": "object",
301
+ "required": letters,
302
+ "properties": {letter: {"type": "array", "items": {"type": "string"}} for letter in letters},
303
+ "additionalProperties": False,
304
+ }
305
+ disqualifier_props = {}
306
+ for letter in letters:
307
+ disqualifier_props[letter] = {"type": "boolean"}
308
+ disqualifier_props[f"{letter}_reason"] = {"type": "string"}
309
+ schema = {
310
+ "type": "object",
311
+ "required": [
312
+ "a_score",
313
+ "b_score",
314
+ "winner",
315
+ "a_breakdown",
316
+ "b_breakdown",
317
+ "critical_findings",
318
+ "disqualifiers",
319
+ "overall_reasoning",
320
+ ] + (["c_score", "c_breakdown"] if have_c else []),
321
+ "properties": {
322
+ "a_score": {"type": "integer", "minimum": 0, "maximum": 100},
323
+ "b_score": {"type": "integer", "minimum": 0, "maximum": 100},
324
+ "winner": {"type": "string", "enum": letters + ["tie"]},
325
+ "a_breakdown": breakdown,
326
+ "b_breakdown": breakdown,
327
+ "critical_findings": findings,
328
+ "disqualifiers": {
329
+ "type": "object",
330
+ "required": list(disqualifier_props),
331
+ "properties": disqualifier_props,
332
+ "additionalProperties": False,
333
+ },
334
+ "overall_reasoning": {"type": "string"},
335
+ },
336
+ "additionalProperties": False,
337
+ }
338
+ if have_c:
339
+ schema["properties"]["c_score"] = {"type": "integer", "minimum": 0, "maximum": 100}
340
+ schema["properties"]["c_breakdown"] = breakdown
341
+ out.write_text(json.dumps(schema, indent=2))
342
+ PY
256
343
  set +e
257
- cat "$PROMPT_FILE" | (cd "$JUDGE_CWD" && codex exec -s read-only --skip-git-repo-check -c model_reasoning_effort=xhigh - ) > "$JUDGE_OUT" 2>&1
344
+ cat "$PROMPT_FILE" | (
345
+ cd "$JUDGE_CWD" && codex exec \
346
+ --ignore-user-config \
347
+ --ignore-rules \
348
+ --ephemeral \
349
+ --disable codex_hooks \
350
+ --disable hooks \
351
+ -s read-only \
352
+ --skip-git-repo-check \
353
+ -c model_reasoning_effort=xhigh \
354
+ --output-schema "$JUDGE_SCHEMA" \
355
+ --output-last-message "$JUDGE_LAST" \
356
+ -
357
+ ) > "$JUDGE_OUT" 2>&1
258
358
  JUDGE_EXIT=$?
259
359
  set -e
260
360
  rm -rf "$JUDGE_CWD"
@@ -264,18 +364,23 @@ if [ $JUDGE_EXIT -ne 0 ]; then
264
364
  fi
265
365
 
266
366
  # Extract JSON (codex wraps with banners; pick the last {...} block)
267
- python3 - "$JUDGE_OUT" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" <<'PY'
367
+ python3 - "$JUDGE_OUT" "$JUDGE_LAST" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" "$BENCH_ROOT/scripts" <<'PY'
268
368
  import math
269
369
  import sys, re, json, pathlib
270
- out = pathlib.Path(sys.argv[1]).read_text()
271
- target = pathlib.Path(sys.argv[2])
272
- a_arm, b_arm, c_arm, seed, codex_ver, judge_model = sys.argv[3:9]
370
+ out_path = pathlib.Path(sys.argv[1])
371
+ last_path = pathlib.Path(sys.argv[2])
372
+ target = pathlib.Path(sys.argv[3])
373
+ a_arm, b_arm, c_arm, seed, codex_ver, judge_model = sys.argv[4:10]
374
+ sys.path.insert(0, sys.argv[10])
375
+ from pair_evidence_contract import loads_strict_json_object, reject_json_constant
376
+
377
+ out = last_path.read_text() if last_path.is_file() and last_path.stat().st_size else out_path.read_text()
273
378
 
274
379
  # Extract the last valid judgment JSON. A naive brace-counter breaks on
275
380
  # `{`/`}` that appear inside strings (e.g. JS source embedded in the arms'
276
381
  # diffs), so use json.JSONDecoder.raw_decode starting at each `{` position
277
382
  # and keep the last successful parse with the required keys.
278
- decoder = json.JSONDecoder()
383
+ decoder = json.JSONDecoder(parse_constant=reject_json_constant)
279
384
  brace_positions = [i for i, c in enumerate(out) if c == '{']
280
385
  chosen = None
281
386
  for pos in reversed(brace_positions):
@@ -287,7 +392,21 @@ for pos in reversed(brace_positions):
287
392
  chosen = obj
288
393
  break
289
394
  if chosen is None:
290
- raise SystemExit(f"no valid JSON in judge output; see {sys.argv[1]}")
395
+ raise SystemExit(f"no valid JSON in judge output; see {out_path}")
396
+
397
+ def is_score(value):
398
+ return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 100
399
+
400
+ def is_bool(value):
401
+ return isinstance(value, bool)
402
+
403
+ def is_axis_score(value):
404
+ return isinstance(value, int) and not isinstance(value, bool) and 0 <= value <= 25
405
+
406
+ required_score_keys = ["a_score", "b_score"] + (["c_score"] if c_arm else [])
407
+ invalid_scores = [key for key in required_score_keys if not is_score(chosen.get(key))]
408
+ if invalid_scores:
409
+ raise SystemExit(f"invalid judge score value(s): {', '.join(invalid_scores)}")
291
410
 
292
411
  # Decode blind labels — record full mapping so summary code can iterate
293
412
  mapping = {"A": a_arm, "B": b_arm}
@@ -313,9 +432,9 @@ for bk in BREAKDOWN_KEYS:
313
432
  if axis not in chosen[bk]:
314
433
  continue
315
434
  v = chosen[bk][axis]
316
- if not isinstance(v, (int, float)) or v < 0 or v > 25:
435
+ if not is_axis_score(v):
317
436
  axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
318
- chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
437
+ chosen[bk][axis] = max(0, min(25, int(v) if not isinstance(v, bool) and isinstance(v, (int, float)) else 0))
319
438
  chosen["_axis_validation"] = {
320
439
  "out_of_range_count": len(axis_invalid_cells),
321
440
  "out_of_range_cells": axis_invalid_cells,
@@ -335,7 +454,7 @@ def arm_verify_score(arm: str):
335
454
  path = target.parent / arm / "verify.json"
336
455
  if not path.is_file():
337
456
  return None
338
- data = json.loads(path.read_text())
457
+ data = loads_strict_json_object(path.read_text())
339
458
  value = data.get("verify_score")
340
459
  return float(value) if isinstance(value, (int, float)) else None
341
460
 
@@ -367,7 +486,7 @@ for letter, score_key, breakdown_key in (
367
486
  "score_capped": False,
368
487
  "spec_capped": False,
369
488
  }
370
- if isinstance(raw_score, (int, float)) and raw_score > score_cap:
489
+ if is_score(raw_score) and raw_score > score_cap:
371
490
  chosen[score_key] = score_cap
372
491
  row["score_capped"] = True
373
492
  breakdown = chosen.get(breakdown_key)
@@ -386,23 +505,31 @@ slot_keys = ["a_score", "b_score", "c_score"]
386
505
  slot_letters = ["A", "B", "C"]
387
506
  for letter, key in zip(slot_letters, slot_keys):
388
507
  arm = mapping.get(letter)
389
- if arm is not None and key in chosen:
508
+ if arm is not None and key in chosen and is_score(chosen[key]):
390
509
  scores_by_arm[arm] = chosen[key]
391
510
  chosen["scores_by_arm"] = scores_by_arm
392
511
 
393
512
  # Per-letter critical_findings / disqualifiers also rotated to per-arm.
394
- findings_letters = chosen.get("critical_findings", {}) or {}
513
+ raw_findings_letters = chosen.get("critical_findings")
514
+ findings_letters = raw_findings_letters if isinstance(raw_findings_letters, dict) else {}
395
515
  findings_by_arm = {mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping}
396
516
  chosen["findings_by_arm"] = findings_by_arm
397
517
 
398
- dq_letters = chosen.get("disqualifiers", {}) or {}
518
+ raw_dq_letters = chosen.get("disqualifiers")
519
+ dq_letters = raw_dq_letters if isinstance(raw_dq_letters, dict) else {}
520
+ invalid_dq_letters = [
521
+ letter for letter in slot_letters
522
+ if letter in dq_letters and not is_bool(dq_letters.get(letter))
523
+ ]
524
+ if invalid_dq_letters:
525
+ raise SystemExit(f"invalid judge disqualifier value(s): {', '.join(invalid_dq_letters)}")
399
526
  dq_by_arm = {}
400
527
  for l in slot_letters:
401
528
  if l not in mapping:
402
529
  continue
403
530
  arm = mapping[l]
404
531
  dq_by_arm[arm] = {
405
- "disqualifier": bool(dq_letters.get(l, False)),
532
+ "disqualifier": dq_letters.get(l, False) is True,
406
533
  "reason": str(dq_letters.get(f"{l}_reason", "") or ""),
407
534
  }
408
535
  chosen["disqualifiers_by_arm"] = dq_by_arm