devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,430 @@
1
+ #!/usr/bin/env bash
2
+ # judge-opus-pass.sh — POST-RUN Opus 4.7 dual-judge sidecar.
3
+ #
4
+ # Runs Opus 4.7 over the SAME sanitized blind prompts that judge.sh wrote
5
+ # (judge-prompt.txt), producing a parallel judge-opus.json alongside the
6
+ # canonical judge.json. Does NOT modify judge.sh and does NOT change the
7
+ # measurement contract for the in-flight run — this is a sidecar that adds
8
+ # cross-judge data after the suite finishes.
9
+ #
10
+ # Why: judge.sh runs GPT-5.5 today; variant arms run Codex BUILD/FIX on
11
+ # GPT-5.5 too — self-judgment bias. Opus pass tests inter-judge agreement.
12
+ #
13
+ # iter-0025 (B-1) rewrite — 3-arm aware. Prior version computed only legacy
14
+ # `variant_score / bare_score / margin` from A/B slots (Codex R0 caught: F1
15
+ # has variant in C; F6/F9 have bare in C — legacy parser silently produced
16
+ # wrong margins on iter-0020 mappings). This version mirrors judge.sh post
17
+ # iter-0023:
18
+ # - Reads `_blind_mapping` from gpt judge.json (A/B/C → arm name).
19
+ # - Requires C-slot score when C is present in mapping.
20
+ # - Computes `scores_by_arm`, `margins.{variant_over_bare, solo_over_bare,
21
+ # variant_over_solo}`, and per-arm 4-axis breakdown.
22
+ # - Validates each axis cell ∈ [0, 25] (clamp + record under
23
+ # `_axis_validation`, same shape as judge.sh).
24
+ # - Always re-judges (no skip-on-exists) so cross-judge results never go
25
+ # stale.
26
+ # - Aggregator computes per-axis L1-L0 disagreement vs GPT (the decisive
27
+ # metric per Codex R0 Q1 — falsification rule: any axis disagreement >2
28
+ # means iter-0021/0023 L1 readout is single-judge artifact).
29
+ #
30
+ # Usage:
31
+ # judge-opus-pass.sh --run-id <ID>
32
+ #
33
+ # Reads: results/<run-id>/<fixture>/judge-prompt.txt (blind A/B/C prompt)
34
+ # results/<run-id>/<fixture>/judge.json (blind mapping)
35
+ # Writes: results/<run-id>/<fixture>/judge-opus.json (Opus's parsed result)
36
+ # results/<run-id>/cross-judge-summary.json (agreement metrics)
37
+
38
+ set -euo pipefail
39
+
40
+ usage() { echo "usage: $0 --run-id <ID>"; exit 1; }
41
+ RUN_ID=""
42
+ while [ $# -gt 0 ]; do
43
+ case "$1" in
44
+ --run-id) RUN_ID="$2"; shift 2;;
45
+ *) usage;;
46
+ esac
47
+ done
48
+ [ -n "$RUN_ID" ] || usage
49
+
50
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
51
+ RES_ROOT="$BENCH_ROOT/results/$RUN_ID"
52
+ [ -d "$RES_ROOT" ] || { echo "no results dir: $RES_ROOT"; exit 1; }
53
+
54
+ command -v claude >/dev/null 2>&1 || { echo "claude CLI not on PATH"; exit 1; }
55
+
56
+ CLAUDE_CLI_VER=$(claude --version 2>/dev/null || echo "claude-cli unknown")
57
+ JUDGE_MODEL_ALIAS="opus-4.7"
58
+
59
+ echo "[opus-judge] run-id=$RUN_ID cli=$CLAUDE_CLI_VER model=$JUDGE_MODEL_ALIAS"
60
+
61
+ processed=0
62
+ skipped=0
63
+ failed=0
64
+
65
+ for fixture_dir in "$RES_ROOT"/F*/; do
66
+ [ -d "$fixture_dir" ] || continue
67
+ fid=$(basename "$fixture_dir")
68
+ prompt_f="$fixture_dir/judge-prompt.txt"
69
+ gpt_judge_f="$fixture_dir/judge.json"
70
+ opus_out_raw="$fixture_dir/judge-opus-output.txt"
71
+ opus_judge_f="$fixture_dir/judge-opus.json"
72
+
73
+ if [ ! -f "$prompt_f" ] || [ ! -f "$gpt_judge_f" ]; then
74
+ echo "[opus-judge] skip $fid (missing prompt or gpt judge)"
75
+ skipped=$((skipped + 1))
76
+ continue
77
+ fi
78
+
79
+ echo "[opus-judge] judging $fid ..."
80
+
81
+ # Codex R1 #2: drop any stale judge-opus.json before re-judging. Otherwise
82
+ # a parse failure here keeps the previous run's file and the aggregator
83
+ # silently reads it.
84
+ rm -f "$opus_judge_f"
85
+
86
+ # Strict MCP isolation matches the variant arm contract — no user MCP
87
+ # leakage into the judge call. Run from a clean tmp CWD so it can't peek
88
+ # at project files that might leak arm identity.
89
+ JUDGE_CWD="/tmp/opus-judge-$RUN_ID-$fid"
90
+ rm -rf "$JUDGE_CWD"
91
+ mkdir -p "$JUDGE_CWD"
92
+
93
+ set +e
94
+ ( cd "$JUDGE_CWD" && claude -p "$(cat "$prompt_f")" \
95
+ --dangerously-skip-permissions \
96
+ --strict-mcp-config --mcp-config '{"mcpServers":{}}' \
97
+ ) > "$opus_out_raw" 2>&1
98
+ rc=$?
99
+ set -e
100
+ rm -rf "$JUDGE_CWD"
101
+
102
+ if [ $rc -ne 0 ]; then
103
+ echo "[opus-judge] ✗ $fid claude -p exit=$rc (output preserved at $opus_out_raw)"
104
+ failed=$((failed + 1))
105
+ continue
106
+ fi
107
+
108
+ python3 - "$opus_out_raw" "$gpt_judge_f" "$opus_judge_f" "$CLAUDE_CLI_VER" "$JUDGE_MODEL_ALIAS" <<'PY' || { echo "[opus-judge] ✗ $fid parse failed"; failed=$((failed + 1)); continue; }
109
+ import sys, json, pathlib
110
+
111
+ raw = pathlib.Path(sys.argv[1]).read_text()
112
+ gpt = json.loads(pathlib.Path(sys.argv[2]).read_text())
113
+ target = pathlib.Path(sys.argv[3])
114
+ cli_ver = sys.argv[4].strip()
115
+ model_alias = sys.argv[5].strip()
116
+
117
+ # Robust JSON extraction — last valid {} block with required score keys.
118
+ mapping = gpt.get("_blind_mapping") or {}
119
+ required_score_keys = ["a_score", "b_score"]
120
+ if "C" in mapping:
121
+ required_score_keys.append("c_score")
122
+
123
+ decoder = json.JSONDecoder()
124
+ brace_positions = [i for i, c in enumerate(raw) if c == '{']
125
+ chosen = None
126
+ for pos in reversed(brace_positions):
127
+ try:
128
+ obj, _ = decoder.raw_decode(raw[pos:])
129
+ except json.JSONDecodeError:
130
+ continue
131
+ if isinstance(obj, dict) and all(k in obj for k in required_score_keys):
132
+ chosen = obj
133
+ break
134
+ if chosen is None:
135
+ raise SystemExit(
136
+ f"no valid JSON with keys {required_score_keys} in opus output: {sys.argv[1]}"
137
+ )
138
+
139
+ # Axis validation — mirror judge.sh post iter-0023.
140
+ AXIS_KEYS = ("spec", "constraint", "scope", "quality")
141
+ BREAKDOWN_KEYS = ("a_breakdown", "b_breakdown", "c_breakdown")
142
+ axis_invalid_cells = []
143
+ for bk in BREAKDOWN_KEYS:
144
+ if bk not in chosen or not isinstance(chosen[bk], dict):
145
+ continue
146
+ for axis in AXIS_KEYS:
147
+ if axis not in chosen[bk]:
148
+ continue
149
+ v = chosen[bk][axis]
150
+ if not isinstance(v, (int, float)) or v < 0 or v > 25:
151
+ axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
152
+ chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
153
+ chosen["_axis_validation"] = {
154
+ "out_of_range_count": len(axis_invalid_cells),
155
+ "out_of_range_cells": axis_invalid_cells,
156
+ "axis_range": [0, 25],
157
+ }
158
+ if axis_invalid_cells:
159
+ sys.stderr.write(
160
+ f"[opus-judge] WARNING: {len(axis_invalid_cells)} axis cell(s) out of [0,25] "
161
+ f"clamped: {axis_invalid_cells}\n"
162
+ )
163
+
164
+ # Reuse GPT's blind mapping verbatim (same blind A/B/C decoded the same way).
165
+ chosen["_blind_mapping"] = mapping
166
+ chosen["_judge_cli"] = cli_ver
167
+ chosen["_judge_model"] = model_alias
168
+
169
+ # scores_by_arm + margins — mirror judge.sh post iter-0023.
170
+ slot_keys = ["a_score", "b_score", "c_score"]
171
+ slot_letters = ["A", "B", "C"]
172
+ scores_by_arm = {}
173
+ for letter, key in zip(slot_letters, slot_keys):
174
+ arm = mapping.get(letter)
175
+ if arm is not None and key in chosen:
176
+ scores_by_arm[arm] = chosen[key]
177
+ chosen["scores_by_arm"] = scores_by_arm
178
+
179
+ def margin(left, right):
180
+ if left in scores_by_arm and right in scores_by_arm:
181
+ return scores_by_arm[left] - scores_by_arm[right]
182
+ return None
183
+
184
+ chosen["margins"] = {
185
+ "variant_over_bare": margin("variant", "bare"),
186
+ "solo_over_bare": margin("solo_claude", "bare"),
187
+ "variant_over_solo": margin("variant", "solo_claude"),
188
+ }
189
+
190
+ # Per-arm axis breakdown.
191
+ breakdowns_by_arm = {}
192
+ for letter, bk in zip(slot_letters, BREAKDOWN_KEYS):
193
+ arm = mapping.get(letter)
194
+ if arm is not None and bk in chosen:
195
+ breakdowns_by_arm[arm] = chosen[bk]
196
+ chosen["breakdowns_by_arm"] = breakdowns_by_arm
197
+
198
+ # Per-arm critical_findings + disqualifiers (same shape judge.sh emits).
199
+ findings_letters = chosen.get("critical_findings", {}) or {}
200
+ chosen["findings_by_arm"] = {
201
+ mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping
202
+ }
203
+ dq_letters = chosen.get("disqualifiers", {}) or {}
204
+ dq_by_arm = {}
205
+ for l in slot_letters:
206
+ if l not in mapping:
207
+ continue
208
+ arm = mapping[l]
209
+ dq_by_arm[arm] = {
210
+ "disqualifier": bool(dq_letters.get(l, False)),
211
+ "reason": str(dq_letters.get(f"{l}_reason", "") or ""),
212
+ }
213
+ chosen["disqualifiers_by_arm"] = dq_by_arm
214
+
215
+ # Winner letter → arm.
216
+ w = chosen.get("winner")
217
+ if w in slot_letters and w in mapping:
218
+ chosen["winner_arm"] = mapping[w]
219
+ elif w == "tie":
220
+ chosen["winner_arm"] = "tie"
221
+ else:
222
+ chosen["winner_arm"] = None
223
+
224
+ # Legacy fields kept for back-compat.
225
+ a_arm = mapping.get("A")
226
+ b_arm = mapping.get("B")
227
+ chosen["variant_score"] = scores_by_arm.get("variant")
228
+ chosen["bare_score"] = scores_by_arm.get("bare")
229
+ chosen["solo_score"] = scores_by_arm.get("solo_claude")
230
+ if chosen["variant_score"] is not None and chosen["bare_score"] is not None:
231
+ chosen["margin"] = chosen["variant_score"] - chosen["bare_score"]
232
+ else:
233
+ chosen["margin"] = None
234
+
235
+ target.write_text(json.dumps(chosen, indent=2))
236
+ print(
237
+ f"[opus-judge] {target.parent.name} "
238
+ f"v={chosen.get('variant_score')} l1={chosen.get('solo_score')} l0={chosen.get('bare_score')} "
239
+ f"l1-l0={chosen['margins']['solo_over_bare']} v-l1={chosen['margins']['variant_over_solo']}"
240
+ )
241
+ PY
242
+ processed=$((processed + 1))
243
+ done
244
+
245
+ echo "[opus-judge] judge passes: processed=$processed skipped=$skipped failed=$failed"
246
+
247
+ # Aggregate cross-judge agreement, including per-axis L1-L0 disagreement.
248
+ python3 - "$RES_ROOT" <<'PY'
249
+ import json, pathlib, sys, math
250
+
251
+ res_root = pathlib.Path(sys.argv[1])
252
+ rows = []
253
+ axis_keys = ("spec", "constraint", "scope", "quality")
254
+
255
+ for fdir in sorted(res_root.glob("F*/")):
256
+ g_f = fdir / "judge.json"
257
+ o_f = fdir / "judge-opus.json"
258
+ if not g_f.exists() or not o_f.exists():
259
+ continue
260
+ g = json.loads(g_f.read_text())
261
+ o = json.loads(o_f.read_text())
262
+
263
+ # Per-axis L1-L0 (solo_claude − bare) for both judges.
264
+ # Codex R1 #1: judge.sh historically writes `a/b/c_breakdown` plus
265
+ # `_blind_mapping`, NOT `breakdowns_by_arm`. iter-0020 judge.json files
266
+ # are in that historical shape. Derive per-arm breakdowns from letter
267
+ # fields when `breakdowns_by_arm` is absent; fail loudly when neither
268
+ # source is available so axis disagreement never silently falls to zero.
269
+ def axis_l1_l0(j, label):
270
+ bka = j.get("breakdowns_by_arm") or {}
271
+ if "solo_claude" in bka and "bare" in bka:
272
+ l1 = bka["solo_claude"]; l0 = bka["bare"]
273
+ else:
274
+ mapping = j.get("_blind_mapping") or {}
275
+ slot_letters = ["A", "B", "C"]
276
+ slot_breakdowns = ["a_breakdown", "b_breakdown", "c_breakdown"]
277
+ derived = {}
278
+ for letter, bk in zip(slot_letters, slot_breakdowns):
279
+ arm = mapping.get(letter)
280
+ if arm is not None and bk in j:
281
+ derived[arm] = j[bk]
282
+ if "solo_claude" not in derived or "bare" not in derived:
283
+ raise SystemExit(
284
+ f"[cross-judge] {label} judge.json missing breakdowns for solo_claude/bare; "
285
+ "expected either `breakdowns_by_arm` or `a/b/c_breakdown` + `_blind_mapping`"
286
+ )
287
+ l1 = derived["solo_claude"]; l0 = derived["bare"]
288
+ return {a: (l1.get(a, 0) - l0.get(a, 0)) for a in axis_keys}
289
+
290
+ g_axes = axis_l1_l0(g, f"gpt {fdir.name}")
291
+ o_axes = axis_l1_l0(o, f"opus {fdir.name}")
292
+ axis_disagreement = {a: o_axes[a] - g_axes[a] for a in axis_keys}
293
+
294
+ g_margins = (g.get("margins") or {})
295
+ o_margins = (o.get("margins") or {})
296
+ g_l1_l0 = g_margins.get("solo_over_bare")
297
+ o_l1_l0 = o_margins.get("solo_over_bare")
298
+ g_v_l0 = g_margins.get("variant_over_bare")
299
+ o_v_l0 = o_margins.get("variant_over_bare")
300
+ margin_l1_l0_diff = (
301
+ abs(g_l1_l0 - o_l1_l0) if g_l1_l0 is not None and o_l1_l0 is not None else None
302
+ )
303
+ margin_v_l0_diff = (
304
+ abs(g_v_l0 - o_v_l0) if g_v_l0 is not None and o_v_l0 is not None else None
305
+ )
306
+
307
+ rows.append({
308
+ "fixture": fdir.name,
309
+ "gpt_scores": g.get("scores_by_arm") or {},
310
+ "opus_scores": o.get("scores_by_arm") or {},
311
+ "gpt_margin_l1_l0": g_l1_l0,
312
+ "opus_margin_l1_l0": o_l1_l0,
313
+ "margin_l1_l0_diff": margin_l1_l0_diff,
314
+ "gpt_margin_v_l0": g_v_l0,
315
+ "opus_margin_v_l0": o_v_l0,
316
+ "margin_v_l0_diff": margin_v_l0_diff,
317
+ "gpt_axis_l1_l0": g_axes,
318
+ "opus_axis_l1_l0": o_axes,
319
+ "axis_disagreement": axis_disagreement,
320
+ "winner_agree": g.get("winner_arm") == o.get("winner_arm"),
321
+ "gpt_winner": g.get("winner_arm"),
322
+ "opus_winner": o.get("winner_arm"),
323
+ })
324
+
325
+ if not rows:
326
+ print("[cross-judge] no paired judgements found")
327
+ sys.exit(0)
328
+
329
+ n = len(rows)
330
+
331
+ # Suite-level per-axis L1-L0 sum (both judges) and disagreement.
332
+ g_axis_sum = {a: sum(r["gpt_axis_l1_l0"][a] for r in rows) for a in axis_keys}
333
+ o_axis_sum = {a: sum(r["opus_axis_l1_l0"][a] for r in rows) for a in axis_keys}
334
+ axis_sum_disagreement = {a: o_axis_sum[a] - g_axis_sum[a] for a in axis_keys}
335
+ max_abs_axis_disagreement = max(abs(v) for v in axis_sum_disagreement.values())
336
+
337
+ # Per-axis falsification rule (Codex R0 Q1): any axis-sum disagreement >2 → single-judge artifact.
338
+ THRESHOLD = 2
339
+ falsified_by_axis = max_abs_axis_disagreement > THRESHOLD
340
+ flipped_axes = [a for a, v in axis_sum_disagreement.items() if abs(v) > THRESHOLD]
341
+
342
+ # Suite avg L1-L0 (both judges) — Codex R1 #3: divide by valid-count, report denom.
343
+ gpt_l1_l0_valid = [r["gpt_margin_l1_l0"] for r in rows if r["gpt_margin_l1_l0"] is not None]
344
+ opus_l1_l0_valid = [r["opus_margin_l1_l0"] for r in rows if r["opus_margin_l1_l0"] is not None]
345
+ gpt_l1_l0_avg = (sum(gpt_l1_l0_valid) / len(gpt_l1_l0_valid)) if gpt_l1_l0_valid else None
346
+ opus_l1_l0_avg = (sum(opus_l1_l0_valid) / len(opus_l1_l0_valid)) if opus_l1_l0_valid else None
347
+ suite_avg_diff = (
348
+ abs(gpt_l1_l0_avg - opus_l1_l0_avg)
349
+ if gpt_l1_l0_avg is not None and opus_l1_l0_avg is not None else None
350
+ )
351
+
352
+ # Sign agreement — Codex R1 #4: 3-way sign (-, 0, +) so tie vs positive don't agree.
353
+ def sign3(v):
354
+ if v is None:
355
+ return None
356
+ return 1 if v > 0 else (-1 if v < 0 else 0)
357
+
358
+ sign_agree_v_l0 = sum(
359
+ 1 for r in rows
360
+ if r["gpt_margin_v_l0"] is not None and r["opus_margin_v_l0"] is not None
361
+ and sign3(r["gpt_margin_v_l0"]) == sign3(r["opus_margin_v_l0"])
362
+ )
363
+ sign_agree_l1_l0 = sum(
364
+ 1 for r in rows
365
+ if r["gpt_margin_l1_l0"] is not None and r["opus_margin_l1_l0"] is not None
366
+ and sign3(r["gpt_margin_l1_l0"]) == sign3(r["opus_margin_l1_l0"])
367
+ )
368
+ sign_valid_l1_l0 = sum(
369
+ 1 for r in rows
370
+ if r["gpt_margin_l1_l0"] is not None and r["opus_margin_l1_l0"] is not None
371
+ )
372
+ sign_valid_v_l0 = sum(
373
+ 1 for r in rows
374
+ if r["gpt_margin_v_l0"] is not None and r["opus_margin_v_l0"] is not None
375
+ )
376
+ winner_agree = sum(1 for r in rows if r["winner_agree"])
377
+
378
+ l1_l0_diffs = [r["margin_l1_l0_diff"] for r in rows if r["margin_l1_l0_diff"] is not None]
379
+ mean_abs_l1_l0_diff = (sum(l1_l0_diffs) / len(l1_l0_diffs)) if l1_l0_diffs else None
380
+
381
+ summary = {
382
+ "n_fixtures": n,
383
+ "threshold": THRESHOLD,
384
+ "falsified_by_axis_disagreement": falsified_by_axis,
385
+ "flipped_axes": flipped_axes,
386
+ "axis_sum_l1_l0": {
387
+ "gpt": g_axis_sum,
388
+ "opus": o_axis_sum,
389
+ "disagreement": axis_sum_disagreement,
390
+ "max_abs_disagreement": max_abs_axis_disagreement,
391
+ },
392
+ "suite_avg_l1_l0": {
393
+ "gpt": gpt_l1_l0_avg,
394
+ "opus": opus_l1_l0_avg,
395
+ "abs_diff": suite_avg_diff,
396
+ "gpt_valid_count": len(gpt_l1_l0_valid),
397
+ "opus_valid_count": len(opus_l1_l0_valid),
398
+ },
399
+ "winner_agree_count": winner_agree,
400
+ "sign_agree_l1_l0": sign_agree_l1_l0,
401
+ "sign_valid_count_l1_l0": sign_valid_l1_l0,
402
+ "sign_agree_variant_over_bare": sign_agree_v_l0,
403
+ "sign_valid_count_variant_over_bare": sign_valid_v_l0,
404
+ "mean_abs_l1_l0_margin_diff": mean_abs_l1_l0_diff,
405
+ "mean_abs_l1_l0_valid_count": len(l1_l0_diffs),
406
+ "rows": rows,
407
+ }
408
+ out = res_root / "cross-judge-summary.json"
409
+ out.write_text(json.dumps(summary, indent=2))
410
+
411
+ print(
412
+ f"[cross-judge] n={n} "
413
+ f"falsified={falsified_by_axis} flipped_axes={flipped_axes} "
414
+ f"max_axis_disagreement={max_abs_axis_disagreement} "
415
+ f"gpt_l1_l0_avg={gpt_l1_l0_avg:.2f} opus_l1_l0_avg={opus_l1_l0_avg:.2f} "
416
+ f"suite_avg_diff={suite_avg_diff:.2f}"
417
+ )
418
+ print(f"[cross-judge] axis_sum_l1_l0: gpt={g_axis_sum} opus={o_axis_sum} disagree={axis_sum_disagreement}")
419
+ print(f"[cross-judge] wrote {out}")
420
+ PY
421
+
422
+ # Hard-fail summary if not all 9 fixtures produced paired judgements.
423
+ EXPECTED_FIXTURES=$(ls -d "$RES_ROOT"/F*/ 2>/dev/null | wc -l | awk '{print $1}')
424
+ PAIRED=$(find "$RES_ROOT" -maxdepth 2 -name 'judge-opus.json' | wc -l | awk '{print $1}')
425
+ echo "[opus-judge] expected_fixtures=$EXPECTED_FIXTURES paired=$PAIRED"
426
+ if [ "$PAIRED" -ne "$EXPECTED_FIXTURES" ]; then
427
+ echo "[opus-judge] ✗ paired ($PAIRED) != expected ($EXPECTED_FIXTURES) — sidecar incomplete; do not interpret cross-judge until all fixtures judged."
428
+ exit 2
429
+ fi
430
+ echo "[opus-judge] done: processed=$processed skipped=$skipped failed=$failed"