devlyn-cli 1.15.0 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (158) hide show
  1. package/AGENTS.md +104 -0
  2. package/CLAUDE.md +135 -21
  3. package/README.md +43 -125
  4. package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
  5. package/benchmark/auto-resolve/README.md +114 -0
  6. package/benchmark/auto-resolve/RUBRIC.md +162 -0
  7. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
  8. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
  9. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
  10. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
  11. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
  12. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
  13. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
  14. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
  15. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
  16. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
  17. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
  18. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
  19. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
  20. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
  21. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
  22. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
  23. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
  24. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
  25. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
  26. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
  27. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
  28. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
  29. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
  30. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
  31. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
  32. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
  33. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
  34. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
  35. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
  36. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
  37. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
  38. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
  39. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
  40. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
  41. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
  42. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
  43. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
  45. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
  46. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
  47. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
  48. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
  49. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
  50. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
  51. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
  52. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
  53. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
  54. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
  55. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
  56. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
  57. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
  58. package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
  59. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
  60. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
  61. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
  62. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
  63. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
  64. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
  65. package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
  66. package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
  67. package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
  68. package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
  69. package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
  70. package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
  71. package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
  72. package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
  73. package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
  74. package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
  75. package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
  76. package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
  77. package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
  78. package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
  79. package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
  80. package/benchmark/auto-resolve/scripts/judge.sh +359 -0
  81. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
  82. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
  83. package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
  84. package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
  85. package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
  86. package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
  87. package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
  88. package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
  89. package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
  90. package/bin/devlyn.js +175 -17
  91. package/config/skills/_shared/adapters/README.md +64 -0
  92. package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
  93. package/config/skills/_shared/adapters/opus-4-7.md +29 -0
  94. package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
  95. package/config/skills/_shared/codex-config.md +54 -0
  96. package/config/skills/_shared/codex-monitored.sh +141 -0
  97. package/config/skills/_shared/engine-preflight.md +35 -0
  98. package/config/skills/_shared/expected.schema.json +93 -0
  99. package/config/skills/_shared/pair-plan-schema.md +298 -0
  100. package/config/skills/_shared/runtime-principles.md +110 -0
  101. package/config/skills/_shared/spec-verify-check.py +519 -0
  102. package/config/skills/devlyn:ideate/SKILL.md +99 -429
  103. package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
  104. package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
  105. package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
  106. package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
  107. package/config/skills/devlyn:resolve/SKILL.md +172 -184
  108. package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
  109. package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
  110. package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
  111. package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
  112. package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
  113. package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
  114. package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
  115. package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
  116. package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
  117. package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
  118. package/package.json +12 -2
  119. package/scripts/lint-skills.sh +431 -0
  120. package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
  121. package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
  122. package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
  123. package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
  124. package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
  125. package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
  126. package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
  127. package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
  128. package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
  129. package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
  130. package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
  131. package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
  132. package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
  133. package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
  134. package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
  135. package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
  136. package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
  137. package/config/skills/devlyn:clean/SKILL.md +0 -285
  138. package/config/skills/devlyn:design-ui/SKILL.md +0 -351
  139. package/config/skills/devlyn:discover-product/SKILL.md +0 -124
  140. package/config/skills/devlyn:evaluate/SKILL.md +0 -564
  141. package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
  142. package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
  143. package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
  144. package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
  145. package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
  146. package/config/skills/devlyn:preflight/SKILL.md +0 -355
  147. package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
  148. package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
  149. package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
  150. package/config/skills/devlyn:product-spec/SKILL.md +0 -603
  151. package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
  152. package/config/skills/devlyn:review/SKILL.md +0 -161
  153. package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
  154. package/config/skills/devlyn:team-review/SKILL.md +0 -493
  155. package/config/skills/devlyn:update-docs/SKILL.md +0 -463
  156. package/config/skills/workflow-routing/SKILL.md +0 -73
  157. /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
  158. /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
@@ -0,0 +1,359 @@
1
+ #!/usr/bin/env bash
2
+ # judge.sh — Codex (CLI's current flagship, inherited) blind judge for ONE fixture.
3
+ #
4
+ # Usage:
5
+ # judge.sh --fixture <FID> --run-id <ID>
6
+ #
7
+ # Reads:
8
+ # results/<run-id>/<fixture>/variant/diff.patch + verify.json
9
+ # results/<run-id>/<fixture>/bare/diff.patch + verify.json
10
+ # fixtures/<fixture>/spec.md + expected.json + NOTES.md
11
+ # RUBRIC.md (stable rubric)
12
+ #
13
+ # Writes:
14
+ # results/<run-id>/<fixture>/judge.json
15
+ #
16
+ # Blind: A/B assignment randomized per fixture, seed stored in judge.json.
17
+
18
+ set -euo pipefail
19
+
20
+ usage() { echo "usage: $0 --fixture <FID> --run-id <ID>"; exit 1; }
21
+ FIXTURE=""; RUN_ID=""
22
+ while [ $# -gt 0 ]; do
23
+ case "$1" in
24
+ --fixture) FIXTURE="$2"; shift 2;;
25
+ --run-id) RUN_ID="$2"; shift 2;;
26
+ *) usage;;
27
+ esac
28
+ done
29
+ [ -n "$FIXTURE" ] && [ -n "$RUN_ID" ] || usage
30
+
31
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
32
+ FIX_DIR="$BENCH_ROOT/fixtures/$FIXTURE"
33
+ RES_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE"
34
+
35
+ # iter-0019: 3 arms — variant (L2), solo_claude (L1), bare (L0). The judge
36
+ # scores all three in a single pass with the same prompt + same model so
37
+ # margin derivations (L2-vs-L0, L1-vs-L0, L2-vs-L1) are calibrated against
38
+ # each other and not against separate judge calls. ARMS_PRESENT enumerates
39
+ # whichever subset actually has artifacts (so a missing arm doesn't abort
40
+ # the whole judge step). Two-arm judge mode is preserved for runs that pre-
41
+ # date iter-0019.
42
+ ARMS_PRESENT=()
43
+ # iter-0033c: l2_gated/l2_forced added for NEW L2 vs NEW L1 measurement.
44
+ # Slot count is still A/B/C max 3 — pair-eligible iter-0033c fixtures supply
45
+ # {solo_claude, l2_gated, l2_forced}; non-pair-eligible fixtures supply
46
+ # {solo_claude, l2_gated}. The blind-shuffle slot mapping below already
47
+ # tolerates arbitrary ARMS_PRESENT counts ≥2.
48
+ for arm in variant solo_claude bare l2_gated l2_forced; do
49
+ if [ -f "$RES_DIR/$arm/diff.patch" ] && [ -f "$RES_DIR/$arm/verify.json" ]; then
50
+ ARMS_PRESENT+=("$arm")
51
+ fi
52
+ done
53
+ if [ ${#ARMS_PRESENT[@]} -lt 2 ]; then
54
+ echo "judge needs at least 2 arms with diff.patch + verify.json; have: ${ARMS_PRESENT[*]:-(none)}"
55
+ exit 1
56
+ fi
57
+ for f in "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md"; do
58
+ [ -f "$f" ] || { echo "missing required input: $f"; exit 1; }
59
+ done
60
+
61
+ # Blind randomization: shuffle ARMS_PRESENT into ABC order. Seed recorded
62
+ # in judge.json so runs are reproducible if rejudged.
63
+ SEED=$RANDOM
64
+ # iter-0019.4: Bash 3.2 compatible (macOS /bin/bash). `mapfile` is Bash 4+
65
+ # only; replaced with while-read loop. The `|| [ -n "$line" ]` guard
66
+ # preserves exact `mapfile -t` behavior on a final unterminated line (Python
67
+ # print() emits trailing \n so this guard is belt-and-suspenders here, but
68
+ # matches mapfile semantics for future producers).
69
+ SLOTS=()
70
+ while IFS= read -r line || [ -n "$line" ]; do
71
+ SLOTS+=("$line")
72
+ done < <(python3 - "$SEED" "${ARMS_PRESENT[@]}" <<'PY'
73
+ import sys, random
74
+ seed = int(sys.argv[1]); arms = sys.argv[2:]
75
+ random.seed(seed)
76
+ random.shuffle(arms)
77
+ print("\n".join(arms))
78
+ PY
79
+ )
80
+ A_ARM="${SLOTS[0]:-}"
81
+ B_ARM="${SLOTS[1]:-}"
82
+ C_ARM="${SLOTS[2]:-}"
83
+
84
+ PROMPT_FILE="$RES_DIR/judge-prompt.txt"
85
+ A_DIFF="$RES_DIR/$A_ARM/diff.patch"
86
+ A_VERIFY="$RES_DIR/$A_ARM/verify.json"
87
+ B_DIFF="$RES_DIR/$B_ARM/diff.patch"
88
+ B_VERIFY="$RES_DIR/$B_ARM/verify.json"
89
+ if [ -n "$C_ARM" ]; then
90
+ C_DIFF="$RES_DIR/$C_ARM/diff.patch"
91
+ C_VERIFY="$RES_DIR/$C_ARM/verify.json"
92
+ else
93
+ C_DIFF=""
94
+ C_VERIFY=""
95
+ fi
96
+
97
+ # Sanitize diffs so stylistic tells that correlate with variant (e.g.
98
+ # pipeline-commit markers, .devlyn/ archive lines) don't leak to the judge.
99
+ # Judge sees only file-content changes; the transcript, arm label, NOTES.md,
100
+ # and all process artifacts stay out of the prompt.
101
+ python3 - "$PROMPT_FILE" "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md" "$A_DIFF" "$B_DIFF" "$A_VERIFY" "$B_VERIFY" "$C_DIFF" "$C_VERIFY" <<'PY'
102
+ import sys, pathlib, re, json
103
+ args = sys.argv[1:]
104
+ out_p, spec_p, exp_p, rubric_p = map(pathlib.Path, args[:4])
105
+ a_diff, b_diff, a_ver, b_ver = map(pathlib.Path, args[4:8])
106
+ c_diff_arg, c_ver_arg = args[8], args[9]
107
+ c_diff = pathlib.Path(c_diff_arg) if c_diff_arg else None
108
+ c_ver = pathlib.Path(c_ver_arg) if c_ver_arg else None
109
+ out = out_p
110
+ spec = spec_p.read_text()
111
+ expected = exp_p.read_text()
112
+ rubric = rubric_p.read_text()
113
+
114
+ # Strip pipeline-origin tells from the diff before the judge sees it.
115
+ TELLS = [
116
+ re.compile(r"^diff --git.*\.devlyn/.*$", re.M),
117
+ re.compile(r"^chore\(pipeline\):.*$", re.M),
118
+ re.compile(r"^\.devlyn/.*$", re.M),
119
+ re.compile(r"^Co-Authored-By:.*$", re.M),
120
+ ]
121
+ def sanitize(diff: str) -> str:
122
+ # Drop whole-file hunks under .devlyn/
123
+ out_lines, skip = [], False
124
+ for line in diff.splitlines(keepends=True):
125
+ if line.startswith("diff --git ") and ".devlyn/" in line:
126
+ skip = True
127
+ continue
128
+ if line.startswith("diff --git "):
129
+ skip = False
130
+ if skip:
131
+ continue
132
+ out_lines.append(line)
133
+ text = "".join(out_lines)
134
+ for pat in TELLS:
135
+ text = pat.sub("", text)
136
+ return text
137
+
138
+ # Also strip arm-identifying fields from verify.json before passing to judge.
139
+ def sanitize_verify(path: pathlib.Path) -> str:
140
+ data = json.loads(path.read_text())
141
+ # Remove anything that could name the arm
142
+ data.pop("arm", None)
143
+ return json.dumps(data, indent=2)
144
+
145
+ a_diff_text = sanitize(a_diff.read_text())
146
+ b_diff_text = sanitize(b_diff.read_text())
147
+ a_ver_text = sanitize_verify(a_ver)
148
+ b_ver_text = sanitize_verify(b_ver)
149
+ have_c = c_diff is not None
150
+ if have_c:
151
+ c_diff_text = sanitize(c_diff.read_text())
152
+ c_ver_text = sanitize_verify(c_ver)
153
+
154
+ n_arms = 3 if have_c else 2
155
+ arms_phrase = "Three engineers" if have_c else "Two engineers"
156
+ slot_keys = ["a_score", "b_score", "c_score"][:n_arms]
157
+ slot_breakdowns = ["a_breakdown", "b_breakdown", "c_breakdown"][:n_arms]
158
+ slot_letters = ["A", "B", "C"][:n_arms]
159
+
160
+ # Build the JSON-format hint dynamically so the judge sees the right shape
161
+ # for either 2 or 3 arms. Same scoring rules; same rubric.
162
+ score_lines = ",\n ".join(f'"{k}": <int 0-100>' for k in slot_keys)
163
+ breakdown_lines = ",\n ".join(
164
+ f'"{b}": {{"spec": 0-25, "constraint": 0-25, "scope": 0-25, "quality": 0-25, "notes": "<3-5 bullets>"}}'
165
+ for b in slot_breakdowns
166
+ )
167
+ findings_keys = ", ".join(f'"{l}": ["..."]' for l in slot_letters)
168
+ dq_keys = ", ".join(f'"{l}": bool' for l in slot_letters)
169
+ dq_reasons = ", ".join(f'"{l}_reason": "..."' for l in slot_letters)
170
+ winner_choices = " | ".join(f'"{l}"' for l in slot_letters) + ' | "tie"'
171
+
172
+ # Per-arm sections of the prompt
173
+ def section(label: str, diff_text: str, verify_text: str) -> str:
174
+ return (
175
+ f"=== IMPLEMENTATION {label} ===\nDiff:\n"
176
+ f"```diff\n{diff_text}\n```\n"
177
+ f"Verification results:\n```json\n{verify_text}\n```\n"
178
+ )
179
+
180
+ impl_sections = section("A", a_diff_text, a_ver_text) + "\n" + section("B", b_diff_text, b_ver_text)
181
+ if have_c:
182
+ impl_sections += "\n" + section("C", c_diff_text, c_ver_text)
183
+
184
+ prompt = f"""You are a blind code-review judge. {arms_phrase} implemented the same spec. You do NOT know which implementation came from which process — grade them only on the merits of the code and its behavior.
185
+
186
+ Apply the 4-axis rubric from RUBRIC.md below. Each axis is 0-25, total 100. Score every implementation independently — do not let one arm's score anchor another's. The judge's job is to apply the rubric absolutely; relative ordering falls out from the absolute scores.
187
+
188
+ Return STRICT JSON only — no prose outside the JSON. Format:
189
+
190
+ {{
191
+ {score_lines},
192
+ "winner": {winner_choices},
193
+ {breakdown_lines},
194
+ "critical_findings": {{{findings_keys}}},
195
+ "disqualifiers": {{{dq_keys}, {dq_reasons}}},
196
+ "overall_reasoning": "<5-8 sentences>"
197
+ }}
198
+
199
+ === RUBRIC ===
200
+ {rubric}
201
+
202
+ === SPEC ===
203
+ {spec}
204
+
205
+ === EXPECTED (machine-readable acceptance) ===
206
+ {expected}
207
+
208
+ {impl_sections}
209
+ Return the JSON and nothing else.
210
+ """
211
+ out.write_text(prompt)
212
+ PY
213
+
214
+ # Invoke Codex — no -m so CLI flagship is inherited. Model identity is
215
+ # recorded from the codex config.toml so rejudging with a newer flagship is
216
+ # traceable. Run from a clean temp CWD so the judge can't peek at project
217
+ # files that would leak arm identity.
218
+ command -v codex >/dev/null 2>&1 || { echo "codex CLI not on PATH; cannot judge"; exit 1; }
219
+ CODEX_CLI_VER=$(codex --version 2>/dev/null || echo "codex-cli unknown")
220
+ JUDGE_MODEL=$(grep -E '^model\s*=' "${HOME}/.codex/config.toml" 2>/dev/null | head -1 | sed -E 's/.*=\s*"?([^"]+)"?.*/\1/')
221
+ [ -z "$JUDGE_MODEL" ] && JUDGE_MODEL="(unknown — codex config.toml not readable)"
222
+
223
+ JUDGE_CWD="/tmp/judge-$RUN_ID-$FIXTURE"
224
+ rm -rf "$JUDGE_CWD"
225
+ mkdir -p "$JUDGE_CWD"
226
+
227
+ JUDGE_OUT="$RES_DIR/judge-output.txt"
228
+ set +e
229
+ cat "$PROMPT_FILE" | (cd "$JUDGE_CWD" && codex exec -s read-only --skip-git-repo-check -c model_reasoning_effort=xhigh - ) > "$JUDGE_OUT" 2>&1
230
+ JUDGE_EXIT=$?
231
+ set -e
232
+ rm -rf "$JUDGE_CWD"
233
+ if [ $JUDGE_EXIT -ne 0 ]; then
234
+ echo "codex exec failed (exit $JUDGE_EXIT); see $JUDGE_OUT"
235
+ exit 1
236
+ fi
237
+
238
+ # Extract JSON (codex wraps with banners; pick the last {...} block)
239
+ python3 - "$JUDGE_OUT" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" <<'PY'
240
+ import sys, re, json, pathlib
241
+ out = pathlib.Path(sys.argv[1]).read_text()
242
+ target = pathlib.Path(sys.argv[2])
243
+ a_arm, b_arm, c_arm, seed, codex_ver, judge_model = sys.argv[3:9]
244
+
245
+ # Extract the last valid judgment JSON. A naive brace-counter breaks on
246
+ # `{`/`}` that appear inside strings (e.g. JS source embedded in the arms'
247
+ # diffs), so use json.JSONDecoder.raw_decode starting at each `{` position
248
+ # and keep the last successful parse with the required keys.
249
+ decoder = json.JSONDecoder()
250
+ brace_positions = [i for i, c in enumerate(out) if c == '{']
251
+ chosen = None
252
+ for pos in reversed(brace_positions):
253
+ try:
254
+ obj, _ = decoder.raw_decode(out[pos:])
255
+ except json.JSONDecodeError:
256
+ continue
257
+ if isinstance(obj, dict) and "a_score" in obj and "b_score" in obj:
258
+ chosen = obj
259
+ break
260
+ if chosen is None:
261
+ raise SystemExit(f"no valid JSON in judge output; see {sys.argv[1]}")
262
+
263
+ # Decode blind labels — record full mapping so summary code can iterate
264
+ mapping = {"A": a_arm, "B": b_arm}
265
+ if c_arm:
266
+ mapping["C"] = c_arm
267
+ chosen["_blind_mapping"] = {**mapping, "seed": int(seed)}
268
+ chosen["_judge_cli"] = codex_ver.strip()
269
+ chosen["_judge_model"] = judge_model.strip()
270
+
271
+ # iter-0023 — axis breakdown validation. Rubric axes are 0-25 (RUBRIC.md
272
+ # "Scoring — 4 axes, 25 points each"). Past runs (iter-0020 F9) recorded
273
+ # `quality: -1` because judge LLM occasionally emits sentinel/negative
274
+ # values; ship-gate then averaged invalid cells. Detect, clamp to [0, 25],
275
+ # and record the invalid cells under `_axis_validation` so downstream
276
+ # consumers can refuse to trust that fixture's margin.
277
+ AXIS_KEYS = ("spec", "constraint", "scope", "quality")
278
+ BREAKDOWN_KEYS = ("a_breakdown", "b_breakdown", "c_breakdown")
279
+ axis_invalid_cells = []
280
+ for bk in BREAKDOWN_KEYS:
281
+ if bk not in chosen or not isinstance(chosen[bk], dict):
282
+ continue
283
+ for axis in AXIS_KEYS:
284
+ if axis not in chosen[bk]:
285
+ continue
286
+ v = chosen[bk][axis]
287
+ if not isinstance(v, (int, float)) or v < 0 or v > 25:
288
+ axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
289
+ chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
290
+ chosen["_axis_validation"] = {
291
+ "out_of_range_count": len(axis_invalid_cells),
292
+ "out_of_range_cells": axis_invalid_cells,
293
+ "axis_range": [0, 25],
294
+ }
295
+ if axis_invalid_cells:
296
+ sys.stderr.write(
297
+ f"[judge.sh] WARNING: {len(axis_invalid_cells)} axis cell(s) out of [0,25] "
298
+ f"clamped: {axis_invalid_cells}\n"
299
+ )
300
+
301
+ # scores_by_arm: arm-name → score, computed from the blind A/B/C scores.
302
+ # This is the canonical 3-arm-aware shape the report consumer reads. The
303
+ # legacy variant_score / bare_score / margin fields below are derived from
304
+ # scores_by_arm for backward compatibility with pre-iter-0019 callers.
305
+ scores_by_arm = {}
306
+ slot_keys = ["a_score", "b_score", "c_score"]
307
+ slot_letters = ["A", "B", "C"]
308
+ for letter, key in zip(slot_letters, slot_keys):
309
+ arm = mapping.get(letter)
310
+ if arm is not None and key in chosen:
311
+ scores_by_arm[arm] = chosen[key]
312
+ chosen["scores_by_arm"] = scores_by_arm
313
+
314
+ # Per-letter critical_findings / disqualifiers also rotated to per-arm.
315
+ findings_letters = chosen.get("critical_findings", {}) or {}
316
+ findings_by_arm = {mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping}
317
+ chosen["findings_by_arm"] = findings_by_arm
318
+
319
+ dq_letters = chosen.get("disqualifiers", {}) or {}
320
+ dq_by_arm = {}
321
+ for l in slot_letters:
322
+ if l not in mapping:
323
+ continue
324
+ arm = mapping[l]
325
+ dq_by_arm[arm] = {
326
+ "disqualifier": bool(dq_letters.get(l, False)),
327
+ "reason": str(dq_letters.get(f"{l}_reason", "") or ""),
328
+ }
329
+ chosen["disqualifiers_by_arm"] = dq_by_arm
330
+
331
+ # Pairwise margins (positive = first arm beat second).
332
+ def margin(left: str, right: str):
333
+ if left in scores_by_arm and right in scores_by_arm:
334
+ return scores_by_arm[left] - scores_by_arm[right]
335
+ return None
336
+
337
+ chosen["margins"] = {
338
+ "variant_over_bare": margin("variant", "bare"),
339
+ "solo_over_bare": margin("solo_claude", "bare"),
340
+ "variant_over_solo": margin("variant", "solo_claude"),
341
+ }
342
+
343
+ # Translate winner letter to arm
344
+ w = chosen.get("winner")
345
+ chosen["winner_arm"] = mapping.get(w, "tie") if w in mapping else "tie"
346
+
347
+ # Legacy 2-arm fields preserved so older summary code still parses. When
348
+ # solo_claude is present, variant/bare margin is derived from scores_by_arm.
349
+ chosen["variant_score"] = scores_by_arm.get("variant")
350
+ chosen["bare_score"] = scores_by_arm.get("bare")
351
+ if chosen.get("variant_score") is not None and chosen.get("bare_score") is not None:
352
+ chosen["margin"] = chosen["variant_score"] - chosen["bare_score"]
353
+
354
+ target.write_text(json.dumps(chosen, indent=2))
355
+ parts = [f"{arm}={s}" for arm, s in scores_by_arm.items()]
356
+ mline = chosen.get("margins") or {}
357
+ mparts = [f"{k}={v:+d}" for k, v in mline.items() if v is not None]
358
+ print(f"[judge] " + " ".join(parts) + (" | " + " ".join(mparts) if mparts else ""))
359
+ PY
@@ -0,0 +1,260 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ oracle-scope-tier-a.py — deterministic detector for categorical file-path
4
+ violations. Flags touches of paths that are never legitimately needed by an
5
+ implementation task (planning docs, CI config, dep-install output, runtime
6
+ artifacts, env/secret files) and lockfile deletions.
7
+
8
+ Complementary to oracle-test-fidelity.py, which handles weakening INSIDE
9
+ existing test files. This oracle only cares about WHICH files are touched.
10
+
11
+ Path matching uses fnmatch with normalized, repo-root-relative paths.
12
+ `docs/**` matches `docs/<anything>` but NOT `server/docs/readme.md` — the
13
+ anchoring is left-only, which is what we want. Per-oracle convention
14
+ documented here; step 1's content oracle uses regex instead.
15
+
16
+ Fixtures can waive any Tier A pattern via `expected.json::tier_a_waivers`
17
+ (list of fnmatch globs). Load-bearing case: F9 e2e-ideate-to-preflight
18
+ legitimately creates docs/VISION.md, docs/ROADMAP.md, docs/roadmap/**.
19
+
20
+ Step 2 scope: findings only. Scoring integration is a later step.
21
+ """
22
+ import argparse
23
+ import fnmatch
24
+ import json
25
+ import os
26
+ import pathlib
27
+ import subprocess
28
+ import sys
29
+
30
+ ORACLE_NAME = "scope-tier-a"
31
+
32
+ # iter-0022: stable category enumeration. See header comment in
33
+ # oracle-test-fidelity.py for the edit-discipline rules. tier-a-violation is
34
+ # ONE umbrella category covering the 5 path-glob groups (planning-doc,
35
+ # ci-config, node-modules, test-results-or-coverage, env-secrets) plus the 2
36
+ # basename rules (.log suffix, .env/secrets. prefix); splitting into 7 sub-
37
+ # categories was rejected during iter-0022 R0 because the oracle emits a
38
+ # single finding-row per touched path regardless.
39
+ CATEGORIES = [
40
+ {
41
+ "id": "scope-tier-a:lockfile-deletion",
42
+ "severity": "hard",
43
+ "applies_when": "scaffold contains a lockfile (package-lock.json / yarn.lock / pnpm-lock.yaml / bun.lock / bun.lockb)",
44
+ "operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
45
+ "evidence_source_files": ["oracle-scope-tier-a.py"],
46
+ },
47
+ {
48
+ "id": "scope-tier-a:tier-a-violation",
49
+ "severity": "hard",
50
+ "applies_when": "any fixture (the protected paths exist conceptually for every JS/TS repo)",
51
+ "operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
52
+ "evidence_source_files": ["oracle-scope-tier-a.py"],
53
+ },
54
+ ]
55
+
56
+ # Path globs — fnmatch-style, left-anchored. `**` is treated as `*` by
57
+ # fnmatch (no special recursive semantics), so `docs/roadmap/**` matches
58
+ # `docs/roadmap/anything/nested/here` because `*` matches `/` in fnmatch.
59
+ TIER_A_PATH_GLOBS = [
60
+ # Planning docs — implementation tasks should never edit these.
61
+ "docs/roadmap/**",
62
+ "docs/VISION.md",
63
+ "docs/ROADMAP.md",
64
+ # CI / repo-wide config.
65
+ ".github/**",
66
+ # Install outputs.
67
+ "node_modules/**",
68
+ "**/node_modules/**",
69
+ # Runtime / test artifacts.
70
+ "test-results/**",
71
+ "coverage/**",
72
+ ".nyc_output/**",
73
+ ]
74
+
75
+ # Basename suffix matches (run against os.path.basename so any depth hits).
76
+ TIER_A_BASENAME_SUFFIXES = {
77
+ ".log",
78
+ }
79
+
80
+ # Basename prefix matches. `.env` → `.env`, `.env.local`, `.env.production`.
81
+ # `secrets.` → `secrets.json`, `secrets.yaml`.
82
+ TIER_A_BASENAME_PREFIXES = {
83
+ ".env",
84
+ "secrets.",
85
+ }
86
+
87
+ # Lockfiles — modification is legitimate when deps change; deletion is not.
88
+ # Only flag D status AND only if the file existed at scaffold.
89
+ LOCKFILE_NAMES = {
90
+ "package-lock.json",
91
+ "yarn.lock",
92
+ "pnpm-lock.yaml",
93
+ "bun.lock",
94
+ "bun.lockb",
95
+ }
96
+
97
+
98
+ def run_git(args, cwd):
99
+ return subprocess.run(
100
+ ["git", *args], cwd=cwd, capture_output=True, text=True
101
+ )
102
+
103
+
104
+ def git_diff_status(scaffold_sha, cwd):
105
+ r = run_git(["diff", "--name-status", "-M", scaffold_sha], cwd=cwd)
106
+ entries = []
107
+ for line in r.stdout.splitlines():
108
+ line = line.strip()
109
+ if not line:
110
+ continue
111
+ parts = line.split("\t")
112
+ status = parts[0]
113
+ if status.startswith("R") or status.startswith("C"):
114
+ if len(parts) >= 3:
115
+ # Treat as new path; keep R/C letter for reporting.
116
+ entries.append((status[0], parts[2]))
117
+ else:
118
+ if len(parts) >= 2:
119
+ entries.append((status, parts[1]))
120
+ return entries
121
+
122
+
123
+ def existed_at_scaffold(scaffold_sha, path, cwd):
124
+ r = run_git(["cat-file", "-e", f"{scaffold_sha}:{path}"], cwd=cwd)
125
+ return r.returncode == 0
126
+
127
+
128
+ def matches_any_glob(path, patterns):
129
+ for p in patterns:
130
+ if fnmatch.fnmatch(path, p):
131
+ return p
132
+ return None
133
+
134
+
135
+ def matches_basename(path, suffixes, prefixes):
136
+ base = os.path.basename(path)
137
+ for s in suffixes:
138
+ if base.endswith(s):
139
+ return f"*{s}"
140
+ for p in prefixes:
141
+ if base.startswith(p):
142
+ return f"{p}*"
143
+ return None
144
+
145
+
146
+ def is_waived(path, waivers):
147
+ for w in waivers:
148
+ if fnmatch.fnmatch(path, w):
149
+ return True
150
+ return False
151
+
152
+
153
+ def analyze(work_dir, scaffold_sha, waivers, fixture_id=None):
154
+ findings = []
155
+ entries = git_diff_status(scaffold_sha, work_dir)
156
+
157
+ # Structural exemption: every benchmark fixture has its own spec at
158
+ # docs/roadmap/phase-*/<fixture_id>.md, and auto-resolve's DOCS phase
159
+ # Job 1 legitimately flips its frontmatter status. That flip is a
160
+ # skill feature, not a scope violation — always exempt regardless of
161
+ # per-fixture waivers.
162
+ own_spec_globs = []
163
+ if fixture_id:
164
+ own_spec_globs.append(f"docs/roadmap/phase-*/{fixture_id}.md")
165
+
166
+ for status, path in entries:
167
+ if is_waived(path, waivers):
168
+ continue
169
+ if is_waived(path, own_spec_globs):
170
+ continue
171
+
172
+ # Lockfile deletion — only when file existed at scaffold.
173
+ if status == "D" and os.path.basename(path) in LOCKFILE_NAMES:
174
+ if existed_at_scaffold(scaffold_sha, path, work_dir):
175
+ findings.append({
176
+ "file": path,
177
+ "type": "lockfile-deletion",
178
+ "severity": "hard",
179
+ "status": status,
180
+ "verdict": "Lockfile deleted (existed at scaffold)",
181
+ })
182
+ continue
183
+
184
+ # For categorical path violations, only flag when the arm ADDED or
185
+ # MODIFIED (including rename/copy). Pure deletion of a non-lockfile
186
+ # Tier A path would also be suspicious but is rare in practice and
187
+ # overlaps with test-fidelity; leave for a later iteration if needed.
188
+ if status not in ("A", "M", "R", "C"):
189
+ continue
190
+
191
+ matched = matches_any_glob(path, TIER_A_PATH_GLOBS)
192
+ if matched is None:
193
+ matched = matches_basename(
194
+ path, TIER_A_BASENAME_SUFFIXES, TIER_A_BASENAME_PREFIXES
195
+ )
196
+ if matched is None:
197
+ continue
198
+
199
+ findings.append({
200
+ "file": path,
201
+ "type": "tier-a-violation",
202
+ "severity": "hard",
203
+ "status": status,
204
+ "matched_pattern": matched,
205
+ "verdict": "Touched a path categorically outside implementation scope",
206
+ })
207
+
208
+ return findings
209
+
210
+
211
+ def main():
212
+ ap = argparse.ArgumentParser()
213
+ ap.add_argument("--work")
214
+ ap.add_argument("--scaffold")
215
+ ap.add_argument(
216
+ "--expected",
217
+ help="Path to fixture expected.json (for tier_a_waivers)",
218
+ default=None,
219
+ )
220
+ ap.add_argument(
221
+ "--list-categories",
222
+ action="store_true",
223
+ help="Emit the stable oracle CATEGORIES enum as JSON and exit (iter-0022).",
224
+ )
225
+ args = ap.parse_args()
226
+
227
+ if args.list_categories:
228
+ print(json.dumps({"oracle": ORACLE_NAME, "categories": CATEGORIES}, indent=2, sort_keys=True))
229
+ return
230
+
231
+ if not args.work or not args.scaffold:
232
+ ap.error("--work and --scaffold are required unless --list-categories is set")
233
+
234
+ waivers = []
235
+ fixture_id = None
236
+ if args.expected:
237
+ exp_path = pathlib.Path(args.expected)
238
+ # fixture_id = parent directory name of expected.json
239
+ fixture_id = exp_path.parent.name
240
+ try:
241
+ expected = json.loads(exp_path.read_text())
242
+ raw = expected.get("tier_a_waivers", [])
243
+ if isinstance(raw, list):
244
+ waivers = [w for w in raw if isinstance(w, str)]
245
+ except (OSError, json.JSONDecodeError) as e:
246
+ sys.stderr.write(
247
+ f"[oracle-scope-tier-a] could not read waivers from {args.expected}: {e}\n"
248
+ )
249
+
250
+ findings = analyze(args.work, args.scaffold, waivers, fixture_id=fixture_id)
251
+ print(json.dumps({
252
+ "oracle": "scope-tier-a",
253
+ "waivers": waivers,
254
+ "fixture_id": fixture_id,
255
+ "findings": findings,
256
+ }, indent=2))
257
+
258
+
259
+ if __name__ == "__main__":
260
+ main()