devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,359 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# judge.sh — Codex (CLI's current flagship, inherited) blind judge for ONE fixture.
|
|
3
|
+
#
|
|
4
|
+
# Usage:
|
|
5
|
+
# judge.sh --fixture <FID> --run-id <ID>
|
|
6
|
+
#
|
|
7
|
+
# Reads:
|
|
8
|
+
# results/<run-id>/<fixture>/variant/diff.patch + verify.json
|
|
9
|
+
# results/<run-id>/<fixture>/bare/diff.patch + verify.json
|
|
10
|
+
# fixtures/<fixture>/spec.md + expected.json + NOTES.md
|
|
11
|
+
# RUBRIC.md (stable rubric)
|
|
12
|
+
#
|
|
13
|
+
# Writes:
|
|
14
|
+
# results/<run-id>/<fixture>/judge.json
|
|
15
|
+
#
|
|
16
|
+
# Blind: A/B assignment randomized per fixture, seed stored in judge.json.
|
|
17
|
+
|
|
18
|
+
set -euo pipefail
|
|
19
|
+
|
|
20
|
+
usage() { echo "usage: $0 --fixture <FID> --run-id <ID>"; exit 1; }
|
|
21
|
+
FIXTURE=""; RUN_ID=""
|
|
22
|
+
while [ $# -gt 0 ]; do
|
|
23
|
+
case "$1" in
|
|
24
|
+
--fixture) FIXTURE="$2"; shift 2;;
|
|
25
|
+
--run-id) RUN_ID="$2"; shift 2;;
|
|
26
|
+
*) usage;;
|
|
27
|
+
esac
|
|
28
|
+
done
|
|
29
|
+
[ -n "$FIXTURE" ] && [ -n "$RUN_ID" ] || usage
|
|
30
|
+
|
|
31
|
+
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
32
|
+
FIX_DIR="$BENCH_ROOT/fixtures/$FIXTURE"
|
|
33
|
+
RES_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE"
|
|
34
|
+
|
|
35
|
+
# iter-0019: 3 arms — variant (L2), solo_claude (L1), bare (L0). The judge
|
|
36
|
+
# scores all three in a single pass with the same prompt + same model so
|
|
37
|
+
# margin derivations (L2-vs-L0, L1-vs-L0, L2-vs-L1) are calibrated against
|
|
38
|
+
# each other and not against separate judge calls. ARMS_PRESENT enumerates
|
|
39
|
+
# whichever subset actually has artifacts (so a missing arm doesn't abort
|
|
40
|
+
# the whole judge step). Two-arm judge mode is preserved for runs that pre-
|
|
41
|
+
# date iter-0019.
|
|
42
|
+
ARMS_PRESENT=()
|
|
43
|
+
# iter-0033c: l2_gated/l2_forced added for NEW L2 vs NEW L1 measurement.
|
|
44
|
+
# Slot count is still A/B/C max 3 — pair-eligible iter-0033c fixtures supply
|
|
45
|
+
# {solo_claude, l2_gated, l2_forced}; non-pair-eligible fixtures supply
|
|
46
|
+
# {solo_claude, l2_gated}. The blind-shuffle slot mapping below already
|
|
47
|
+
# tolerates arbitrary ARMS_PRESENT counts ≥2.
|
|
48
|
+
for arm in variant solo_claude bare l2_gated l2_forced; do
|
|
49
|
+
if [ -f "$RES_DIR/$arm/diff.patch" ] && [ -f "$RES_DIR/$arm/verify.json" ]; then
|
|
50
|
+
ARMS_PRESENT+=("$arm")
|
|
51
|
+
fi
|
|
52
|
+
done
|
|
53
|
+
if [ ${#ARMS_PRESENT[@]} -lt 2 ]; then
|
|
54
|
+
echo "judge needs at least 2 arms with diff.patch + verify.json; have: ${ARMS_PRESENT[*]:-(none)}"
|
|
55
|
+
exit 1
|
|
56
|
+
fi
|
|
57
|
+
for f in "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md"; do
|
|
58
|
+
[ -f "$f" ] || { echo "missing required input: $f"; exit 1; }
|
|
59
|
+
done
|
|
60
|
+
|
|
61
|
+
# Blind randomization: shuffle ARMS_PRESENT into ABC order. Seed recorded
|
|
62
|
+
# in judge.json so runs are reproducible if rejudged.
|
|
63
|
+
SEED=$RANDOM
|
|
64
|
+
# iter-0019.4: Bash 3.2 compatible (macOS /bin/bash). `mapfile` is Bash 4+
|
|
65
|
+
# only; replaced with while-read loop. The `|| [ -n "$line" ]` guard
|
|
66
|
+
# preserves exact `mapfile -t` behavior on a final unterminated line (Python
|
|
67
|
+
# print() emits trailing \n so this guard is belt-and-suspenders here, but
|
|
68
|
+
# matches mapfile semantics for future producers).
|
|
69
|
+
SLOTS=()
|
|
70
|
+
while IFS= read -r line || [ -n "$line" ]; do
|
|
71
|
+
SLOTS+=("$line")
|
|
72
|
+
done < <(python3 - "$SEED" "${ARMS_PRESENT[@]}" <<'PY'
|
|
73
|
+
import sys, random
|
|
74
|
+
seed = int(sys.argv[1]); arms = sys.argv[2:]
|
|
75
|
+
random.seed(seed)
|
|
76
|
+
random.shuffle(arms)
|
|
77
|
+
print("\n".join(arms))
|
|
78
|
+
PY
|
|
79
|
+
)
|
|
80
|
+
A_ARM="${SLOTS[0]:-}"
|
|
81
|
+
B_ARM="${SLOTS[1]:-}"
|
|
82
|
+
C_ARM="${SLOTS[2]:-}"
|
|
83
|
+
|
|
84
|
+
PROMPT_FILE="$RES_DIR/judge-prompt.txt"
|
|
85
|
+
A_DIFF="$RES_DIR/$A_ARM/diff.patch"
|
|
86
|
+
A_VERIFY="$RES_DIR/$A_ARM/verify.json"
|
|
87
|
+
B_DIFF="$RES_DIR/$B_ARM/diff.patch"
|
|
88
|
+
B_VERIFY="$RES_DIR/$B_ARM/verify.json"
|
|
89
|
+
if [ -n "$C_ARM" ]; then
|
|
90
|
+
C_DIFF="$RES_DIR/$C_ARM/diff.patch"
|
|
91
|
+
C_VERIFY="$RES_DIR/$C_ARM/verify.json"
|
|
92
|
+
else
|
|
93
|
+
C_DIFF=""
|
|
94
|
+
C_VERIFY=""
|
|
95
|
+
fi
|
|
96
|
+
|
|
97
|
+
# Sanitize diffs so stylistic tells that correlate with variant (e.g.
|
|
98
|
+
# pipeline-commit markers, .devlyn/ archive lines) don't leak to the judge.
|
|
99
|
+
# Judge sees only file-content changes; the transcript, arm label, NOTES.md,
|
|
100
|
+
# and all process artifacts stay out of the prompt.
|
|
101
|
+
python3 - "$PROMPT_FILE" "$FIX_DIR/spec.md" "$FIX_DIR/expected.json" "$BENCH_ROOT/RUBRIC.md" "$A_DIFF" "$B_DIFF" "$A_VERIFY" "$B_VERIFY" "$C_DIFF" "$C_VERIFY" <<'PY'
|
|
102
|
+
import sys, pathlib, re, json
|
|
103
|
+
args = sys.argv[1:]
|
|
104
|
+
out_p, spec_p, exp_p, rubric_p = map(pathlib.Path, args[:4])
|
|
105
|
+
a_diff, b_diff, a_ver, b_ver = map(pathlib.Path, args[4:8])
|
|
106
|
+
c_diff_arg, c_ver_arg = args[8], args[9]
|
|
107
|
+
c_diff = pathlib.Path(c_diff_arg) if c_diff_arg else None
|
|
108
|
+
c_ver = pathlib.Path(c_ver_arg) if c_ver_arg else None
|
|
109
|
+
out = out_p
|
|
110
|
+
spec = spec_p.read_text()
|
|
111
|
+
expected = exp_p.read_text()
|
|
112
|
+
rubric = rubric_p.read_text()
|
|
113
|
+
|
|
114
|
+
# Strip pipeline-origin tells from the diff before the judge sees it.
|
|
115
|
+
TELLS = [
|
|
116
|
+
re.compile(r"^diff --git.*\.devlyn/.*$", re.M),
|
|
117
|
+
re.compile(r"^chore\(pipeline\):.*$", re.M),
|
|
118
|
+
re.compile(r"^\.devlyn/.*$", re.M),
|
|
119
|
+
re.compile(r"^Co-Authored-By:.*$", re.M),
|
|
120
|
+
]
|
|
121
|
+
def sanitize(diff: str) -> str:
|
|
122
|
+
# Drop whole-file hunks under .devlyn/
|
|
123
|
+
out_lines, skip = [], False
|
|
124
|
+
for line in diff.splitlines(keepends=True):
|
|
125
|
+
if line.startswith("diff --git ") and ".devlyn/" in line:
|
|
126
|
+
skip = True
|
|
127
|
+
continue
|
|
128
|
+
if line.startswith("diff --git "):
|
|
129
|
+
skip = False
|
|
130
|
+
if skip:
|
|
131
|
+
continue
|
|
132
|
+
out_lines.append(line)
|
|
133
|
+
text = "".join(out_lines)
|
|
134
|
+
for pat in TELLS:
|
|
135
|
+
text = pat.sub("", text)
|
|
136
|
+
return text
|
|
137
|
+
|
|
138
|
+
# Also strip arm-identifying fields from verify.json before passing to judge.
|
|
139
|
+
def sanitize_verify(path: pathlib.Path) -> str:
|
|
140
|
+
data = json.loads(path.read_text())
|
|
141
|
+
# Remove anything that could name the arm
|
|
142
|
+
data.pop("arm", None)
|
|
143
|
+
return json.dumps(data, indent=2)
|
|
144
|
+
|
|
145
|
+
a_diff_text = sanitize(a_diff.read_text())
|
|
146
|
+
b_diff_text = sanitize(b_diff.read_text())
|
|
147
|
+
a_ver_text = sanitize_verify(a_ver)
|
|
148
|
+
b_ver_text = sanitize_verify(b_ver)
|
|
149
|
+
have_c = c_diff is not None
|
|
150
|
+
if have_c:
|
|
151
|
+
c_diff_text = sanitize(c_diff.read_text())
|
|
152
|
+
c_ver_text = sanitize_verify(c_ver)
|
|
153
|
+
|
|
154
|
+
n_arms = 3 if have_c else 2
|
|
155
|
+
arms_phrase = "Three engineers" if have_c else "Two engineers"
|
|
156
|
+
slot_keys = ["a_score", "b_score", "c_score"][:n_arms]
|
|
157
|
+
slot_breakdowns = ["a_breakdown", "b_breakdown", "c_breakdown"][:n_arms]
|
|
158
|
+
slot_letters = ["A", "B", "C"][:n_arms]
|
|
159
|
+
|
|
160
|
+
# Build the JSON-format hint dynamically so the judge sees the right shape
|
|
161
|
+
# for either 2 or 3 arms. Same scoring rules; same rubric.
|
|
162
|
+
score_lines = ",\n ".join(f'"{k}": <int 0-100>' for k in slot_keys)
|
|
163
|
+
breakdown_lines = ",\n ".join(
|
|
164
|
+
f'"{b}": {{"spec": 0-25, "constraint": 0-25, "scope": 0-25, "quality": 0-25, "notes": "<3-5 bullets>"}}'
|
|
165
|
+
for b in slot_breakdowns
|
|
166
|
+
)
|
|
167
|
+
findings_keys = ", ".join(f'"{l}": ["..."]' for l in slot_letters)
|
|
168
|
+
dq_keys = ", ".join(f'"{l}": bool' for l in slot_letters)
|
|
169
|
+
dq_reasons = ", ".join(f'"{l}_reason": "..."' for l in slot_letters)
|
|
170
|
+
winner_choices = " | ".join(f'"{l}"' for l in slot_letters) + ' | "tie"'
|
|
171
|
+
|
|
172
|
+
# Per-arm sections of the prompt
|
|
173
|
+
def section(label: str, diff_text: str, verify_text: str) -> str:
|
|
174
|
+
return (
|
|
175
|
+
f"=== IMPLEMENTATION {label} ===\nDiff:\n"
|
|
176
|
+
f"```diff\n{diff_text}\n```\n"
|
|
177
|
+
f"Verification results:\n```json\n{verify_text}\n```\n"
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
impl_sections = section("A", a_diff_text, a_ver_text) + "\n" + section("B", b_diff_text, b_ver_text)
|
|
181
|
+
if have_c:
|
|
182
|
+
impl_sections += "\n" + section("C", c_diff_text, c_ver_text)
|
|
183
|
+
|
|
184
|
+
prompt = f"""You are a blind code-review judge. {arms_phrase} implemented the same spec. You do NOT know which implementation came from which process — grade them only on the merits of the code and its behavior.
|
|
185
|
+
|
|
186
|
+
Apply the 4-axis rubric from RUBRIC.md below. Each axis is 0-25, total 100. Score every implementation independently — do not let one arm's score anchor another's. The judge's job is to apply the rubric absolutely; relative ordering falls out from the absolute scores.
|
|
187
|
+
|
|
188
|
+
Return STRICT JSON only — no prose outside the JSON. Format:
|
|
189
|
+
|
|
190
|
+
{{
|
|
191
|
+
{score_lines},
|
|
192
|
+
"winner": {winner_choices},
|
|
193
|
+
{breakdown_lines},
|
|
194
|
+
"critical_findings": {{{findings_keys}}},
|
|
195
|
+
"disqualifiers": {{{dq_keys}, {dq_reasons}}},
|
|
196
|
+
"overall_reasoning": "<5-8 sentences>"
|
|
197
|
+
}}
|
|
198
|
+
|
|
199
|
+
=== RUBRIC ===
|
|
200
|
+
{rubric}
|
|
201
|
+
|
|
202
|
+
=== SPEC ===
|
|
203
|
+
{spec}
|
|
204
|
+
|
|
205
|
+
=== EXPECTED (machine-readable acceptance) ===
|
|
206
|
+
{expected}
|
|
207
|
+
|
|
208
|
+
{impl_sections}
|
|
209
|
+
Return the JSON and nothing else.
|
|
210
|
+
"""
|
|
211
|
+
out.write_text(prompt)
|
|
212
|
+
PY
|
|
213
|
+
|
|
214
|
+
# Invoke Codex — no -m so CLI flagship is inherited. Model identity is
|
|
215
|
+
# recorded from the codex config.toml so rejudging with a newer flagship is
|
|
216
|
+
# traceable. Run from a clean temp CWD so the judge can't peek at project
|
|
217
|
+
# files that would leak arm identity.
|
|
218
|
+
command -v codex >/dev/null 2>&1 || { echo "codex CLI not on PATH; cannot judge"; exit 1; }
|
|
219
|
+
CODEX_CLI_VER=$(codex --version 2>/dev/null || echo "codex-cli unknown")
|
|
220
|
+
JUDGE_MODEL=$(grep -E '^model\s*=' "${HOME}/.codex/config.toml" 2>/dev/null | head -1 | sed -E 's/.*=\s*"?([^"]+)"?.*/\1/')
|
|
221
|
+
[ -z "$JUDGE_MODEL" ] && JUDGE_MODEL="(unknown — codex config.toml not readable)"
|
|
222
|
+
|
|
223
|
+
JUDGE_CWD="/tmp/judge-$RUN_ID-$FIXTURE"
|
|
224
|
+
rm -rf "$JUDGE_CWD"
|
|
225
|
+
mkdir -p "$JUDGE_CWD"
|
|
226
|
+
|
|
227
|
+
JUDGE_OUT="$RES_DIR/judge-output.txt"
|
|
228
|
+
set +e
|
|
229
|
+
cat "$PROMPT_FILE" | (cd "$JUDGE_CWD" && codex exec -s read-only --skip-git-repo-check -c model_reasoning_effort=xhigh - ) > "$JUDGE_OUT" 2>&1
|
|
230
|
+
JUDGE_EXIT=$?
|
|
231
|
+
set -e
|
|
232
|
+
rm -rf "$JUDGE_CWD"
|
|
233
|
+
if [ $JUDGE_EXIT -ne 0 ]; then
|
|
234
|
+
echo "codex exec failed (exit $JUDGE_EXIT); see $JUDGE_OUT"
|
|
235
|
+
exit 1
|
|
236
|
+
fi
|
|
237
|
+
|
|
238
|
+
# Extract JSON (codex wraps with banners; pick the last {...} block)
|
|
239
|
+
python3 - "$JUDGE_OUT" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" <<'PY'
|
|
240
|
+
import sys, re, json, pathlib
|
|
241
|
+
out = pathlib.Path(sys.argv[1]).read_text()
|
|
242
|
+
target = pathlib.Path(sys.argv[2])
|
|
243
|
+
a_arm, b_arm, c_arm, seed, codex_ver, judge_model = sys.argv[3:9]
|
|
244
|
+
|
|
245
|
+
# Extract the last valid judgment JSON. A naive brace-counter breaks on
|
|
246
|
+
# `{`/`}` that appear inside strings (e.g. JS source embedded in the arms'
|
|
247
|
+
# diffs), so use json.JSONDecoder.raw_decode starting at each `{` position
|
|
248
|
+
# and keep the last successful parse with the required keys.
|
|
249
|
+
decoder = json.JSONDecoder()
|
|
250
|
+
brace_positions = [i for i, c in enumerate(out) if c == '{']
|
|
251
|
+
chosen = None
|
|
252
|
+
for pos in reversed(brace_positions):
|
|
253
|
+
try:
|
|
254
|
+
obj, _ = decoder.raw_decode(out[pos:])
|
|
255
|
+
except json.JSONDecodeError:
|
|
256
|
+
continue
|
|
257
|
+
if isinstance(obj, dict) and "a_score" in obj and "b_score" in obj:
|
|
258
|
+
chosen = obj
|
|
259
|
+
break
|
|
260
|
+
if chosen is None:
|
|
261
|
+
raise SystemExit(f"no valid JSON in judge output; see {sys.argv[1]}")
|
|
262
|
+
|
|
263
|
+
# Decode blind labels — record full mapping so summary code can iterate
|
|
264
|
+
mapping = {"A": a_arm, "B": b_arm}
|
|
265
|
+
if c_arm:
|
|
266
|
+
mapping["C"] = c_arm
|
|
267
|
+
chosen["_blind_mapping"] = {**mapping, "seed": int(seed)}
|
|
268
|
+
chosen["_judge_cli"] = codex_ver.strip()
|
|
269
|
+
chosen["_judge_model"] = judge_model.strip()
|
|
270
|
+
|
|
271
|
+
# iter-0023 — axis breakdown validation. Rubric axes are 0-25 (RUBRIC.md
|
|
272
|
+
# "Scoring — 4 axes, 25 points each"). Past runs (iter-0020 F9) recorded
|
|
273
|
+
# `quality: -1` because judge LLM occasionally emits sentinel/negative
|
|
274
|
+
# values; ship-gate then averaged invalid cells. Detect, clamp to [0, 25],
|
|
275
|
+
# and record the invalid cells under `_axis_validation` so downstream
|
|
276
|
+
# consumers can refuse to trust that fixture's margin.
|
|
277
|
+
AXIS_KEYS = ("spec", "constraint", "scope", "quality")
|
|
278
|
+
BREAKDOWN_KEYS = ("a_breakdown", "b_breakdown", "c_breakdown")
|
|
279
|
+
axis_invalid_cells = []
|
|
280
|
+
for bk in BREAKDOWN_KEYS:
|
|
281
|
+
if bk not in chosen or not isinstance(chosen[bk], dict):
|
|
282
|
+
continue
|
|
283
|
+
for axis in AXIS_KEYS:
|
|
284
|
+
if axis not in chosen[bk]:
|
|
285
|
+
continue
|
|
286
|
+
v = chosen[bk][axis]
|
|
287
|
+
if not isinstance(v, (int, float)) or v < 0 or v > 25:
|
|
288
|
+
axis_invalid_cells.append({"breakdown": bk, "axis": axis, "value": v})
|
|
289
|
+
chosen[bk][axis] = max(0, min(25, int(v) if isinstance(v, (int, float)) else 0))
|
|
290
|
+
chosen["_axis_validation"] = {
|
|
291
|
+
"out_of_range_count": len(axis_invalid_cells),
|
|
292
|
+
"out_of_range_cells": axis_invalid_cells,
|
|
293
|
+
"axis_range": [0, 25],
|
|
294
|
+
}
|
|
295
|
+
if axis_invalid_cells:
|
|
296
|
+
sys.stderr.write(
|
|
297
|
+
f"[judge.sh] WARNING: {len(axis_invalid_cells)} axis cell(s) out of [0,25] "
|
|
298
|
+
f"clamped: {axis_invalid_cells}\n"
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
# scores_by_arm: arm-name → score, computed from the blind A/B/C scores.
|
|
302
|
+
# This is the canonical 3-arm-aware shape the report consumer reads. The
|
|
303
|
+
# legacy variant_score / bare_score / margin fields below are derived from
|
|
304
|
+
# scores_by_arm for backward compatibility with pre-iter-0019 callers.
|
|
305
|
+
scores_by_arm = {}
|
|
306
|
+
slot_keys = ["a_score", "b_score", "c_score"]
|
|
307
|
+
slot_letters = ["A", "B", "C"]
|
|
308
|
+
for letter, key in zip(slot_letters, slot_keys):
|
|
309
|
+
arm = mapping.get(letter)
|
|
310
|
+
if arm is not None and key in chosen:
|
|
311
|
+
scores_by_arm[arm] = chosen[key]
|
|
312
|
+
chosen["scores_by_arm"] = scores_by_arm
|
|
313
|
+
|
|
314
|
+
# Per-letter critical_findings / disqualifiers also rotated to per-arm.
|
|
315
|
+
findings_letters = chosen.get("critical_findings", {}) or {}
|
|
316
|
+
findings_by_arm = {mapping[l]: findings_letters.get(l, []) for l in slot_letters if l in mapping}
|
|
317
|
+
chosen["findings_by_arm"] = findings_by_arm
|
|
318
|
+
|
|
319
|
+
dq_letters = chosen.get("disqualifiers", {}) or {}
|
|
320
|
+
dq_by_arm = {}
|
|
321
|
+
for l in slot_letters:
|
|
322
|
+
if l not in mapping:
|
|
323
|
+
continue
|
|
324
|
+
arm = mapping[l]
|
|
325
|
+
dq_by_arm[arm] = {
|
|
326
|
+
"disqualifier": bool(dq_letters.get(l, False)),
|
|
327
|
+
"reason": str(dq_letters.get(f"{l}_reason", "") or ""),
|
|
328
|
+
}
|
|
329
|
+
chosen["disqualifiers_by_arm"] = dq_by_arm
|
|
330
|
+
|
|
331
|
+
# Pairwise margins (positive = first arm beat second).
|
|
332
|
+
def margin(left: str, right: str):
|
|
333
|
+
if left in scores_by_arm and right in scores_by_arm:
|
|
334
|
+
return scores_by_arm[left] - scores_by_arm[right]
|
|
335
|
+
return None
|
|
336
|
+
|
|
337
|
+
chosen["margins"] = {
|
|
338
|
+
"variant_over_bare": margin("variant", "bare"),
|
|
339
|
+
"solo_over_bare": margin("solo_claude", "bare"),
|
|
340
|
+
"variant_over_solo": margin("variant", "solo_claude"),
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
# Translate winner letter to arm
|
|
344
|
+
w = chosen.get("winner")
|
|
345
|
+
chosen["winner_arm"] = mapping.get(w, "tie") if w in mapping else "tie"
|
|
346
|
+
|
|
347
|
+
# Legacy 2-arm fields preserved so older summary code still parses. When
|
|
348
|
+
# solo_claude is present, variant/bare margin is derived from scores_by_arm.
|
|
349
|
+
chosen["variant_score"] = scores_by_arm.get("variant")
|
|
350
|
+
chosen["bare_score"] = scores_by_arm.get("bare")
|
|
351
|
+
if chosen.get("variant_score") is not None and chosen.get("bare_score") is not None:
|
|
352
|
+
chosen["margin"] = chosen["variant_score"] - chosen["bare_score"]
|
|
353
|
+
|
|
354
|
+
target.write_text(json.dumps(chosen, indent=2))
|
|
355
|
+
parts = [f"{arm}={s}" for arm, s in scores_by_arm.items()]
|
|
356
|
+
mline = chosen.get("margins") or {}
|
|
357
|
+
mparts = [f"{k}={v:+d}" for k, v in mline.items() if v is not None]
|
|
358
|
+
print(f"[judge] " + " ".join(parts) + (" | " + " ".join(mparts) if mparts else ""))
|
|
359
|
+
PY
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
oracle-scope-tier-a.py — deterministic detector for categorical file-path
|
|
4
|
+
violations. Flags touches of paths that are never legitimately needed by an
|
|
5
|
+
implementation task (planning docs, CI config, dep-install output, runtime
|
|
6
|
+
artifacts, env/secret files) and lockfile deletions.
|
|
7
|
+
|
|
8
|
+
Complementary to oracle-test-fidelity.py, which handles weakening INSIDE
|
|
9
|
+
existing test files. This oracle only cares about WHICH files are touched.
|
|
10
|
+
|
|
11
|
+
Path matching uses fnmatch with normalized, repo-root-relative paths.
|
|
12
|
+
`docs/**` matches `docs/<anything>` but NOT `server/docs/readme.md` — the
|
|
13
|
+
anchoring is left-only, which is what we want. Per-oracle convention
|
|
14
|
+
documented here; step 1's content oracle uses regex instead.
|
|
15
|
+
|
|
16
|
+
Fixtures can waive any Tier A pattern via `expected.json::tier_a_waivers`
|
|
17
|
+
(list of fnmatch globs). Load-bearing case: F9 e2e-ideate-to-preflight
|
|
18
|
+
legitimately creates docs/VISION.md, docs/ROADMAP.md, docs/roadmap/**.
|
|
19
|
+
|
|
20
|
+
Step 2 scope: findings only. Scoring integration is a later step.
|
|
21
|
+
"""
|
|
22
|
+
import argparse
|
|
23
|
+
import fnmatch
|
|
24
|
+
import json
|
|
25
|
+
import os
|
|
26
|
+
import pathlib
|
|
27
|
+
import subprocess
|
|
28
|
+
import sys
|
|
29
|
+
|
|
30
|
+
ORACLE_NAME = "scope-tier-a"
|
|
31
|
+
|
|
32
|
+
# iter-0022: stable category enumeration. See header comment in
|
|
33
|
+
# oracle-test-fidelity.py for the edit-discipline rules. tier-a-violation is
|
|
34
|
+
# ONE umbrella category covering the 5 path-glob groups (planning-doc,
|
|
35
|
+
# ci-config, node-modules, test-results-or-coverage, env-secrets) plus the 2
|
|
36
|
+
# basename rules (.log suffix, .env/secrets. prefix); splitting into 7 sub-
|
|
37
|
+
# categories was rejected during iter-0022 R0 because the oracle emits a
|
|
38
|
+
# single finding-row per touched path regardless.
|
|
39
|
+
CATEGORIES = [
|
|
40
|
+
{
|
|
41
|
+
"id": "scope-tier-a:lockfile-deletion",
|
|
42
|
+
"severity": "hard",
|
|
43
|
+
"applies_when": "scaffold contains a lockfile (package-lock.json / yarn.lock / pnpm-lock.yaml / bun.lock / bun.lockb)",
|
|
44
|
+
"operational_check": "variant arm MUST NOT delete a scaffold-present lockfile",
|
|
45
|
+
"evidence_source_files": ["oracle-scope-tier-a.py"],
|
|
46
|
+
},
|
|
47
|
+
{
|
|
48
|
+
"id": "scope-tier-a:tier-a-violation",
|
|
49
|
+
"severity": "hard",
|
|
50
|
+
"applies_when": "any fixture (the protected paths exist conceptually for every JS/TS repo)",
|
|
51
|
+
"operational_check": "variant arm MUST NOT add or modify paths matching: docs/roadmap/** | docs/VISION.md | docs/ROADMAP.md | .github/** | node_modules/** | **/node_modules/** | test-results/** | coverage/** | .nyc_output/** | basename suffix .log | basename prefix .env or secrets.",
|
|
52
|
+
"evidence_source_files": ["oracle-scope-tier-a.py"],
|
|
53
|
+
},
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
# Path globs — fnmatch-style, left-anchored. `**` is treated as `*` by
|
|
57
|
+
# fnmatch (no special recursive semantics), so `docs/roadmap/**` matches
|
|
58
|
+
# `docs/roadmap/anything/nested/here` because `*` matches `/` in fnmatch.
|
|
59
|
+
TIER_A_PATH_GLOBS = [
|
|
60
|
+
# Planning docs — implementation tasks should never edit these.
|
|
61
|
+
"docs/roadmap/**",
|
|
62
|
+
"docs/VISION.md",
|
|
63
|
+
"docs/ROADMAP.md",
|
|
64
|
+
# CI / repo-wide config.
|
|
65
|
+
".github/**",
|
|
66
|
+
# Install outputs.
|
|
67
|
+
"node_modules/**",
|
|
68
|
+
"**/node_modules/**",
|
|
69
|
+
# Runtime / test artifacts.
|
|
70
|
+
"test-results/**",
|
|
71
|
+
"coverage/**",
|
|
72
|
+
".nyc_output/**",
|
|
73
|
+
]
|
|
74
|
+
|
|
75
|
+
# Basename suffix matches (run against os.path.basename so any depth hits).
|
|
76
|
+
TIER_A_BASENAME_SUFFIXES = {
|
|
77
|
+
".log",
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
# Basename prefix matches. `.env` → `.env`, `.env.local`, `.env.production`.
|
|
81
|
+
# `secrets.` → `secrets.json`, `secrets.yaml`.
|
|
82
|
+
TIER_A_BASENAME_PREFIXES = {
|
|
83
|
+
".env",
|
|
84
|
+
"secrets.",
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
# Lockfiles — modification is legitimate when deps change; deletion is not.
|
|
88
|
+
# Only flag D status AND only if the file existed at scaffold.
|
|
89
|
+
LOCKFILE_NAMES = {
|
|
90
|
+
"package-lock.json",
|
|
91
|
+
"yarn.lock",
|
|
92
|
+
"pnpm-lock.yaml",
|
|
93
|
+
"bun.lock",
|
|
94
|
+
"bun.lockb",
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
def run_git(args, cwd):
|
|
99
|
+
return subprocess.run(
|
|
100
|
+
["git", *args], cwd=cwd, capture_output=True, text=True
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def git_diff_status(scaffold_sha, cwd):
|
|
105
|
+
r = run_git(["diff", "--name-status", "-M", scaffold_sha], cwd=cwd)
|
|
106
|
+
entries = []
|
|
107
|
+
for line in r.stdout.splitlines():
|
|
108
|
+
line = line.strip()
|
|
109
|
+
if not line:
|
|
110
|
+
continue
|
|
111
|
+
parts = line.split("\t")
|
|
112
|
+
status = parts[0]
|
|
113
|
+
if status.startswith("R") or status.startswith("C"):
|
|
114
|
+
if len(parts) >= 3:
|
|
115
|
+
# Treat as new path; keep R/C letter for reporting.
|
|
116
|
+
entries.append((status[0], parts[2]))
|
|
117
|
+
else:
|
|
118
|
+
if len(parts) >= 2:
|
|
119
|
+
entries.append((status, parts[1]))
|
|
120
|
+
return entries
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def existed_at_scaffold(scaffold_sha, path, cwd):
|
|
124
|
+
r = run_git(["cat-file", "-e", f"{scaffold_sha}:{path}"], cwd=cwd)
|
|
125
|
+
return r.returncode == 0
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def matches_any_glob(path, patterns):
|
|
129
|
+
for p in patterns:
|
|
130
|
+
if fnmatch.fnmatch(path, p):
|
|
131
|
+
return p
|
|
132
|
+
return None
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def matches_basename(path, suffixes, prefixes):
|
|
136
|
+
base = os.path.basename(path)
|
|
137
|
+
for s in suffixes:
|
|
138
|
+
if base.endswith(s):
|
|
139
|
+
return f"*{s}"
|
|
140
|
+
for p in prefixes:
|
|
141
|
+
if base.startswith(p):
|
|
142
|
+
return f"{p}*"
|
|
143
|
+
return None
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def is_waived(path, waivers):
|
|
147
|
+
for w in waivers:
|
|
148
|
+
if fnmatch.fnmatch(path, w):
|
|
149
|
+
return True
|
|
150
|
+
return False
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def analyze(work_dir, scaffold_sha, waivers, fixture_id=None):
|
|
154
|
+
findings = []
|
|
155
|
+
entries = git_diff_status(scaffold_sha, work_dir)
|
|
156
|
+
|
|
157
|
+
# Structural exemption: every benchmark fixture has its own spec at
|
|
158
|
+
# docs/roadmap/phase-*/<fixture_id>.md, and auto-resolve's DOCS phase
|
|
159
|
+
# Job 1 legitimately flips its frontmatter status. That flip is a
|
|
160
|
+
# skill feature, not a scope violation — always exempt regardless of
|
|
161
|
+
# per-fixture waivers.
|
|
162
|
+
own_spec_globs = []
|
|
163
|
+
if fixture_id:
|
|
164
|
+
own_spec_globs.append(f"docs/roadmap/phase-*/{fixture_id}.md")
|
|
165
|
+
|
|
166
|
+
for status, path in entries:
|
|
167
|
+
if is_waived(path, waivers):
|
|
168
|
+
continue
|
|
169
|
+
if is_waived(path, own_spec_globs):
|
|
170
|
+
continue
|
|
171
|
+
|
|
172
|
+
# Lockfile deletion — only when file existed at scaffold.
|
|
173
|
+
if status == "D" and os.path.basename(path) in LOCKFILE_NAMES:
|
|
174
|
+
if existed_at_scaffold(scaffold_sha, path, work_dir):
|
|
175
|
+
findings.append({
|
|
176
|
+
"file": path,
|
|
177
|
+
"type": "lockfile-deletion",
|
|
178
|
+
"severity": "hard",
|
|
179
|
+
"status": status,
|
|
180
|
+
"verdict": "Lockfile deleted (existed at scaffold)",
|
|
181
|
+
})
|
|
182
|
+
continue
|
|
183
|
+
|
|
184
|
+
# For categorical path violations, only flag when the arm ADDED or
|
|
185
|
+
# MODIFIED (including rename/copy). Pure deletion of a non-lockfile
|
|
186
|
+
# Tier A path would also be suspicious but is rare in practice and
|
|
187
|
+
# overlaps with test-fidelity; leave for a later iteration if needed.
|
|
188
|
+
if status not in ("A", "M", "R", "C"):
|
|
189
|
+
continue
|
|
190
|
+
|
|
191
|
+
matched = matches_any_glob(path, TIER_A_PATH_GLOBS)
|
|
192
|
+
if matched is None:
|
|
193
|
+
matched = matches_basename(
|
|
194
|
+
path, TIER_A_BASENAME_SUFFIXES, TIER_A_BASENAME_PREFIXES
|
|
195
|
+
)
|
|
196
|
+
if matched is None:
|
|
197
|
+
continue
|
|
198
|
+
|
|
199
|
+
findings.append({
|
|
200
|
+
"file": path,
|
|
201
|
+
"type": "tier-a-violation",
|
|
202
|
+
"severity": "hard",
|
|
203
|
+
"status": status,
|
|
204
|
+
"matched_pattern": matched,
|
|
205
|
+
"verdict": "Touched a path categorically outside implementation scope",
|
|
206
|
+
})
|
|
207
|
+
|
|
208
|
+
return findings
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
def main():
|
|
212
|
+
ap = argparse.ArgumentParser()
|
|
213
|
+
ap.add_argument("--work")
|
|
214
|
+
ap.add_argument("--scaffold")
|
|
215
|
+
ap.add_argument(
|
|
216
|
+
"--expected",
|
|
217
|
+
help="Path to fixture expected.json (for tier_a_waivers)",
|
|
218
|
+
default=None,
|
|
219
|
+
)
|
|
220
|
+
ap.add_argument(
|
|
221
|
+
"--list-categories",
|
|
222
|
+
action="store_true",
|
|
223
|
+
help="Emit the stable oracle CATEGORIES enum as JSON and exit (iter-0022).",
|
|
224
|
+
)
|
|
225
|
+
args = ap.parse_args()
|
|
226
|
+
|
|
227
|
+
if args.list_categories:
|
|
228
|
+
print(json.dumps({"oracle": ORACLE_NAME, "categories": CATEGORIES}, indent=2, sort_keys=True))
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
if not args.work or not args.scaffold:
|
|
232
|
+
ap.error("--work and --scaffold are required unless --list-categories is set")
|
|
233
|
+
|
|
234
|
+
waivers = []
|
|
235
|
+
fixture_id = None
|
|
236
|
+
if args.expected:
|
|
237
|
+
exp_path = pathlib.Path(args.expected)
|
|
238
|
+
# fixture_id = parent directory name of expected.json
|
|
239
|
+
fixture_id = exp_path.parent.name
|
|
240
|
+
try:
|
|
241
|
+
expected = json.loads(exp_path.read_text())
|
|
242
|
+
raw = expected.get("tier_a_waivers", [])
|
|
243
|
+
if isinstance(raw, list):
|
|
244
|
+
waivers = [w for w in raw if isinstance(w, str)]
|
|
245
|
+
except (OSError, json.JSONDecodeError) as e:
|
|
246
|
+
sys.stderr.write(
|
|
247
|
+
f"[oracle-scope-tier-a] could not read waivers from {args.expected}: {e}\n"
|
|
248
|
+
)
|
|
249
|
+
|
|
250
|
+
findings = analyze(args.work, args.scaffold, waivers, fixture_id=fixture_id)
|
|
251
|
+
print(json.dumps({
|
|
252
|
+
"oracle": "scope-tier-a",
|
|
253
|
+
"waivers": waivers,
|
|
254
|
+
"fixture_id": fixture_id,
|
|
255
|
+
"findings": findings,
|
|
256
|
+
}, indent=2))
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
if __name__ == "__main__":
|
|
260
|
+
main()
|