devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-iter-0033c.sh — orchestrate the iter-0033c suite (NEW L2 vs NEW L1).
|
|
3
|
+
#
|
|
4
|
+
# Codex R0.5-infra design: bypass run-suite.sh + ship-gate.py + compile-report.py
|
|
5
|
+
# (those enforce variant/bare semantics that don't apply here). Call run-fixture.sh
|
|
6
|
+
# directly per fixture per arm; per-fixture interleaving for fail-early on hard-floor
|
|
7
|
+
# violations (Codex R0.5-infra Q4).
|
|
8
|
+
#
|
|
9
|
+
# Per Mission 1: serial only, no parallel-fleet.
|
|
10
|
+
#
|
|
11
|
+
# Usage:
|
|
12
|
+
# run-iter-0033c.sh --label <label> [--fixtures F1,F2,...] [--c1-summary <path>] [--f9-judge <path>]
|
|
13
|
+
# [--manifest-out <path>] [--results-out-dir <path>] [--skip-judge]
|
|
14
|
+
#
|
|
15
|
+
# Pre-flight: smoke 1b (codex availability) — fail-fast.
|
|
16
|
+
# Arms per fixture:
|
|
17
|
+
# - All fixtures: solo_claude (L1 rerun) + l2_gated (L2 natural triggers)
|
|
18
|
+
# - Pair-eligible (per manifest): also l2_forced (L2 diagnostic)
|
|
19
|
+
# After arms: judge.sh per fixture; manifest build; iter-0033c-compare.py.
|
|
20
|
+
set -euo pipefail
|
|
21
|
+
|
|
22
|
+
usage() {
|
|
23
|
+
cat >&2 <<EOF
|
|
24
|
+
usage: $0 --label <label>
|
|
25
|
+
[--fixtures F1,F2,F3,F4,F5,F6,F7,F8,F9]
|
|
26
|
+
[--c1-summary <path>] # default: benchmark/auto-resolve/results/3bc86dd-iter0033c1-new-20260501T004229Z/summary.json
|
|
27
|
+
[--f9-judge <path>] # default: benchmark/auto-resolve/results/4e3d89a-iter-0033a-f9-smoke3-20260430T232747Z/F9-e2e-ideate-to-resolve/judge.json
|
|
28
|
+
[--results-root <path>] # default: benchmark/auto-resolve/results
|
|
29
|
+
[--skip-judge] # skip judge.sh (re-runnable post-hoc)
|
|
30
|
+
EOF
|
|
31
|
+
exit 1
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
LABEL=""
|
|
35
|
+
FIXTURES_CSV="F1,F2,F3,F4,F5,F6,F7,F8,F9"
|
|
36
|
+
C1_SUMMARY="benchmark/auto-resolve/results/3bc86dd-iter0033c1-new-20260501T004229Z/summary.json"
|
|
37
|
+
F9_JUDGE="benchmark/auto-resolve/results/4e3d89a-iter-0033a-f9-smoke3-20260430T232747Z/F9-e2e-ideate-to-resolve/judge.json"
|
|
38
|
+
RESULTS_ROOT="benchmark/auto-resolve/results"
|
|
39
|
+
SKIP_JUDGE=0
|
|
40
|
+
while [ $# -gt 0 ]; do
|
|
41
|
+
case "$1" in
|
|
42
|
+
--label) LABEL="$2"; shift 2;;
|
|
43
|
+
--fixtures) FIXTURES_CSV="$2"; shift 2;;
|
|
44
|
+
--c1-summary) C1_SUMMARY="$2"; shift 2;;
|
|
45
|
+
--f9-judge) F9_JUDGE="$2"; shift 2;;
|
|
46
|
+
--results-root) RESULTS_ROOT="$2"; shift 2;;
|
|
47
|
+
--skip-judge) SKIP_JUDGE=1; shift;;
|
|
48
|
+
*) usage;;
|
|
49
|
+
esac
|
|
50
|
+
done
|
|
51
|
+
[ -n "$LABEL" ] || usage
|
|
52
|
+
|
|
53
|
+
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
|
|
54
|
+
cd "$REPO_ROOT"
|
|
55
|
+
|
|
56
|
+
# --- Smoke 1b: codex availability fail-fast ---
|
|
57
|
+
echo "=== Smoke 1b: Codex availability ==="
|
|
58
|
+
if ! command -v codex >/dev/null 2>&1; then
|
|
59
|
+
echo "FAIL: codex not on PATH — iter-0033c L2 arms cannot run" >&2
|
|
60
|
+
exit 1
|
|
61
|
+
fi
|
|
62
|
+
echo "PASS: $(command -v codex) ($(codex --version 2>&1 | head -1))"
|
|
63
|
+
|
|
64
|
+
# --- Mirror committed skills to .claude/skills (parity with run-suite.sh:111-141) ---
|
|
65
|
+
# Iteration commits land in config/skills/; the variant-arm runtime resolves
|
|
66
|
+
# from .claude/skills/. Without this step, edits to SKILL.md / phase prompts /
|
|
67
|
+
# _shared scripts (e.g. archive_run.py iter-0033c fix) silently run against the
|
|
68
|
+
# stale mirror. UNSHIPPED list mirrors bin/devlyn.js:299-304.
|
|
69
|
+
SRC_SKILLS="$REPO_ROOT/config/skills"
|
|
70
|
+
DST_SKILLS="$REPO_ROOT/.claude/skills"
|
|
71
|
+
mkdir -p "$DST_SKILLS"
|
|
72
|
+
mirrored=0
|
|
73
|
+
for src_dir in "$SRC_SKILLS"/*/; do
|
|
74
|
+
[ -d "$src_dir" ] || continue
|
|
75
|
+
name=$(basename "$src_dir")
|
|
76
|
+
case "$name" in
|
|
77
|
+
devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
|
|
78
|
+
continue ;;
|
|
79
|
+
esac
|
|
80
|
+
staging="$DST_SKILLS/.${name}.staging"
|
|
81
|
+
rm -rf "$staging"
|
|
82
|
+
cp -R "$src_dir" "$staging"
|
|
83
|
+
rm -rf "$DST_SKILLS/$name"
|
|
84
|
+
mv "$staging" "$DST_SKILLS/$name"
|
|
85
|
+
mirrored=$((mirrored + 1))
|
|
86
|
+
done
|
|
87
|
+
echo "[run-iter-0033c] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
|
|
88
|
+
|
|
89
|
+
# --- Setup ---
|
|
90
|
+
HEAD_SHA=$(git rev-parse --short HEAD)
|
|
91
|
+
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
|
92
|
+
RUN_ID="${HEAD_SHA}-iter0033c-${LABEL}-${TS}"
|
|
93
|
+
RESULTS_DIR="$RESULTS_ROOT/$RUN_ID"
|
|
94
|
+
mkdir -p "$RESULTS_DIR"
|
|
95
|
+
echo "[run-iter-0033c] RUN_ID=$RUN_ID"
|
|
96
|
+
echo "[run-iter-0033c] RESULTS_DIR=$RESULTS_DIR"
|
|
97
|
+
|
|
98
|
+
# --- Determine pair-eligible set from manifest input bundle ---
|
|
99
|
+
# Build a draft manifest using the C1 summary as the L1 placeholder; we'll
|
|
100
|
+
# rebuild with the real L1 rerun summary at the end. For now we just need
|
|
101
|
+
# the pair-eligible set for arm-selection per fixture.
|
|
102
|
+
DRAFT_MANIFEST="$RESULTS_DIR/manifest-draft.json"
|
|
103
|
+
python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
|
|
104
|
+
--c1-summary "$C1_SUMMARY" \
|
|
105
|
+
--f9-judge "$F9_JUDGE" \
|
|
106
|
+
--l1-rerun-summary "$C1_SUMMARY" \
|
|
107
|
+
--output "$DRAFT_MANIFEST"
|
|
108
|
+
PAIR_ELIGIBLE=$(python3 -c "import json;print(' '.join(json.load(open('$DRAFT_MANIFEST'))['fixtures_pair_eligible']))")
|
|
109
|
+
echo "[run-iter-0033c] pair-eligible: $PAIR_ELIGIBLE"
|
|
110
|
+
|
|
111
|
+
# --- Per-fixture interleaved arm loop ---
|
|
112
|
+
IFS=',' read -ra FIXTURES <<< "$FIXTURES_CSV"
|
|
113
|
+
declare -a TIMINGS=()
|
|
114
|
+
for short in "${FIXTURES[@]}"; do
|
|
115
|
+
# Resolve short ID to canonical fixture dir name.
|
|
116
|
+
case "$short" in
|
|
117
|
+
F1) fx="F1-cli-trivial-flag";;
|
|
118
|
+
F2) fx="F2-cli-medium-subcommand";;
|
|
119
|
+
F3) fx="F3-backend-contract-risk";;
|
|
120
|
+
F4) fx="F4-web-browser-design";;
|
|
121
|
+
F5) fx="F5-fix-loop-red-green";;
|
|
122
|
+
F6) fx="F6-dep-audit-native-module";;
|
|
123
|
+
F7) fx="F7-out-of-scope-trap";;
|
|
124
|
+
F8) fx="F8-known-limit-ambiguous";;
|
|
125
|
+
F9) fx="F9-e2e-ideate-to-resolve";;
|
|
126
|
+
*) echo "[run-iter-0033c] unknown fixture short id: $short" >&2; exit 1;;
|
|
127
|
+
esac
|
|
128
|
+
echo ""
|
|
129
|
+
echo "=== Fixture $fx ==="
|
|
130
|
+
ARMS=("solo_claude" "l2_gated")
|
|
131
|
+
if [[ " $PAIR_ELIGIBLE " =~ " $short " ]]; then
|
|
132
|
+
ARMS+=("l2_forced")
|
|
133
|
+
fi
|
|
134
|
+
for arm in "${ARMS[@]}"; do
|
|
135
|
+
echo "[run-iter-0033c] $fx :: $arm START $(date -u +%FT%TZ)"
|
|
136
|
+
arm_t0=$(date +%s)
|
|
137
|
+
if ! bash benchmark/auto-resolve/scripts/run-fixture.sh \
|
|
138
|
+
--fixture "$fx" --arm "$arm" \
|
|
139
|
+
--run-id "$RUN_ID" --resolve-skill new \
|
|
140
|
+
> "$RESULTS_DIR/${fx}-${arm}.log" 2>&1; then
|
|
141
|
+
echo "[run-iter-0033c] $fx :: $arm FAILED — see $RESULTS_DIR/${fx}-${arm}.log"
|
|
142
|
+
# Continue to next arm; full failure surface goes through compare.py gates.
|
|
143
|
+
fi
|
|
144
|
+
arm_t1=$(date +%s)
|
|
145
|
+
elapsed=$((arm_t1 - arm_t0))
|
|
146
|
+
TIMINGS+=("$fx:$arm:${elapsed}s")
|
|
147
|
+
echo "[run-iter-0033c] $fx :: $arm END elapsed=${elapsed}s"
|
|
148
|
+
done
|
|
149
|
+
|
|
150
|
+
# Per-fixture judge (graded across ARMS_PRESENT)
|
|
151
|
+
if [ "$SKIP_JUDGE" -eq 0 ]; then
|
|
152
|
+
echo "[run-iter-0033c] $fx :: judge START"
|
|
153
|
+
if ! bash benchmark/auto-resolve/scripts/judge.sh \
|
|
154
|
+
--fixture "$fx" --run-id "$RUN_ID" \
|
|
155
|
+
> "$RESULTS_DIR/${fx}-judge.log" 2>&1; then
|
|
156
|
+
echo "[run-iter-0033c] $fx :: judge FAILED — see $RESULTS_DIR/${fx}-judge.log"
|
|
157
|
+
fi
|
|
158
|
+
echo "[run-iter-0033c] $fx :: judge END"
|
|
159
|
+
fi
|
|
160
|
+
done
|
|
161
|
+
|
|
162
|
+
# --- Build L1 rerun summary from solo_claude arm result.json + judge.json ---
|
|
163
|
+
L1_RERUN_SUMMARY="$RESULTS_DIR/l1-rerun-summary.json"
|
|
164
|
+
python3 - "$RESULTS_DIR" "$L1_RERUN_SUMMARY" "$RUN_ID" "$HEAD_SHA" <<'PY'
|
|
165
|
+
import json, sys
|
|
166
|
+
from pathlib import Path
|
|
167
|
+
results_dir = Path(sys.argv[1])
|
|
168
|
+
out_path = Path(sys.argv[2])
|
|
169
|
+
run_id = sys.argv[3]
|
|
170
|
+
head_sha = sys.argv[4]
|
|
171
|
+
rows = []
|
|
172
|
+
for fx_dir in sorted(results_dir.iterdir()):
|
|
173
|
+
if not fx_dir.is_dir():
|
|
174
|
+
continue
|
|
175
|
+
judge_p = fx_dir / "judge.json"
|
|
176
|
+
if not judge_p.is_file():
|
|
177
|
+
continue
|
|
178
|
+
judge = json.loads(judge_p.read_text())
|
|
179
|
+
mapping = judge.get("_blind_mapping") or {}
|
|
180
|
+
inv = {v: k for k, v in mapping.items()}
|
|
181
|
+
arms = {}
|
|
182
|
+
for arm_name in ("solo_claude", "l2_gated", "l2_forced", "bare"):
|
|
183
|
+
letter = inv.get(arm_name)
|
|
184
|
+
if not letter:
|
|
185
|
+
continue
|
|
186
|
+
arm_dir = fx_dir / arm_name
|
|
187
|
+
result = {}
|
|
188
|
+
if (arm_dir / "result.json").is_file():
|
|
189
|
+
result = json.loads((arm_dir / "result.json").read_text())
|
|
190
|
+
arms[arm_name] = {
|
|
191
|
+
"score": judge.get(f"{letter}_score"),
|
|
192
|
+
"wall_s": result.get("elapsed_seconds"),
|
|
193
|
+
"verify_score": result.get("verify_score"),
|
|
194
|
+
"files_changed": result.get("files_changed"),
|
|
195
|
+
"timed_out": result.get("timed_out"),
|
|
196
|
+
"disqualifier": result.get("disqualifier"),
|
|
197
|
+
}
|
|
198
|
+
rows.append({"fixture": fx_dir.name, "arms": arms})
|
|
199
|
+
out = {
|
|
200
|
+
"run_id": run_id,
|
|
201
|
+
"git_sha": head_sha,
|
|
202
|
+
"fixtures_total": len(rows),
|
|
203
|
+
"rows": rows,
|
|
204
|
+
}
|
|
205
|
+
out_path.write_text(json.dumps(out, indent=2) + "\n")
|
|
206
|
+
print(f"[l1-rerun-summary] wrote {out_path} (fixtures={len(rows)})")
|
|
207
|
+
PY
|
|
208
|
+
|
|
209
|
+
# --- Build final manifest with real L1 rerun summary ---
|
|
210
|
+
FINAL_MANIFEST="$RESULTS_DIR/iter-0033c-pair-eligible.json"
|
|
211
|
+
python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
|
|
212
|
+
--c1-summary "$C1_SUMMARY" \
|
|
213
|
+
--f9-judge "$F9_JUDGE" \
|
|
214
|
+
--l1-rerun-summary "$L1_RERUN_SUMMARY" \
|
|
215
|
+
--output "$FINAL_MANIFEST"
|
|
216
|
+
|
|
217
|
+
# --- Run iter-0033c gate compare ---
|
|
218
|
+
GATES_JSON="$RESULTS_DIR/gates.json"
|
|
219
|
+
GATES_MD="$RESULTS_DIR/gates.md"
|
|
220
|
+
python3 benchmark/auto-resolve/scripts/iter-0033c-compare.py \
|
|
221
|
+
--manifest "$FINAL_MANIFEST" \
|
|
222
|
+
--results-dir "$RESULTS_DIR" \
|
|
223
|
+
--work-dir-root /tmp \
|
|
224
|
+
--run-id "$RUN_ID" \
|
|
225
|
+
--out-json "$GATES_JSON" \
|
|
226
|
+
--out-md "$GATES_MD" \
|
|
227
|
+
|| true # gates may FAIL — exit non-zero handled by inspecting gates.json
|
|
228
|
+
|
|
229
|
+
echo ""
|
|
230
|
+
echo "=== iter-0033c done ==="
|
|
231
|
+
echo "RESULTS_DIR=$RESULTS_DIR"
|
|
232
|
+
echo "MANIFEST=$FINAL_MANIFEST"
|
|
233
|
+
echo "GATES=$GATES_MD"
|
|
234
|
+
printf '\n--- per-arm wall ---\n%s\n' "$(printf '%s\n' "${TIMINGS[@]}")"
|
|
@@ -0,0 +1,214 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-suite.sh — the single-command benchmark entry.
|
|
3
|
+
#
|
|
4
|
+
# Orchestrates: fixture setup + arm invocations + blind judge + report + ship
|
|
5
|
+
# gate. Called by `npx devlyn-cli benchmark` as well as directly.
|
|
6
|
+
#
|
|
7
|
+
# Usage:
|
|
8
|
+
# run-suite.sh # all fixtures, n=1 smoke
|
|
9
|
+
# run-suite.sh --n 3 # 3 runs per fixture for ship decisions
|
|
10
|
+
# run-suite.sh F2 F5 # specific fixtures only
|
|
11
|
+
# run-suite.sh --dry-run # skip model invocations, validate setup
|
|
12
|
+
# run-suite.sh --judge-only --run-id X # re-judge an existing run
|
|
13
|
+
# run-suite.sh --label v3.6 # tag this run
|
|
14
|
+
# run-suite.sh --bless # if ship-gate PASS, promote to baselines/shipped.json
|
|
15
|
+
# run-suite.sh --resolve-skill new # invoke /devlyn:resolve --spec (the only supported value post iter-0034 cutover; flag kept as accepted no-op for historical runners)
|
|
16
|
+
#
|
|
17
|
+
# Exits 0 on PASS, 1 on FAIL.
|
|
18
|
+
|
|
19
|
+
set -euo pipefail
|
|
20
|
+
|
|
21
|
+
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
22
|
+
REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
|
|
23
|
+
|
|
24
|
+
N=1
|
|
25
|
+
LABEL=""
|
|
26
|
+
DRY_RUN=0
|
|
27
|
+
JUDGE_ONLY=0
|
|
28
|
+
RUN_ID_ARG=""
|
|
29
|
+
BLESS=0
|
|
30
|
+
ACCEPT_MISSING=0
|
|
31
|
+
SUITE="golden"
|
|
32
|
+
RESOLVE_SKILL="new"
|
|
33
|
+
FIXTURES=()
|
|
34
|
+
|
|
35
|
+
while [ $# -gt 0 ]; do
|
|
36
|
+
case "$1" in
|
|
37
|
+
--n) N="$2"; shift 2;;
|
|
38
|
+
--label) LABEL="$2"; shift 2;;
|
|
39
|
+
--dry-run) DRY_RUN=1; shift;;
|
|
40
|
+
--judge-only) JUDGE_ONLY=1; shift;;
|
|
41
|
+
--run-id) RUN_ID_ARG="$2"; shift 2;;
|
|
42
|
+
--bless) BLESS=1; shift;;
|
|
43
|
+
--accept-missing) ACCEPT_MISSING=1; shift;;
|
|
44
|
+
--suite) SUITE="$2"; shift 2;;
|
|
45
|
+
--resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
|
|
46
|
+
-h|--help)
|
|
47
|
+
head -22 "$0" | sed -n '3,22p'; exit 0;;
|
|
48
|
+
[FS][0-9]*) FIXTURES+=("$1"); shift;;
|
|
49
|
+
*)
|
|
50
|
+
echo "unknown arg: $1" >&2; exit 1;;
|
|
51
|
+
esac
|
|
52
|
+
done
|
|
53
|
+
|
|
54
|
+
# iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` deleted.
|
|
55
|
+
# Only `new` (= /devlyn:resolve --spec) is supported. The flag is retained as
|
|
56
|
+
# an accepted no-op so historical runners (e.g. run-iter-0033c.sh) keep working
|
|
57
|
+
# without edit. `old` is hard-errored with a pointer at the cutover commit.
|
|
58
|
+
if [ "$RESOLVE_SKILL" = "old" ]; then
|
|
59
|
+
echo "--resolve-skill old is no longer supported: /devlyn:auto-resolve was deleted in the iter-0034 Phase 4 cutover. Use --resolve-skill new (default) or omit the flag." >&2
|
|
60
|
+
exit 1
|
|
61
|
+
fi
|
|
62
|
+
[ "$RESOLVE_SKILL" = "new" ] || \
|
|
63
|
+
{ echo "--resolve-skill must be 'new' (got '$RESOLVE_SKILL')" >&2; exit 1; }
|
|
64
|
+
|
|
65
|
+
# Suite → fixtures directory + discovery prefix.
|
|
66
|
+
case "$SUITE" in
|
|
67
|
+
golden) FIXTURES_DIR="$BENCH_ROOT/fixtures"; FIXTURES_GLOB="F*";;
|
|
68
|
+
shadow) FIXTURES_DIR="$BENCH_ROOT/shadow-fixtures"; FIXTURES_GLOB="S*";;
|
|
69
|
+
*) echo "error: --suite must be 'golden' or 'shadow' (got '$SUITE')" >&2; exit 1;;
|
|
70
|
+
esac
|
|
71
|
+
|
|
72
|
+
# n must be 1 while iteration semantics aren't wired through judge/report.
|
|
73
|
+
# Remove this block when compile-report.py gains multi-iter aggregation.
|
|
74
|
+
if [ "$N" -ne 1 ]; then
|
|
75
|
+
echo "error: --n $N not yet supported — judge/report currently expect a single iteration per fixture." >&2
|
|
76
|
+
echo " Track progress in benchmark/auto-resolve/BENCHMARK-DESIGN.md (#multi-iter-roadmap)." >&2
|
|
77
|
+
exit 2
|
|
78
|
+
fi
|
|
79
|
+
|
|
80
|
+
# Auto-discover fixtures if none specified
|
|
81
|
+
if [ ${#FIXTURES[@]} -eq 0 ]; then
|
|
82
|
+
for d in "$FIXTURES_DIR"/$FIXTURES_GLOB/; do
|
|
83
|
+
[ -d "$d" ] && FIXTURES+=("$(basename "$d")")
|
|
84
|
+
done
|
|
85
|
+
fi
|
|
86
|
+
|
|
87
|
+
if [ ${#FIXTURES[@]} -eq 0 ]; then
|
|
88
|
+
echo "no fixtures found in $FIXTURES_DIR/ — build the suite first" >&2
|
|
89
|
+
exit 1
|
|
90
|
+
fi
|
|
91
|
+
|
|
92
|
+
# RUN_ID
|
|
93
|
+
if [ -n "$RUN_ID_ARG" ]; then
|
|
94
|
+
RUN_ID="$RUN_ID_ARG"
|
|
95
|
+
else
|
|
96
|
+
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
|
97
|
+
SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
|
|
98
|
+
RUN_ID="${TS}-${SHA}${LABEL:+-$LABEL}"
|
|
99
|
+
fi
|
|
100
|
+
|
|
101
|
+
RES_DIR="$BENCH_ROOT/results/$RUN_ID"
|
|
102
|
+
mkdir -p "$RES_DIR"
|
|
103
|
+
|
|
104
|
+
echo ""
|
|
105
|
+
echo "═══ Benchmark Suite Run ═══"
|
|
106
|
+
echo "Run-id: $RUN_ID"
|
|
107
|
+
echo "Label: ${LABEL:-(unlabeled)}"
|
|
108
|
+
echo "Suite: $SUITE ($FIXTURES_DIR)"
|
|
109
|
+
echo "Fixtures: ${FIXTURES[*]}"
|
|
110
|
+
echo "n: $N"
|
|
111
|
+
echo "Resolve skill: $RESOLVE_SKILL"
|
|
112
|
+
[ $DRY_RUN -eq 1 ] && echo "Mode: DRY RUN (no model invocations)"
|
|
113
|
+
[ $JUDGE_ONLY -eq 1 ] && echo "Mode: JUDGE ONLY (re-judging existing artifacts)"
|
|
114
|
+
echo ""
|
|
115
|
+
|
|
116
|
+
# ---- Mirror committed skills into .claude/skills (iter-0017) --------------
|
|
117
|
+
# The variant arm reads $REPO_ROOT/.claude/skills/, but iteration commits land
|
|
118
|
+
# in config/skills/. Without this step every checkout/revert that touches
|
|
119
|
+
# SKILL.md or phase prompts requires a manual `node bin/devlyn.js -y` or
|
|
120
|
+
# surgical cp; forgetting it silently runs the suite against stale skills.
|
|
121
|
+
# Replicates the clean-then-copy semantics of bin/devlyn.js
|
|
122
|
+
# (cleanManagedSkillDirs ~L313 + copyRecursive ~L274). Per-skill staging dir
|
|
123
|
+
# + atomic mv keeps a Ctrl-C window from leaving a managed skill missing.
|
|
124
|
+
# UNSHIPPED list mirrors bin/devlyn.js:299-304 — keep them in sync.
|
|
125
|
+
# Skipped only in --judge-only (no model invocations); runs in --dry-run.
|
|
126
|
+
if [ $JUDGE_ONLY -eq 0 ]; then
|
|
127
|
+
SRC_SKILLS="$REPO_ROOT/config/skills"
|
|
128
|
+
DST_SKILLS="$REPO_ROOT/.claude/skills"
|
|
129
|
+
mkdir -p "$DST_SKILLS"
|
|
130
|
+
mirrored=0
|
|
131
|
+
for src_dir in "$SRC_SKILLS"/*/; do
|
|
132
|
+
[ -d "$src_dir" ] || continue
|
|
133
|
+
name=$(basename "$src_dir")
|
|
134
|
+
case "$name" in
|
|
135
|
+
devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
|
|
136
|
+
continue ;;
|
|
137
|
+
esac
|
|
138
|
+
staging="$DST_SKILLS/.${name}.staging"
|
|
139
|
+
rm -rf "$staging"
|
|
140
|
+
cp -R "$src_dir" "$staging"
|
|
141
|
+
rm -rf "$DST_SKILLS/$name"
|
|
142
|
+
mv "$staging" "$DST_SKILLS/$name"
|
|
143
|
+
mirrored=$((mirrored + 1))
|
|
144
|
+
done
|
|
145
|
+
echo "[suite] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
|
|
146
|
+
fi
|
|
147
|
+
|
|
148
|
+
# Prereq checks
|
|
149
|
+
if [ $DRY_RUN -eq 0 ] && [ $JUDGE_ONLY -eq 0 ]; then
|
|
150
|
+
command -v claude >/dev/null 2>&1 || { echo "claude CLI missing; install Claude Code first"; exit 1; }
|
|
151
|
+
fi
|
|
152
|
+
if [ $JUDGE_ONLY -eq 0 ]; then
|
|
153
|
+
command -v codex >/dev/null 2>&1 || echo "warning: codex CLI missing — judge will fail"
|
|
154
|
+
fi
|
|
155
|
+
command -v python3 >/dev/null 2>&1 || { echo "python3 missing"; exit 1; }
|
|
156
|
+
|
|
157
|
+
# Install test-repo deps once per suite run (shared cache)
|
|
158
|
+
if [ $DRY_RUN -eq 0 ] && [ $JUDGE_ONLY -eq 0 ]; then
|
|
159
|
+
TEST_REPO="$BENCH_ROOT/fixtures/test-repo"
|
|
160
|
+
if [ ! -d "$TEST_REPO/node_modules" ]; then
|
|
161
|
+
echo "[suite] installing test-repo deps (one-time)"
|
|
162
|
+
if ! (cd "$TEST_REPO" && npm install --no-audit --no-fund --loglevel=error); then
|
|
163
|
+
echo "[suite] ✗ npm install in test-repo failed — check network/npm auth. Aborting." >&2
|
|
164
|
+
exit 1
|
|
165
|
+
fi
|
|
166
|
+
fi
|
|
167
|
+
fi
|
|
168
|
+
|
|
169
|
+
# ---- Run arms ---------------------------------------------------------------
|
|
170
|
+
if [ $JUDGE_ONLY -eq 0 ]; then
|
|
171
|
+
for fid in "${FIXTURES[@]}"; do
|
|
172
|
+
[ -d "$FIXTURES_DIR/$fid" ] || { echo "[suite] skip $fid (missing)"; continue; }
|
|
173
|
+
for arm in variant solo_claude bare; do
|
|
174
|
+
echo "[suite] ► $fid / $arm (resolve-skill=$RESOLVE_SKILL)"
|
|
175
|
+
extra=""
|
|
176
|
+
[ $DRY_RUN -eq 1 ] && extra="--dry-run"
|
|
177
|
+
bash "$BENCH_ROOT/scripts/run-fixture.sh" \
|
|
178
|
+
--fixture "$fid" --arm "$arm" --run-id "$RUN_ID" \
|
|
179
|
+
--resolve-skill "$RESOLVE_SKILL" $extra \
|
|
180
|
+
|| echo "[suite] ✗ $fid / $arm (arm failure tolerated; artifacts still captured)"
|
|
181
|
+
done
|
|
182
|
+
done
|
|
183
|
+
fi
|
|
184
|
+
|
|
185
|
+
# ---- Judge ------------------------------------------------------------------
|
|
186
|
+
for fid in "${FIXTURES[@]}"; do
|
|
187
|
+
if [ ! -d "$BENCH_ROOT/results/$RUN_ID/$fid" ]; then
|
|
188
|
+
echo "[suite] skip judge for $fid (no results)"
|
|
189
|
+
continue
|
|
190
|
+
fi
|
|
191
|
+
if [ $DRY_RUN -eq 1 ]; then
|
|
192
|
+
echo "[suite] DRY RUN — skipping judge for $fid"
|
|
193
|
+
continue
|
|
194
|
+
fi
|
|
195
|
+
echo "[suite] ► judge $fid"
|
|
196
|
+
bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
|
|
197
|
+
|| echo "[suite] ✗ judge failed for $fid (will appear as NO_JUDGE in report)"
|
|
198
|
+
done
|
|
199
|
+
|
|
200
|
+
# ---- Compile report + ship gate --------------------------------------------
|
|
201
|
+
if [ $DRY_RUN -eq 1 ]; then
|
|
202
|
+
echo ""
|
|
203
|
+
echo "[suite] DRY RUN complete — results in $RES_DIR"
|
|
204
|
+
echo "Run without --dry-run to invoke models."
|
|
205
|
+
exit 0
|
|
206
|
+
fi
|
|
207
|
+
|
|
208
|
+
echo ""
|
|
209
|
+
python3 "$BENCH_ROOT/scripts/compile-report.py" --run-id "$RUN_ID" ${LABEL:+--label "$LABEL"}
|
|
210
|
+
|
|
211
|
+
extra_flag=""
|
|
212
|
+
[ $BLESS -eq 1 ] && extra_flag="$extra_flag --bless"
|
|
213
|
+
[ $ACCEPT_MISSING -eq 1 ] && extra_flag="$extra_flag --accept-missing"
|
|
214
|
+
python3 "$BENCH_ROOT/scripts/ship-gate.py" --run-id "$RUN_ID" $extra_flag
|
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
ship-gate.py — apply RUBRIC.md ship thresholds to a suite run's summary.json.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
ship-gate.py --run-id <ID> # check gates, return 0/1 via exit code
|
|
7
|
+
ship-gate.py --run-id <ID> --bless # if PASS, promote summary to baselines/shipped.json
|
|
8
|
+
|
|
9
|
+
Exits 0 on PASS, 1 on FAIL.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
import argparse, json, pathlib, sys, shutil, datetime
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def main() -> int:
|
|
16
|
+
p = argparse.ArgumentParser()
|
|
17
|
+
p.add_argument("--run-id", required=True)
|
|
18
|
+
p.add_argument("--bless", action="store_true")
|
|
19
|
+
p.add_argument("--accept-missing", action="store_true",
|
|
20
|
+
help="skip hard-floor gates that require fixtures not yet implemented "
|
|
21
|
+
"(F9 and the 7-of-9 count) — only for suites in bootstrap")
|
|
22
|
+
args = p.parse_args()
|
|
23
|
+
|
|
24
|
+
root = pathlib.Path(__file__).resolve().parent.parent
|
|
25
|
+
summary_p = root / "results" / args.run_id / "summary.json"
|
|
26
|
+
if not summary_p.exists():
|
|
27
|
+
print(f"no summary at {summary_p}", file=sys.stderr); return 1
|
|
28
|
+
summary = json.loads(summary_p.read_text())
|
|
29
|
+
|
|
30
|
+
baseline_p = root / "history" / "baselines" / "shipped.json"
|
|
31
|
+
baseline = None
|
|
32
|
+
if baseline_p.exists():
|
|
33
|
+
try:
|
|
34
|
+
baseline = json.loads(baseline_p.read_text())
|
|
35
|
+
except Exception:
|
|
36
|
+
baseline = None
|
|
37
|
+
|
|
38
|
+
failures: list[str] = []
|
|
39
|
+
warnings: list[str] = []
|
|
40
|
+
|
|
41
|
+
# Hard floor 1: no disqualifier in variant
|
|
42
|
+
if summary["hard_floor_violations"] > 0:
|
|
43
|
+
failures.append(f"{summary['hard_floor_violations']} variant disqualifier(s) — see report")
|
|
44
|
+
|
|
45
|
+
# Hard floor 2: F9 must pass (skipped during bootstrap via --accept-missing)
|
|
46
|
+
# Variant arm legacy gate kept for L2 baseline comparability.
|
|
47
|
+
# iter-0033a (2026-04-30): renamed F9 dir from -to-preflight to -to-resolve to
|
|
48
|
+
# match the shipped 2-skill contract (no preflight). The OLD pre-rename id
|
|
49
|
+
# is preserved in fixtures/retired/ for replay.
|
|
50
|
+
f9_row = next((r for r in summary["rows"] if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
|
|
51
|
+
if f9_row is None:
|
|
52
|
+
if not args.accept_missing:
|
|
53
|
+
failures.append("F9 (E2E novice flow) missing — add fixture or run with --accept-missing")
|
|
54
|
+
else:
|
|
55
|
+
if (f9_row.get("margin") or -999) < 5:
|
|
56
|
+
failures.append("F9 (E2E novice flow) must have variant margin ≥ +5")
|
|
57
|
+
|
|
58
|
+
# Hard floor 3: ≥ 7 of 9 gated fixtures with margin ≥ +5
|
|
59
|
+
# (skipped during bootstrap via --accept-missing)
|
|
60
|
+
if summary["gated_fixtures"] > 0 and summary["margin_ge_5_count"] < 7:
|
|
61
|
+
if not args.accept_missing:
|
|
62
|
+
failures.append(
|
|
63
|
+
f"only {summary['margin_ge_5_count']} of {summary['gated_fixtures']} "
|
|
64
|
+
f"gated fixtures have variant margin ≥ +5 (need ≥ 7)"
|
|
65
|
+
)
|
|
66
|
+
|
|
67
|
+
# iter-0023 — L1 (solo_claude) gates per NORTH-STAR.md ops test #1.
|
|
68
|
+
# Codex R1 (this iter) caught that ship-gate enforced only legacy L2
|
|
69
|
+
# `variant` margin and never read `solo_over_bare`. Now NORTH-STAR's
|
|
70
|
+
# documented L1 floor (≥ +5, ≥ 7/9 fixtures, F9 ≥ +5, no L1
|
|
71
|
+
# disqualifier) is mechanically enforced.
|
|
72
|
+
arms_present = summary.get("arms_present", {})
|
|
73
|
+
margins_avg = summary.get("margins_avg", {})
|
|
74
|
+
if arms_present.get("solo_claude"):
|
|
75
|
+
l1_avg = margins_avg.get("solo_over_bare")
|
|
76
|
+
if l1_avg is not None and l1_avg < 5:
|
|
77
|
+
warnings.append(
|
|
78
|
+
f"L1 (solo_over_bare) suite avg {l1_avg:+.1f} below NORTH-STAR floor +5 "
|
|
79
|
+
"(reporting only — per-fixture L1 gates below are decisive)"
|
|
80
|
+
)
|
|
81
|
+
# F9 L1 floor
|
|
82
|
+
if f9_row is not None:
|
|
83
|
+
f9_l1 = (f9_row.get("margins") or {}).get("solo_over_bare")
|
|
84
|
+
if f9_l1 is None:
|
|
85
|
+
if not args.accept_missing:
|
|
86
|
+
failures.append("F9 L1 (solo_over_bare) margin missing — measurement invalid")
|
|
87
|
+
elif f9_l1 < 5:
|
|
88
|
+
failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+d} < +5 floor")
|
|
89
|
+
# 7-of-9 L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
|
|
90
|
+
# Codex collab + NORTH-STAR amendment + RUBRIC hard-floor 3 update).
|
|
91
|
+
# A fixture is excluded from the denominator when 100 - L0_score < 5
|
|
92
|
+
# AND L1_score >= 95 AND the L1 arm has no disqualifier / CRITICAL-HIGH
|
|
93
|
+
# finding / watchdog timeout / regression worse than gate #4. Excluded
|
|
94
|
+
# fixtures become fixture-rotation candidates if RUBRIC's
|
|
95
|
+
# two-shipped-version saturation rule fires.
|
|
96
|
+
l1_ge_5 = 0
|
|
97
|
+
l1_gated = 0
|
|
98
|
+
l1_excluded_headroom = []
|
|
99
|
+
for r in summary.get("rows", []):
|
|
100
|
+
if (r.get("category") or "").lower() == "known-limit":
|
|
101
|
+
continue
|
|
102
|
+
arms = r.get("arms") or {}
|
|
103
|
+
l0 = arms.get("bare") or {}
|
|
104
|
+
l1 = arms.get("solo_claude") or {}
|
|
105
|
+
l0_score = l0.get("score")
|
|
106
|
+
l1_score = l1.get("score")
|
|
107
|
+
m = (r.get("margins") or {}).get("solo_over_bare")
|
|
108
|
+
if m is None:
|
|
109
|
+
continue
|
|
110
|
+
# Headroom carve-out — must satisfy ALL conditions:
|
|
111
|
+
# (a) bare ceiling-near (100 - L0 < 5)
|
|
112
|
+
# (b) L1 also ceiling-near (>=95)
|
|
113
|
+
# (c) L1 arm clean (no disqualifier, no axis-invalid, fix-loop didn't fail)
|
|
114
|
+
l1_dq_here = bool(l1.get("disqualifier"))
|
|
115
|
+
l1_axis_inv = (l1.get("_axis_validation_out_of_range_count") or 0) > 0
|
|
116
|
+
if (
|
|
117
|
+
isinstance(l0_score, (int, float)) and isinstance(l1_score, (int, float))
|
|
118
|
+
and (100 - l0_score) < 5 and l1_score >= 95
|
|
119
|
+
and not l1_dq_here and not l1_axis_inv
|
|
120
|
+
):
|
|
121
|
+
l1_excluded_headroom.append({
|
|
122
|
+
"fixture": r.get("fixture"),
|
|
123
|
+
"l0_score": l0_score,
|
|
124
|
+
"l1_score": l1_score,
|
|
125
|
+
"margin": m,
|
|
126
|
+
})
|
|
127
|
+
continue
|
|
128
|
+
l1_gated += 1
|
|
129
|
+
if m >= 5:
|
|
130
|
+
l1_ge_5 += 1
|
|
131
|
+
if l1_gated > 0 and l1_ge_5 < 7 and not args.accept_missing:
|
|
132
|
+
failures.append(
|
|
133
|
+
f"L1: only {l1_ge_5} of {l1_gated} headroom-available fixtures have solo_over_bare ≥ +5 (need ≥ 7)"
|
|
134
|
+
)
|
|
135
|
+
if l1_excluded_headroom:
|
|
136
|
+
warnings.append(
|
|
137
|
+
"L1 headroom-excluded (saturation candidates per RUBRIC two-shipped-version rule): "
|
|
138
|
+
+ ", ".join(
|
|
139
|
+
f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+d})"
|
|
140
|
+
for x in l1_excluded_headroom
|
|
141
|
+
)
|
|
142
|
+
)
|
|
143
|
+
# L1 disqualifier floor
|
|
144
|
+
l1_dq = sum(
|
|
145
|
+
1 for r in summary.get("rows", [])
|
|
146
|
+
if ((r.get("arms") or {}).get("solo_claude") or {}).get("disqualifier")
|
|
147
|
+
)
|
|
148
|
+
if l1_dq > 0:
|
|
149
|
+
failures.append(f"L1 disqualifier(s): {l1_dq} solo_claude arm(s) hit a disqualifier")
|
|
150
|
+
# L1 axis-validity gate (judge.sh records out-of-range axis cells under
|
|
151
|
+
# `_axis_validation` per fixture). If any L1 row has invalid axis data,
|
|
152
|
+
# the L1 score for that row is not trustworthy.
|
|
153
|
+
l1_axis_invalid = 0
|
|
154
|
+
for r in summary.get("rows", []):
|
|
155
|
+
av = (r.get("arms") or {}).get("solo_claude") or {}
|
|
156
|
+
inv = av.get("_axis_validation_out_of_range_count")
|
|
157
|
+
if inv is not None and inv > 0:
|
|
158
|
+
l1_axis_invalid += 1
|
|
159
|
+
if l1_axis_invalid > 0:
|
|
160
|
+
failures.append(
|
|
161
|
+
f"L1 axis-invalid: {l1_axis_invalid} fixture(s) have out-of-range axis cells — "
|
|
162
|
+
"re-judge before trusting L1 margins"
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
# Hard floor 4: no per-fixture regression worse than −5 vs shipped baseline
|
|
166
|
+
if baseline:
|
|
167
|
+
prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
|
|
168
|
+
for r in summary["rows"]:
|
|
169
|
+
fid = r.get("fixture")
|
|
170
|
+
prev = prev_rows.get(fid)
|
|
171
|
+
if prev and r.get("variant_score") is not None and prev.get("variant_score") is not None:
|
|
172
|
+
delta = r["variant_score"] - prev["variant_score"]
|
|
173
|
+
if delta < -5:
|
|
174
|
+
failures.append(f"{fid} regressed {delta:+d} vs shipped (floor: −5)")
|
|
175
|
+
|
|
176
|
+
# Soft gate: suite average margin drop > 3
|
|
177
|
+
if baseline:
|
|
178
|
+
margin_delta = summary["margin_avg"] - baseline.get("margin_avg", 0)
|
|
179
|
+
if margin_delta < -3:
|
|
180
|
+
warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
|
|
181
|
+
|
|
182
|
+
# Soft gate: any fixture that was > +5 before is now ≤ 0
|
|
183
|
+
if baseline:
|
|
184
|
+
prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
|
|
185
|
+
for r in summary["rows"]:
|
|
186
|
+
fid = r.get("fixture")
|
|
187
|
+
prev = prev_rows.get(fid)
|
|
188
|
+
if prev and (prev.get("margin") or 0) > 5 and (r.get("margin") or 0) <= 0:
|
|
189
|
+
warnings.append(
|
|
190
|
+
f"{fid} lost its margin: was {prev['margin']:+d}, now {r['margin']:+d}"
|
|
191
|
+
)
|
|
192
|
+
|
|
193
|
+
verdict = "PASS" if not failures else "FAIL"
|
|
194
|
+
print(f"\n═══ SHIP-GATE VERDICT: {verdict} ═══\n")
|
|
195
|
+
if failures:
|
|
196
|
+
print("Hard-floor failures:")
|
|
197
|
+
for f in failures:
|
|
198
|
+
print(f" ✗ {f}")
|
|
199
|
+
print()
|
|
200
|
+
if warnings:
|
|
201
|
+
print("Soft-gate warnings:")
|
|
202
|
+
for w in warnings:
|
|
203
|
+
print(f" ⚠ {w}")
|
|
204
|
+
print()
|
|
205
|
+
if not failures and not warnings:
|
|
206
|
+
print("No gate violations. Suite is ship-ready.")
|
|
207
|
+
|
|
208
|
+
# Bless if PASS + --bless — opt-in promotion to shipped baseline.
|
|
209
|
+
# Per BENCHMARK-DESIGN.md Karpathy Check, automatic history mutation is
|
|
210
|
+
# deferred until after the suite format stabilizes; `--bless` stays as
|
|
211
|
+
# the explicit promotion path, and `summary.json` inside the run dir
|
|
212
|
+
# is the durable record for ad-hoc inspection.
|
|
213
|
+
if verdict == "PASS" and args.bless:
|
|
214
|
+
baseline_p.parent.mkdir(parents=True, exist_ok=True)
|
|
215
|
+
shutil.copyfile(summary_p, baseline_p)
|
|
216
|
+
print(f"\nBlessed: {baseline_p}")
|
|
217
|
+
|
|
218
|
+
return 0 if verdict == "PASS" else 1
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
if __name__ == "__main__":
|
|
222
|
+
sys.exit(main())
|