devlyn-cli 1.15.0 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/AGENTS.md +104 -0
- package/CLAUDE.md +135 -21
- package/README.md +43 -125
- package/benchmark/auto-resolve/BENCHMARK-DESIGN.md +272 -0
- package/benchmark/auto-resolve/README.md +114 -0
- package/benchmark/auto-resolve/RUBRIC.md +162 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/NOTES.md +30 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +45 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/task.txt +8 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/NOTES.md +54 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected-pair-plan-registry.json +170 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/expected.json +84 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/metadata.json +21 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-fail.json +214 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/pair-plan.sample-pass.json +223 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/setup.sh +5 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/task.txt +14 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected-pair-plan-registry.json +162 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/metadata.json +19 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/NOTES.md +40 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/expected.json +57 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/setup.sh +6 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/expected.json +65 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/setup.sh +55 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/NOTES.md +38 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/expected.json +77 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +49 -0
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/task.txt +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/expected.json +76 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +46 -0
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/NOTES.md +50 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/expected.json +63 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/setup.sh +4 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/spec.md +48 -0
- package/benchmark/auto-resolve/fixtures/F8-known-limit-ambiguous/task.txt +1 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/NOTES.md +93 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/setup.sh +28 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/task.txt +5 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +130 -0
- package/benchmark/auto-resolve/fixtures/test-repo/README.md +27 -0
- package/benchmark/auto-resolve/fixtures/test-repo/bin/cli.js +63 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package-lock.json +823 -0
- package/benchmark/auto-resolve/fixtures/test-repo/package.json +22 -0
- package/benchmark/auto-resolve/fixtures/test-repo/playwright.config.js +17 -0
- package/benchmark/auto-resolve/fixtures/test-repo/server/index.js +37 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/cli.test.js +25 -0
- package/benchmark/auto-resolve/fixtures/test-repo/tests/server.test.js +58 -0
- package/benchmark/auto-resolve/fixtures/test-repo/web/index.html +37 -0
- package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py +174 -0
- package/benchmark/auto-resolve/scripts/check-f9-artifacts.py +256 -0
- package/benchmark/auto-resolve/scripts/compile-report.py +331 -0
- package/benchmark/auto-resolve/scripts/iter-0033c-compare.py +552 -0
- package/benchmark/auto-resolve/scripts/judge-opus-pass.sh +430 -0
- package/benchmark/auto-resolve/scripts/judge.sh +359 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +260 -0
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +274 -0
- package/benchmark/auto-resolve/scripts/oracle-test-fidelity.py +328 -0
- package/benchmark/auto-resolve/scripts/pair-plan-idgen.py +401 -0
- package/benchmark/auto-resolve/scripts/pair-plan-lint.py +468 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +691 -0
- package/benchmark/auto-resolve/scripts/run-iter-0033c.sh +234 -0
- package/benchmark/auto-resolve/scripts/run-suite.sh +214 -0
- package/benchmark/auto-resolve/scripts/ship-gate.py +222 -0
- package/bin/devlyn.js +175 -17
- package/config/skills/_shared/adapters/README.md +64 -0
- package/config/skills/_shared/adapters/gpt-5-5.md +29 -0
- package/config/skills/_shared/adapters/opus-4-7.md +29 -0
- package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py +26 -0
- package/config/skills/_shared/codex-config.md +54 -0
- package/config/skills/_shared/codex-monitored.sh +141 -0
- package/config/skills/_shared/engine-preflight.md +35 -0
- package/config/skills/_shared/expected.schema.json +93 -0
- package/config/skills/_shared/pair-plan-schema.md +298 -0
- package/config/skills/_shared/runtime-principles.md +110 -0
- package/config/skills/_shared/spec-verify-check.py +519 -0
- package/config/skills/devlyn:ideate/SKILL.md +99 -429
- package/config/skills/devlyn:ideate/references/elicitation.md +97 -0
- package/config/skills/devlyn:ideate/references/from-spec-mode.md +54 -0
- package/config/skills/devlyn:ideate/references/project-mode.md +76 -0
- package/config/skills/devlyn:ideate/references/spec-template.md +102 -0
- package/config/skills/devlyn:resolve/SKILL.md +172 -184
- package/config/skills/devlyn:resolve/references/free-form-mode.md +68 -0
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +45 -0
- package/config/skills/devlyn:resolve/references/phases/cleanup.md +39 -0
- package/config/skills/devlyn:resolve/references/phases/implement.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/plan.md +42 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +69 -0
- package/config/skills/devlyn:resolve/references/state-schema.md +106 -0
- package/{config/skills → optional-skills}/devlyn:design-system/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:reap/SKILL.md +1 -0
- package/{config/skills → optional-skills}/devlyn:team-design-ui/SKILL.md +5 -0
- package/package.json +12 -2
- package/scripts/lint-skills.sh +431 -0
- package/config/skills/devlyn:auto-resolve/SKILL.md +0 -252
- package/config/skills/devlyn:auto-resolve/evals/evals.json +0 -21
- package/config/skills/devlyn:auto-resolve/evals/task-doctor-subcommand.md +0 -42
- package/config/skills/devlyn:auto-resolve/references/build-gate.md +0 -130
- package/config/skills/devlyn:auto-resolve/references/engine-routing.md +0 -82
- package/config/skills/devlyn:auto-resolve/references/findings-schema.md +0 -103
- package/config/skills/devlyn:auto-resolve/references/phases/phase-1-build.md +0 -54
- package/config/skills/devlyn:auto-resolve/references/phases/phase-2-evaluate.md +0 -45
- package/config/skills/devlyn:auto-resolve/references/phases/phase-3-critic.md +0 -84
- package/config/skills/devlyn:auto-resolve/references/pipeline-routing.md +0 -114
- package/config/skills/devlyn:auto-resolve/references/pipeline-state.md +0 -201
- package/config/skills/devlyn:auto-resolve/scripts/terminal_verdict.py +0 -96
- package/config/skills/devlyn:browser-validate/SKILL.md +0 -164
- package/config/skills/devlyn:browser-validate/references/flow-testing.md +0 -118
- package/config/skills/devlyn:browser-validate/references/tier1-chrome.md +0 -137
- package/config/skills/devlyn:browser-validate/references/tier2-playwright.md +0 -195
- package/config/skills/devlyn:browser-validate/references/tier3-curl.md +0 -57
- package/config/skills/devlyn:clean/SKILL.md +0 -285
- package/config/skills/devlyn:design-ui/SKILL.md +0 -351
- package/config/skills/devlyn:discover-product/SKILL.md +0 -124
- package/config/skills/devlyn:evaluate/SKILL.md +0 -564
- package/config/skills/devlyn:feature-spec/SKILL.md +0 -630
- package/config/skills/devlyn:ideate/references/challenge-rubric.md +0 -122
- package/config/skills/devlyn:ideate/references/codex-critic-template.md +0 -42
- package/config/skills/devlyn:ideate/references/templates/item-spec.md +0 -90
- package/config/skills/devlyn:implement-ui/SKILL.md +0 -466
- package/config/skills/devlyn:preflight/SKILL.md +0 -355
- package/config/skills/devlyn:preflight/references/auditors/browser-auditor.md +0 -32
- package/config/skills/devlyn:preflight/references/auditors/code-auditor.md +0 -86
- package/config/skills/devlyn:preflight/references/auditors/docs-auditor.md +0 -38
- package/config/skills/devlyn:product-spec/SKILL.md +0 -603
- package/config/skills/devlyn:recommend-features/SKILL.md +0 -286
- package/config/skills/devlyn:review/SKILL.md +0 -161
- package/config/skills/devlyn:team-resolve/SKILL.md +0 -631
- package/config/skills/devlyn:team-review/SKILL.md +0 -493
- package/config/skills/devlyn:update-docs/SKILL.md +0 -463
- package/config/skills/workflow-routing/SKILL.md +0 -73
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/reap.sh +0 -0
- /package/{config/skills → optional-skills}/devlyn:reap/scripts/scan.sh +0 -0
|
@@ -0,0 +1,691 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-fixture.sh — run ONE fixture, ONE arm, end-to-end. Self-contained.
|
|
3
|
+
#
|
|
4
|
+
# Prepares a fresh work dir, applies setup, invokes the arm via `claude -p`
|
|
5
|
+
# subprocess (isolated session), then captures artifacts + runs verification.
|
|
6
|
+
#
|
|
7
|
+
# Usage:
|
|
8
|
+
# run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID>
|
|
9
|
+
# run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID> --dry-run
|
|
10
|
+
#
|
|
11
|
+
# Outputs to benchmark/auto-resolve/results/<run-id>/<fixture>/<arm>/:
|
|
12
|
+
# input.md, transcript.txt, diff.patch, changed-files.txt, verify.json,
|
|
13
|
+
# timing.json, result.json, setup.log (if setup ran)
|
|
14
|
+
|
|
15
|
+
set -euo pipefail
|
|
16
|
+
|
|
17
|
+
usage() {
|
|
18
|
+
echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
|
|
19
|
+
exit 1
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
|
|
23
|
+
RESOLVE_SKILL="new"
|
|
24
|
+
while [ $# -gt 0 ]; do
|
|
25
|
+
case "$1" in
|
|
26
|
+
--fixture) FIXTURE="$2"; shift 2;;
|
|
27
|
+
--arm) ARM="$2"; shift 2;;
|
|
28
|
+
--run-id) RUN_ID="$2"; shift 2;;
|
|
29
|
+
--resolve-skill) RESOLVE_SKILL="$2"; shift 2;;
|
|
30
|
+
--dry-run) DRY_RUN=1; shift;;
|
|
31
|
+
*) usage;;
|
|
32
|
+
esac
|
|
33
|
+
done
|
|
34
|
+
[ -n "$FIXTURE" ] && [ -n "$ARM" ] && [ -n "$RUN_ID" ] || usage
|
|
35
|
+
# iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
|
|
36
|
+
# solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
|
|
37
|
+
# bare (L0: direct claude -p, no skill, no codex).
|
|
38
|
+
# iter-0033c (Codex R0-infra adoption, 2026-05-02): two new arms for NEW L2 measurement on /devlyn:resolve —
|
|
39
|
+
# l2_gated (--engine claude, no --pair-verify; pair fires only on natural triggers),
|
|
40
|
+
# l2_forced (--engine claude --pair-verify; diagnostic). Both require --resolve-skill new.
|
|
41
|
+
[ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] || [ "$ARM" = "bare" ] \
|
|
42
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ] || \
|
|
43
|
+
{ echo "arm must be variant|solo_claude|bare|l2_gated|l2_forced"; exit 1; }
|
|
44
|
+
# iter-0033c (Codex R0-infra Q2): l2_* arms require NEW skill surface (only NEW
|
|
45
|
+
# `/devlyn:resolve` honors --pair-verify; OLD `/devlyn:auto-resolve` would silently
|
|
46
|
+
# ignore the flag and produce mis-attributed L2 numbers).
|
|
47
|
+
if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
|
|
48
|
+
echo "l2_* arms require --resolve-skill new (got '$RESOLVE_SKILL')"; exit 1
|
|
49
|
+
fi
|
|
50
|
+
# iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` was
|
|
51
|
+
# deleted. Only `new` (= /devlyn:resolve --spec) is supported. The flag stays
|
|
52
|
+
# an accepted no-op so historical runners (run-iter-0033c.sh:137) keep working
|
|
53
|
+
# unchanged. `old` is hard-errored — silently downgrading to `new` would
|
|
54
|
+
# produce mis-attributed results in any pre-cutover replay attempt.
|
|
55
|
+
if [ "$RESOLVE_SKILL" = "old" ]; then
|
|
56
|
+
echo "--resolve-skill old is no longer supported: /devlyn:auto-resolve was deleted in the iter-0034 Phase 4 cutover. Use --resolve-skill new (default) or omit the flag." >&2
|
|
57
|
+
exit 1
|
|
58
|
+
fi
|
|
59
|
+
[ "$RESOLVE_SKILL" = "new" ] || \
|
|
60
|
+
{ echo "--resolve-skill must be 'new' (got '$RESOLVE_SKILL')"; exit 1; }
|
|
61
|
+
|
|
62
|
+
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
63
|
+
REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
|
|
64
|
+
|
|
65
|
+
FIX_DIR=""
|
|
66
|
+
for candidate in "$BENCH_ROOT/fixtures/$FIXTURE" "$BENCH_ROOT/shadow-fixtures/$FIXTURE"; do
|
|
67
|
+
if [ -d "$candidate" ]; then FIX_DIR="$candidate"; break; fi
|
|
68
|
+
done
|
|
69
|
+
[ -n "$FIX_DIR" ] || { echo "fixture not found in fixtures/ or shadow-fixtures/: $FIXTURE"; exit 1; }
|
|
70
|
+
|
|
71
|
+
META="$FIX_DIR/metadata.json"
|
|
72
|
+
EXPECTED="$FIX_DIR/expected.json"
|
|
73
|
+
SPEC="$FIX_DIR/spec.md"
|
|
74
|
+
TASK="$FIX_DIR/task.txt"
|
|
75
|
+
SETUP="$FIX_DIR/setup.sh"
|
|
76
|
+
for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
|
|
77
|
+
[ -f "$f" ] || { echo "fixture missing required file: $f (see SCHEMA.md)"; exit 1; }
|
|
78
|
+
done
|
|
79
|
+
|
|
80
|
+
TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
|
|
81
|
+
|
|
82
|
+
RESULT_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE/$ARM"
|
|
83
|
+
mkdir -p "$RESULT_DIR"
|
|
84
|
+
|
|
85
|
+
# Fresh copy of test-repo — order matters. We copy arm-env files (skills,
|
|
86
|
+
# CLAUDE.md) BEFORE the baseline commit so they do NOT appear in the diff
|
|
87
|
+
# the arm produces. That keeps diff.patch focused on the arm's actual code
|
|
88
|
+
# changes, so forbidden-pattern scans and judge rubrics see only real work.
|
|
89
|
+
WORK_DIR="/tmp/bench-${RUN_ID}-${FIXTURE}-${ARM}"
|
|
90
|
+
rm -rf "$WORK_DIR"
|
|
91
|
+
cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
|
|
92
|
+
|
|
93
|
+
# All skill-driven arms (variant / solo_claude / l2_gated / l2_forced) get
|
|
94
|
+
# devlyn skills + project CLAUDE.md pre-baseline + codex shim + monitored
|
|
95
|
+
# wrapper. Bare gets nothing (no skill, no shim, no env).
|
|
96
|
+
#
|
|
97
|
+
# iter-0019: solo_claude (L1) shares variant-arm staging because the L1 arm
|
|
98
|
+
# runs the same orchestrator on the same skills — only difference is codex
|
|
99
|
+
# is blocked. Shim catches PATH resolution; wrapper catches direct-path
|
|
100
|
+
# invocations.
|
|
101
|
+
# iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced share variant staging
|
|
102
|
+
# (codex unblocked, shim+wrapper routing). Difference vs variant is the
|
|
103
|
+
# ENGINE_CLAUSE branch below — l2_* run --engine claude (Claude IMPLEMENT)
|
|
104
|
+
# while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
|
|
105
|
+
# /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
|
|
106
|
+
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
107
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
108
|
+
mkdir -p "$WORK_DIR/.claude"
|
|
109
|
+
if [ -d "$REPO_ROOT/.claude/skills" ]; then
|
|
110
|
+
cp -R "$REPO_ROOT/.claude/skills" "$WORK_DIR/.claude/skills"
|
|
111
|
+
else
|
|
112
|
+
echo "warning: $REPO_ROOT/.claude/skills missing — $ARM may lack project skills" >&2
|
|
113
|
+
fi
|
|
114
|
+
if [ -f "$REPO_ROOT/CLAUDE.md" ]; then
|
|
115
|
+
cp "$REPO_ROOT/CLAUDE.md" "$WORK_DIR/CLAUDE.md"
|
|
116
|
+
fi
|
|
117
|
+
# Stage the codex PATH shim. Required for both variant (route to monitored
|
|
118
|
+
# wrapper) and solo_claude (CODEX_BLOCKED enforcement at PATH layer).
|
|
119
|
+
if command -v codex >/dev/null 2>&1; then
|
|
120
|
+
CODEX_REAL_BIN="$(command -v codex)"
|
|
121
|
+
SHIM_SRC="$REPO_ROOT/scripts/codex-shim/codex"
|
|
122
|
+
WRAPPER_SRC="$REPO_ROOT/config/skills/_shared/codex-monitored.sh"
|
|
123
|
+
if [ ! -x "$SHIM_SRC" ] || [ ! -r "$WRAPPER_SRC" ]; then
|
|
124
|
+
echo "fatal: iter-0009 shim/wrapper missing at $SHIM_SRC / $WRAPPER_SRC" >&2
|
|
125
|
+
exit 1
|
|
126
|
+
fi
|
|
127
|
+
mkdir -p "$WORK_DIR/.devlyn-bin"
|
|
128
|
+
cp "$SHIM_SRC" "$WORK_DIR/.devlyn-bin/codex"
|
|
129
|
+
chmod +x "$WORK_DIR/.devlyn-bin/codex"
|
|
130
|
+
CODEX_MONITORED_PATH="$WORK_DIR/.claude/skills/_shared/codex-monitored.sh"
|
|
131
|
+
[ -r "$CODEX_MONITORED_PATH" ] || {
|
|
132
|
+
echo "fatal: codex-monitored.sh not present in staged skills at $CODEX_MONITORED_PATH" >&2
|
|
133
|
+
exit 1
|
|
134
|
+
}
|
|
135
|
+
export CODEX_REAL_BIN CODEX_MONITORED_PATH
|
|
136
|
+
SNAPSHOT_PATH=$(grep -m1 '^export PATH=' \
|
|
137
|
+
"$HOME/.claude/shell-snapshots/snapshot-zsh-"*.sh 2>/dev/null \
|
|
138
|
+
| head -1 | sed 's/^[^=]*=//' | tr -d '"' || true)
|
|
139
|
+
[ -n "$SNAPSHOT_PATH" ] || SNAPSHOT_PATH="$PATH"
|
|
140
|
+
INJECTED_PATH="$WORK_DIR/.devlyn-bin:$SNAPSHOT_PATH"
|
|
141
|
+
# iter-0019: arm-specific env. variant gets the codex routing pair;
|
|
142
|
+
# solo_claude gets CODEX_BLOCKED=1 (shim + wrapper both refuse).
|
|
143
|
+
# iter-0019.5 (Codex R1, 2026-04-28): when ARM=solo_claude, OMIT
|
|
144
|
+
# CODEX_REAL_BIN + CODEX_MONITORED_PATH from the env block. Reason: the
|
|
145
|
+
# absolute path to the real codex binary in CODEX_REAL_BIN is itself
|
|
146
|
+
# the bypass weapon — the orchestrator could `Bash("$CODEX_REAL_BIN
|
|
147
|
+
# exec ...")` and reach real codex, defeating both shim (PATH lookup
|
|
148
|
+
# bypassed) and wrapper (codex-monitored.sh not invoked). With the
|
|
149
|
+
# var absent: shim's BLOCKED check fires before it needs CODEX_REAL_BIN
|
|
150
|
+
# so removing it doesn't break the shim path; if CODEX_BLOCKED is
|
|
151
|
+
# somehow unset later, the shim now FAILS CLOSED (no real binary to
|
|
152
|
+
# delegate to) rather than silently delegating. The earlier "diagnostic
|
|
153
|
+
# visibility" justification was inverted — it handed the orchestrator
|
|
154
|
+
# the bypass weapon. Across iter-0019 paid 5-fixture run the bypass
|
|
155
|
+
# was OPEN but never exercised; this closes the surface preemptively
|
|
156
|
+
# before iter-0020's 9-fixture L0/L1/L2 run.
|
|
157
|
+
# iter-0033c (Codex R0-infra Q5): l2_gated/l2_forced are codex-UNBLOCKED
|
|
158
|
+
# (codex must be reachable for VERIFY pair-JUDGE). They take the variant
|
|
159
|
+
# path: ARM_CODEX_BLOCKED=0 → python writer omits CODEX_BLOCKED from env
|
|
160
|
+
# entirely (the shim refuses on any non-empty value, so 0 ≠ unset).
|
|
161
|
+
if [ "$ARM" = "solo_claude" ]; then
|
|
162
|
+
ARM_CODEX_BLOCKED=1
|
|
163
|
+
else
|
|
164
|
+
ARM_CODEX_BLOCKED=0
|
|
165
|
+
fi
|
|
166
|
+
python3 - "$WORK_DIR/.claude/settings.json" \
|
|
167
|
+
"$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" <<'PY'
|
|
168
|
+
import json, sys
|
|
169
|
+
out_path, path_val, real_bin, monitored, codex_blocked = sys.argv[1:6]
|
|
170
|
+
env = {
|
|
171
|
+
"CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1",
|
|
172
|
+
"PATH": path_val,
|
|
173
|
+
}
|
|
174
|
+
if codex_blocked == "1":
|
|
175
|
+
# iter-0019.5: solo_claude (L1 arm) — codex blocked at binary layer.
|
|
176
|
+
# Do NOT export CODEX_REAL_BIN / CODEX_MONITORED_PATH to the
|
|
177
|
+
# orchestrator subshell; those vars become bypass weapons under any
|
|
178
|
+
# CODEX_BLOCKED enforcement gap.
|
|
179
|
+
env["CODEX_BLOCKED"] = "1"
|
|
180
|
+
else:
|
|
181
|
+
# variant arm (L2) — codex routes through wrapper as part of pair-mode
|
|
182
|
+
# BUILD; both vars are required by the shim/wrapper handshake.
|
|
183
|
+
env["CODEX_REAL_BIN"] = real_bin
|
|
184
|
+
env["CODEX_MONITORED_PATH"] = monitored
|
|
185
|
+
data = {"env": env}
|
|
186
|
+
with open(out_path, "w") as f:
|
|
187
|
+
json.dump(data, f, indent=2)
|
|
188
|
+
f.write("\n")
|
|
189
|
+
PY
|
|
190
|
+
else
|
|
191
|
+
echo "warning: codex not on PATH — $ARM cannot exercise iter-0009 wrapper / iter-0019 BLOCKED enforcement" >&2
|
|
192
|
+
CODEX_REAL_BIN=""
|
|
193
|
+
CODEX_MONITORED_PATH=""
|
|
194
|
+
fi
|
|
195
|
+
fi
|
|
196
|
+
|
|
197
|
+
(cd "$WORK_DIR" \
|
|
198
|
+
&& git init -q \
|
|
199
|
+
&& git add -A \
|
|
200
|
+
&& git -c user.email=b@b -c user.name=b commit -q -m baseline) \
|
|
201
|
+
|| { echo "baseline git init failed"; exit 1; }
|
|
202
|
+
|
|
203
|
+
# Native security-review Skill expects `refs/remotes/origin/HEAD` to identify
|
|
204
|
+
# the diff surface. Fresh `git init` has no remote, which made a prior F8 run
|
|
205
|
+
# spend ~56 minutes inside CRITIC recovering this manually. Configure a
|
|
206
|
+
# synthetic origin pointing at the work dir itself (no network I/O) and
|
|
207
|
+
# wire origin/HEAD → origin/<current-branch> so security-review resolves
|
|
208
|
+
# immediately.
|
|
209
|
+
(
|
|
210
|
+
cd "$WORK_DIR"
|
|
211
|
+
git remote add origin "$WORK_DIR" 2>/dev/null || true
|
|
212
|
+
BRANCH=$(git branch --show-current 2>/dev/null || echo master)
|
|
213
|
+
git update-ref "refs/remotes/origin/$BRANCH" HEAD 2>/dev/null || true
|
|
214
|
+
git symbolic-ref refs/remotes/origin/HEAD "refs/remotes/origin/$BRANCH" 2>/dev/null || true
|
|
215
|
+
) >/dev/null 2>&1 || true
|
|
216
|
+
|
|
217
|
+
# Fixture-specific setup (applied post-baseline so the diff shows fixture
|
|
218
|
+
# framing as part of the arm's environment, not its work product). Commit
|
|
219
|
+
# failures here break arm-only diff isolation, so fail loudly.
|
|
220
|
+
if [ -f "$SETUP" ] && [ -s "$SETUP" ]; then
|
|
221
|
+
chmod +x "$SETUP"
|
|
222
|
+
if ! (cd "$WORK_DIR" && "$SETUP") > "$RESULT_DIR/setup.log" 2>&1; then
|
|
223
|
+
echo "setup.sh failed; see $RESULT_DIR/setup.log"
|
|
224
|
+
exit 1
|
|
225
|
+
fi
|
|
226
|
+
if ! (cd "$WORK_DIR" \
|
|
227
|
+
&& git add -A \
|
|
228
|
+
&& git -c user.email=b@b -c user.name=b commit -q --allow-empty -m "fixture-setup"); then
|
|
229
|
+
echo "fixture-setup commit failed — arm diff isolation broken"
|
|
230
|
+
exit 1
|
|
231
|
+
fi
|
|
232
|
+
fi
|
|
233
|
+
|
|
234
|
+
# iter-0019.6: stage normalized .devlyn/spec-verify.json containing ONLY
|
|
235
|
+
# verification_commands from expected.json (no tier_a_waivers, no
|
|
236
|
+
# forbidden_patterns, no scope oracles — those have separate enforcement
|
|
237
|
+
# layers). BUILD_GATE's spec-verify-check.py reads this generic path so
|
|
238
|
+
# the orchestrator stays benchmark-agnostic; future /devlyn:ideate could
|
|
239
|
+
# generate the same shape from a spec.md "## Verification" section for
|
|
240
|
+
# real-user runs (Codex R5, 2026-04-28). This stages all 3 arms — bare's
|
|
241
|
+
# .devlyn/ is created lazily by spec-verify-check.py if absent.
|
|
242
|
+
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
243
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
244
|
+
python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
|
|
245
|
+
import json, os, sys
|
|
246
|
+
expected = json.load(open(sys.argv[1]))
|
|
247
|
+
out_path = sys.argv[2]
|
|
248
|
+
normalized = {"verification_commands": expected.get("verification_commands", [])}
|
|
249
|
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
250
|
+
with open(out_path, "w") as f:
|
|
251
|
+
json.dump(normalized, f, indent=2)
|
|
252
|
+
f.write("\n")
|
|
253
|
+
PY
|
|
254
|
+
fi
|
|
255
|
+
|
|
256
|
+
# Build arm-specific prompt + place arm-specific environment files. Anything
|
|
257
|
+
# that's "benchmark scaffolding" (spec path placement, prompt wrapper) is
|
|
258
|
+
# committed to the work repo as a separate pre-model commit so the model's
|
|
259
|
+
# diff shows only its own work.
|
|
260
|
+
#
|
|
261
|
+
# Per-arm prompt selection is:
|
|
262
|
+
# 1. Fixture-id-aware for F9 (end-to-end novice fixture, no pre-placed spec).
|
|
263
|
+
# 2. Spec-mode `/devlyn:resolve --spec <path>` for the rest (post iter-0034
|
|
264
|
+
# Phase 4 cutover the OLD `/devlyn:auto-resolve` route was deleted).
|
|
265
|
+
PROMPT_FILE="$RESULT_DIR/input.md"
|
|
266
|
+
# Variant uses --engine auto (experimental dual-engine: codex BUILD + claude
|
|
267
|
+
# critique pair); solo_claude uses --engine claude explicitly so the orchestrator
|
|
268
|
+
# routes every phase to Claude and never tries to invoke codex. The CODEX_BLOCKED
|
|
269
|
+
# shim enforces this at the binary layer if the orchestrator misroutes. Both
|
|
270
|
+
# arms pass the engine flag explicitly so they survive future runtime-default
|
|
271
|
+
# changes (post iter-0020 close-out: default flipped to claude).
|
|
272
|
+
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
273
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
274
|
+
case "$ARM" in
|
|
275
|
+
solo_claude)
|
|
276
|
+
ENGINE_CLAUSE="--engine claude"
|
|
277
|
+
ENGINE_PROMPT_HINT="Run with \`--engine claude\` for every phase. Codex must not be invoked — the harness has blocked it at the binary layer for this run."
|
|
278
|
+
;;
|
|
279
|
+
variant)
|
|
280
|
+
ENGINE_CLAUSE="--engine auto"
|
|
281
|
+
ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
|
|
282
|
+
;;
|
|
283
|
+
l2_gated)
|
|
284
|
+
# iter-0033c: NEW L2 with natural pair-mode triggers. Claude does
|
|
285
|
+
# IMPLEMENT; pair-JUDGE in VERIFY fires only on coverage_failed OR
|
|
286
|
+
# MECHANICAL warning per /devlyn:resolve PHASE 5. Codex remains
|
|
287
|
+
# available as the OTHER-engine pair-JUDGE candidate.
|
|
288
|
+
ENGINE_CLAUSE="--engine claude"
|
|
289
|
+
ENGINE_PROMPT_HINT="Run with \`--engine claude\` and let the orchestrator's pair-mode (VERIFY) trigger naturally per its policy. Codex is available as the OTHER-engine pair-JUDGE — the harness has not blocked it. Do NOT pass \`--pair-verify\`; this arm measures gated triggering."
|
|
290
|
+
;;
|
|
291
|
+
l2_forced)
|
|
292
|
+
# iter-0033c: NEW L2 forced — pair-JUDGE always fires. Diagnostic arm
|
|
293
|
+
# for Gate 6 fixture-level cross-check + Gate 7 attribution causality.
|
|
294
|
+
ENGINE_CLAUSE="--engine claude --pair-verify"
|
|
295
|
+
ENGINE_PROMPT_HINT="Run with \`--engine claude --pair-verify\` so VERIFY pair-mode fires unconditionally. Codex is the OTHER-engine pair-JUDGE."
|
|
296
|
+
;;
|
|
297
|
+
esac
|
|
298
|
+
if [ "$FIXTURE" = "F9-e2e-ideate-to-resolve" ]; then
|
|
299
|
+
# F9 NEW chain (iter-0033a): /devlyn:ideate --quick → /devlyn:resolve
|
|
300
|
+
# --spec <emitted-path>. No pre-placed spec; the variant arm generates it
|
|
301
|
+
# via ideate. No preflight (folded into resolve's VERIFY phase).
|
|
302
|
+
#
|
|
303
|
+
# --quick is mandatory in autonomous (claude -p) mode: default ideate
|
|
304
|
+
# invokes interactive Q&A which has no human to answer in a benchmark
|
|
305
|
+
# subprocess — the agent asks questions and stops. --quick uses
|
|
306
|
+
# single-turn assume-and-confirm: AI synthesizes the spec from the goal
|
|
307
|
+
# plus an explicit assumptions block, so the chain proceeds end-to-end
|
|
308
|
+
# without user input. Smoke 3 (iter-0033a, 2026-04-30) caught this:
|
|
309
|
+
# default-mode F9 produced empty diffs after 54s of Q&A waiting.
|
|
310
|
+
cat > "$PROMPT_FILE" <<EOF
|
|
311
|
+
You are a first-time devlyn-cli user. You have a vague idea and want the 2-skill harness to take it from unstructured ask to shipped, verified feature. Run the chain:
|
|
312
|
+
|
|
313
|
+
1. Invoke \`/devlyn:ideate --quick ${ENGINE_CLAUSE}\` to turn the idea into a verifiable spec. \`--quick\` is mandatory: this is an autonomous run with no human to answer interactive questions, so ideate must synthesize the spec single-turn from the goal text and emit assumptions explicitly. The skill announces \`spec ready — /devlyn:resolve --spec <emitted-path>\` when done. The emitted spec lives at \`docs/specs/<id>-<slug>/spec.md\` with a sibling \`spec.expected.json\`.
|
|
314
|
+
2. Take the emitted spec path verbatim from the announce line and invoke \`/devlyn:resolve --spec <that-path> ${ENGINE_CLAUSE}\` to run PLAN → IMPLEMENT → BUILD_GATE → CLEANUP → VERIFY (VERIFY is the fresh-subagent final phase — there is no separate preflight skill in the 2-skill design).
|
|
315
|
+
|
|
316
|
+
${ENGINE_PROMPT_HINT}
|
|
317
|
+
|
|
318
|
+
Follow the skills to completion. Do not short-circuit. Do not invoke \`/devlyn:auto-resolve\` or \`/devlyn:preflight\` — they are not part of the 2-skill chain. Do not stop after ideate; the chain only counts as complete after \`/devlyn:resolve\` returns a terminal verdict.
|
|
319
|
+
|
|
320
|
+
After the whole chain, briefly report: (a) the spec path ideate produced, (b) the resolve terminal verdict, (c) whether VERIFY surfaced any findings.
|
|
321
|
+
|
|
322
|
+
RAW IDEA:
|
|
323
|
+
$(cat "$TASK")
|
|
324
|
+
EOF
|
|
325
|
+
else
|
|
326
|
+
# Spec-mode /devlyn:resolve: spec pre-placed at the canonical roadmap path
|
|
327
|
+
# the harness has used since iter-0019. Pre-Phase-4 this branch shared
|
|
328
|
+
# staging with the OLD /devlyn:auto-resolve route; iter-0034 deleted the
|
|
329
|
+
# OLD branch and this is now the only non-F9 path.
|
|
330
|
+
mkdir -p "$WORK_DIR/docs/roadmap/phase-1"
|
|
331
|
+
cp "$SPEC" "$WORK_DIR/docs/roadmap/phase-1/$FIXTURE.md"
|
|
332
|
+
cat > "$PROMPT_FILE" <<EOF
|
|
333
|
+
Use the \`/devlyn:resolve --spec docs/roadmap/phase-1/$FIXTURE.md ${ENGINE_CLAUSE}\` skill to implement the spec. ${ENGINE_PROMPT_HINT}
|
|
334
|
+
|
|
335
|
+
The 2-skill design folds verification into resolve's VERIFY phase — there is no separate \`/devlyn:preflight\`, \`/devlyn:auto-resolve\`, or other 3-skill orchestrator at HEAD.
|
|
336
|
+
|
|
337
|
+
After the pipeline finishes, report the terminal verdict and list of files changed so the benchmark runner can capture state.
|
|
338
|
+
EOF
|
|
339
|
+
fi
|
|
340
|
+
else
|
|
341
|
+
# Bare — same prompt for F9 as any other fixture: task.txt with anti-skill rules.
|
|
342
|
+
cat > "$PROMPT_FILE" <<EOF
|
|
343
|
+
You are acting as a smart engineer implementing the following request directly. No skill pipeline.
|
|
344
|
+
|
|
345
|
+
HARD RULES:
|
|
346
|
+
- Do NOT invoke any \`/devlyn:*\` skill (no auto-resolve, evaluate, review, clean, update-docs, team-*, etc.).
|
|
347
|
+
- Do NOT invoke native \`simplify\` or \`security-review\` skills.
|
|
348
|
+
- Use only direct tools: Read, Write, Edit, Grep, Glob, Bash.
|
|
349
|
+
- Write code to satisfy the request. Run the verification commands the user implies. Fix failures until they pass.
|
|
350
|
+
|
|
351
|
+
REQUEST:
|
|
352
|
+
$(cat "$TASK")
|
|
353
|
+
EOF
|
|
354
|
+
fi
|
|
355
|
+
|
|
356
|
+
# Commit scaffolding so the upcoming arm-only diff excludes it. A failure
|
|
357
|
+
# here means arm work would appear mixed with scaffolding in the diff — fail
|
|
358
|
+
# loudly rather than silently producing corrupted data.
|
|
359
|
+
if ! (cd "$WORK_DIR" \
|
|
360
|
+
&& git add -A \
|
|
361
|
+
&& git -c user.email=b@b -c user.name=b commit -q --allow-empty -m "bench-scaffold"); then
|
|
362
|
+
echo "bench-scaffold commit failed — arm diff isolation broken"
|
|
363
|
+
exit 1
|
|
364
|
+
fi
|
|
365
|
+
# Capture the scaffold commit SHA so the arm-only diff can be computed even
|
|
366
|
+
# when the arm makes its own commits internally (e.g. variant's auto-resolve
|
|
367
|
+
# pipeline commits after each phase). Diffing against HEAD would miss those.
|
|
368
|
+
SCAFFOLD_SHA=$(cd "$WORK_DIR" && git rev-parse HEAD)
|
|
369
|
+
|
|
370
|
+
# Timing start
|
|
371
|
+
T_START=$(date +%s)
|
|
372
|
+
cat > "$RESULT_DIR/timing.json" <<EOF
|
|
373
|
+
{
|
|
374
|
+
"run_id": "$RUN_ID",
|
|
375
|
+
"fixture": "$FIXTURE",
|
|
376
|
+
"arm": "$ARM",
|
|
377
|
+
"work_dir": "$WORK_DIR",
|
|
378
|
+
"start_epoch": $T_START
|
|
379
|
+
}
|
|
380
|
+
EOF
|
|
381
|
+
|
|
382
|
+
# --- Invocation -------------------------------------------------------------
|
|
383
|
+
# Exit code is captured so infrastructure failures don't silently look like
|
|
384
|
+
# a weak diff. See invoke_exit in result.json.
|
|
385
|
+
INVOKE_EXIT=0
|
|
386
|
+
# iter-0012: WATCHDOG_FIRED is the truth source for `timed_out` in result.json.
|
|
387
|
+
# Set to 1 only when the watchdog flag file existed at post-wait check
|
|
388
|
+
# (lines 332-336). Initialized here so the `set -u` `export` below at the
|
|
389
|
+
# Python aggregator works in both branches (dry-run never sets it).
|
|
390
|
+
WATCHDOG_FIRED=0
|
|
391
|
+
if [ $DRY_RUN -eq 1 ]; then
|
|
392
|
+
echo "[run-fixture] DRY RUN — prepared $WORK_DIR, skipping model invocation" \
|
|
393
|
+
> "$RESULT_DIR/transcript.txt"
|
|
394
|
+
else
|
|
395
|
+
command -v claude >/dev/null 2>&1 || {
|
|
396
|
+
echo "claude CLI not on PATH — cannot invoke arm"; exit 1;
|
|
397
|
+
}
|
|
398
|
+
# Arm uses real HOME so Claude auth (macOS Keychain + ~/.claude session
|
|
399
|
+
# state) works. Fixtures that need HOME isolation override it inline in
|
|
400
|
+
# their verification commands (e.g. F2 uses `HOME=/nonexistent` per command).
|
|
401
|
+
# Variant-arm skills are resolved from $WORK_DIR/.claude/skills (project
|
|
402
|
+
# scope), so bare-arm runs never see them regardless of HOME.
|
|
403
|
+
#
|
|
404
|
+
# Portable wall-clock watchdog. macOS lacks GNU `timeout` by default; the
|
|
405
|
+
# earlier fallback ran arms unbounded, which produced a multi-hour F7 hang
|
|
406
|
+
# when the inner `codex exec` raced against a lingering codex-mcp-server.
|
|
407
|
+
# We background the arm in its own process group (`set -m` + `exec`) so the
|
|
408
|
+
# watchdog can `kill -- -PGID` and reap codex/codex-mcp-server descendants
|
|
409
|
+
# together with the parent. A flag file disambiguates timeout from natural
|
|
410
|
+
# exit; on timeout we set INVOKE_EXIT=124 (GNU timeout convention) so the
|
|
411
|
+
# downstream `invoke_failure` logic routes the run into BLOCKED. iter-0012:
|
|
412
|
+
# the same flag also flips WATCHDOG_FIRED=1, which is exported and consumed
|
|
413
|
+
# by the Python aggregator below to derive result.json.timed_out — so a
|
|
414
|
+
# natural exit at or past the budget is no longer mislabeled as timeout.
|
|
415
|
+
#
|
|
416
|
+
# MCP/config isolation (iter 0004). The harness's `claude -p` subprocess
|
|
417
|
+
# must not load the operator's user-level MCP plugins (pencil, codex-cli,
|
|
418
|
+
# telegram, vercel, …). Project policy is "MCP is not in the loop"; loading
|
|
419
|
+
# user MCP inside the variant arm is uncontrolled environment leaking into
|
|
420
|
+
# the experiment, and it is the most plausible cause of the F7 0-byte-
|
|
421
|
+
# transcript hang. `--strict-mcp-config` + an empty `mcpServers` object
|
|
422
|
+
# forces a hermetic subprocess. Skills still resolve via `/skill-name`.
|
|
423
|
+
# `--debug-file` records per-arm init/runtime so the next hang has a
|
|
424
|
+
# location, not a guess.
|
|
425
|
+
TIMEOUT_FLAG="$RESULT_DIR/.timed_out"
|
|
426
|
+
rm -f "$TIMEOUT_FLAG"
|
|
427
|
+
|
|
428
|
+
set +e
|
|
429
|
+
set -m
|
|
430
|
+
(
|
|
431
|
+
cd "$WORK_DIR"
|
|
432
|
+
# iter-0009 + iter-0019: prepend codex shim PATH for any arm that staged
|
|
433
|
+
# one. variant routes through codex-monitored.sh; solo_claude refuses on
|
|
434
|
+
# CODEX_BLOCKED=1; bare has no shim.
|
|
435
|
+
# iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced ALSO need the shim
|
|
436
|
+
# PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
|
|
437
|
+
# `codex exec` through the wrapper for starvation safety.
|
|
438
|
+
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
439
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } \
|
|
440
|
+
&& [ -x "$WORK_DIR/.devlyn-bin/codex" ]; then
|
|
441
|
+
export PATH="$WORK_DIR/.devlyn-bin:$PATH"
|
|
442
|
+
[ "$ARM" = "solo_claude" ] && export CODEX_BLOCKED=1
|
|
443
|
+
fi
|
|
444
|
+
# iter-0019.6: BUILD_GATE's spec-verify-check.py uses BENCH_WORKDIR for
|
|
445
|
+
# commands that escape the work-dir (e.g. F9's outside-repo check via
|
|
446
|
+
# `cd /tmp && node $BENCH_WORKDIR/bin/cli.js gitstats`). Mirror exactly
|
|
447
|
+
# what the post-run verifier (run-fixture.sh:431-434) sets so the gate
|
|
448
|
+
# sees the same environment shape.
|
|
449
|
+
export BENCH_WORKDIR="$WORK_DIR"
|
|
450
|
+
exec claude \
|
|
451
|
+
-p "$(cat "$PROMPT_FILE")" \
|
|
452
|
+
--dangerously-skip-permissions \
|
|
453
|
+
--effort xhigh \
|
|
454
|
+
--strict-mcp-config \
|
|
455
|
+
--mcp-config '{"mcpServers":{}}' \
|
|
456
|
+
--debug-file "$RESULT_DIR/claude-debug.log"
|
|
457
|
+
) > "$RESULT_DIR/transcript.txt" 2>&1 &
|
|
458
|
+
CHILD_PID=$!
|
|
459
|
+
set +m
|
|
460
|
+
|
|
461
|
+
(
|
|
462
|
+
sleep "$TIMEOUT"
|
|
463
|
+
if kill -0 "$CHILD_PID" 2>/dev/null; then
|
|
464
|
+
: > "$TIMEOUT_FLAG"
|
|
465
|
+
kill -TERM -- "-$CHILD_PID" 2>/dev/null
|
|
466
|
+
sleep 5
|
|
467
|
+
kill -KILL -- "-$CHILD_PID" 2>/dev/null
|
|
468
|
+
fi
|
|
469
|
+
) &
|
|
470
|
+
WATCHDOG_PID=$!
|
|
471
|
+
|
|
472
|
+
wait "$CHILD_PID"
|
|
473
|
+
INVOKE_EXIT=$?
|
|
474
|
+
|
|
475
|
+
kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
|
|
476
|
+
wait "$WATCHDOG_PID" 2>/dev/null || true
|
|
477
|
+
|
|
478
|
+
if [ -f "$TIMEOUT_FLAG" ]; then
|
|
479
|
+
INVOKE_EXIT=124
|
|
480
|
+
WATCHDOG_FIRED=1
|
|
481
|
+
rm -f "$TIMEOUT_FLAG"
|
|
482
|
+
echo "[run-fixture] arm timed out after ${TIMEOUT}s — INVOKE_EXIT=124" >&2
|
|
483
|
+
fi
|
|
484
|
+
set -e
|
|
485
|
+
fi
|
|
486
|
+
|
|
487
|
+
T_END=$(date +%s)
|
|
488
|
+
ELAPSED=$((T_END - T_START))
|
|
489
|
+
|
|
490
|
+
# Capture the ARM-ONLY diff against the scaffold commit. Variant's
|
|
491
|
+
# auto-resolve pipeline commits internally after each phase, so diffing
|
|
492
|
+
# against HEAD would miss committed work. Diffing against SCAFFOLD_SHA after
|
|
493
|
+
# `git add -A` picks up both scaffold..HEAD committed deltas AND any
|
|
494
|
+
# staged-but-not-yet-committed leftovers (unstaged or untracked).
|
|
495
|
+
(cd "$WORK_DIR" \
|
|
496
|
+
&& git add -A 2>/dev/null \
|
|
497
|
+
&& git diff "$SCAFFOLD_SHA") > "$RESULT_DIR/diff.patch" 2>&1 || true
|
|
498
|
+
(cd "$WORK_DIR" \
|
|
499
|
+
&& git diff "$SCAFFOLD_SHA" --name-only) > "$RESULT_DIR/changed-files.txt" 2>&1 || true
|
|
500
|
+
|
|
501
|
+
# Deterministic oracles (step 1+ of the benchmark-extension plan).
|
|
502
|
+
# Findings-only at this stage; scoring integration is step 5.
|
|
503
|
+
python3 "$BENCH_ROOT/scripts/oracle-test-fidelity.py" \
|
|
504
|
+
--work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" \
|
|
505
|
+
> "$RESULT_DIR/oracle-test-fidelity.json" 2>/dev/null || \
|
|
506
|
+
echo '{"oracle":"test-fidelity","findings":[],"error":"oracle invocation failed"}' \
|
|
507
|
+
> "$RESULT_DIR/oracle-test-fidelity.json"
|
|
508
|
+
|
|
509
|
+
python3 "$BENCH_ROOT/scripts/oracle-scope-tier-a.py" \
|
|
510
|
+
--work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" --expected "$EXPECTED" \
|
|
511
|
+
> "$RESULT_DIR/oracle-scope-tier-a.json" 2>/dev/null || \
|
|
512
|
+
echo '{"oracle":"scope-tier-a","findings":[],"error":"oracle invocation failed"}' \
|
|
513
|
+
> "$RESULT_DIR/oracle-scope-tier-a.json"
|
|
514
|
+
|
|
515
|
+
python3 "$BENCH_ROOT/scripts/oracle-scope-tier-b.py" \
|
|
516
|
+
--work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" --expected "$EXPECTED" \
|
|
517
|
+
> "$RESULT_DIR/oracle-scope-tier-b.json" 2>/dev/null || \
|
|
518
|
+
echo '{"oracle":"scope-tier-b","findings":[],"error":"oracle invocation failed"}' \
|
|
519
|
+
> "$RESULT_DIR/oracle-scope-tier-b.json"
|
|
520
|
+
|
|
521
|
+
# Run verification commands + forbidden pattern scan + deps check. Uses
|
|
522
|
+
# the operator's real HOME (same as the arm saw). Fixtures that need HOME
|
|
523
|
+
# isolation override it inline per verification command.
|
|
524
|
+
python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" <<'PY'
|
|
525
|
+
import json, os, re, subprocess, sys
|
|
526
|
+
|
|
527
|
+
expected = json.load(open(sys.argv[1]))
|
|
528
|
+
result_dir = sys.argv[2]
|
|
529
|
+
work = sys.argv[3]
|
|
530
|
+
|
|
531
|
+
verify_env = os.environ.copy()
|
|
532
|
+
# Expose the work-dir path so fixtures whose verification needs to reference
|
|
533
|
+
# the work root can do so portably (e.g. F9's out-of-repo check).
|
|
534
|
+
verify_env["BENCH_WORKDIR"] = work
|
|
535
|
+
|
|
536
|
+
verify = {"commands": [], "forbidden_pattern_hits": [], "deps_added": 0,
|
|
537
|
+
"max_deps_added": expected.get("max_deps_added", 0),
|
|
538
|
+
"missing_required_files": [], "forbidden_files_present": []}
|
|
539
|
+
|
|
540
|
+
for vc in expected.get("verification_commands", []):
|
|
541
|
+
try:
|
|
542
|
+
proc = subprocess.run(vc["cmd"], cwd=work, shell=True, env=verify_env,
|
|
543
|
+
capture_output=True, text=True, timeout=60)
|
|
544
|
+
out = (proc.stdout or "") + (proc.stderr or "")
|
|
545
|
+
ok_exit = proc.returncode == vc.get("exit_code", 0)
|
|
546
|
+
ok_contains = all(s in out for s in vc.get("stdout_contains", []))
|
|
547
|
+
ok_not = not any(s in out for s in vc.get("stdout_not_contains", []))
|
|
548
|
+
verify["commands"].append({
|
|
549
|
+
"cmd": vc["cmd"],
|
|
550
|
+
"expected_exit": vc.get("exit_code", 0),
|
|
551
|
+
"actual_exit": proc.returncode,
|
|
552
|
+
"pass": bool(ok_exit and ok_contains and ok_not),
|
|
553
|
+
"reason": None if (ok_exit and ok_contains and ok_not)
|
|
554
|
+
else ("exit" if not ok_exit
|
|
555
|
+
else ("missing_contains" if not ok_contains else "unexpected_text")),
|
|
556
|
+
"stdout_tail": out[-500:],
|
|
557
|
+
})
|
|
558
|
+
except subprocess.TimeoutExpired:
|
|
559
|
+
verify["commands"].append({"cmd": vc["cmd"], "pass": False, "reason": "timeout"})
|
|
560
|
+
except Exception as e:
|
|
561
|
+
verify["commands"].append({"cmd": vc["cmd"], "pass": False,
|
|
562
|
+
"reason": f"error:{e.__class__.__name__}:{e}"})
|
|
563
|
+
|
|
564
|
+
# Forbidden pattern scan over diff.patch. Each pattern may declare a `files`
|
|
565
|
+
# allowlist; when present, we slice the diff to only those files' hunks.
|
|
566
|
+
diff_text = ""
|
|
567
|
+
try:
|
|
568
|
+
with open(os.path.join(result_dir, "diff.patch")) as fh:
|
|
569
|
+
diff_text = fh.read()
|
|
570
|
+
except Exception:
|
|
571
|
+
pass
|
|
572
|
+
|
|
573
|
+
def slice_diff_to_files(diff, files):
|
|
574
|
+
"""Return the subset of a unified diff touching any of `files`.
|
|
575
|
+
Hunks outside the allowlist are dropped."""
|
|
576
|
+
if not files:
|
|
577
|
+
return diff
|
|
578
|
+
out, keep = [], False
|
|
579
|
+
for line in diff.splitlines(keepends=True):
|
|
580
|
+
if line.startswith("diff --git "):
|
|
581
|
+
keep = any(f in line for f in files)
|
|
582
|
+
if keep:
|
|
583
|
+
out.append(line)
|
|
584
|
+
return "".join(out)
|
|
585
|
+
|
|
586
|
+
for fp in expected.get("forbidden_patterns", []):
|
|
587
|
+
scope = slice_diff_to_files(diff_text, fp.get("files") or [])
|
|
588
|
+
if re.search(fp["pattern"], scope):
|
|
589
|
+
verify["forbidden_pattern_hits"].append({
|
|
590
|
+
"pattern": fp["pattern"],
|
|
591
|
+
"severity": fp.get("severity", "warning"),
|
|
592
|
+
"description": fp.get("description", ""),
|
|
593
|
+
"scoped_to": fp.get("files") or "all",
|
|
594
|
+
})
|
|
595
|
+
|
|
596
|
+
# Deps added count (naive: count top-level added lines under dependencies keys)
|
|
597
|
+
try:
|
|
598
|
+
proc = subprocess.run(["git", "diff", "HEAD", "--", "package.json"],
|
|
599
|
+
cwd=work, capture_output=True, text=True)
|
|
600
|
+
in_deps = False
|
|
601
|
+
for line in (proc.stdout or "").splitlines():
|
|
602
|
+
if line.startswith("+ ") or line.startswith("- "):
|
|
603
|
+
continue
|
|
604
|
+
if '"dependencies"' in line or '"devDependencies"' in line:
|
|
605
|
+
in_deps = True
|
|
606
|
+
elif line.strip().startswith("}"):
|
|
607
|
+
in_deps = False
|
|
608
|
+
elif in_deps and line.startswith("+") and not line.startswith("+++"):
|
|
609
|
+
if re.search(r'"[^"]+"\s*:\s*"[^"]+"', line):
|
|
610
|
+
verify["deps_added"] += 1
|
|
611
|
+
except Exception:
|
|
612
|
+
pass
|
|
613
|
+
|
|
614
|
+
# Required / forbidden files
|
|
615
|
+
try:
|
|
616
|
+
with open(os.path.join(result_dir, "changed-files.txt")) as fh:
|
|
617
|
+
changed = [l.strip() for l in fh.read().splitlines() if l.strip()]
|
|
618
|
+
except Exception:
|
|
619
|
+
changed = []
|
|
620
|
+
verify["missing_required_files"] = [
|
|
621
|
+
f for f in expected.get("required_files", [])
|
|
622
|
+
if not os.path.exists(os.path.join(work, f))
|
|
623
|
+
]
|
|
624
|
+
verify["forbidden_files_present"] = [
|
|
625
|
+
f for f in expected.get("forbidden_files", []) if f in changed
|
|
626
|
+
]
|
|
627
|
+
|
|
628
|
+
total = len(verify["commands"])
|
|
629
|
+
passed = sum(1 for r in verify["commands"] if r.get("pass"))
|
|
630
|
+
verify["commands_passed"] = passed
|
|
631
|
+
verify["commands_total"] = total
|
|
632
|
+
verify["verify_score"] = (passed / total) if total else 1.0
|
|
633
|
+
|
|
634
|
+
verify["disqualifier"] = (
|
|
635
|
+
any(h["severity"] == "disqualifier" for h in verify["forbidden_pattern_hits"])
|
|
636
|
+
or verify["deps_added"] > verify["max_deps_added"]
|
|
637
|
+
or bool(verify["missing_required_files"])
|
|
638
|
+
or bool(verify["forbidden_files_present"])
|
|
639
|
+
)
|
|
640
|
+
|
|
641
|
+
json.dump(verify, open(os.path.join(result_dir, "verify.json"), "w"), indent=2)
|
|
642
|
+
PY
|
|
643
|
+
|
|
644
|
+
# Timing + aggregate
|
|
645
|
+
export INVOKE_EXIT WATCHDOG_FIRED
|
|
646
|
+
python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" <<'PY'
|
|
647
|
+
import json, os, sys
|
|
648
|
+
result_dir, fixture, arm, run_id = sys.argv[1:5]
|
|
649
|
+
t_end, elapsed, timeout = int(sys.argv[5]), int(sys.argv[6]), int(sys.argv[7])
|
|
650
|
+
|
|
651
|
+
timing = json.load(open(os.path.join(result_dir, "timing.json")))
|
|
652
|
+
timing["end_epoch"] = t_end
|
|
653
|
+
timing["elapsed_seconds"] = elapsed
|
|
654
|
+
timing["timeout_seconds"] = timeout
|
|
655
|
+
# iter-0012: derive from watchdog signal, not elapsed wall time. Natural
|
|
656
|
+
# exits at-or-past the budget (budget == elapsed, or up to ~5s past due to
|
|
657
|
+
# SIGTERM grace) are no longer mislabeled as timeouts. Source of truth is
|
|
658
|
+
# WATCHDOG_FIRED, set in run-fixture.sh when TIMEOUT_FLAG existed post-wait.
|
|
659
|
+
timing["timed_out"] = os.environ.get("WATCHDOG_FIRED", "0") == "1"
|
|
660
|
+
json.dump(timing, open(os.path.join(result_dir, "timing.json"), "w"), indent=2)
|
|
661
|
+
|
|
662
|
+
verify = json.load(open(os.path.join(result_dir, "verify.json")))
|
|
663
|
+
try:
|
|
664
|
+
with open(os.path.join(result_dir, "diff.patch")) as f: diff_size = len(f.read())
|
|
665
|
+
except Exception: diff_size = 0
|
|
666
|
+
try:
|
|
667
|
+
with open(os.path.join(result_dir, "changed-files.txt")) as f:
|
|
668
|
+
changed = [l for l in f.read().splitlines() if l.strip()]
|
|
669
|
+
except Exception:
|
|
670
|
+
changed = []
|
|
671
|
+
|
|
672
|
+
result = {
|
|
673
|
+
"fixture": fixture,
|
|
674
|
+
"arm": arm,
|
|
675
|
+
"run_id": run_id,
|
|
676
|
+
"disqualifier": verify.get("disqualifier", False),
|
|
677
|
+
"verify_score": verify.get("verify_score", 0.0),
|
|
678
|
+
"commands_passed": verify.get("commands_passed", 0),
|
|
679
|
+
"commands_total": verify.get("commands_total", 0),
|
|
680
|
+
"diff_bytes": diff_size,
|
|
681
|
+
"files_changed": len(changed),
|
|
682
|
+
"elapsed_seconds": elapsed,
|
|
683
|
+
"timed_out": timing["timed_out"],
|
|
684
|
+
"invoke_exit": int(os.environ.get("INVOKE_EXIT", "0")),
|
|
685
|
+
"invoke_failure": int(os.environ.get("INVOKE_EXIT", "0")) not in (0,) and not timing["timed_out"],
|
|
686
|
+
}
|
|
687
|
+
json.dump(result, open(os.path.join(result_dir, "result.json"), "w"), indent=2)
|
|
688
|
+
print(json.dumps(result, indent=2))
|
|
689
|
+
PY
|
|
690
|
+
|
|
691
|
+
echo "[run-fixture] done: $RESULT_DIR"
|