devlyn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +321 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +363 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +69 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +69 -20
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-headroom-candidate.sh — calibrate candidate fixtures for L2/pair headroom.
|
|
3
|
+
#
|
|
4
|
+
# Runs only the arms needed by headroom-gate.py: bare and solo_claude.
|
|
5
|
+
# Then blind-judges those two arms and applies the mechanical gate.
|
|
6
|
+
|
|
7
|
+
set -euo pipefail
|
|
8
|
+
|
|
9
|
+
usage() {
|
|
10
|
+
local code="${1:-1}"
|
|
11
|
+
echo "usage: $0 [--run-id ID] <fixture> [<fixture> ...]" >&2
|
|
12
|
+
exit "$code"
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
RUN_ID=""
|
|
16
|
+
FIXTURES=()
|
|
17
|
+
while [ $# -gt 0 ]; do
|
|
18
|
+
case "$1" in
|
|
19
|
+
--run-id) RUN_ID="$2"; shift 2;;
|
|
20
|
+
-h|--help) usage 0;;
|
|
21
|
+
F[0-9]*) FIXTURES+=("$1"); shift;;
|
|
22
|
+
*) echo "unknown arg: $1" >&2; usage;;
|
|
23
|
+
esac
|
|
24
|
+
done
|
|
25
|
+
|
|
26
|
+
[ ${#FIXTURES[@]} -gt 0 ] || usage
|
|
27
|
+
|
|
28
|
+
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
29
|
+
REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
|
|
30
|
+
|
|
31
|
+
if [ -z "$RUN_ID" ]; then
|
|
32
|
+
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
|
33
|
+
SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
|
|
34
|
+
RUN_ID="${TS}-${SHA}-headroom"
|
|
35
|
+
fi
|
|
36
|
+
|
|
37
|
+
echo ""
|
|
38
|
+
echo "═══ Headroom Candidate Run ═══"
|
|
39
|
+
echo "Run-id: $RUN_ID"
|
|
40
|
+
echo "Fixtures: ${FIXTURES[*]}"
|
|
41
|
+
echo "Arms: bare solo_claude"
|
|
42
|
+
if [ ${#FIXTURES[@]} -lt 2 ]; then
|
|
43
|
+
echo "Gate: will FAIL set gate unless at least 2 fixtures are supplied"
|
|
44
|
+
fi
|
|
45
|
+
echo ""
|
|
46
|
+
|
|
47
|
+
SRC_SKILLS="$REPO_ROOT/config/skills"
|
|
48
|
+
DST_SKILLS="$REPO_ROOT/.claude/skills"
|
|
49
|
+
mkdir -p "$DST_SKILLS"
|
|
50
|
+
mirrored=0
|
|
51
|
+
for src_dir in "$SRC_SKILLS"/*/; do
|
|
52
|
+
[ -d "$src_dir" ] || continue
|
|
53
|
+
name=$(basename "$src_dir")
|
|
54
|
+
case "$name" in
|
|
55
|
+
devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
|
|
56
|
+
continue ;;
|
|
57
|
+
esac
|
|
58
|
+
staging="$DST_SKILLS/.${name}.staging"
|
|
59
|
+
rm -rf "$staging"
|
|
60
|
+
cp -R "$src_dir" "$staging"
|
|
61
|
+
rm -rf "$DST_SKILLS/$name"
|
|
62
|
+
mv "$staging" "$DST_SKILLS/$name"
|
|
63
|
+
mirrored=$((mirrored + 1))
|
|
64
|
+
done
|
|
65
|
+
echo "[headroom] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
|
|
66
|
+
|
|
67
|
+
for fid in "${FIXTURES[@]}"; do
|
|
68
|
+
echo "[headroom] ► $fid / bare"
|
|
69
|
+
bash "$BENCH_ROOT/scripts/run-fixture.sh" \
|
|
70
|
+
--fixture "$fid" --arm bare --run-id "$RUN_ID" \
|
|
71
|
+
|| echo "[headroom] ✗ $fid / bare (arm failure tolerated; artifacts may still exist)"
|
|
72
|
+
|
|
73
|
+
echo "[headroom] ► $fid / solo_claude"
|
|
74
|
+
bash "$BENCH_ROOT/scripts/run-fixture.sh" \
|
|
75
|
+
--fixture "$fid" --arm solo_claude --run-id "$RUN_ID" \
|
|
76
|
+
|| echo "[headroom] ✗ $fid / solo_claude (arm failure tolerated; artifacts may still exist)"
|
|
77
|
+
|
|
78
|
+
echo "[headroom] ► judge $fid"
|
|
79
|
+
bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
|
|
80
|
+
|| echo "[headroom] ✗ judge failed for $fid"
|
|
81
|
+
done
|
|
82
|
+
|
|
83
|
+
echo ""
|
|
84
|
+
set +e
|
|
85
|
+
python3 "$BENCH_ROOT/scripts/headroom-gate.py" \
|
|
86
|
+
--run-id "$RUN_ID" \
|
|
87
|
+
--out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json" \
|
|
88
|
+
--out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
|
|
89
|
+
GATE_EXIT=$?
|
|
90
|
+
set -e
|
|
91
|
+
|
|
92
|
+
cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
|
|
93
|
+
exit "$GATE_EXIT"
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Run a prepared SWE-bench frozen VERIFY corpus and gate the result set.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
usage() {
|
|
6
|
+
cat >&2 <<EOF
|
|
7
|
+
usage: $0 --manifest <path> [--run-prefix ID] [--pair-mode forced|gated]
|
|
8
|
+
[--min-runs N] [--out-json <path>] [--out-md <path>]
|
|
9
|
+
[--max-pair-solo-wall-ratio N] [--timeout-seconds N]
|
|
10
|
+
[--run-ids-out <path>] [--resume-completed-arms]
|
|
11
|
+
[--prepare-only] [--gate-only-run-ids <path>]
|
|
12
|
+
|
|
13
|
+
Reads the manifest from prepare-swebench-frozen-corpus.py, runs each prepared
|
|
14
|
+
case through run-frozen-verify-pair.sh, then applies frozen-verify-gate.py to
|
|
15
|
+
the resulting run ids. --prepare-only validates patch application without
|
|
16
|
+
provider calls and skips the gate. --gate-only-run-ids reruns the gate over an
|
|
17
|
+
existing newline-delimited run-id file without invoking providers.
|
|
18
|
+
EOF
|
|
19
|
+
exit "${1:-1}"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
MANIFEST=""
|
|
23
|
+
RUN_PREFIX=""
|
|
24
|
+
PAIR_MODE="gated"
|
|
25
|
+
MIN_RUNS=2
|
|
26
|
+
OUT_JSON=""
|
|
27
|
+
OUT_MD=""
|
|
28
|
+
MAX_PAIR_SOLO_WALL_RATIO=""
|
|
29
|
+
PREPARE_ONLY=0
|
|
30
|
+
GATE_ONLY_RUN_IDS=""
|
|
31
|
+
TIMEOUT_SECONDS=""
|
|
32
|
+
RUN_IDS_OUT=""
|
|
33
|
+
RESUME_COMPLETED_ARMS=0
|
|
34
|
+
while [ $# -gt 0 ]; do
|
|
35
|
+
case "$1" in
|
|
36
|
+
--manifest) MANIFEST="$2"; shift 2;;
|
|
37
|
+
--run-prefix) RUN_PREFIX="$2"; shift 2;;
|
|
38
|
+
--pair-mode) PAIR_MODE="$2"; shift 2;;
|
|
39
|
+
--min-runs) MIN_RUNS="$2"; shift 2;;
|
|
40
|
+
--out-json) OUT_JSON="$2"; shift 2;;
|
|
41
|
+
--out-md) OUT_MD="$2"; shift 2;;
|
|
42
|
+
--max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
|
|
43
|
+
--timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
|
|
44
|
+
--run-ids-out) RUN_IDS_OUT="$2"; shift 2;;
|
|
45
|
+
--resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
|
|
46
|
+
--prepare-only) PREPARE_ONLY=1; shift;;
|
|
47
|
+
--gate-only-run-ids) GATE_ONLY_RUN_IDS="$2"; shift 2;;
|
|
48
|
+
-h|--help) usage 0;;
|
|
49
|
+
*) echo "unknown arg: $1" >&2; usage 1;;
|
|
50
|
+
esac
|
|
51
|
+
done
|
|
52
|
+
|
|
53
|
+
[ -n "$MANIFEST" ] || usage 1
|
|
54
|
+
[ -f "$MANIFEST" ] || { echo "manifest not found: $MANIFEST" >&2; exit 1; }
|
|
55
|
+
[ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated" >&2; exit 1; }
|
|
56
|
+
case "$MIN_RUNS" in ''|*[!0-9]*) echo "--min-runs must be an integer" >&2; exit 1;; esac
|
|
57
|
+
[ "$MIN_RUNS" -gt 0 ] || { echo "--min-runs must be > 0" >&2; exit 1; }
|
|
58
|
+
if [ -n "$TIMEOUT_SECONDS" ]; then
|
|
59
|
+
case "$TIMEOUT_SECONDS" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
|
|
60
|
+
[ "$TIMEOUT_SECONDS" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
|
|
61
|
+
fi
|
|
62
|
+
if [ -n "$MAX_PAIR_SOLO_WALL_RATIO" ]; then
|
|
63
|
+
python3 - "$MAX_PAIR_SOLO_WALL_RATIO" <<'PY' || { echo "--max-pair-solo-wall-ratio must be a positive number" >&2; exit 1; }
|
|
64
|
+
import sys
|
|
65
|
+
try:
|
|
66
|
+
value = float(sys.argv[1])
|
|
67
|
+
except ValueError:
|
|
68
|
+
raise SystemExit(1)
|
|
69
|
+
if value <= 0:
|
|
70
|
+
raise SystemExit(1)
|
|
71
|
+
PY
|
|
72
|
+
fi
|
|
73
|
+
[ -z "$GATE_ONLY_RUN_IDS" ] || [ -f "$GATE_ONLY_RUN_IDS" ] || { echo "run ids file not found: $GATE_ONLY_RUN_IDS" >&2; exit 1; }
|
|
74
|
+
[ "$PREPARE_ONLY" -eq 0 ] || [ -z "$GATE_ONLY_RUN_IDS" ] || { echo "--prepare-only and --gate-only-run-ids are mutually exclusive" >&2; exit 1; }
|
|
75
|
+
|
|
76
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
77
|
+
BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
78
|
+
if [ -z "$RUN_PREFIX" ]; then
|
|
79
|
+
RUN_PREFIX="$(date -u +%Y%m%dT%H%M%SZ)-swebench-frozen"
|
|
80
|
+
fi
|
|
81
|
+
|
|
82
|
+
TMP_RUN_IDS="$(mktemp)"
|
|
83
|
+
trap 'rm -f "$TMP_RUN_IDS"' EXIT
|
|
84
|
+
ROW_FAILURES=0
|
|
85
|
+
|
|
86
|
+
if [ -n "$GATE_ONLY_RUN_IDS" ]; then
|
|
87
|
+
cp "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS"
|
|
88
|
+
else
|
|
89
|
+
while IFS=$'\t' read -r index instance_id cases_root repo_dir diff_path; do
|
|
90
|
+
[ -n "$instance_id" ] || continue
|
|
91
|
+
run_id="${RUN_PREFIX}-${index}-${instance_id}"
|
|
92
|
+
safe_run_id="$(printf '%s' "$run_id" | tr -c 'A-Za-z0-9_.-' '-')"
|
|
93
|
+
echo "[swebench-frozen-corpus] ${index}: ${instance_id} -> ${safe_run_id}"
|
|
94
|
+
cmd=(
|
|
95
|
+
bash "$SCRIPT_DIR/run-frozen-verify-pair.sh"
|
|
96
|
+
--fixture "$instance_id"
|
|
97
|
+
--fixtures-root "$cases_root"
|
|
98
|
+
--base-repo "$repo_dir"
|
|
99
|
+
--diff "$diff_path"
|
|
100
|
+
--run-id "$safe_run_id"
|
|
101
|
+
--pair-mode "$PAIR_MODE"
|
|
102
|
+
)
|
|
103
|
+
if [ -n "$TIMEOUT_SECONDS" ]; then
|
|
104
|
+
cmd+=(--timeout-seconds "$TIMEOUT_SECONDS")
|
|
105
|
+
fi
|
|
106
|
+
if [ "$PREPARE_ONLY" -eq 1 ]; then
|
|
107
|
+
cmd+=(--prepare-only)
|
|
108
|
+
fi
|
|
109
|
+
if [ "$RESUME_COMPLETED_ARMS" -eq 1 ]; then
|
|
110
|
+
cmd+=(--resume-completed-arms)
|
|
111
|
+
fi
|
|
112
|
+
set +e
|
|
113
|
+
"${cmd[@]}" </dev/null
|
|
114
|
+
row_exit=$?
|
|
115
|
+
set -e
|
|
116
|
+
if [ "$row_exit" -ne 0 ]; then
|
|
117
|
+
echo "[swebench-frozen-corpus] row failed: ${safe_run_id} exit=${row_exit}" >&2
|
|
118
|
+
ROW_FAILURES=$((ROW_FAILURES + 1))
|
|
119
|
+
python3 - "$BENCH_ROOT/results/$safe_run_id" "$instance_id" "$row_exit" <<'PY'
|
|
120
|
+
import json
|
|
121
|
+
import pathlib
|
|
122
|
+
import sys
|
|
123
|
+
|
|
124
|
+
run_root = pathlib.Path(sys.argv[1])
|
|
125
|
+
instance_id = sys.argv[2]
|
|
126
|
+
row_exit = int(sys.argv[3])
|
|
127
|
+
run_root.mkdir(parents=True, exist_ok=True)
|
|
128
|
+
for arm in ("solo", "pair"):
|
|
129
|
+
arm_root = run_root / arm
|
|
130
|
+
arm_root.mkdir(parents=True, exist_ok=True)
|
|
131
|
+
input_path = arm_root / "input.md"
|
|
132
|
+
if not input_path.exists():
|
|
133
|
+
input_path.write_text(
|
|
134
|
+
f"Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/{instance_id}.md.\n",
|
|
135
|
+
encoding="utf8",
|
|
136
|
+
)
|
|
137
|
+
compare_path = run_root / "compare.json"
|
|
138
|
+
if not compare_path.exists():
|
|
139
|
+
compare_path.write_text(
|
|
140
|
+
json.dumps(
|
|
141
|
+
{
|
|
142
|
+
"solo": {"invoke_exit": row_exit, "timed_out": False},
|
|
143
|
+
"pair": {"invoke_exit": row_exit, "timed_out": False, "pair_mode": False},
|
|
144
|
+
"comparison": {
|
|
145
|
+
"pair_trigger_missed": False,
|
|
146
|
+
"pair_verdict_lift": False,
|
|
147
|
+
"pair_internal_verdict_lift": False,
|
|
148
|
+
"row_failed_before_compare": True,
|
|
149
|
+
"row_exit": row_exit,
|
|
150
|
+
},
|
|
151
|
+
},
|
|
152
|
+
indent=2,
|
|
153
|
+
)
|
|
154
|
+
+ "\n",
|
|
155
|
+
encoding="utf8",
|
|
156
|
+
)
|
|
157
|
+
PY
|
|
158
|
+
fi
|
|
159
|
+
printf '%s\n' "$safe_run_id" >> "$TMP_RUN_IDS"
|
|
160
|
+
done < <(python3 - "$MANIFEST" <<'PY'
|
|
161
|
+
import json, pathlib, sys
|
|
162
|
+
manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
163
|
+
for index, row in enumerate(manifest.get("prepared") or [], start=1):
|
|
164
|
+
instance_id = row["instance_id"]
|
|
165
|
+
case_dir = pathlib.Path(row["case_dir"])
|
|
166
|
+
repo_dir = pathlib.Path(row["repo_dir"])
|
|
167
|
+
print("\t".join([
|
|
168
|
+
str(index),
|
|
169
|
+
instance_id,
|
|
170
|
+
str(case_dir.parent),
|
|
171
|
+
str(repo_dir),
|
|
172
|
+
str(case_dir / "model.patch"),
|
|
173
|
+
]))
|
|
174
|
+
PY
|
|
175
|
+
)
|
|
176
|
+
fi
|
|
177
|
+
|
|
178
|
+
if [ -n "$RUN_IDS_OUT" ]; then
|
|
179
|
+
mkdir -p "$(dirname "$RUN_IDS_OUT")"
|
|
180
|
+
cp "$TMP_RUN_IDS" "$RUN_IDS_OUT"
|
|
181
|
+
fi
|
|
182
|
+
|
|
183
|
+
if [ "$PREPARE_ONLY" -eq 1 ]; then
|
|
184
|
+
echo "[swebench-frozen-corpus] prepare-only complete; gate skipped"
|
|
185
|
+
if [ "$ROW_FAILURES" -gt 0 ]; then
|
|
186
|
+
echo "[swebench-frozen-corpus] row failures: $ROW_FAILURES" >&2
|
|
187
|
+
exit 1
|
|
188
|
+
fi
|
|
189
|
+
exit 0
|
|
190
|
+
fi
|
|
191
|
+
|
|
192
|
+
run_count="$(wc -l < "$TMP_RUN_IDS" | tr -d ' ')"
|
|
193
|
+
[ "$run_count" -gt 0 ] || { echo "manifest prepared no runs" >&2; exit 1; }
|
|
194
|
+
|
|
195
|
+
fixtures_root="$(python3 - "$MANIFEST" <<'PY'
|
|
196
|
+
import json, pathlib, sys
|
|
197
|
+
manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
198
|
+
print(manifest["cases_root"])
|
|
199
|
+
PY
|
|
200
|
+
)"
|
|
201
|
+
gate_args=(python3 "$SCRIPT_DIR/frozen-verify-gate.py" --fixtures-root "$fixtures_root" --min-runs "$MIN_RUNS")
|
|
202
|
+
[ -z "$OUT_JSON" ] || gate_args+=(--out-json "$OUT_JSON")
|
|
203
|
+
[ -z "$OUT_MD" ] || gate_args+=(--out-md "$OUT_MD")
|
|
204
|
+
[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || gate_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
|
|
205
|
+
while IFS= read -r run_id; do
|
|
206
|
+
gate_args+=(--run-id "$run_id")
|
|
207
|
+
done < "$TMP_RUN_IDS"
|
|
208
|
+
|
|
209
|
+
"${gate_args[@]}"
|
|
@@ -0,0 +1,239 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Prepare SWE-bench solver worktrees, run a direct solver, and collect patches.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
usage() {
|
|
6
|
+
cat >&2 <<EOF
|
|
7
|
+
usage: $0 --instances-jsonl <path> --predictions-out <path>
|
|
8
|
+
[--instance-id ID ...] [--limit N] [--model-name NAME]
|
|
9
|
+
[--repos-root <path>] [--worktrees-root <path>]
|
|
10
|
+
[--timeout-seconds N] [--copy-devlyn-context] [--resume]
|
|
11
|
+
|
|
12
|
+
Runs Claude Code directly against each selected SWE-bench instance without
|
|
13
|
+
reading gold patch/test_patch fields. Each worktree receives patch.diff plus
|
|
14
|
+
direct-transcript.txt and claude-direct-debug.log. At the end, patch.diff files
|
|
15
|
+
are collected into a SWE-bench predictions JSONL.
|
|
16
|
+
EOF
|
|
17
|
+
exit "${1:-1}"
|
|
18
|
+
}
|
|
19
|
+
|
|
20
|
+
INSTANCES_JSONL=""
|
|
21
|
+
PREDICTIONS_OUT=""
|
|
22
|
+
MODEL_NAME="claude-direct"
|
|
23
|
+
REPOS_ROOT="benchmark/auto-resolve/external/swebench/repos-solver"
|
|
24
|
+
WORKTREES_ROOT="benchmark/auto-resolve/external/swebench/worktrees"
|
|
25
|
+
TIMEOUT_SECONDS=2400
|
|
26
|
+
COPY_DEVLYN_CONTEXT=0
|
|
27
|
+
RESUME=0
|
|
28
|
+
LIMIT=""
|
|
29
|
+
INSTANCE_IDS=()
|
|
30
|
+
|
|
31
|
+
while [ $# -gt 0 ]; do
|
|
32
|
+
case "$1" in
|
|
33
|
+
--instances-jsonl) INSTANCES_JSONL="$2"; shift 2;;
|
|
34
|
+
--predictions-out) PREDICTIONS_OUT="$2"; shift 2;;
|
|
35
|
+
--model-name) MODEL_NAME="$2"; shift 2;;
|
|
36
|
+
--repos-root) REPOS_ROOT="$2"; shift 2;;
|
|
37
|
+
--worktrees-root) WORKTREES_ROOT="$2"; shift 2;;
|
|
38
|
+
--timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
|
|
39
|
+
--copy-devlyn-context) COPY_DEVLYN_CONTEXT=1; shift;;
|
|
40
|
+
--resume) RESUME=1; shift;;
|
|
41
|
+
--limit) LIMIT="$2"; shift 2;;
|
|
42
|
+
--instance-id) INSTANCE_IDS+=("$2"); shift 2;;
|
|
43
|
+
-h|--help) usage 0;;
|
|
44
|
+
*) echo "unknown arg: $1" >&2; usage 1;;
|
|
45
|
+
esac
|
|
46
|
+
done
|
|
47
|
+
|
|
48
|
+
[ -n "$INSTANCES_JSONL" ] || usage 1
|
|
49
|
+
[ -n "$PREDICTIONS_OUT" ] || usage 1
|
|
50
|
+
[ -f "$INSTANCES_JSONL" ] || { echo "instances JSONL not found: $INSTANCES_JSONL" >&2; exit 1; }
|
|
51
|
+
case "$TIMEOUT_SECONDS" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
|
|
52
|
+
[ "$TIMEOUT_SECONDS" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
|
|
53
|
+
if [ -n "$LIMIT" ]; then
|
|
54
|
+
case "$LIMIT" in ''|*[!0-9]*) echo "--limit must be an integer" >&2; exit 1;; esac
|
|
55
|
+
[ "$LIMIT" -gt 0 ] || { echo "--limit must be > 0" >&2; exit 1; }
|
|
56
|
+
fi
|
|
57
|
+
command -v claude >/dev/null 2>&1 || { echo "claude command not found" >&2; exit 1; }
|
|
58
|
+
mkdir -p "$REPOS_ROOT" "$WORKTREES_ROOT" "$(dirname "$PREDICTIONS_OUT")"
|
|
59
|
+
|
|
60
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
61
|
+
TMP_IDS="$(mktemp)"
|
|
62
|
+
TMP_SELECTED_INSTANCES="$(mktemp)"
|
|
63
|
+
trap 'rm -f "$TMP_IDS" "$TMP_SELECTED_INSTANCES"' EXIT
|
|
64
|
+
|
|
65
|
+
python3 - "$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT" "${INSTANCE_IDS[@]}" > "$TMP_IDS" <<'PY'
|
|
66
|
+
import json
|
|
67
|
+
import sys
|
|
68
|
+
from pathlib import Path
|
|
69
|
+
|
|
70
|
+
instances_path = Path(sys.argv[1])
|
|
71
|
+
selected_path = Path(sys.argv[2])
|
|
72
|
+
limit = int(sys.argv[3]) if sys.argv[3] else None
|
|
73
|
+
requested = sys.argv[4:]
|
|
74
|
+
requested_set = set(requested)
|
|
75
|
+
rows = []
|
|
76
|
+
with instances_path.open(encoding="utf8") as f:
|
|
77
|
+
for line_no, line in enumerate(f, start=1):
|
|
78
|
+
if not line.strip():
|
|
79
|
+
continue
|
|
80
|
+
row = json.loads(line)
|
|
81
|
+
instance_id = row.get("instance_id")
|
|
82
|
+
if not isinstance(instance_id, str) or not instance_id:
|
|
83
|
+
raise SystemExit(f"{instances_path}:{line_no}: missing instance_id")
|
|
84
|
+
if requested_set and instance_id not in requested_set:
|
|
85
|
+
continue
|
|
86
|
+
rows.append(row)
|
|
87
|
+
if limit is not None and len(rows) >= limit:
|
|
88
|
+
break
|
|
89
|
+
if requested_set:
|
|
90
|
+
missing = sorted(requested_set - {row["instance_id"] for row in rows})
|
|
91
|
+
if missing:
|
|
92
|
+
raise SystemExit(f"requested instance ids not found: {', '.join(missing)}")
|
|
93
|
+
for instance_id in rows:
|
|
94
|
+
print(instance_id["instance_id"])
|
|
95
|
+
with selected_path.open("w", encoding="utf8") as f:
|
|
96
|
+
for row in rows:
|
|
97
|
+
f.write(json.dumps(row) + "\n")
|
|
98
|
+
PY
|
|
99
|
+
|
|
100
|
+
run_solver() {
|
|
101
|
+
local worktree
|
|
102
|
+
worktree="$(cd "$1" && pwd -P)"
|
|
103
|
+
local timeout_seconds="$2"
|
|
104
|
+
local prompt_file="$worktree/solve-prompt.txt"
|
|
105
|
+
local transcript="$worktree/direct-transcript.txt"
|
|
106
|
+
local debug_log="$worktree/claude-direct-debug.log"
|
|
107
|
+
local timeout_flag="$worktree/.solver-timed-out"
|
|
108
|
+
|
|
109
|
+
rm -f "$transcript" "$debug_log" "$timeout_flag"
|
|
110
|
+
set +e
|
|
111
|
+
set -m
|
|
112
|
+
(
|
|
113
|
+
cd "$worktree"
|
|
114
|
+
exec claude \
|
|
115
|
+
-p "$(cat "$prompt_file")" \
|
|
116
|
+
--dangerously-skip-permissions \
|
|
117
|
+
--effort xhigh \
|
|
118
|
+
--strict-mcp-config \
|
|
119
|
+
--mcp-config '{"mcpServers":{}}' \
|
|
120
|
+
--debug-file "$debug_log" \
|
|
121
|
+
</dev/null
|
|
122
|
+
) > "$transcript" 2>&1 &
|
|
123
|
+
local child_pid=$!
|
|
124
|
+
set +m
|
|
125
|
+
|
|
126
|
+
(
|
|
127
|
+
sleep "$timeout_seconds"
|
|
128
|
+
if kill -0 "$child_pid" 2>/dev/null; then
|
|
129
|
+
: > "$timeout_flag"
|
|
130
|
+
kill -TERM -- "-$child_pid" 2>/dev/null
|
|
131
|
+
sleep 5
|
|
132
|
+
kill -KILL -- "-$child_pid" 2>/dev/null
|
|
133
|
+
fi
|
|
134
|
+
) &
|
|
135
|
+
local watchdog_pid=$!
|
|
136
|
+
|
|
137
|
+
wait "$child_pid"
|
|
138
|
+
local invoke_exit=$?
|
|
139
|
+
kill -TERM "$watchdog_pid" 2>/dev/null || true
|
|
140
|
+
wait "$watchdog_pid" 2>/dev/null || true
|
|
141
|
+
|
|
142
|
+
if [ -f "$timeout_flag" ]; then
|
|
143
|
+
rm -f "$timeout_flag"
|
|
144
|
+
invoke_exit=124
|
|
145
|
+
fi
|
|
146
|
+
set -e
|
|
147
|
+
return "$invoke_exit"
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
write_patch() {
|
|
151
|
+
local worktree
|
|
152
|
+
worktree="$(cd "$1" && pwd -P)"
|
|
153
|
+
(
|
|
154
|
+
cd "$worktree"
|
|
155
|
+
git add -N -- . \
|
|
156
|
+
':(exclude).claude/**' \
|
|
157
|
+
':(exclude)CLAUDE.md' \
|
|
158
|
+
':(exclude)benchmark/**' \
|
|
159
|
+
':(exclude)docs/roadmap/phase-1/*.md' \
|
|
160
|
+
':(exclude)solve-prompt.txt' \
|
|
161
|
+
':(exclude)direct-transcript.txt' \
|
|
162
|
+
':(exclude)claude-direct-debug.log' \
|
|
163
|
+
':(exclude)latest' \
|
|
164
|
+
':(exclude).solver-timed-out' >/dev/null 2>&1 || true
|
|
165
|
+
git diff --binary -- . \
|
|
166
|
+
':(exclude).claude/**' \
|
|
167
|
+
':(exclude)CLAUDE.md' \
|
|
168
|
+
':(exclude)benchmark/**' \
|
|
169
|
+
':(exclude)docs/roadmap/phase-1/*.md' \
|
|
170
|
+
':(exclude)solve-prompt.txt' \
|
|
171
|
+
':(exclude)direct-transcript.txt' \
|
|
172
|
+
':(exclude)claude-direct-debug.log' \
|
|
173
|
+
':(exclude)latest' \
|
|
174
|
+
':(exclude).solver-timed-out' > patch.diff
|
|
175
|
+
)
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
while IFS= read -r instance_id; do
|
|
179
|
+
[ -n "$instance_id" ] || continue
|
|
180
|
+
worktree="$WORKTREES_ROOT/$instance_id"
|
|
181
|
+
if [ "$RESUME" -eq 1 ] && [ -s "$worktree/patch.diff" ]; then
|
|
182
|
+
echo "[swebench-solver] skip existing patch: $instance_id"
|
|
183
|
+
continue
|
|
184
|
+
fi
|
|
185
|
+
|
|
186
|
+
echo "[swebench-solver] prepare: $instance_id"
|
|
187
|
+
prepare_cmd=(
|
|
188
|
+
python3 "$SCRIPT_DIR/prepare-swebench-solver-worktree.py"
|
|
189
|
+
--instances-jsonl "$INSTANCES_JSONL"
|
|
190
|
+
--instance-id "$instance_id"
|
|
191
|
+
--repos-root "$REPOS_ROOT"
|
|
192
|
+
--worktrees-root "$WORKTREES_ROOT"
|
|
193
|
+
)
|
|
194
|
+
if [ "$COPY_DEVLYN_CONTEXT" -eq 1 ]; then
|
|
195
|
+
prepare_cmd+=(--copy-devlyn-context)
|
|
196
|
+
fi
|
|
197
|
+
"${prepare_cmd[@]}" > "$worktree.prepare.json"
|
|
198
|
+
|
|
199
|
+
echo "[swebench-solver] solve: $instance_id"
|
|
200
|
+
if run_solver "$worktree" "$TIMEOUT_SECONDS"; then
|
|
201
|
+
invoke_exit=0
|
|
202
|
+
else
|
|
203
|
+
invoke_exit=$?
|
|
204
|
+
fi
|
|
205
|
+
write_patch "$worktree"
|
|
206
|
+
python3 - "$worktree" "$instance_id" "$invoke_exit" <<'PY'
|
|
207
|
+
import json
|
|
208
|
+
import subprocess
|
|
209
|
+
import sys
|
|
210
|
+
from pathlib import Path
|
|
211
|
+
|
|
212
|
+
worktree = Path(sys.argv[1])
|
|
213
|
+
instance_id = sys.argv[2]
|
|
214
|
+
invoke_exit = int(sys.argv[3])
|
|
215
|
+
patch = worktree / "patch.diff"
|
|
216
|
+
stat = subprocess.run(
|
|
217
|
+
["git", "-C", str(worktree), "diff", "--stat", "--", "."],
|
|
218
|
+
text=True,
|
|
219
|
+
capture_output=True,
|
|
220
|
+
check=False,
|
|
221
|
+
)
|
|
222
|
+
report = {
|
|
223
|
+
"instance_id": instance_id,
|
|
224
|
+
"invoke_exit": invoke_exit,
|
|
225
|
+
"patch_path": str(patch),
|
|
226
|
+
"patch_bytes": patch.stat().st_size if patch.exists() else 0,
|
|
227
|
+
"diff_stat": stat.stdout.strip(),
|
|
228
|
+
}
|
|
229
|
+
(worktree / "solver-result.json").write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
|
|
230
|
+
print(json.dumps(report, indent=2))
|
|
231
|
+
PY
|
|
232
|
+
done < "$TMP_IDS"
|
|
233
|
+
|
|
234
|
+
python3 "$SCRIPT_DIR/collect-swebench-predictions.py" \
|
|
235
|
+
--patch-root "$WORKTREES_ROOT" \
|
|
236
|
+
--instances-jsonl "$TMP_SELECTED_INSTANCES" \
|
|
237
|
+
--model-name "$MODEL_NAME" \
|
|
238
|
+
--out "$PREDICTIONS_OUT" \
|
|
239
|
+
--allow-empty
|