devlyn-cli 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/README.md +1 -1
- package/benchmark/auto-resolve/README.md +318 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/bin/devlyn.js +56 -10
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +312 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:ideate/SKILL.md +1 -1
- package/config/skills/devlyn:resolve/SKILL.md +62 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +32 -0
|
@@ -0,0 +1,511 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-frozen-verify-pair.sh — compare solo VERIFY vs pair VERIFY on one frozen diff.
|
|
3
|
+
#
|
|
4
|
+
# This isolates VERIFY/JUDGE from IMPLEMENT: the implementation diff is applied
|
|
5
|
+
# before /devlyn:resolve starts, then both arms run verify-only against the same
|
|
6
|
+
# committed code and `.devlyn/external-diff.patch`.
|
|
7
|
+
|
|
8
|
+
set -euo pipefail
|
|
9
|
+
|
|
10
|
+
usage() {
|
|
11
|
+
cat >&2 <<EOF
|
|
12
|
+
usage: $0 --fixture <FID> --diff <path> [--run-id ID] [--pair-mode forced|gated]
|
|
13
|
+
[--fixtures-root <path>] [--base-repo <path>]
|
|
14
|
+
[--timeout-seconds N] [--prepare-only] [--resume-completed-arms]
|
|
15
|
+
|
|
16
|
+
Runs two verify-only arms:
|
|
17
|
+
solo = /devlyn:resolve --verify-only ... --engine claude
|
|
18
|
+
pair = forced: /devlyn:resolve --verify-only ... --engine claude --pair-verify
|
|
19
|
+
gated: /devlyn:resolve --verify-only ... --engine claude
|
|
20
|
+
|
|
21
|
+
By default fixtures come from benchmark/auto-resolve/fixtures and the base repo
|
|
22
|
+
is fixtures/test-repo. External corpora such as SWE-bench can pass their own
|
|
23
|
+
case root and checked-out base repo.
|
|
24
|
+
EOF
|
|
25
|
+
exit "${1:-1}"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
FIXTURE=""
|
|
29
|
+
DIFF_PATH=""
|
|
30
|
+
RUN_ID=""
|
|
31
|
+
PAIR_MODE="forced"
|
|
32
|
+
FIXTURES_ROOT=""
|
|
33
|
+
BASE_REPO=""
|
|
34
|
+
PREPARE_ONLY=0
|
|
35
|
+
TIMEOUT_OVERRIDE=""
|
|
36
|
+
RESUME_COMPLETED_ARMS=0
|
|
37
|
+
while [ $# -gt 0 ]; do
|
|
38
|
+
case "$1" in
|
|
39
|
+
--fixture) FIXTURE="$2"; shift 2;;
|
|
40
|
+
--diff) DIFF_PATH="$2"; shift 2;;
|
|
41
|
+
--run-id) RUN_ID="$2"; shift 2;;
|
|
42
|
+
--pair-mode) PAIR_MODE="$2"; shift 2;;
|
|
43
|
+
--fixtures-root) FIXTURES_ROOT="$2"; shift 2;;
|
|
44
|
+
--base-repo) BASE_REPO="$2"; shift 2;;
|
|
45
|
+
--timeout-seconds) TIMEOUT_OVERRIDE="$2"; shift 2;;
|
|
46
|
+
--prepare-only) PREPARE_ONLY=1; shift;;
|
|
47
|
+
--resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
|
|
48
|
+
-h|--help) usage 0;;
|
|
49
|
+
*) echo "unknown arg: $1" >&2; usage 1;;
|
|
50
|
+
esac
|
|
51
|
+
done
|
|
52
|
+
|
|
53
|
+
[ -n "$FIXTURE" ] && [ -n "$DIFF_PATH" ] || usage 1
|
|
54
|
+
[ -f "$DIFF_PATH" ] || { echo "diff not found: $DIFF_PATH" >&2; exit 1; }
|
|
55
|
+
[ -s "$DIFF_PATH" ] || { echo "diff is empty: $DIFF_PATH" >&2; exit 1; }
|
|
56
|
+
[ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated (got '$PAIR_MODE')" >&2; exit 1; }
|
|
57
|
+
|
|
58
|
+
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
59
|
+
REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
|
|
60
|
+
[ -n "$FIXTURES_ROOT" ] || FIXTURES_ROOT="$BENCH_ROOT/fixtures"
|
|
61
|
+
[ -n "$BASE_REPO" ] || BASE_REPO="$BENCH_ROOT/fixtures/test-repo"
|
|
62
|
+
FIXTURES_ROOT="$(cd "$FIXTURES_ROOT" && pwd)"
|
|
63
|
+
BASE_REPO="$(cd "$BASE_REPO" && pwd)"
|
|
64
|
+
FIX_DIR="$FIXTURES_ROOT/$FIXTURE"
|
|
65
|
+
[ -d "$FIX_DIR" ] || { echo "fixture not found: $FIXTURE" >&2; exit 1; }
|
|
66
|
+
[ -d "$BASE_REPO" ] || { echo "base repo not found: $BASE_REPO" >&2; exit 1; }
|
|
67
|
+
|
|
68
|
+
META="$FIX_DIR/metadata.json"
|
|
69
|
+
EXPECTED="$FIX_DIR/expected.json"
|
|
70
|
+
SPEC="$FIX_DIR/spec.md"
|
|
71
|
+
TASK="$FIX_DIR/task.txt"
|
|
72
|
+
SETUP="$FIX_DIR/setup.sh"
|
|
73
|
+
for f in "$META" "$EXPECTED" "$SPEC" "$TASK" "$SETUP"; do
|
|
74
|
+
[ -f "$f" ] || { echo "fixture missing required file: $f" >&2; exit 1; }
|
|
75
|
+
done
|
|
76
|
+
|
|
77
|
+
TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
|
|
78
|
+
if [ -n "$TIMEOUT_OVERRIDE" ]; then
|
|
79
|
+
case "$TIMEOUT_OVERRIDE" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
|
|
80
|
+
[ "$TIMEOUT_OVERRIDE" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
|
|
81
|
+
TIMEOUT="$TIMEOUT_OVERRIDE"
|
|
82
|
+
fi
|
|
83
|
+
if [ -z "$RUN_ID" ]; then
|
|
84
|
+
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
|
85
|
+
SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
|
|
86
|
+
RUN_ID="${TS}-${SHA}-frozen-verify"
|
|
87
|
+
fi
|
|
88
|
+
|
|
89
|
+
RESULT_ROOT="$BENCH_ROOT/results/$RUN_ID"
|
|
90
|
+
mkdir -p "$RESULT_ROOT"
|
|
91
|
+
|
|
92
|
+
echo ""
|
|
93
|
+
echo "═══ Frozen Verify Pair Run ═══"
|
|
94
|
+
echo "Run-id: $RUN_ID"
|
|
95
|
+
echo "Fixture: $FIXTURE"
|
|
96
|
+
echo "Cases: $FIXTURES_ROOT"
|
|
97
|
+
echo "Base: $BASE_REPO"
|
|
98
|
+
echo "Diff: $DIFF_PATH"
|
|
99
|
+
echo "Pair: $PAIR_MODE"
|
|
100
|
+
echo "Timeout: ${TIMEOUT}s per arm"
|
|
101
|
+
[ "$PREPARE_ONLY" -eq 0 ] || echo "Mode: prepare-only"
|
|
102
|
+
echo ""
|
|
103
|
+
|
|
104
|
+
mirror_skills() {
|
|
105
|
+
local src_skills="$REPO_ROOT/config/skills"
|
|
106
|
+
local dst_skills="$REPO_ROOT/.claude/skills"
|
|
107
|
+
mkdir -p "$dst_skills"
|
|
108
|
+
local mirrored=0
|
|
109
|
+
for src_dir in "$src_skills"/*/; do
|
|
110
|
+
[ -d "$src_dir" ] || continue
|
|
111
|
+
local name
|
|
112
|
+
name=$(basename "$src_dir")
|
|
113
|
+
case "$name" in
|
|
114
|
+
devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
|
|
115
|
+
continue ;;
|
|
116
|
+
esac
|
|
117
|
+
local staging="$dst_skills/.${name}.staging"
|
|
118
|
+
rm -rf "$staging"
|
|
119
|
+
cp -R "$src_dir" "$staging"
|
|
120
|
+
rm -rf "$dst_skills/$name"
|
|
121
|
+
mv "$staging" "$dst_skills/$name"
|
|
122
|
+
mirrored=$((mirrored + 1))
|
|
123
|
+
done
|
|
124
|
+
echo "[frozen-verify] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
stage_codex_env() {
|
|
128
|
+
local work_dir="$1"
|
|
129
|
+
local arm="$2"
|
|
130
|
+
mkdir -p "$work_dir/.claude"
|
|
131
|
+
cp -R "$REPO_ROOT/.claude/skills" "$work_dir/.claude/skills"
|
|
132
|
+
[ -f "$REPO_ROOT/CLAUDE.md" ] && cp "$REPO_ROOT/CLAUDE.md" "$work_dir/CLAUDE.md"
|
|
133
|
+
|
|
134
|
+
if ! command -v codex >/dev/null 2>&1; then
|
|
135
|
+
echo "warning: codex not on PATH — pair arm cannot exercise Codex pair-JUDGE" >&2
|
|
136
|
+
return
|
|
137
|
+
fi
|
|
138
|
+
local real_bin shim_src monitored_src monitored_path snapshot_path injected_path blocked
|
|
139
|
+
real_bin="$(command -v codex)"
|
|
140
|
+
shim_src="$REPO_ROOT/scripts/codex-shim/codex"
|
|
141
|
+
monitored_src="$REPO_ROOT/config/skills/_shared/codex-monitored.sh"
|
|
142
|
+
[ -x "$shim_src" ] || { echo "missing codex shim: $shim_src" >&2; exit 1; }
|
|
143
|
+
[ -r "$monitored_src" ] || { echo "missing codex wrapper: $monitored_src" >&2; exit 1; }
|
|
144
|
+
mkdir -p "$work_dir/.devlyn-bin"
|
|
145
|
+
cp "$shim_src" "$work_dir/.devlyn-bin/codex"
|
|
146
|
+
chmod +x "$work_dir/.devlyn-bin/codex"
|
|
147
|
+
monitored_path="$work_dir/.claude/skills/_shared/codex-monitored.sh"
|
|
148
|
+
snapshot_path=$(grep -m1 '^export PATH=' "$HOME/.claude/shell-snapshots/snapshot-zsh-"*.sh 2>/dev/null | head -1 | sed 's/^[^=]*=//' | tr -d '"' || true)
|
|
149
|
+
[ -n "$snapshot_path" ] || snapshot_path="$PATH"
|
|
150
|
+
injected_path="$work_dir/.devlyn-bin:$snapshot_path"
|
|
151
|
+
blocked=0
|
|
152
|
+
[ "$arm" = "solo" ] && blocked=1
|
|
153
|
+
python3 - "$work_dir/.claude/settings.json" "$injected_path" "$real_bin" "$monitored_path" "$blocked" <<'PY'
|
|
154
|
+
import json, sys
|
|
155
|
+
out_path, path_val, real_bin, monitored, blocked = sys.argv[1:6]
|
|
156
|
+
env = {"CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1", "PATH": path_val}
|
|
157
|
+
if blocked == "1":
|
|
158
|
+
env["CODEX_BLOCKED"] = "1"
|
|
159
|
+
else:
|
|
160
|
+
env["CODEX_REAL_BIN"] = real_bin
|
|
161
|
+
env["CODEX_MONITORED_PATH"] = monitored
|
|
162
|
+
with open(out_path, "w") as f:
|
|
163
|
+
json.dump({"env": env}, f, indent=2)
|
|
164
|
+
f.write("\n")
|
|
165
|
+
PY
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
cleanup_workdir_processes() {
|
|
169
|
+
local work_dir="$1"
|
|
170
|
+
local signal="$2"
|
|
171
|
+
local physical_work_dir current_pgid
|
|
172
|
+
physical_work_dir="$(cd "$work_dir" 2>/dev/null && pwd -P || printf '%s' "$work_dir")"
|
|
173
|
+
current_pgid="$(ps -o pgid= -p "$$" | tr -d ' ')"
|
|
174
|
+
ps -axo pid=,pgid=,command= \
|
|
175
|
+
| awk -v p1="$work_dir" -v p2="$physical_work_dir" -v self="$$" -v current_pgid="$current_pgid" '
|
|
176
|
+
$1 != self && $2 != current_pgid && (index($0, p1) || index($0, p2)) { print $2 }
|
|
177
|
+
' \
|
|
178
|
+
| sort -u \
|
|
179
|
+
| while IFS= read -r pgid; do
|
|
180
|
+
[ -n "$pgid" ] || continue
|
|
181
|
+
kill "-$signal" -- "-$pgid" 2>/dev/null || true
|
|
182
|
+
done
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
archive_ready() {
|
|
186
|
+
local work_dir="$1"
|
|
187
|
+
python3 - "$work_dir" <<'PY'
|
|
188
|
+
import pathlib, sys
|
|
189
|
+
root = pathlib.Path(sys.argv[1]) / ".devlyn" / "runs"
|
|
190
|
+
raise SystemExit(0 if root.is_dir() and any(root.glob("*/pipeline.state.json")) else 1)
|
|
191
|
+
PY
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
summarize_arm() {
|
|
195
|
+
local result_dir="$1"
|
|
196
|
+
local elapsed="$2"
|
|
197
|
+
local invoke_exit="$3"
|
|
198
|
+
python3 - "$result_dir" "$elapsed" "$invoke_exit" <<'PY'
|
|
199
|
+
import json, pathlib, sys
|
|
200
|
+
result_dir = pathlib.Path(sys.argv[1])
|
|
201
|
+
elapsed = int(sys.argv[2])
|
|
202
|
+
invoke_exit = int(sys.argv[3])
|
|
203
|
+
archive = result_dir / "run-archive"
|
|
204
|
+
state_path = archive / "pipeline.state.json"
|
|
205
|
+
state = json.loads(state_path.read_text()) if state_path.is_file() else {}
|
|
206
|
+
verify = ((state.get("phases") or {}).get("verify") or {})
|
|
207
|
+
sub_verdicts = verify.get("sub_verdicts")
|
|
208
|
+
pair_trigger = verify.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
|
|
209
|
+
findings = []
|
|
210
|
+
finding_paths = []
|
|
211
|
+
merged_path = archive / "verify-merged.findings.jsonl"
|
|
212
|
+
if merged_path.is_file():
|
|
213
|
+
finding_paths.append(merged_path)
|
|
214
|
+
else:
|
|
215
|
+
candidates = []
|
|
216
|
+
for name in ("verify.findings.jsonl", "verify.pair-judge.findings.jsonl"):
|
|
217
|
+
candidates.append(archive / name)
|
|
218
|
+
candidates.extend(sorted(archive.glob("verify.findings*.jsonl")))
|
|
219
|
+
candidates.extend(sorted(archive.glob("verify.*findings*.jsonl")))
|
|
220
|
+
seen = set()
|
|
221
|
+
for candidate_path in candidates:
|
|
222
|
+
if candidate_path.name == "verify-mechanical.findings.jsonl":
|
|
223
|
+
continue
|
|
224
|
+
if candidate_path in seen or not candidate_path.is_file():
|
|
225
|
+
continue
|
|
226
|
+
seen.add(candidate_path)
|
|
227
|
+
finding_paths.append(candidate_path)
|
|
228
|
+
findings_source = "+".join(path.name for path in finding_paths) if finding_paths else "missing"
|
|
229
|
+
finding_severities = {"CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"}
|
|
230
|
+
for findings_path in finding_paths:
|
|
231
|
+
for line in findings_path.read_text().splitlines():
|
|
232
|
+
if line.strip():
|
|
233
|
+
try:
|
|
234
|
+
parsed = json.loads(line)
|
|
235
|
+
except json.JSONDecodeError:
|
|
236
|
+
continue
|
|
237
|
+
if not isinstance(parsed, dict):
|
|
238
|
+
continue
|
|
239
|
+
sev = str(parsed.get("severity") or parsed.get("level") or "").upper()
|
|
240
|
+
if sev not in finding_severities:
|
|
241
|
+
continue
|
|
242
|
+
findings.append(parsed)
|
|
243
|
+
merged = verify.get("merged") if isinstance(verify.get("merged"), dict) else {}
|
|
244
|
+
merged_findings_count = sum(
|
|
245
|
+
int(merged.get(k) or 0) for k in ("critical", "high", "medium", "low")
|
|
246
|
+
)
|
|
247
|
+
findings_count = len(findings) if findings else merged_findings_count
|
|
248
|
+
severity_counts = {}
|
|
249
|
+
for finding in findings:
|
|
250
|
+
if isinstance(finding, dict):
|
|
251
|
+
sev = str(finding.get("severity") or finding.get("level") or "unknown").upper()
|
|
252
|
+
severity_counts[sev] = severity_counts.get(sev, 0) + 1
|
|
253
|
+
transcript_path = result_dir / "transcript.txt"
|
|
254
|
+
transcript = transcript_path.read_text(errors="replace") if transcript_path.is_file() else ""
|
|
255
|
+
invoke_failure_reason = None
|
|
256
|
+
if invoke_exit == 124:
|
|
257
|
+
invoke_failure_reason = "timeout"
|
|
258
|
+
elif "You've hit your limit" in transcript:
|
|
259
|
+
invoke_failure_reason = "provider_limit"
|
|
260
|
+
summary = {
|
|
261
|
+
"elapsed_seconds": elapsed,
|
|
262
|
+
"invoke_exit": invoke_exit,
|
|
263
|
+
"timed_out": invoke_exit == 124,
|
|
264
|
+
"invoke_failure_reason": invoke_failure_reason,
|
|
265
|
+
"terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
|
|
266
|
+
"verify_verdict": verify.get("verdict"),
|
|
267
|
+
"sub_verdicts": sub_verdicts,
|
|
268
|
+
"pair_trigger": pair_trigger,
|
|
269
|
+
"pair_mode": bool(isinstance(sub_verdicts, dict) and (
|
|
270
|
+
sub_verdicts.get("judge_codex") is not None
|
|
271
|
+
or sub_verdicts.get("pair_judge") is not None
|
|
272
|
+
))
|
|
273
|
+
or bool(verify.get("pair_mode")),
|
|
274
|
+
"verify_findings_count": findings_count,
|
|
275
|
+
"verify_findings_source": findings_source if finding_paths else (
|
|
276
|
+
"state.merged" if merged_findings_count else "missing"
|
|
277
|
+
),
|
|
278
|
+
"merged_findings_counts": merged,
|
|
279
|
+
"severity_counts": severity_counts,
|
|
280
|
+
"verify_findings_severities": [f.get("severity") for f in findings if isinstance(f, dict)],
|
|
281
|
+
}
|
|
282
|
+
(result_dir / "summary.json").write_text(json.dumps(summary, indent=2) + "\n")
|
|
283
|
+
print(json.dumps(summary, indent=2))
|
|
284
|
+
PY
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
copy_base_repo() {
|
|
288
|
+
local work_dir="$1"
|
|
289
|
+
rm -rf "$work_dir"
|
|
290
|
+
mkdir -p "$work_dir"
|
|
291
|
+
if [ -d "$BASE_REPO/.git" ]; then
|
|
292
|
+
git -C "$BASE_REPO" archive --format=tar HEAD | (cd "$work_dir" && LC_ALL=C tar -xf -)
|
|
293
|
+
else
|
|
294
|
+
cp -R "$BASE_REPO"/. "$work_dir"/
|
|
295
|
+
rm -rf "$work_dir/.git"
|
|
296
|
+
fi
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
run_arm() {
|
|
300
|
+
local arm="$1"
|
|
301
|
+
local pair_flag="$2"
|
|
302
|
+
local result_dir="$RESULT_ROOT/$arm"
|
|
303
|
+
local work_dir="/tmp/bench-${RUN_ID}-${FIXTURE}-${arm}"
|
|
304
|
+
if [ "$RESUME_COMPLETED_ARMS" -eq 1 ] && [ "$PREPARE_ONLY" -eq 0 ] && [ -f "$result_dir/summary.json" ]; then
|
|
305
|
+
if python3 - "$result_dir/summary.json" <<'PY'
|
|
306
|
+
import json
|
|
307
|
+
import sys
|
|
308
|
+
|
|
309
|
+
summary = json.load(open(sys.argv[1]))
|
|
310
|
+
raise SystemExit(0 if summary.get("invoke_exit") == 0 else 1)
|
|
311
|
+
PY
|
|
312
|
+
then
|
|
313
|
+
echo "[frozen-verify] $arm: reuse completed summary"
|
|
314
|
+
return 0
|
|
315
|
+
fi
|
|
316
|
+
fi
|
|
317
|
+
mkdir -p "$result_dir"
|
|
318
|
+
copy_base_repo "$work_dir"
|
|
319
|
+
|
|
320
|
+
stage_codex_env "$work_dir" "$arm"
|
|
321
|
+
|
|
322
|
+
(cd "$work_dir" && git init -q && git add -A && git -c user.email=b@b -c user.name=b commit -q -m baseline)
|
|
323
|
+
|
|
324
|
+
if [ -s "$SETUP" ]; then
|
|
325
|
+
chmod +x "$SETUP"
|
|
326
|
+
(cd "$work_dir" && "$SETUP") > "$result_dir/setup.log" 2>&1
|
|
327
|
+
(cd "$work_dir" && git add -A && git -c user.email=b@b -c user.name=b commit -q --allow-empty -m fixture-setup)
|
|
328
|
+
fi
|
|
329
|
+
|
|
330
|
+
mkdir -p "$work_dir/docs/roadmap/phase-1" "$work_dir/.devlyn"
|
|
331
|
+
cp "$SPEC" "$work_dir/docs/roadmap/phase-1/$FIXTURE.md"
|
|
332
|
+
cp "$DIFF_PATH" "$work_dir/.devlyn/external-diff.patch"
|
|
333
|
+
python3 - "$EXPECTED" "$work_dir/.devlyn/spec-verify.json" <<'PY'
|
|
334
|
+
import json, os, sys
|
|
335
|
+
expected = json.load(open(sys.argv[1]))
|
|
336
|
+
out_path = sys.argv[2]
|
|
337
|
+
commands = expected.get("verification_commands", [])
|
|
338
|
+
if not commands:
|
|
339
|
+
raise SystemExit(0)
|
|
340
|
+
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
341
|
+
with open(out_path, "w") as f:
|
|
342
|
+
json.dump({"verification_commands": commands}, f, indent=2)
|
|
343
|
+
f.write("\n")
|
|
344
|
+
PY
|
|
345
|
+
|
|
346
|
+
if ! (cd "$work_dir" && git apply .devlyn/external-diff.patch); then
|
|
347
|
+
echo "[frozen-verify] $arm: diff failed to apply" >&2
|
|
348
|
+
return 1
|
|
349
|
+
fi
|
|
350
|
+
(cd "$work_dir" && git add -A && git -c user.email=b@b -c user.name=b commit -q -m external-implementation)
|
|
351
|
+
|
|
352
|
+
cat > "$result_dir/input.md" <<EOF
|
|
353
|
+
Use the \`/devlyn:resolve --verify-only .devlyn/external-diff.patch --spec docs/roadmap/phase-1/$FIXTURE.md --engine claude ${pair_flag}\` skill to run VERIFY-ONLY mode.
|
|
354
|
+
|
|
355
|
+
The diff at .devlyn/external-diff.patch represents an external implementation already applied to the work tree. Run PHASE 5 (VERIFY) only — skip PLAN, IMPLEMENT, BUILD_GATE, CLEANUP per the skill's verify-only mode contract.
|
|
356
|
+
|
|
357
|
+
Important: \`--engine claude\` selects the primary VERIFY judge only. It must not suppress gated VERIFY pair-mode. If the spec/phase trigger makes pair-mode eligible with non-empty reasons, the skill must spawn the OTHER-engine judge unless Codex is blocked/unavailable at the invocation layer.
|
|
358
|
+
|
|
359
|
+
Report the terminal verdict, list of files in the diff, and any findings.
|
|
360
|
+
EOF
|
|
361
|
+
|
|
362
|
+
if [ "$PREPARE_ONLY" -eq 1 ]; then
|
|
363
|
+
echo "[frozen-verify] $arm prepared at $work_dir"
|
|
364
|
+
return 0
|
|
365
|
+
fi
|
|
366
|
+
|
|
367
|
+
local start end elapsed invoke_exit watchdog timeout_flag complete_flag
|
|
368
|
+
start=$(date +%s)
|
|
369
|
+
timeout_flag="$result_dir/.timed_out"
|
|
370
|
+
complete_flag="$result_dir/.completed"
|
|
371
|
+
rm -f "$timeout_flag" "$complete_flag"
|
|
372
|
+
set +e
|
|
373
|
+
set -m
|
|
374
|
+
(
|
|
375
|
+
cd "$work_dir"
|
|
376
|
+
export PATH="$work_dir/.devlyn-bin:$PATH"
|
|
377
|
+
[ "$arm" = "solo" ] && export CODEX_BLOCKED=1
|
|
378
|
+
export BENCH_WORKDIR="$work_dir"
|
|
379
|
+
export BENCH_FIXTURE_DIR="$FIX_DIR"
|
|
380
|
+
exec claude \
|
|
381
|
+
-p "$(cat "$result_dir/input.md")" \
|
|
382
|
+
--dangerously-skip-permissions \
|
|
383
|
+
--effort xhigh \
|
|
384
|
+
--strict-mcp-config \
|
|
385
|
+
--mcp-config '{"mcpServers":{}}' \
|
|
386
|
+
--debug-file "$result_dir/claude-debug.log" \
|
|
387
|
+
</dev/null
|
|
388
|
+
) > "$result_dir/transcript.txt" 2>&1 &
|
|
389
|
+
local child_pid=$!
|
|
390
|
+
set +m
|
|
391
|
+
(
|
|
392
|
+
local deadline now
|
|
393
|
+
deadline=$(($(date +%s) + TIMEOUT))
|
|
394
|
+
while kill -0 "$child_pid" 2>/dev/null; do
|
|
395
|
+
if archive_ready "$work_dir"; then
|
|
396
|
+
: > "$complete_flag"
|
|
397
|
+
kill -TERM -- "-$child_pid" 2>/dev/null
|
|
398
|
+
cleanup_workdir_processes "$work_dir" TERM
|
|
399
|
+
sleep 2
|
|
400
|
+
kill -KILL -- "-$child_pid" 2>/dev/null
|
|
401
|
+
cleanup_workdir_processes "$work_dir" KILL
|
|
402
|
+
exit 0
|
|
403
|
+
fi
|
|
404
|
+
now=$(date +%s)
|
|
405
|
+
[ "$now" -lt "$deadline" ] || break
|
|
406
|
+
sleep 5
|
|
407
|
+
done
|
|
408
|
+
if kill -0 "$child_pid" 2>/dev/null; then
|
|
409
|
+
: > "$timeout_flag"
|
|
410
|
+
kill -TERM -- "-$child_pid" 2>/dev/null
|
|
411
|
+
cleanup_workdir_processes "$work_dir" TERM
|
|
412
|
+
sleep 5
|
|
413
|
+
kill -KILL -- "-$child_pid" 2>/dev/null
|
|
414
|
+
cleanup_workdir_processes "$work_dir" KILL
|
|
415
|
+
fi
|
|
416
|
+
) &
|
|
417
|
+
watchdog=$!
|
|
418
|
+
wait "$child_pid"
|
|
419
|
+
invoke_exit=$?
|
|
420
|
+
kill -TERM "$watchdog" 2>/dev/null || true
|
|
421
|
+
wait "$watchdog" 2>/dev/null || true
|
|
422
|
+
if [ -f "$timeout_flag" ]; then
|
|
423
|
+
invoke_exit=124
|
|
424
|
+
rm -f "$timeout_flag"
|
|
425
|
+
elif [ -f "$complete_flag" ]; then
|
|
426
|
+
invoke_exit=0
|
|
427
|
+
rm -f "$complete_flag"
|
|
428
|
+
fi
|
|
429
|
+
set -e
|
|
430
|
+
end=$(date +%s)
|
|
431
|
+
elapsed=$((end - start))
|
|
432
|
+
|
|
433
|
+
local run_dir
|
|
434
|
+
run_dir=$(find "$work_dir/.devlyn/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -1 || true)
|
|
435
|
+
if [ -n "$run_dir" ]; then
|
|
436
|
+
rm -rf "$result_dir/run-archive"
|
|
437
|
+
cp -R "$run_dir" "$result_dir/run-archive"
|
|
438
|
+
[ -f "$result_dir/run-archive/pipeline.state.json" ] \
|
|
439
|
+
|| [ ! -f "$work_dir/.devlyn/pipeline.state.json" ] \
|
|
440
|
+
|| cp "$work_dir/.devlyn/pipeline.state.json" "$result_dir/run-archive/pipeline.state.json"
|
|
441
|
+
elif [ -d "$work_dir/.devlyn" ]; then
|
|
442
|
+
rm -rf "$result_dir/run-archive"
|
|
443
|
+
mkdir -p "$result_dir/run-archive"
|
|
444
|
+
find "$work_dir/.devlyn" -maxdepth 1 -type f -exec cp {} "$result_dir/run-archive/" \;
|
|
445
|
+
fi
|
|
446
|
+
if [ -d "$work_dir/.devlyn" ] && [ -d "$result_dir/run-archive" ]; then
|
|
447
|
+
find "$work_dir/.devlyn" -maxdepth 1 -type f \
|
|
448
|
+
\( -name 'verify.findings*.jsonl' -o -name 'verify.*findings*.jsonl' -o -name 'verify-merged.findings.jsonl' \) \
|
|
449
|
+
! -name 'verify-mechanical.findings.jsonl' \
|
|
450
|
+
-exec cp {} "$result_dir/run-archive/" \;
|
|
451
|
+
fi
|
|
452
|
+
summarize_arm "$result_dir" "$elapsed" "$invoke_exit"
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
mirror_skills
|
|
456
|
+
echo "[frozen-verify] ► solo"
|
|
457
|
+
run_arm solo ""
|
|
458
|
+
echo "[frozen-verify] ► pair"
|
|
459
|
+
if [ "$PAIR_MODE" = "forced" ]; then
|
|
460
|
+
run_arm pair "--pair-verify"
|
|
461
|
+
else
|
|
462
|
+
run_arm pair ""
|
|
463
|
+
fi
|
|
464
|
+
|
|
465
|
+
python3 - "$RESULT_ROOT" "$PAIR_MODE" <<'PY'
|
|
466
|
+
import json, pathlib, sys
|
|
467
|
+
root = pathlib.Path(sys.argv[1])
|
|
468
|
+
pair_mode_requested = sys.argv[2]
|
|
469
|
+
out = {}
|
|
470
|
+
for arm in ("solo", "pair"):
|
|
471
|
+
path = root / arm / "summary.json"
|
|
472
|
+
out[arm] = json.loads(path.read_text()) if path.is_file() else {"missing": True}
|
|
473
|
+
solo = out.get("solo", {})
|
|
474
|
+
pair = out.get("pair", {})
|
|
475
|
+
rank = {
|
|
476
|
+
None: 0,
|
|
477
|
+
"PASS": 0,
|
|
478
|
+
"PASS_WITH_ISSUES": 1,
|
|
479
|
+
"NEEDS_WORK": 2,
|
|
480
|
+
"BLOCKED": 3,
|
|
481
|
+
}
|
|
482
|
+
solo_rank = rank.get(solo.get("verify_verdict"), 0)
|
|
483
|
+
pair_rank = rank.get(pair.get("verify_verdict"), 0)
|
|
484
|
+
pair_sub = pair.get("sub_verdicts") or {}
|
|
485
|
+
pair_primary_verdict = pair_sub.get("judge")
|
|
486
|
+
pair_judge_verdict = pair_sub.get("pair_judge")
|
|
487
|
+
pair_primary_rank = rank.get(pair_primary_verdict, 0)
|
|
488
|
+
pair_judge_rank = rank.get(pair_judge_verdict, 0)
|
|
489
|
+
out["comparison"] = {
|
|
490
|
+
"pair_mode_requested": pair_mode_requested,
|
|
491
|
+
"pair_trigger_missed": bool(
|
|
492
|
+
pair_mode_requested == "gated"
|
|
493
|
+
and (pair.get("pair_trigger") or {}).get("eligible") is True
|
|
494
|
+
and (pair.get("pair_trigger") or {}).get("reasons")
|
|
495
|
+
and not pair.get("pair_mode")
|
|
496
|
+
),
|
|
497
|
+
"pair_found_more_findings": (pair.get("verify_findings_count") or 0) > (solo.get("verify_findings_count") or 0),
|
|
498
|
+
"pair_found_more_low_or_worse": sum((pair.get("severity_counts") or {}).get(k, 0) for k in ("LOW", "MEDIUM", "HIGH", "CRITICAL"))
|
|
499
|
+
> sum((solo.get("severity_counts") or {}).get(k, 0) for k in ("LOW", "MEDIUM", "HIGH", "CRITICAL")),
|
|
500
|
+
"pair_verdict_lift": bool(pair.get("pair_mode")) and pair_rank > solo_rank and pair_rank >= rank["NEEDS_WORK"],
|
|
501
|
+
"pair_internal_verdict_lift": bool(pair.get("pair_mode"))
|
|
502
|
+
and pair_judge_rank > pair_primary_rank
|
|
503
|
+
and pair_rank >= rank["NEEDS_WORK"],
|
|
504
|
+
"solo_verdict": solo.get("verify_verdict"),
|
|
505
|
+
"pair_verdict": pair.get("verify_verdict"),
|
|
506
|
+
"pair_primary_verdict": pair_primary_verdict,
|
|
507
|
+
"pair_judge_verdict": pair_judge_verdict,
|
|
508
|
+
}
|
|
509
|
+
(root / "compare.json").write_text(json.dumps(out, indent=2) + "\n")
|
|
510
|
+
print(json.dumps(out, indent=2))
|
|
511
|
+
PY
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-full-pipeline-pair-candidate.sh — measure full-pipeline L2/pair candidates.
|
|
3
|
+
#
|
|
4
|
+
# Runs bare + solo_claude first and applies headroom-gate.py. Only if the set
|
|
5
|
+
# leaves room for L2 does it run l2_gated, rejudge, and apply
|
|
6
|
+
# full-pipeline-pair-gate.py.
|
|
7
|
+
|
|
8
|
+
set -euo pipefail
|
|
9
|
+
|
|
10
|
+
usage() {
|
|
11
|
+
local code="${1:-1}"
|
|
12
|
+
cat >&2 <<'EOF'
|
|
13
|
+
usage: run-full-pipeline-pair-candidate.sh [options] <fixture> [<fixture> ...]
|
|
14
|
+
|
|
15
|
+
Options:
|
|
16
|
+
--run-id ID
|
|
17
|
+
--bare-max N
|
|
18
|
+
--solo-max N
|
|
19
|
+
--min-fixtures N
|
|
20
|
+
--min-pair-margin N
|
|
21
|
+
--max-pair-solo-wall-ratio N
|
|
22
|
+
--pair-arm ARM
|
|
23
|
+
--reuse-calibrated-from RUN_ID
|
|
24
|
+
EOF
|
|
25
|
+
exit "$code"
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
RUN_ID=""
|
|
29
|
+
BARE_MAX=60
|
|
30
|
+
SOLO_MAX=80
|
|
31
|
+
MIN_FIXTURES=2
|
|
32
|
+
MIN_PAIR_MARGIN=5
|
|
33
|
+
MAX_PAIR_SOLO_WALL_RATIO=""
|
|
34
|
+
PAIR_ARM="l2_gated"
|
|
35
|
+
REUSE_CALIBRATED_FROM=""
|
|
36
|
+
FIXTURES=()
|
|
37
|
+
while [ $# -gt 0 ]; do
|
|
38
|
+
case "$1" in
|
|
39
|
+
--run-id) RUN_ID="$2"; shift 2;;
|
|
40
|
+
--bare-max) BARE_MAX="$2"; shift 2;;
|
|
41
|
+
--solo-max) SOLO_MAX="$2"; shift 2;;
|
|
42
|
+
--min-fixtures) MIN_FIXTURES="$2"; shift 2;;
|
|
43
|
+
--min-pair-margin) MIN_PAIR_MARGIN="$2"; shift 2;;
|
|
44
|
+
--max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
|
|
45
|
+
--pair-arm) PAIR_ARM="$2"; shift 2;;
|
|
46
|
+
--reuse-calibrated-from) REUSE_CALIBRATED_FROM="$2"; shift 2;;
|
|
47
|
+
-h|--help) usage 0;;
|
|
48
|
+
F[0-9]*) FIXTURES+=("$1"); shift;;
|
|
49
|
+
*) echo "unknown arg: $1" >&2; usage;;
|
|
50
|
+
esac
|
|
51
|
+
done
|
|
52
|
+
[ ${#FIXTURES[@]} -gt 0 ] || usage
|
|
53
|
+
|
|
54
|
+
BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
|
|
55
|
+
REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
|
|
56
|
+
|
|
57
|
+
if [ -z "$RUN_ID" ]; then
|
|
58
|
+
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
|
59
|
+
SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
|
|
60
|
+
RUN_ID="${TS}-${SHA}-full-pipeline-pair"
|
|
61
|
+
fi
|
|
62
|
+
|
|
63
|
+
echo ""
|
|
64
|
+
echo "═══ Full-Pipeline Pair Candidate Run ═══"
|
|
65
|
+
echo "Run-id: $RUN_ID"
|
|
66
|
+
echo "Fixtures: ${FIXTURES[*]}"
|
|
67
|
+
echo "Arms: bare solo_claude $PAIR_ARM"
|
|
68
|
+
[ -z "$REUSE_CALIBRATED_FROM" ] || echo "Reuse: bare+solo from $REUSE_CALIBRATED_FROM"
|
|
69
|
+
echo ""
|
|
70
|
+
|
|
71
|
+
SRC_SKILLS="$REPO_ROOT/config/skills"
|
|
72
|
+
DST_SKILLS="$REPO_ROOT/.claude/skills"
|
|
73
|
+
mkdir -p "$DST_SKILLS"
|
|
74
|
+
mirrored=0
|
|
75
|
+
for src_dir in "$SRC_SKILLS"/*/; do
|
|
76
|
+
[ -d "$src_dir" ] || continue
|
|
77
|
+
name=$(basename "$src_dir")
|
|
78
|
+
case "$name" in
|
|
79
|
+
devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
|
|
80
|
+
continue ;;
|
|
81
|
+
esac
|
|
82
|
+
staging="$DST_SKILLS/.${name}.staging"
|
|
83
|
+
rm -rf "$staging"
|
|
84
|
+
cp -R "$src_dir" "$staging"
|
|
85
|
+
rm -rf "$DST_SKILLS/$name"
|
|
86
|
+
mv "$staging" "$DST_SKILLS/$name"
|
|
87
|
+
mirrored=$((mirrored + 1))
|
|
88
|
+
done
|
|
89
|
+
echo "[full-pipeline-pair] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
|
|
90
|
+
|
|
91
|
+
copy_calibrated_arm() {
|
|
92
|
+
local fid="$1"
|
|
93
|
+
local arm="$2"
|
|
94
|
+
local src="$BENCH_ROOT/results/$REUSE_CALIBRATED_FROM/$fid/$arm"
|
|
95
|
+
local dst="$BENCH_ROOT/results/$RUN_ID/$fid/$arm"
|
|
96
|
+
if [ -f "$dst/result.json" ]; then
|
|
97
|
+
echo "[full-pipeline-pair] reuse skip: $fid / $arm already exists in $RUN_ID"
|
|
98
|
+
return 0
|
|
99
|
+
fi
|
|
100
|
+
[ -d "$src" ] || { echo "reuse source missing: $src" >&2; exit 1; }
|
|
101
|
+
[ -f "$src/result.json" ] || { echo "reuse source missing result.json: $src" >&2; exit 1; }
|
|
102
|
+
mkdir -p "$(dirname "$dst")"
|
|
103
|
+
cp -R "$src" "$dst"
|
|
104
|
+
echo "[full-pipeline-pair] reused $fid / $arm from $REUSE_CALIBRATED_FROM"
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
for fid in "${FIXTURES[@]}"; do
|
|
108
|
+
if [ -n "$REUSE_CALIBRATED_FROM" ]; then
|
|
109
|
+
copy_calibrated_arm "$fid" bare
|
|
110
|
+
copy_calibrated_arm "$fid" solo_claude
|
|
111
|
+
else
|
|
112
|
+
echo "[full-pipeline-pair] ► $fid / bare"
|
|
113
|
+
bash "$BENCH_ROOT/scripts/run-fixture.sh" \
|
|
114
|
+
--fixture "$fid" --arm bare --run-id "$RUN_ID" \
|
|
115
|
+
|| echo "[full-pipeline-pair] ✗ $fid / bare (arm failure tolerated; gate will fail if dirty)"
|
|
116
|
+
|
|
117
|
+
echo "[full-pipeline-pair] ► $fid / solo_claude"
|
|
118
|
+
bash "$BENCH_ROOT/scripts/run-fixture.sh" \
|
|
119
|
+
--fixture "$fid" --arm solo_claude --run-id "$RUN_ID" \
|
|
120
|
+
|| echo "[full-pipeline-pair] ✗ $fid / solo_claude (arm failure tolerated; gate will fail if dirty)"
|
|
121
|
+
fi
|
|
122
|
+
|
|
123
|
+
echo "[full-pipeline-pair] ► headroom judge $fid"
|
|
124
|
+
bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
|
|
125
|
+
|| echo "[full-pipeline-pair] ✗ headroom judge failed for $fid"
|
|
126
|
+
done
|
|
127
|
+
|
|
128
|
+
headroom_args=(
|
|
129
|
+
--run-id "$RUN_ID"
|
|
130
|
+
--bare-max "$BARE_MAX"
|
|
131
|
+
--solo-max "$SOLO_MAX"
|
|
132
|
+
--min-fixtures "$MIN_FIXTURES"
|
|
133
|
+
--out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json"
|
|
134
|
+
--out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
|
|
135
|
+
)
|
|
136
|
+
python3 "$BENCH_ROOT/scripts/headroom-gate.py" "${headroom_args[@]}"
|
|
137
|
+
|
|
138
|
+
for fid in "${FIXTURES[@]}"; do
|
|
139
|
+
echo "[full-pipeline-pair] ► $fid / $PAIR_ARM"
|
|
140
|
+
bash "$BENCH_ROOT/scripts/run-fixture.sh" \
|
|
141
|
+
--fixture "$fid" --arm "$PAIR_ARM" --run-id "$RUN_ID" \
|
|
142
|
+
|| echo "[full-pipeline-pair] ✗ $fid / $PAIR_ARM (arm failure tolerated; gate will fail if dirty)"
|
|
143
|
+
|
|
144
|
+
echo "[full-pipeline-pair] ► final judge $fid"
|
|
145
|
+
bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
|
|
146
|
+
|| echo "[full-pipeline-pair] ✗ final judge failed for $fid"
|
|
147
|
+
done
|
|
148
|
+
|
|
149
|
+
pair_args=(
|
|
150
|
+
--run-id "$RUN_ID"
|
|
151
|
+
--bare-max "$BARE_MAX"
|
|
152
|
+
--solo-max "$SOLO_MAX"
|
|
153
|
+
--min-fixtures "$MIN_FIXTURES"
|
|
154
|
+
--min-pair-margin "$MIN_PAIR_MARGIN"
|
|
155
|
+
--pair-arm "$PAIR_ARM"
|
|
156
|
+
--out-json "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.json"
|
|
157
|
+
--out-md "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md"
|
|
158
|
+
)
|
|
159
|
+
[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || pair_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
|
|
160
|
+
|
|
161
|
+
python3 "$BENCH_ROOT/scripts/full-pipeline-pair-gate.py" "${pair_args[@]}"
|
|
162
|
+
cat "$BENCH_ROOT/results/$RUN_ID/full-pipeline-pair-gate.md"
|