devlyn-cli 2.0.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/README.md +1 -1
- package/benchmark/auto-resolve/README.md +318 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/bin/devlyn.js +56 -10
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +312 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:ideate/SKILL.md +1 -1
- package/config/skills/devlyn:resolve/SKILL.md +62 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +32 -0
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression test for the SWE-bench frozen VERIFY case importer.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
|
|
6
|
+
TMP="$(mktemp -d)"
|
|
7
|
+
trap 'rm -rf "$TMP"' EXIT
|
|
8
|
+
|
|
9
|
+
REPO="$TMP/repo"
|
|
10
|
+
mkdir -p "$REPO"
|
|
11
|
+
git -C "$REPO" init -q
|
|
12
|
+
git -C "$REPO" config user.email bench@example.com
|
|
13
|
+
git -C "$REPO" config user.name bench
|
|
14
|
+
printf 'hello\n' > "$REPO/app.txt"
|
|
15
|
+
git -C "$REPO" add app.txt
|
|
16
|
+
git -C "$REPO" commit -q -m base
|
|
17
|
+
BASE_SHA="$(git -C "$REPO" rev-parse HEAD)"
|
|
18
|
+
|
|
19
|
+
printf 'goodbye\n' > "$REPO/app.txt"
|
|
20
|
+
git -C "$REPO" diff > "$TMP/model.patch"
|
|
21
|
+
git -C "$REPO" checkout -q -- app.txt
|
|
22
|
+
|
|
23
|
+
cat > "$TMP/instance.json" <<JSON
|
|
24
|
+
{
|
|
25
|
+
"instance_id": "local__repo-1",
|
|
26
|
+
"repo": "local/repo",
|
|
27
|
+
"base_commit": "$BASE_SHA",
|
|
28
|
+
"problem_statement": "Change app.txt so it says goodbye instead of hello.",
|
|
29
|
+
"version": "test",
|
|
30
|
+
"issue_url": "https://example.test/issue",
|
|
31
|
+
"pr_url": "https://example.test/pr"
|
|
32
|
+
}
|
|
33
|
+
JSON
|
|
34
|
+
|
|
35
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
|
|
36
|
+
--instance-json "$TMP/instance.json" \
|
|
37
|
+
--model-patch "$TMP/model.patch" \
|
|
38
|
+
--cases-root "$TMP/cases" \
|
|
39
|
+
--repos-root "$TMP/repos" \
|
|
40
|
+
--repo-dir "$REPO" \
|
|
41
|
+
--timeout-seconds 60 > "$TMP/prepare.json"
|
|
42
|
+
|
|
43
|
+
CASE_DIR="$TMP/cases/local__repo-1"
|
|
44
|
+
BASE_REPO="$TMP/repos/local__repo-${BASE_SHA:0:12}"
|
|
45
|
+
test -f "$CASE_DIR/spec.md"
|
|
46
|
+
test -f "$CASE_DIR/model.patch"
|
|
47
|
+
test -x "$CASE_DIR/setup.sh"
|
|
48
|
+
grep -q 'SWE-bench local__repo-1' "$CASE_DIR/spec.md"
|
|
49
|
+
grep -q -- '--pair-mode gated' "$CASE_DIR/run-command.txt"
|
|
50
|
+
|
|
51
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
|
|
52
|
+
--dataset lite \
|
|
53
|
+
--limit 1 \
|
|
54
|
+
--out "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json"
|
|
55
|
+
grep -q '"rows_written": 1' "$TMP/fetch.json"
|
|
56
|
+
python3 - "$TMP/fetched-lite.jsonl" <<'PY'
|
|
57
|
+
import json, pathlib, sys
|
|
58
|
+
row = json.loads(pathlib.Path(sys.argv[1]).read_text().splitlines()[0])
|
|
59
|
+
for key in ("instance_id", "repo", "base_commit", "problem_statement"):
|
|
60
|
+
assert row.get(key), key
|
|
61
|
+
PY
|
|
62
|
+
|
|
63
|
+
python3 - "$TMP/instance.json" "$TMP/instances.jsonl" "$TMP/model.patch" "$TMP/predictions.jsonl" <<'PY'
|
|
64
|
+
import json, pathlib, sys
|
|
65
|
+
instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
66
|
+
pathlib.Path(sys.argv[2]).write_text(json.dumps(instance) + "\n")
|
|
67
|
+
patch = pathlib.Path(sys.argv[3]).read_text()
|
|
68
|
+
pathlib.Path(sys.argv[4]).write_text(json.dumps({
|
|
69
|
+
"instance_id": "local__repo-1",
|
|
70
|
+
"model_name_or_path": "local-test",
|
|
71
|
+
"model_patch": patch,
|
|
72
|
+
}) + "\n")
|
|
73
|
+
PY
|
|
74
|
+
|
|
75
|
+
mkdir -p "$TMP/patch-root/local__repo-1"
|
|
76
|
+
cp "$TMP/model.patch" "$TMP/patch-root/local__repo-1/patch.diff"
|
|
77
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
|
|
78
|
+
--patch-root "$TMP/patch-root" \
|
|
79
|
+
--instances-jsonl "$TMP/instances.jsonl" \
|
|
80
|
+
--model-name local-patch-root \
|
|
81
|
+
--out "$TMP/collected-predictions.jsonl" > "$TMP/collect.json"
|
|
82
|
+
grep -q '"predictions_written": 1' "$TMP/collect.json"
|
|
83
|
+
python3 - "$TMP/collected-predictions.jsonl" <<'PY'
|
|
84
|
+
import json, pathlib, sys
|
|
85
|
+
row = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
86
|
+
assert row["instance_id"] == "local__repo-1"
|
|
87
|
+
assert row["model_name_or_path"] == "local-patch-root"
|
|
88
|
+
assert row["model_patch"].endswith("\n")
|
|
89
|
+
PY
|
|
90
|
+
|
|
91
|
+
rm -rf "$TMP/cases-batch" "$TMP/repos-batch"
|
|
92
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
|
|
93
|
+
--instances-jsonl "$TMP/instances.jsonl" \
|
|
94
|
+
--predictions-jsonl "$TMP/predictions.jsonl" \
|
|
95
|
+
--cases-root "$TMP/cases-batch" \
|
|
96
|
+
--repos-root "$TMP/repos-batch" \
|
|
97
|
+
--repo-dir "$REPO" \
|
|
98
|
+
--out-manifest "$TMP/manifest.json" > "$TMP/batch.json"
|
|
99
|
+
grep -q '"prepared_count": 1' "$TMP/manifest.json"
|
|
100
|
+
test -f "$TMP/cases-batch/local__repo-1/model.patch"
|
|
101
|
+
|
|
102
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
|
|
103
|
+
--fixture local__repo-1 \
|
|
104
|
+
--fixtures-root "$TMP/cases" \
|
|
105
|
+
--base-repo "$BASE_REPO" \
|
|
106
|
+
--diff "$CASE_DIR/model.patch" \
|
|
107
|
+
--run-id swebench-frozen-case-test \
|
|
108
|
+
--pair-mode gated \
|
|
109
|
+
--timeout-seconds 7 \
|
|
110
|
+
--prepare-only > "$TMP/runner.log"
|
|
111
|
+
|
|
112
|
+
grep -q 'Timeout: 7s per arm' "$TMP/runner.log"
|
|
113
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/app.txt
|
|
114
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/app.txt
|
|
115
|
+
test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/.devlyn/spec-verify.json
|
|
116
|
+
test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/.devlyn/spec-verify.json
|
|
117
|
+
|
|
118
|
+
RESULTS_DIR="$ROOT/benchmark/auto-resolve/results"
|
|
119
|
+
RESUME_RUN_ID="swebench-resume-arm-test-local__repo-1"
|
|
120
|
+
mkdir -p "$RESULTS_DIR/$RESUME_RUN_ID/solo" "$TMP/fakebin"
|
|
121
|
+
cat > "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json" <<'EOF'
|
|
122
|
+
{
|
|
123
|
+
"elapsed_seconds": 1,
|
|
124
|
+
"invoke_exit": 0,
|
|
125
|
+
"timed_out": false,
|
|
126
|
+
"verify_verdict": "PASS",
|
|
127
|
+
"terminal_verdict": "PASS"
|
|
128
|
+
}
|
|
129
|
+
EOF
|
|
130
|
+
cat > "$TMP/fakebin/claude" <<'EOF'
|
|
131
|
+
#!/usr/bin/env bash
|
|
132
|
+
echo "fake claude invoked"
|
|
133
|
+
exit 1
|
|
134
|
+
EOF
|
|
135
|
+
chmod +x "$TMP/fakebin/claude"
|
|
136
|
+
PATH="$TMP/fakebin:$PATH" bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
|
|
137
|
+
--fixture local__repo-1 \
|
|
138
|
+
--fixtures-root "$TMP/cases" \
|
|
139
|
+
--base-repo "$BASE_REPO" \
|
|
140
|
+
--diff "$CASE_DIR/model.patch" \
|
|
141
|
+
--run-id "$RESUME_RUN_ID" \
|
|
142
|
+
--pair-mode gated \
|
|
143
|
+
--timeout-seconds 3 \
|
|
144
|
+
--resume-completed-arms > "$TMP/resume-arm.log" 2>&1
|
|
145
|
+
grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/resume-arm.log"
|
|
146
|
+
grep -Fq 'fake claude invoked' "$RESULTS_DIR/$RESUME_RUN_ID/pair/transcript.txt"
|
|
147
|
+
grep -q '"invoke_exit": 0' "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json"
|
|
148
|
+
|
|
149
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
|
|
150
|
+
--manifest "$TMP/manifest.json" \
|
|
151
|
+
--run-prefix swebench-frozen-corpus-test \
|
|
152
|
+
--timeout-seconds 7 \
|
|
153
|
+
--run-ids-out "$TMP/prepare-run-ids.txt" \
|
|
154
|
+
--out-json "$TMP/gate.json" \
|
|
155
|
+
--out-md "$TMP/gate.md" \
|
|
156
|
+
--prepare-only > "$TMP/corpus-runner.log"
|
|
157
|
+
grep -q 'prepare-only complete; gate skipped' "$TMP/corpus-runner.log"
|
|
158
|
+
grep -q 'Timeout: 7s per arm' "$TMP/corpus-runner.log"
|
|
159
|
+
grep -q '^swebench-frozen-corpus-test-1-local__repo-1$' "$TMP/prepare-run-ids.txt"
|
|
160
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/app.txt
|
|
161
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/app.txt
|
|
162
|
+
test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/.devlyn/spec-verify.json
|
|
163
|
+
test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/.devlyn/spec-verify.json
|
|
164
|
+
test ! -e "$TMP/gate.json"
|
|
165
|
+
test ! -e "$TMP/gate.md"
|
|
166
|
+
|
|
167
|
+
python3 - "$TMP/manifest.json" "$TMP/manifest-bad-diff.json" <<'PY'
|
|
168
|
+
import json, pathlib, sys
|
|
169
|
+
manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
170
|
+
manifest["prepared"][0]["case_dir"] = str(pathlib.Path(manifest["prepared"][0]["case_dir"]).parent / "missing-case")
|
|
171
|
+
pathlib.Path(sys.argv[2]).write_text(json.dumps(manifest, indent=2) + "\n")
|
|
172
|
+
PY
|
|
173
|
+
set +e
|
|
174
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
|
|
175
|
+
--manifest "$TMP/manifest-bad-diff.json" \
|
|
176
|
+
--run-prefix swebench-frozen-corpus-fail-test \
|
|
177
|
+
--run-ids-out "$TMP/fail-run-ids.txt" \
|
|
178
|
+
--prepare-only > "$TMP/corpus-fail.log" 2>&1
|
|
179
|
+
fail_status=$?
|
|
180
|
+
set -e
|
|
181
|
+
[ "$fail_status" -ne 0 ]
|
|
182
|
+
grep -q 'row failed: swebench-frozen-corpus-fail-test-1-local__repo-1' "$TMP/corpus-fail.log"
|
|
183
|
+
grep -q '^swebench-frozen-corpus-fail-test-1-local__repo-1$' "$TMP/fail-run-ids.txt"
|
|
184
|
+
test -f "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1/compare.json"
|
|
185
|
+
|
|
186
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
187
|
+
--title "Local SWE-bench Failed Matrix" \
|
|
188
|
+
--verdict FAIL \
|
|
189
|
+
--run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
|
|
190
|
+
--out-json "$TMP/fail-matrix.json" \
|
|
191
|
+
--out-md "$TMP/fail-matrix.md" > "$TMP/fail-matrix.log"
|
|
192
|
+
grep -q '"classification": "failed attempt: row runner exit=1"' "$TMP/fail-matrix.json"
|
|
193
|
+
grep -q '"trailing_non_gate_rows": 1' "$TMP/fail-matrix.json"
|
|
194
|
+
grep -q '"failed attempt: row runner exit=1": 1' "$TMP/fail-matrix.json"
|
|
195
|
+
grep -Fq 'failed attempt: row runner exit=1' "$TMP/fail-matrix.md"
|
|
196
|
+
grep -Fq 'Trailing non-gate rows: 1' "$TMP/fail-matrix.md"
|
|
197
|
+
set +e
|
|
198
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
199
|
+
--title "Local SWE-bench Failed Matrix" \
|
|
200
|
+
--verdict FAIL \
|
|
201
|
+
--run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
|
|
202
|
+
--max-trailing-non-gate 0 \
|
|
203
|
+
--out-json "$TMP/fail-yield-matrix.json" \
|
|
204
|
+
--out-md "$TMP/fail-yield-matrix.md" > "$TMP/fail-yield-matrix.log"
|
|
205
|
+
yield_status=$?
|
|
206
|
+
set -e
|
|
207
|
+
[ "$yield_status" -eq 2 ]
|
|
208
|
+
grep -q '"yield_verdict": "FAIL"' "$TMP/fail-yield-matrix.json"
|
|
209
|
+
grep -q '"trailing non-gate rows 1 > maximum 0"' "$TMP/fail-yield-matrix.json"
|
|
210
|
+
grep -Fq 'Yield verdict: **FAIL**' "$TMP/fail-yield-matrix.md"
|
|
211
|
+
|
|
212
|
+
PROVIDER_LIMIT_RUN_ID="swebench-provider-limit-test-local__repo-1"
|
|
213
|
+
mkdir -p "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo" "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair"
|
|
214
|
+
cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo/input.md" <<'EOF'
|
|
215
|
+
Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
|
|
216
|
+
EOF
|
|
217
|
+
cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair/transcript.txt" <<'EOF'
|
|
218
|
+
You've hit your limit · resets 3am (Asia/Seoul)
|
|
219
|
+
EOF
|
|
220
|
+
cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/compare.json" <<'EOF'
|
|
221
|
+
{
|
|
222
|
+
"solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 1},
|
|
223
|
+
"pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "elapsed_seconds": 1},
|
|
224
|
+
"comparison": {
|
|
225
|
+
"pair_trigger_missed": false,
|
|
226
|
+
"pair_verdict_lift": false,
|
|
227
|
+
"solo_verdict": "PASS",
|
|
228
|
+
"pair_verdict": null
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
EOF
|
|
232
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
233
|
+
--title "Local SWE-bench Provider Limit Matrix" \
|
|
234
|
+
--verdict FAIL \
|
|
235
|
+
--run-id "$PROVIDER_LIMIT_RUN_ID" \
|
|
236
|
+
--out-json "$TMP/provider-limit-matrix.json" \
|
|
237
|
+
--out-md "$TMP/provider-limit-matrix.md" > "$TMP/provider-limit-matrix.log"
|
|
238
|
+
grep -q '"classification": "failed attempt: provider limit"' "$TMP/provider-limit-matrix.json"
|
|
239
|
+
grep -Fq 'failed attempt: provider limit' "$TMP/provider-limit-matrix.md"
|
|
240
|
+
|
|
241
|
+
RUN_ID="swebench-gate-only-test-local__repo-1"
|
|
242
|
+
mkdir -p "$RESULTS_DIR/$RUN_ID/pair"
|
|
243
|
+
cat > "$RESULTS_DIR/$RUN_ID/pair/input.md" <<'EOF'
|
|
244
|
+
Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
|
|
245
|
+
EOF
|
|
246
|
+
cat > "$RESULTS_DIR/$RUN_ID/compare.json" <<'EOF'
|
|
247
|
+
{
|
|
248
|
+
"solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
|
|
249
|
+
"pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
|
|
250
|
+
"comparison": {
|
|
251
|
+
"pair_trigger_missed": false,
|
|
252
|
+
"pair_verdict_lift": true,
|
|
253
|
+
"solo_verdict": "PASS_WITH_ISSUES",
|
|
254
|
+
"pair_verdict": "NEEDS_WORK"
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
EOF
|
|
258
|
+
printf '%s\n' "$RUN_ID" > "$TMP/run-ids.txt"
|
|
259
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
|
|
260
|
+
--manifest "$TMP/manifest.json" \
|
|
261
|
+
--gate-only-run-ids "$TMP/run-ids.txt" \
|
|
262
|
+
--min-runs 1 \
|
|
263
|
+
--max-pair-solo-wall-ratio 3 \
|
|
264
|
+
--run-ids-out "$TMP/gate-run-ids.txt" \
|
|
265
|
+
--out-json "$TMP/gate.json" \
|
|
266
|
+
--out-md "$TMP/gate.md" > "$TMP/gate-only.log"
|
|
267
|
+
grep -q '"verdict": "PASS"' "$TMP/gate.json"
|
|
268
|
+
grep -q '"avg_pair_solo_wall_ratio": 2.0' "$TMP/gate.json"
|
|
269
|
+
grep -Fq 'Verdict: **PASS**' "$TMP/gate.md"
|
|
270
|
+
grep -Fq 'Max pair/solo wall ratio: 3.00x' "$TMP/gate.md"
|
|
271
|
+
cmp "$TMP/run-ids.txt" "$TMP/gate-run-ids.txt"
|
|
272
|
+
|
|
273
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
274
|
+
--title "Local SWE-bench Matrix" \
|
|
275
|
+
--verdict PASS \
|
|
276
|
+
--gate-json "$TMP/gate.json" \
|
|
277
|
+
--run-id "$RUN_ID" \
|
|
278
|
+
--min-gate-rate 1 \
|
|
279
|
+
--max-trailing-non-gate 0 \
|
|
280
|
+
--out-json "$TMP/matrix.json" \
|
|
281
|
+
--out-md "$TMP/matrix.md" > "$TMP/matrix.log"
|
|
282
|
+
grep -q '"runs_total": 1' "$TMP/matrix.json"
|
|
283
|
+
grep -q '"gate_rows": 1' "$TMP/matrix.json"
|
|
284
|
+
grep -q '"gate_rate": 1.0' "$TMP/matrix.json"
|
|
285
|
+
grep -q '"trailing_non_gate_rows": 0' "$TMP/matrix.json"
|
|
286
|
+
grep -q '"yield_verdict": "PASS"' "$TMP/matrix.json"
|
|
287
|
+
grep -Fq 'Local SWE-bench Matrix' "$TMP/matrix.md"
|
|
288
|
+
grep -Fq 'Gate rate: 1.000' "$TMP/matrix.md"
|
|
289
|
+
grep -Fq 'Yield verdict: **PASS**' "$TMP/matrix.md"
|
|
290
|
+
|
|
291
|
+
rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-solo
|
|
292
|
+
rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-pair
|
|
293
|
+
rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo
|
|
294
|
+
rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair
|
|
295
|
+
rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test"
|
|
296
|
+
rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-test-1-local__repo-1"
|
|
297
|
+
rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1"
|
|
298
|
+
rm -rf "$RESULTS_DIR/$RESUME_RUN_ID"
|
|
299
|
+
rm -rf "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID"
|
|
300
|
+
rm -rf "$RESULTS_DIR/$RUN_ID"
|
|
301
|
+
|
|
302
|
+
echo "PASS test-swebench-frozen-case"
|
package/bin/devlyn.js
CHANGED
|
@@ -19,6 +19,10 @@ const CLI_TARGETS = {
|
|
|
19
19
|
instructionsFile: 'AGENTS.md',
|
|
20
20
|
baseInstructionsFile: 'AGENTS.md',
|
|
21
21
|
configDir: null, // Codex uses AGENTS.md at project root
|
|
22
|
+
// Codex auto-loads skills from ~/.codex/skills/ (user-global). Same
|
|
23
|
+
// SKILL.md format as Claude Code; descriptions must stay ≤1024 chars.
|
|
24
|
+
skillsDir: path.join(os.homedir(), '.codex', 'skills'),
|
|
25
|
+
skillsToInstall: ['devlyn:resolve', 'devlyn:ideate', '_shared'],
|
|
22
26
|
detect: () => fs.existsSync(path.join(process.cwd(), 'AGENTS.md')) || fs.existsSync(path.join(process.cwd(), '.codex')),
|
|
23
27
|
},
|
|
24
28
|
gemini: {
|
|
@@ -509,6 +513,37 @@ function detectOtherCLIs() {
|
|
|
509
513
|
return detected;
|
|
510
514
|
}
|
|
511
515
|
|
|
516
|
+
// Install /devlyn:resolve + /devlyn:ideate + _shared skills into a CLI's
|
|
517
|
+
// global skills directory (e.g. ~/.codex/skills/). Returns count of skills
|
|
518
|
+
// copied. Skipped silently for CLIs without a skillsDir (e.g. cursor, copilot
|
|
519
|
+
// at the time of writing — they don't have an analogous skill-loader).
|
|
520
|
+
function installSkillsForCLI(cliKey) {
|
|
521
|
+
const cli = CLI_TARGETS[cliKey];
|
|
522
|
+
if (!cli || !cli.skillsDir || !cli.skillsToInstall) return 0;
|
|
523
|
+
|
|
524
|
+
const sourceSkillsDir = path.join(CONFIG_SOURCE, 'skills');
|
|
525
|
+
if (!fs.existsSync(sourceSkillsDir)) return 0;
|
|
526
|
+
if (!fs.existsSync(cli.skillsDir)) {
|
|
527
|
+
fs.mkdirSync(cli.skillsDir, { recursive: true });
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
let copied = 0;
|
|
531
|
+
for (const skillName of cli.skillsToInstall) {
|
|
532
|
+
const src = path.join(sourceSkillsDir, skillName);
|
|
533
|
+
const dest = path.join(cli.skillsDir, skillName);
|
|
534
|
+
if (!fs.existsSync(src)) continue;
|
|
535
|
+
// Full replace per cleanManagedSkillDirs semantics: stale files in the
|
|
536
|
+
// installed mirror would otherwise persist forever.
|
|
537
|
+
if (fs.existsSync(dest)) {
|
|
538
|
+
fs.rmSync(dest, { recursive: true, force: true });
|
|
539
|
+
}
|
|
540
|
+
copyRecursive(src, dest, cli.skillsDir);
|
|
541
|
+
copied++;
|
|
542
|
+
log(` → ${cli.skillsDir.replace(os.homedir(), '~')}/${skillName}`, 'dim');
|
|
543
|
+
}
|
|
544
|
+
return copied;
|
|
545
|
+
}
|
|
546
|
+
|
|
512
547
|
function installAgentsForCLI(cliKey) {
|
|
513
548
|
const cli = CLI_TARGETS[cliKey];
|
|
514
549
|
if (!cli) return false;
|
|
@@ -561,6 +596,14 @@ function installAgentsForCLI(cliKey) {
|
|
|
561
596
|
log(` → ${cli.instructionsFile} (agent instructions appended)`, 'dim');
|
|
562
597
|
}
|
|
563
598
|
|
|
599
|
+
// If this CLI also supports a global skill-loader (currently Codex), install
|
|
600
|
+
// /devlyn:resolve + /devlyn:ideate + _shared so the same slash commands work
|
|
601
|
+
// there. Skipped for CLIs without a skillsDir entry.
|
|
602
|
+
const skillsCopied = installSkillsForCLI(cliKey);
|
|
603
|
+
if (skillsCopied > 0) {
|
|
604
|
+
log(` → ${skillsCopied} skill${skillsCopied > 1 ? 's' : ''} installed (devlyn:resolve / devlyn:ideate / _shared)`, 'dim');
|
|
605
|
+
}
|
|
606
|
+
|
|
564
607
|
return true;
|
|
565
608
|
}
|
|
566
609
|
|
|
@@ -695,7 +738,7 @@ async function init(skipPrompts = false) {
|
|
|
695
738
|
// Skip prompts if -y flag or non-interactive
|
|
696
739
|
if (skipPrompts || !process.stdin.isTTY) {
|
|
697
740
|
log('\n💡 Add optional addons later: run `npx devlyn-cli` without -y', 'dim');
|
|
698
|
-
log(' Add Codex instructions later: run `npx devlyn-cli agents codex`', 'dim');
|
|
741
|
+
log(' Add Codex instructions + skills later: run `npx devlyn-cli agents codex`', 'dim');
|
|
699
742
|
log(`\n${COLORS.dim} Enjoying devlyn? Star it on GitHub — it helps others find it:${COLORS.reset}`);
|
|
700
743
|
log(` ${COLORS.purple}→ https://github.com/fysoul17/devlyn-cli${COLORS.reset}\n`);
|
|
701
744
|
return;
|
|
@@ -703,14 +746,17 @@ async function init(skipPrompts = false) {
|
|
|
703
746
|
|
|
704
747
|
// Ask which non-Claude CLIs should receive instruction files.
|
|
705
748
|
log('\n🤖 Optional AI CLI instructions:\n', 'blue');
|
|
706
|
-
const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) =>
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
749
|
+
const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) => {
|
|
750
|
+
let desc;
|
|
751
|
+
if (cli.configDir) {
|
|
752
|
+
desc = `Install agents into ${cli.configDir}/`;
|
|
753
|
+
} else if (cli.skillsDir) {
|
|
754
|
+
desc = `Install ${cli.instructionsFile} + /devlyn:resolve + /devlyn:ideate skills (~/.codex/skills/)`;
|
|
755
|
+
} else {
|
|
756
|
+
desc = `Install ${cli.instructionsFile}`;
|
|
757
|
+
}
|
|
758
|
+
return { key, name: cli.name, desc, type: 'cli' };
|
|
759
|
+
});
|
|
714
760
|
const selectedClis = await multiSelect(cliOptions);
|
|
715
761
|
if (selectedClis.length > 0) {
|
|
716
762
|
let agentsInstalled = 0;
|
|
@@ -720,7 +766,7 @@ async function init(skipPrompts = false) {
|
|
|
720
766
|
log(` ✅ Agent instructions installed for ${agentsInstalled} CLI${agentsInstalled !== 1 ? 's' : ''}`, 'green');
|
|
721
767
|
} else {
|
|
722
768
|
log('💡 No additional CLI instructions selected', 'dim');
|
|
723
|
-
log(' Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md', 'dim');
|
|
769
|
+
log(' Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md + /devlyn skills', 'dim');
|
|
724
770
|
}
|
|
725
771
|
|
|
726
772
|
// Ask about optional addons (local skills + external packs)
|
|
@@ -26,6 +26,7 @@ PER_RUN_PATTERNS = (
|
|
|
26
26
|
"*.log.md",
|
|
27
27
|
"fix-batch.round-*.json",
|
|
28
28
|
"criteria.generated.md",
|
|
29
|
+
"risk-probes.jsonl",
|
|
29
30
|
# iter-0019.8: spec-verify carrier artifacts get archived alongside
|
|
30
31
|
# other per-run state. Killed mid-run cleanup is enforced separately
|
|
31
32
|
# by spec-verify-check.py main() — when source markdown has no json
|
|
@@ -35,6 +36,7 @@ PER_RUN_PATTERNS = (
|
|
|
35
36
|
"spec-verify.json",
|
|
36
37
|
"spec-verify.results.json",
|
|
37
38
|
"spec-verify-findings.jsonl",
|
|
39
|
+
"verify-merge.summary.json",
|
|
38
40
|
# iter-0033a/2026-04-30 archive-fix iter: NEW /devlyn:resolve emits
|
|
39
41
|
# plan.md (PLAN output) + final-report.md (PHASE 6 render) +
|
|
40
42
|
# cumulative.patch (cumulative diff). Smoke 2's archive listing
|
|
@@ -52,6 +54,7 @@ PER_RUN_PATTERNS = (
|
|
|
52
54
|
# ("pair_judge findings archive distinguishable") would false-fail on
|
|
53
55
|
# every paired fixture without this glob.
|
|
54
56
|
"verify-judge-*.md",
|
|
57
|
+
"codex-judge.*",
|
|
55
58
|
)
|
|
56
59
|
|
|
57
60
|
|
|
@@ -6,7 +6,7 @@ Single source of truth for how every skill calls Codex. **MCP is not used.** Ski
|
|
|
6
6
|
|
|
7
7
|
All long-running Codex calls go through `codex-monitored.sh` — a thin wrapper that closes stdin (codex 0.124.0 hangs when both stdin is open and a prompt arg is given), streams Codex stdout fully (no `tail -n` truncation), and prints a `[codex-monitored] heartbeat` line every 30s so the outer `claude -p` byte-watchdog stays fed during long reasoning gaps. The wrapper passes its arguments through verbatim to the underlying CLI, so the canonical flag set is unchanged from a raw call — only the launcher differs.
|
|
8
8
|
|
|
9
|
-
**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex.
|
|
9
|
+
**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex. Read-only critique returns findings on stdout; the orchestrator writes any files.
|
|
10
10
|
|
|
11
11
|
```bash
|
|
12
12
|
bash .claude/skills/_shared/codex-monitored.sh \
|
|
@@ -51,4 +51,4 @@ The local Codex CLI (fronted by `codex-monitored.sh`) is the primary (and only)
|
|
|
51
51
|
|
|
52
52
|
Skills write the invocation as a Bash command the runtime executes. Example shape from `/devlyn:resolve` PHASE 2 IMPLEMENT when routed to Codex:
|
|
53
53
|
|
|
54
|
-
> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout.
|
|
54
|
+
> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout. Do not pipe the wrapper; direct capture or file redirection preserves streaming and avoids the pipe-refusal exit.
|
|
@@ -41,7 +41,10 @@
|
|
|
41
41
|
#
|
|
42
42
|
# ENV OVERRIDES:
|
|
43
43
|
# CODEX_MONITORED_HEARTBEAT — heartbeat interval seconds (default 30).
|
|
44
|
-
#
|
|
44
|
+
# CODEX_MONITORED_TIMEOUT_SEC — optional hard timeout. When >0, kill the
|
|
45
|
+
# codex process group and exit 124.
|
|
46
|
+
# CODEX_BIN — real codex binary path. Default:
|
|
47
|
+
# CODEX_REAL_BIN when set, else `codex`.
|
|
45
48
|
# Set this when the shim has put us first
|
|
46
49
|
# on PATH.
|
|
47
50
|
# CODEX_MONITORED_ALLOW_PIPED — set non-empty to skip the pipe-stdout
|
|
@@ -63,8 +66,10 @@ if [ -n "${CODEX_BLOCKED:-}" ]; then
|
|
|
63
66
|
fi
|
|
64
67
|
|
|
65
68
|
HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
|
|
66
|
-
|
|
69
|
+
TIMEOUT_SEC="${CODEX_MONITORED_TIMEOUT_SEC:-0}"
|
|
70
|
+
CODEX_BIN="${CODEX_BIN:-${CODEX_REAL_BIN:-codex}}"
|
|
67
71
|
START=$(date +%s)
|
|
72
|
+
TIMEOUT_FLAG=""
|
|
68
73
|
|
|
69
74
|
# --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
|
|
70
75
|
# `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
|
|
@@ -106,35 +111,95 @@ heartbeat_loop() {
|
|
|
106
111
|
done
|
|
107
112
|
}
|
|
108
113
|
|
|
114
|
+
timeout_loop() {
|
|
115
|
+
local pid="$1"
|
|
116
|
+
local seconds="$2"
|
|
117
|
+
local flag="$3"
|
|
118
|
+
[ "$seconds" -gt 0 ] || return 0
|
|
119
|
+
sleep "$seconds"
|
|
120
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
121
|
+
: > "$flag"
|
|
122
|
+
printf '[codex-monitored] timeout: elapsed=%ds limit=%ds\n' \
|
|
123
|
+
"$(( $(date +%s) - START ))" "$seconds" >&2
|
|
124
|
+
kill -TERM -- "-$pid" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
|
|
125
|
+
sleep 5
|
|
126
|
+
kill -KILL -- "-$pid" 2>/dev/null || kill -KILL "$pid" 2>/dev/null || true
|
|
127
|
+
fi
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
terminate_process_group() {
|
|
131
|
+
local pgid="$1"
|
|
132
|
+
local reason="$2"
|
|
133
|
+
if ! kill -0 -- "-$pgid" 2>/dev/null; then
|
|
134
|
+
return 0
|
|
135
|
+
fi
|
|
136
|
+
printf '[codex-monitored] reap: reason=%s pgid=%s\n' "$reason" "$pgid" >&2
|
|
137
|
+
kill -TERM -- "-$pgid" 2>/dev/null || true
|
|
138
|
+
local i
|
|
139
|
+
for i in 1 2 3 4 5; do
|
|
140
|
+
sleep 1
|
|
141
|
+
if ! kill -0 -- "-$pgid" 2>/dev/null; then
|
|
142
|
+
return 0
|
|
143
|
+
fi
|
|
144
|
+
done
|
|
145
|
+
kill -KILL -- "-$pgid" 2>/dev/null || true
|
|
146
|
+
}
|
|
147
|
+
|
|
109
148
|
forward_signal() {
|
|
110
149
|
local sig="$1"
|
|
111
150
|
if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
|
|
112
|
-
kill -"$sig" "$CODEX_PID" 2>/dev/null || true
|
|
151
|
+
kill -"$sig" -- "-$CODEX_PID" 2>/dev/null || kill -"$sig" "$CODEX_PID" 2>/dev/null || true
|
|
113
152
|
fi
|
|
114
153
|
if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
|
|
115
154
|
kill -TERM "$HB_PID" 2>/dev/null || true
|
|
116
155
|
fi
|
|
156
|
+
if [ -n "${WATCHDOG_PID:-}" ] && kill -0 "$WATCHDOG_PID" 2>/dev/null; then
|
|
157
|
+
kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
|
|
158
|
+
fi
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
cleanup() {
|
|
162
|
+
forward_signal TERM
|
|
163
|
+
[ -z "$TIMEOUT_FLAG" ] || rm -f "$TIMEOUT_FLAG"
|
|
117
164
|
}
|
|
118
165
|
|
|
119
|
-
trap 'forward_signal TERM' TERM
|
|
120
|
-
trap 'forward_signal INT' INT
|
|
166
|
+
trap 'forward_signal TERM; exit 143' TERM
|
|
167
|
+
trap 'forward_signal INT; exit 130' INT
|
|
168
|
+
trap cleanup EXIT
|
|
121
169
|
|
|
122
|
-
printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
|
|
123
|
-
"$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
|
|
170
|
+
printf '[codex-monitored] start: ts=%s heartbeat=%ds timeout=%ss bin=%s\n' \
|
|
171
|
+
"$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$TIMEOUT_SEC" "$CODEX_BIN" >&2
|
|
124
172
|
|
|
125
173
|
# Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
|
|
174
|
+
set -m
|
|
126
175
|
"$CODEX_BIN" exec "$@" < /dev/null &
|
|
127
176
|
CODEX_PID=$!
|
|
177
|
+
set +m
|
|
128
178
|
printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
|
|
129
179
|
|
|
130
180
|
heartbeat_loop "$CODEX_PID" &
|
|
131
181
|
HB_PID=$!
|
|
132
182
|
|
|
183
|
+
if [ "$TIMEOUT_SEC" -gt 0 ]; then
|
|
184
|
+
TIMEOUT_FLAG=$(mktemp "${TMPDIR:-/tmp}/codex-monitored-timeout.XXXXXX")
|
|
185
|
+
rm -f "$TIMEOUT_FLAG"
|
|
186
|
+
timeout_loop "$CODEX_PID" "$TIMEOUT_SEC" "$TIMEOUT_FLAG" &
|
|
187
|
+
WATCHDOG_PID=$!
|
|
188
|
+
fi
|
|
189
|
+
|
|
133
190
|
wait "$CODEX_PID"
|
|
134
191
|
EXIT=$?
|
|
192
|
+
terminate_process_group "$CODEX_PID" "post-exit-descendants"
|
|
135
193
|
|
|
136
194
|
kill -TERM "$HB_PID" 2>/dev/null || true
|
|
137
195
|
wait "$HB_PID" 2>/dev/null || true
|
|
196
|
+
if [ -n "${WATCHDOG_PID:-}" ]; then
|
|
197
|
+
kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
|
|
198
|
+
wait "$WATCHDOG_PID" 2>/dev/null || true
|
|
199
|
+
fi
|
|
200
|
+
if [ -n "$TIMEOUT_FLAG" ] && [ -f "$TIMEOUT_FLAG" ]; then
|
|
201
|
+
EXIT=124
|
|
202
|
+
fi
|
|
138
203
|
|
|
139
204
|
printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
|
|
140
205
|
"$EXIT" $(( $(date +%s) - START )) >&2
|