devlyn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +321 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +363 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +69 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +69 -20
|
@@ -0,0 +1,302 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# Regression test for the SWE-bench frozen VERIFY case importer.
|
|
3
|
+
set -euo pipefail
|
|
4
|
+
|
|
5
|
+
ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
|
|
6
|
+
TMP="$(mktemp -d)"
|
|
7
|
+
trap 'rm -rf "$TMP"' EXIT
|
|
8
|
+
|
|
9
|
+
REPO="$TMP/repo"
|
|
10
|
+
mkdir -p "$REPO"
|
|
11
|
+
git -C "$REPO" init -q
|
|
12
|
+
git -C "$REPO" config user.email bench@example.com
|
|
13
|
+
git -C "$REPO" config user.name bench
|
|
14
|
+
printf 'hello\n' > "$REPO/app.txt"
|
|
15
|
+
git -C "$REPO" add app.txt
|
|
16
|
+
git -C "$REPO" commit -q -m base
|
|
17
|
+
BASE_SHA="$(git -C "$REPO" rev-parse HEAD)"
|
|
18
|
+
|
|
19
|
+
printf 'goodbye\n' > "$REPO/app.txt"
|
|
20
|
+
git -C "$REPO" diff > "$TMP/model.patch"
|
|
21
|
+
git -C "$REPO" checkout -q -- app.txt
|
|
22
|
+
|
|
23
|
+
cat > "$TMP/instance.json" <<JSON
|
|
24
|
+
{
|
|
25
|
+
"instance_id": "local__repo-1",
|
|
26
|
+
"repo": "local/repo",
|
|
27
|
+
"base_commit": "$BASE_SHA",
|
|
28
|
+
"problem_statement": "Change app.txt so it says goodbye instead of hello.",
|
|
29
|
+
"version": "test",
|
|
30
|
+
"issue_url": "https://example.test/issue",
|
|
31
|
+
"pr_url": "https://example.test/pr"
|
|
32
|
+
}
|
|
33
|
+
JSON
|
|
34
|
+
|
|
35
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
|
|
36
|
+
--instance-json "$TMP/instance.json" \
|
|
37
|
+
--model-patch "$TMP/model.patch" \
|
|
38
|
+
--cases-root "$TMP/cases" \
|
|
39
|
+
--repos-root "$TMP/repos" \
|
|
40
|
+
--repo-dir "$REPO" \
|
|
41
|
+
--timeout-seconds 60 > "$TMP/prepare.json"
|
|
42
|
+
|
|
43
|
+
CASE_DIR="$TMP/cases/local__repo-1"
|
|
44
|
+
BASE_REPO="$TMP/repos/local__repo-${BASE_SHA:0:12}"
|
|
45
|
+
test -f "$CASE_DIR/spec.md"
|
|
46
|
+
test -f "$CASE_DIR/model.patch"
|
|
47
|
+
test -x "$CASE_DIR/setup.sh"
|
|
48
|
+
grep -q 'SWE-bench local__repo-1' "$CASE_DIR/spec.md"
|
|
49
|
+
grep -q -- '--pair-mode gated' "$CASE_DIR/run-command.txt"
|
|
50
|
+
|
|
51
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
|
|
52
|
+
--dataset lite \
|
|
53
|
+
--limit 1 \
|
|
54
|
+
--out "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json"
|
|
55
|
+
grep -q '"rows_written": 1' "$TMP/fetch.json"
|
|
56
|
+
python3 - "$TMP/fetched-lite.jsonl" <<'PY'
|
|
57
|
+
import json, pathlib, sys
|
|
58
|
+
row = json.loads(pathlib.Path(sys.argv[1]).read_text().splitlines()[0])
|
|
59
|
+
for key in ("instance_id", "repo", "base_commit", "problem_statement"):
|
|
60
|
+
assert row.get(key), key
|
|
61
|
+
PY
|
|
62
|
+
|
|
63
|
+
python3 - "$TMP/instance.json" "$TMP/instances.jsonl" "$TMP/model.patch" "$TMP/predictions.jsonl" <<'PY'
|
|
64
|
+
import json, pathlib, sys
|
|
65
|
+
instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
66
|
+
pathlib.Path(sys.argv[2]).write_text(json.dumps(instance) + "\n")
|
|
67
|
+
patch = pathlib.Path(sys.argv[3]).read_text()
|
|
68
|
+
pathlib.Path(sys.argv[4]).write_text(json.dumps({
|
|
69
|
+
"instance_id": "local__repo-1",
|
|
70
|
+
"model_name_or_path": "local-test",
|
|
71
|
+
"model_patch": patch,
|
|
72
|
+
}) + "\n")
|
|
73
|
+
PY
|
|
74
|
+
|
|
75
|
+
mkdir -p "$TMP/patch-root/local__repo-1"
|
|
76
|
+
cp "$TMP/model.patch" "$TMP/patch-root/local__repo-1/patch.diff"
|
|
77
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
|
|
78
|
+
--patch-root "$TMP/patch-root" \
|
|
79
|
+
--instances-jsonl "$TMP/instances.jsonl" \
|
|
80
|
+
--model-name local-patch-root \
|
|
81
|
+
--out "$TMP/collected-predictions.jsonl" > "$TMP/collect.json"
|
|
82
|
+
grep -q '"predictions_written": 1' "$TMP/collect.json"
|
|
83
|
+
python3 - "$TMP/collected-predictions.jsonl" <<'PY'
|
|
84
|
+
import json, pathlib, sys
|
|
85
|
+
row = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
86
|
+
assert row["instance_id"] == "local__repo-1"
|
|
87
|
+
assert row["model_name_or_path"] == "local-patch-root"
|
|
88
|
+
assert row["model_patch"].endswith("\n")
|
|
89
|
+
PY
|
|
90
|
+
|
|
91
|
+
rm -rf "$TMP/cases-batch" "$TMP/repos-batch"
|
|
92
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
|
|
93
|
+
--instances-jsonl "$TMP/instances.jsonl" \
|
|
94
|
+
--predictions-jsonl "$TMP/predictions.jsonl" \
|
|
95
|
+
--cases-root "$TMP/cases-batch" \
|
|
96
|
+
--repos-root "$TMP/repos-batch" \
|
|
97
|
+
--repo-dir "$REPO" \
|
|
98
|
+
--out-manifest "$TMP/manifest.json" > "$TMP/batch.json"
|
|
99
|
+
grep -q '"prepared_count": 1' "$TMP/manifest.json"
|
|
100
|
+
test -f "$TMP/cases-batch/local__repo-1/model.patch"
|
|
101
|
+
|
|
102
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
|
|
103
|
+
--fixture local__repo-1 \
|
|
104
|
+
--fixtures-root "$TMP/cases" \
|
|
105
|
+
--base-repo "$BASE_REPO" \
|
|
106
|
+
--diff "$CASE_DIR/model.patch" \
|
|
107
|
+
--run-id swebench-frozen-case-test \
|
|
108
|
+
--pair-mode gated \
|
|
109
|
+
--timeout-seconds 7 \
|
|
110
|
+
--prepare-only > "$TMP/runner.log"
|
|
111
|
+
|
|
112
|
+
grep -q 'Timeout: 7s per arm' "$TMP/runner.log"
|
|
113
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/app.txt
|
|
114
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/app.txt
|
|
115
|
+
test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/.devlyn/spec-verify.json
|
|
116
|
+
test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/.devlyn/spec-verify.json
|
|
117
|
+
|
|
118
|
+
RESULTS_DIR="$ROOT/benchmark/auto-resolve/results"
|
|
119
|
+
RESUME_RUN_ID="swebench-resume-arm-test-local__repo-1"
|
|
120
|
+
mkdir -p "$RESULTS_DIR/$RESUME_RUN_ID/solo" "$TMP/fakebin"
|
|
121
|
+
cat > "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json" <<'EOF'
|
|
122
|
+
{
|
|
123
|
+
"elapsed_seconds": 1,
|
|
124
|
+
"invoke_exit": 0,
|
|
125
|
+
"timed_out": false,
|
|
126
|
+
"verify_verdict": "PASS",
|
|
127
|
+
"terminal_verdict": "PASS"
|
|
128
|
+
}
|
|
129
|
+
EOF
|
|
130
|
+
cat > "$TMP/fakebin/claude" <<'EOF'
|
|
131
|
+
#!/usr/bin/env bash
|
|
132
|
+
echo "fake claude invoked"
|
|
133
|
+
exit 1
|
|
134
|
+
EOF
|
|
135
|
+
chmod +x "$TMP/fakebin/claude"
|
|
136
|
+
PATH="$TMP/fakebin:$PATH" bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
|
|
137
|
+
--fixture local__repo-1 \
|
|
138
|
+
--fixtures-root "$TMP/cases" \
|
|
139
|
+
--base-repo "$BASE_REPO" \
|
|
140
|
+
--diff "$CASE_DIR/model.patch" \
|
|
141
|
+
--run-id "$RESUME_RUN_ID" \
|
|
142
|
+
--pair-mode gated \
|
|
143
|
+
--timeout-seconds 3 \
|
|
144
|
+
--resume-completed-arms > "$TMP/resume-arm.log" 2>&1
|
|
145
|
+
grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/resume-arm.log"
|
|
146
|
+
grep -Fq 'fake claude invoked' "$RESULTS_DIR/$RESUME_RUN_ID/pair/transcript.txt"
|
|
147
|
+
grep -q '"invoke_exit": 0' "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json"
|
|
148
|
+
|
|
149
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
|
|
150
|
+
--manifest "$TMP/manifest.json" \
|
|
151
|
+
--run-prefix swebench-frozen-corpus-test \
|
|
152
|
+
--timeout-seconds 7 \
|
|
153
|
+
--run-ids-out "$TMP/prepare-run-ids.txt" \
|
|
154
|
+
--out-json "$TMP/gate.json" \
|
|
155
|
+
--out-md "$TMP/gate.md" \
|
|
156
|
+
--prepare-only > "$TMP/corpus-runner.log"
|
|
157
|
+
grep -q 'prepare-only complete; gate skipped' "$TMP/corpus-runner.log"
|
|
158
|
+
grep -q 'Timeout: 7s per arm' "$TMP/corpus-runner.log"
|
|
159
|
+
grep -q '^swebench-frozen-corpus-test-1-local__repo-1$' "$TMP/prepare-run-ids.txt"
|
|
160
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/app.txt
|
|
161
|
+
grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/app.txt
|
|
162
|
+
test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/.devlyn/spec-verify.json
|
|
163
|
+
test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/.devlyn/spec-verify.json
|
|
164
|
+
test ! -e "$TMP/gate.json"
|
|
165
|
+
test ! -e "$TMP/gate.md"
|
|
166
|
+
|
|
167
|
+
python3 - "$TMP/manifest.json" "$TMP/manifest-bad-diff.json" <<'PY'
|
|
168
|
+
import json, pathlib, sys
|
|
169
|
+
manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
|
|
170
|
+
manifest["prepared"][0]["case_dir"] = str(pathlib.Path(manifest["prepared"][0]["case_dir"]).parent / "missing-case")
|
|
171
|
+
pathlib.Path(sys.argv[2]).write_text(json.dumps(manifest, indent=2) + "\n")
|
|
172
|
+
PY
|
|
173
|
+
set +e
|
|
174
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
|
|
175
|
+
--manifest "$TMP/manifest-bad-diff.json" \
|
|
176
|
+
--run-prefix swebench-frozen-corpus-fail-test \
|
|
177
|
+
--run-ids-out "$TMP/fail-run-ids.txt" \
|
|
178
|
+
--prepare-only > "$TMP/corpus-fail.log" 2>&1
|
|
179
|
+
fail_status=$?
|
|
180
|
+
set -e
|
|
181
|
+
[ "$fail_status" -ne 0 ]
|
|
182
|
+
grep -q 'row failed: swebench-frozen-corpus-fail-test-1-local__repo-1' "$TMP/corpus-fail.log"
|
|
183
|
+
grep -q '^swebench-frozen-corpus-fail-test-1-local__repo-1$' "$TMP/fail-run-ids.txt"
|
|
184
|
+
test -f "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1/compare.json"
|
|
185
|
+
|
|
186
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
187
|
+
--title "Local SWE-bench Failed Matrix" \
|
|
188
|
+
--verdict FAIL \
|
|
189
|
+
--run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
|
|
190
|
+
--out-json "$TMP/fail-matrix.json" \
|
|
191
|
+
--out-md "$TMP/fail-matrix.md" > "$TMP/fail-matrix.log"
|
|
192
|
+
grep -q '"classification": "failed attempt: row runner exit=1"' "$TMP/fail-matrix.json"
|
|
193
|
+
grep -q '"trailing_non_gate_rows": 1' "$TMP/fail-matrix.json"
|
|
194
|
+
grep -q '"failed attempt: row runner exit=1": 1' "$TMP/fail-matrix.json"
|
|
195
|
+
grep -Fq 'failed attempt: row runner exit=1' "$TMP/fail-matrix.md"
|
|
196
|
+
grep -Fq 'Trailing non-gate rows: 1' "$TMP/fail-matrix.md"
|
|
197
|
+
set +e
|
|
198
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
199
|
+
--title "Local SWE-bench Failed Matrix" \
|
|
200
|
+
--verdict FAIL \
|
|
201
|
+
--run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
|
|
202
|
+
--max-trailing-non-gate 0 \
|
|
203
|
+
--out-json "$TMP/fail-yield-matrix.json" \
|
|
204
|
+
--out-md "$TMP/fail-yield-matrix.md" > "$TMP/fail-yield-matrix.log"
|
|
205
|
+
yield_status=$?
|
|
206
|
+
set -e
|
|
207
|
+
[ "$yield_status" -eq 2 ]
|
|
208
|
+
grep -q '"yield_verdict": "FAIL"' "$TMP/fail-yield-matrix.json"
|
|
209
|
+
grep -q '"trailing non-gate rows 1 > maximum 0"' "$TMP/fail-yield-matrix.json"
|
|
210
|
+
grep -Fq 'Yield verdict: **FAIL**' "$TMP/fail-yield-matrix.md"
|
|
211
|
+
|
|
212
|
+
PROVIDER_LIMIT_RUN_ID="swebench-provider-limit-test-local__repo-1"
|
|
213
|
+
mkdir -p "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo" "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair"
|
|
214
|
+
cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo/input.md" <<'EOF'
|
|
215
|
+
Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
|
|
216
|
+
EOF
|
|
217
|
+
cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair/transcript.txt" <<'EOF'
|
|
218
|
+
You've hit your limit · resets 3am (Asia/Seoul)
|
|
219
|
+
EOF
|
|
220
|
+
cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/compare.json" <<'EOF'
|
|
221
|
+
{
|
|
222
|
+
"solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 1},
|
|
223
|
+
"pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "elapsed_seconds": 1},
|
|
224
|
+
"comparison": {
|
|
225
|
+
"pair_trigger_missed": false,
|
|
226
|
+
"pair_verdict_lift": false,
|
|
227
|
+
"solo_verdict": "PASS",
|
|
228
|
+
"pair_verdict": null
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
EOF
|
|
232
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
233
|
+
--title "Local SWE-bench Provider Limit Matrix" \
|
|
234
|
+
--verdict FAIL \
|
|
235
|
+
--run-id "$PROVIDER_LIMIT_RUN_ID" \
|
|
236
|
+
--out-json "$TMP/provider-limit-matrix.json" \
|
|
237
|
+
--out-md "$TMP/provider-limit-matrix.md" > "$TMP/provider-limit-matrix.log"
|
|
238
|
+
grep -q '"classification": "failed attempt: provider limit"' "$TMP/provider-limit-matrix.json"
|
|
239
|
+
grep -Fq 'failed attempt: provider limit' "$TMP/provider-limit-matrix.md"
|
|
240
|
+
|
|
241
|
+
RUN_ID="swebench-gate-only-test-local__repo-1"
|
|
242
|
+
mkdir -p "$RESULTS_DIR/$RUN_ID/pair"
|
|
243
|
+
cat > "$RESULTS_DIR/$RUN_ID/pair/input.md" <<'EOF'
|
|
244
|
+
Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
|
|
245
|
+
EOF
|
|
246
|
+
cat > "$RESULTS_DIR/$RUN_ID/compare.json" <<'EOF'
|
|
247
|
+
{
|
|
248
|
+
"solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
|
|
249
|
+
"pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
|
|
250
|
+
"comparison": {
|
|
251
|
+
"pair_trigger_missed": false,
|
|
252
|
+
"pair_verdict_lift": true,
|
|
253
|
+
"solo_verdict": "PASS_WITH_ISSUES",
|
|
254
|
+
"pair_verdict": "NEEDS_WORK"
|
|
255
|
+
}
|
|
256
|
+
}
|
|
257
|
+
EOF
|
|
258
|
+
printf '%s\n' "$RUN_ID" > "$TMP/run-ids.txt"
|
|
259
|
+
bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
|
|
260
|
+
--manifest "$TMP/manifest.json" \
|
|
261
|
+
--gate-only-run-ids "$TMP/run-ids.txt" \
|
|
262
|
+
--min-runs 1 \
|
|
263
|
+
--max-pair-solo-wall-ratio 3 \
|
|
264
|
+
--run-ids-out "$TMP/gate-run-ids.txt" \
|
|
265
|
+
--out-json "$TMP/gate.json" \
|
|
266
|
+
--out-md "$TMP/gate.md" > "$TMP/gate-only.log"
|
|
267
|
+
grep -q '"verdict": "PASS"' "$TMP/gate.json"
|
|
268
|
+
grep -q '"avg_pair_solo_wall_ratio": 2.0' "$TMP/gate.json"
|
|
269
|
+
grep -Fq 'Verdict: **PASS**' "$TMP/gate.md"
|
|
270
|
+
grep -Fq 'Max pair/solo wall ratio: 3.00x' "$TMP/gate.md"
|
|
271
|
+
cmp "$TMP/run-ids.txt" "$TMP/gate-run-ids.txt"
|
|
272
|
+
|
|
273
|
+
python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
|
|
274
|
+
--title "Local SWE-bench Matrix" \
|
|
275
|
+
--verdict PASS \
|
|
276
|
+
--gate-json "$TMP/gate.json" \
|
|
277
|
+
--run-id "$RUN_ID" \
|
|
278
|
+
--min-gate-rate 1 \
|
|
279
|
+
--max-trailing-non-gate 0 \
|
|
280
|
+
--out-json "$TMP/matrix.json" \
|
|
281
|
+
--out-md "$TMP/matrix.md" > "$TMP/matrix.log"
|
|
282
|
+
grep -q '"runs_total": 1' "$TMP/matrix.json"
|
|
283
|
+
grep -q '"gate_rows": 1' "$TMP/matrix.json"
|
|
284
|
+
grep -q '"gate_rate": 1.0' "$TMP/matrix.json"
|
|
285
|
+
grep -q '"trailing_non_gate_rows": 0' "$TMP/matrix.json"
|
|
286
|
+
grep -q '"yield_verdict": "PASS"' "$TMP/matrix.json"
|
|
287
|
+
grep -Fq 'Local SWE-bench Matrix' "$TMP/matrix.md"
|
|
288
|
+
grep -Fq 'Gate rate: 1.000' "$TMP/matrix.md"
|
|
289
|
+
grep -Fq 'Yield verdict: **PASS**' "$TMP/matrix.md"
|
|
290
|
+
|
|
291
|
+
rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-solo
|
|
292
|
+
rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-pair
|
|
293
|
+
rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo
|
|
294
|
+
rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair
|
|
295
|
+
rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test"
|
|
296
|
+
rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-test-1-local__repo-1"
|
|
297
|
+
rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1"
|
|
298
|
+
rm -rf "$RESULTS_DIR/$RESUME_RUN_ID"
|
|
299
|
+
rm -rf "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID"
|
|
300
|
+
rm -rf "$RESULTS_DIR/$RUN_ID"
|
|
301
|
+
|
|
302
|
+
echo "PASS test-swebench-frozen-case"
|
|
@@ -26,6 +26,7 @@ PER_RUN_PATTERNS = (
|
|
|
26
26
|
"*.log.md",
|
|
27
27
|
"fix-batch.round-*.json",
|
|
28
28
|
"criteria.generated.md",
|
|
29
|
+
"risk-probes.jsonl",
|
|
29
30
|
# iter-0019.8: spec-verify carrier artifacts get archived alongside
|
|
30
31
|
# other per-run state. Killed mid-run cleanup is enforced separately
|
|
31
32
|
# by spec-verify-check.py main() — when source markdown has no json
|
|
@@ -35,6 +36,7 @@ PER_RUN_PATTERNS = (
|
|
|
35
36
|
"spec-verify.json",
|
|
36
37
|
"spec-verify.results.json",
|
|
37
38
|
"spec-verify-findings.jsonl",
|
|
39
|
+
"verify-merge.summary.json",
|
|
38
40
|
# iter-0033a/2026-04-30 archive-fix iter: NEW /devlyn:resolve emits
|
|
39
41
|
# plan.md (PLAN output) + final-report.md (PHASE 6 render) +
|
|
40
42
|
# cumulative.patch (cumulative diff). Smoke 2's archive listing
|
|
@@ -52,6 +54,7 @@ PER_RUN_PATTERNS = (
|
|
|
52
54
|
# ("pair_judge findings archive distinguishable") would false-fail on
|
|
53
55
|
# every paired fixture without this glob.
|
|
54
56
|
"verify-judge-*.md",
|
|
57
|
+
"codex-judge.*",
|
|
55
58
|
)
|
|
56
59
|
|
|
57
60
|
|
|
@@ -6,7 +6,7 @@ Single source of truth for how every skill calls Codex. **MCP is not used.** Ski
|
|
|
6
6
|
|
|
7
7
|
All long-running Codex calls go through `codex-monitored.sh` — a thin wrapper that closes stdin (codex 0.124.0 hangs when both stdin is open and a prompt arg is given), streams Codex stdout fully (no `tail -n` truncation), and prints a `[codex-monitored] heartbeat` line every 30s so the outer `claude -p` byte-watchdog stays fed during long reasoning gaps. The wrapper passes its arguments through verbatim to the underlying CLI, so the canonical flag set is unchanged from a raw call — only the launcher differs.
|
|
8
8
|
|
|
9
|
-
**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex.
|
|
9
|
+
**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex. Read-only critique returns findings on stdout; the orchestrator writes any files.
|
|
10
10
|
|
|
11
11
|
```bash
|
|
12
12
|
bash .claude/skills/_shared/codex-monitored.sh \
|
|
@@ -51,4 +51,4 @@ The local Codex CLI (fronted by `codex-monitored.sh`) is the primary (and only)
|
|
|
51
51
|
|
|
52
52
|
Skills write the invocation as a Bash command the runtime executes. Example shape from `/devlyn:resolve` PHASE 2 IMPLEMENT when routed to Codex:
|
|
53
53
|
|
|
54
|
-
> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout.
|
|
54
|
+
> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout. Do not pipe the wrapper; direct capture or file redirection preserves streaming and avoids the pipe-refusal exit.
|
|
@@ -41,7 +41,10 @@
|
|
|
41
41
|
#
|
|
42
42
|
# ENV OVERRIDES:
|
|
43
43
|
# CODEX_MONITORED_HEARTBEAT — heartbeat interval seconds (default 30).
|
|
44
|
-
#
|
|
44
|
+
# CODEX_MONITORED_TIMEOUT_SEC — optional hard timeout. When >0, kill the
|
|
45
|
+
# codex process group and exit 124.
|
|
46
|
+
# CODEX_BIN — real codex binary path. Default:
|
|
47
|
+
# CODEX_REAL_BIN when set, else `codex`.
|
|
45
48
|
# Set this when the shim has put us first
|
|
46
49
|
# on PATH.
|
|
47
50
|
# CODEX_MONITORED_ALLOW_PIPED — set non-empty to skip the pipe-stdout
|
|
@@ -63,8 +66,10 @@ if [ -n "${CODEX_BLOCKED:-}" ]; then
|
|
|
63
66
|
fi
|
|
64
67
|
|
|
65
68
|
HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
|
|
66
|
-
|
|
69
|
+
TIMEOUT_SEC="${CODEX_MONITORED_TIMEOUT_SEC:-0}"
|
|
70
|
+
CODEX_BIN="${CODEX_BIN:-${CODEX_REAL_BIN:-codex}}"
|
|
67
71
|
START=$(date +%s)
|
|
72
|
+
TIMEOUT_FLAG=""
|
|
68
73
|
|
|
69
74
|
# --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
|
|
70
75
|
# `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
|
|
@@ -106,35 +111,95 @@ heartbeat_loop() {
|
|
|
106
111
|
done
|
|
107
112
|
}
|
|
108
113
|
|
|
114
|
+
timeout_loop() {
|
|
115
|
+
local pid="$1"
|
|
116
|
+
local seconds="$2"
|
|
117
|
+
local flag="$3"
|
|
118
|
+
[ "$seconds" -gt 0 ] || return 0
|
|
119
|
+
sleep "$seconds"
|
|
120
|
+
if kill -0 "$pid" 2>/dev/null; then
|
|
121
|
+
: > "$flag"
|
|
122
|
+
printf '[codex-monitored] timeout: elapsed=%ds limit=%ds\n' \
|
|
123
|
+
"$(( $(date +%s) - START ))" "$seconds" >&2
|
|
124
|
+
kill -TERM -- "-$pid" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
|
|
125
|
+
sleep 5
|
|
126
|
+
kill -KILL -- "-$pid" 2>/dev/null || kill -KILL "$pid" 2>/dev/null || true
|
|
127
|
+
fi
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
terminate_process_group() {
|
|
131
|
+
local pgid="$1"
|
|
132
|
+
local reason="$2"
|
|
133
|
+
if ! kill -0 -- "-$pgid" 2>/dev/null; then
|
|
134
|
+
return 0
|
|
135
|
+
fi
|
|
136
|
+
printf '[codex-monitored] reap: reason=%s pgid=%s\n' "$reason" "$pgid" >&2
|
|
137
|
+
kill -TERM -- "-$pgid" 2>/dev/null || true
|
|
138
|
+
local i
|
|
139
|
+
for i in 1 2 3 4 5; do
|
|
140
|
+
sleep 1
|
|
141
|
+
if ! kill -0 -- "-$pgid" 2>/dev/null; then
|
|
142
|
+
return 0
|
|
143
|
+
fi
|
|
144
|
+
done
|
|
145
|
+
kill -KILL -- "-$pgid" 2>/dev/null || true
|
|
146
|
+
}
|
|
147
|
+
|
|
109
148
|
forward_signal() {
|
|
110
149
|
local sig="$1"
|
|
111
150
|
if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
|
|
112
|
-
kill -"$sig" "$CODEX_PID" 2>/dev/null || true
|
|
151
|
+
kill -"$sig" -- "-$CODEX_PID" 2>/dev/null || kill -"$sig" "$CODEX_PID" 2>/dev/null || true
|
|
113
152
|
fi
|
|
114
153
|
if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
|
|
115
154
|
kill -TERM "$HB_PID" 2>/dev/null || true
|
|
116
155
|
fi
|
|
156
|
+
if [ -n "${WATCHDOG_PID:-}" ] && kill -0 "$WATCHDOG_PID" 2>/dev/null; then
|
|
157
|
+
kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
|
|
158
|
+
fi
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
cleanup() {
|
|
162
|
+
forward_signal TERM
|
|
163
|
+
[ -z "$TIMEOUT_FLAG" ] || rm -f "$TIMEOUT_FLAG"
|
|
117
164
|
}
|
|
118
165
|
|
|
119
|
-
trap 'forward_signal TERM' TERM
|
|
120
|
-
trap 'forward_signal INT' INT
|
|
166
|
+
trap 'forward_signal TERM; exit 143' TERM
|
|
167
|
+
trap 'forward_signal INT; exit 130' INT
|
|
168
|
+
trap cleanup EXIT
|
|
121
169
|
|
|
122
|
-
printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
|
|
123
|
-
"$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
|
|
170
|
+
printf '[codex-monitored] start: ts=%s heartbeat=%ds timeout=%ss bin=%s\n' \
|
|
171
|
+
"$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$TIMEOUT_SEC" "$CODEX_BIN" >&2
|
|
124
172
|
|
|
125
173
|
# Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
|
|
174
|
+
set -m
|
|
126
175
|
"$CODEX_BIN" exec "$@" < /dev/null &
|
|
127
176
|
CODEX_PID=$!
|
|
177
|
+
set +m
|
|
128
178
|
printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
|
|
129
179
|
|
|
130
180
|
heartbeat_loop "$CODEX_PID" &
|
|
131
181
|
HB_PID=$!
|
|
132
182
|
|
|
183
|
+
if [ "$TIMEOUT_SEC" -gt 0 ]; then
|
|
184
|
+
TIMEOUT_FLAG=$(mktemp "${TMPDIR:-/tmp}/codex-monitored-timeout.XXXXXX")
|
|
185
|
+
rm -f "$TIMEOUT_FLAG"
|
|
186
|
+
timeout_loop "$CODEX_PID" "$TIMEOUT_SEC" "$TIMEOUT_FLAG" &
|
|
187
|
+
WATCHDOG_PID=$!
|
|
188
|
+
fi
|
|
189
|
+
|
|
133
190
|
wait "$CODEX_PID"
|
|
134
191
|
EXIT=$?
|
|
192
|
+
terminate_process_group "$CODEX_PID" "post-exit-descendants"
|
|
135
193
|
|
|
136
194
|
kill -TERM "$HB_PID" 2>/dev/null || true
|
|
137
195
|
wait "$HB_PID" 2>/dev/null || true
|
|
196
|
+
if [ -n "${WATCHDOG_PID:-}" ]; then
|
|
197
|
+
kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
|
|
198
|
+
wait "$WATCHDOG_PID" 2>/dev/null || true
|
|
199
|
+
fi
|
|
200
|
+
if [ -n "$TIMEOUT_FLAG" ] && [ -f "$TIMEOUT_FLAG" ]; then
|
|
201
|
+
EXIT=124
|
|
202
|
+
fi
|
|
138
203
|
|
|
139
204
|
printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
|
|
140
205
|
"$EXIT" $(( $(date +%s) - START )) >&2
|
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Normalize raw Codex pair-JUDGE stdout into canonical VERIFY JSONL."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import pathlib
|
|
9
|
+
import sys
|
|
10
|
+
import tempfile
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
FINDING_SEVERITIES = {"CRITICAL", "HIGH", "MEDIUM", "LOW", "INFO"}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def atomic_write(path: pathlib.Path, text: str) -> None:
|
|
18
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
19
|
+
with tempfile.NamedTemporaryFile(
|
|
20
|
+
"w", encoding="utf-8", dir=path.parent, delete=False
|
|
21
|
+
) as handle:
|
|
22
|
+
handle.write(text)
|
|
23
|
+
tmp_name = handle.name
|
|
24
|
+
pathlib.Path(tmp_name).replace(path)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def collect(stdout_path: pathlib.Path) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
|
|
28
|
+
findings: list[dict[str, Any]] = []
|
|
29
|
+
summary: dict[str, Any] | None = None
|
|
30
|
+
with stdout_path.open(encoding="utf-8") as handle:
|
|
31
|
+
for line_no, line in enumerate(handle, 1):
|
|
32
|
+
raw = line.strip()
|
|
33
|
+
if not raw:
|
|
34
|
+
continue
|
|
35
|
+
if raw.startswith("# SUMMARY "):
|
|
36
|
+
try:
|
|
37
|
+
item = json.loads(raw.removeprefix("# SUMMARY ").strip())
|
|
38
|
+
except json.JSONDecodeError as exc:
|
|
39
|
+
raise SystemExit(f"error: invalid SUMMARY JSON at {stdout_path}:{line_no}: {exc}")
|
|
40
|
+
if not isinstance(item, dict):
|
|
41
|
+
raise SystemExit(f"error: SUMMARY is not an object at {stdout_path}:{line_no}")
|
|
42
|
+
summary = item
|
|
43
|
+
continue
|
|
44
|
+
if raw.startswith("#"):
|
|
45
|
+
continue
|
|
46
|
+
try:
|
|
47
|
+
item = json.loads(raw)
|
|
48
|
+
except json.JSONDecodeError as exc:
|
|
49
|
+
raise SystemExit(f"error: invalid JSONL at {stdout_path}:{line_no}: {exc}")
|
|
50
|
+
if not isinstance(item, dict):
|
|
51
|
+
raise SystemExit(f"error: JSONL item is not an object at {stdout_path}:{line_no}")
|
|
52
|
+
severity = str(item.get("severity") or "").upper()
|
|
53
|
+
if severity not in FINDING_SEVERITIES:
|
|
54
|
+
raise SystemExit(f"error: finding missing valid severity at {stdout_path}:{line_no}")
|
|
55
|
+
findings.append(item)
|
|
56
|
+
if not findings and summary is None:
|
|
57
|
+
raise SystemExit("error: Codex pair-JUDGE stdout contained no JSONL findings or PASS line")
|
|
58
|
+
if summary and summary.get("verdict") in {"NEEDS_WORK", "FAIL", "BLOCKED"} and not findings:
|
|
59
|
+
raise SystemExit("error: non-PASS SUMMARY without JSONL findings")
|
|
60
|
+
return findings, summary
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def self_test() -> int:
|
|
64
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
65
|
+
root = pathlib.Path(tmp)
|
|
66
|
+
stdout_path = root / "codex-judge.stdout"
|
|
67
|
+
out_path = root / "verify.pair.findings.jsonl"
|
|
68
|
+
summary_path = root / "codex-judge.summary.json"
|
|
69
|
+
stdout_path.write_text(
|
|
70
|
+
json.dumps({"id": "a", "severity": "HIGH"}) + "\n"
|
|
71
|
+
+ '# SUMMARY {"verdict":"NEEDS_WORK"}\n',
|
|
72
|
+
encoding="utf-8",
|
|
73
|
+
)
|
|
74
|
+
findings, summary = collect(stdout_path)
|
|
75
|
+
write_outputs(findings, summary, out_path, summary_path)
|
|
76
|
+
assert out_path.read_text(encoding="utf-8").count("\n") == 1
|
|
77
|
+
assert json.loads(summary_path.read_text(encoding="utf-8"))["verdict"] == "NEEDS_WORK"
|
|
78
|
+
stdout_path.write_text("", encoding="utf-8")
|
|
79
|
+
try:
|
|
80
|
+
collect(stdout_path)
|
|
81
|
+
except SystemExit as exc:
|
|
82
|
+
assert "no JSONL findings" in str(exc)
|
|
83
|
+
else:
|
|
84
|
+
raise AssertionError("empty Codex stdout must not normalize to PASS")
|
|
85
|
+
return 0
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def write_outputs(
|
|
89
|
+
findings: list[dict[str, Any]],
|
|
90
|
+
summary: dict[str, Any] | None,
|
|
91
|
+
out_path: pathlib.Path,
|
|
92
|
+
summary_path: pathlib.Path,
|
|
93
|
+
) -> None:
|
|
94
|
+
atomic_write(
|
|
95
|
+
out_path,
|
|
96
|
+
"".join(json.dumps(item, sort_keys=True, separators=(",", ":")) + "\n" for item in findings),
|
|
97
|
+
)
|
|
98
|
+
if summary is not None:
|
|
99
|
+
atomic_write(summary_path, json.dumps(summary, indent=2, sort_keys=True) + "\n")
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main() -> int:
|
|
103
|
+
parser = argparse.ArgumentParser(description=__doc__)
|
|
104
|
+
parser.add_argument("--devlyn-dir", default=".devlyn")
|
|
105
|
+
parser.add_argument("--stdout-file", default="codex-judge.stdout")
|
|
106
|
+
parser.add_argument("--out", default="verify.pair.findings.jsonl")
|
|
107
|
+
parser.add_argument("--summary-out", default="codex-judge.summary.json")
|
|
108
|
+
parser.add_argument("--self-test", action="store_true")
|
|
109
|
+
args = parser.parse_args()
|
|
110
|
+
if args.self_test:
|
|
111
|
+
return self_test()
|
|
112
|
+
|
|
113
|
+
devlyn = pathlib.Path(args.devlyn_dir)
|
|
114
|
+
stdout_path = devlyn / args.stdout_file
|
|
115
|
+
if not stdout_path.is_file():
|
|
116
|
+
sys.stderr.write(f"error: {stdout_path} not found\n")
|
|
117
|
+
return 1
|
|
118
|
+
findings, summary = collect(stdout_path)
|
|
119
|
+
write_outputs(findings, summary, devlyn / args.out, devlyn / args.summary_out)
|
|
120
|
+
print(json.dumps({"findings_count": len(findings), "summary": summary}, sort_keys=True))
|
|
121
|
+
return 0
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
if __name__ == "__main__":
|
|
125
|
+
raise SystemExit(main())
|
|
@@ -14,7 +14,7 @@ When the resolved engine is `auto` or `codex`, on entry (before spawning any pha
|
|
|
14
14
|
|
|
15
15
|
Never prompt the user. Never abort the run on missing CLI.
|
|
16
16
|
|
|
17
|
-
Per-skill defaults: `/devlyn:resolve` defaults to `claude` (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement)
|
|
17
|
+
Per-skill defaults: `/devlyn:resolve` defaults to `claude` for PLAN/IMPLEMENT (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement). `/devlyn:resolve` VERIFY is the exception: gated pair-JUDGE may invoke the OTHER engine when its SKILL.md trigger policy fires. `/devlyn:ideate` defaults to `auto` for the CHALLENGE phase's cross-model GAN-critic dynamic. Each skill's SKILL.md flag block is the source of truth for that skill's default.
|
|
18
18
|
|
|
19
19
|
## Why this is the one permitted silent fallback
|
|
20
20
|
|
|
@@ -35,6 +35,12 @@
|
|
|
35
35
|
"description": "None of these substrings may appear in (stdout + stderr) for pass.",
|
|
36
36
|
"items": { "type": "string", "minLength": 1 },
|
|
37
37
|
"default": []
|
|
38
|
+
},
|
|
39
|
+
"contract_refs": {
|
|
40
|
+
"type": "array",
|
|
41
|
+
"description": "For hidden BENCH_FIXTURE_DIR commands, exact substrings from spec.md that this oracle verifies. Hidden oracles may test only visible spec clauses.",
|
|
42
|
+
"items": { "type": "string", "minLength": 1 },
|
|
43
|
+
"default": []
|
|
38
44
|
}
|
|
39
45
|
}
|
|
40
46
|
}
|
|
@@ -83,6 +89,18 @@
|
|
|
83
89
|
"items": { "type": "string", "minLength": 1 },
|
|
84
90
|
"default": []
|
|
85
91
|
},
|
|
92
|
+
"tier_a_waivers": {
|
|
93
|
+
"type": "array",
|
|
94
|
+
"description": "Optional fnmatch globs exempted from Tier A scope-oracle path checks when the spec explicitly authorizes those files.",
|
|
95
|
+
"items": { "type": "string", "minLength": 1 },
|
|
96
|
+
"default": []
|
|
97
|
+
},
|
|
98
|
+
"spec_output_files": {
|
|
99
|
+
"type": "array",
|
|
100
|
+
"description": "Files or globs that define the spec-authorized output surface for scope oracles. Touched files outside this set must be reachable from it via static imports or separately waived.",
|
|
101
|
+
"items": { "type": "string", "minLength": 1 },
|
|
102
|
+
"default": []
|
|
103
|
+
},
|
|
86
104
|
"max_deps_added": {
|
|
87
105
|
"type": "integer",
|
|
88
106
|
"description": "Hard cap on new entries under dependencies/devDependencies in package.json. Exceeds → DQ.",
|