devlyn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +321 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +363 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +69 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +69 -20
|
@@ -15,10 +15,27 @@
|
|
|
15
15
|
set -euo pipefail
|
|
16
16
|
|
|
17
17
|
usage() {
|
|
18
|
-
echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
|
|
18
|
+
echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
|
|
19
19
|
exit 1
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
kill_worktree_processes() {
|
|
23
|
+
local work_dir="$1"
|
|
24
|
+
local signal="$2"
|
|
25
|
+
local physical_work_dir current_pgid
|
|
26
|
+
physical_work_dir="$(cd "$work_dir" 2>/dev/null && pwd -P || printf '%s' "$work_dir")"
|
|
27
|
+
current_pgid="$(ps -o pgid= -p "$$" | tr -d ' ')"
|
|
28
|
+
ps -axo pid=,pgid=,command= \
|
|
29
|
+
| awk -v p1="$work_dir" -v p2="$physical_work_dir" -v self="$$" -v current_pgid="$current_pgid" '
|
|
30
|
+
$1 != self && $2 != current_pgid && (index($0, p1) || index($0, p2)) { print $2 }
|
|
31
|
+
' \
|
|
32
|
+
| sort -u \
|
|
33
|
+
| while IFS= read -r pgid; do
|
|
34
|
+
[ -n "$pgid" ] || continue
|
|
35
|
+
kill "-$signal" -- "-$pgid" 2>/dev/null || true
|
|
36
|
+
done
|
|
37
|
+
}
|
|
38
|
+
|
|
22
39
|
FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
|
|
23
40
|
RESOLVE_SKILL="new"
|
|
24
41
|
while [ $# -gt 0 ]; do
|
|
@@ -35,18 +52,23 @@ done
|
|
|
35
52
|
# iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
|
|
36
53
|
# solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
|
|
37
54
|
# bare (L0: direct claude -p, no skill, no codex).
|
|
38
|
-
# iter-0033c (Codex R0-infra adoption, 2026-05-02): two
|
|
55
|
+
# iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
|
|
39
56
|
# l2_gated (--engine claude, no --pair-verify; pair fires only on natural triggers),
|
|
40
|
-
#
|
|
57
|
+
# l2_risk_probes (--engine claude --risk-probes; pair converts visible Verification bullets to executable probes before IMPLEMENT),
|
|
58
|
+
# l2_forced (--engine claude --pair-verify; retired because it leaks pair-awareness before IMPLEMENT).
|
|
41
59
|
[ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] || [ "$ARM" = "bare" ] \
|
|
42
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ] || \
|
|
43
|
-
{ echo "arm must be variant|solo_claude|bare|l2_gated|l2_forced"; exit 1; }
|
|
60
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ] || \
|
|
61
|
+
{ echo "arm must be variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced"; exit 1; }
|
|
44
62
|
# iter-0033c (Codex R0-infra Q2): l2_* arms require NEW skill surface (only NEW
|
|
45
63
|
# `/devlyn:resolve` honors --pair-verify; OLD `/devlyn:auto-resolve` would silently
|
|
46
64
|
# ignore the flag and produce mis-attributed L2 numbers).
|
|
47
|
-
if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
|
|
65
|
+
if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
|
|
48
66
|
echo "l2_* arms require --resolve-skill new (got '$RESOLVE_SKILL')"; exit 1
|
|
49
67
|
fi
|
|
68
|
+
if [ "$ARM" = "l2_forced" ]; then
|
|
69
|
+
echo "l2_forced is retired: it puts --pair-verify in the initial prompt, so IMPLEMENT can become pair-aware before the diff is frozen. Use scripts/run-frozen-verify-pair.sh for leak-free VERIFY-pair measurement." >&2
|
|
70
|
+
exit 1
|
|
71
|
+
fi
|
|
50
72
|
# iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` was
|
|
51
73
|
# deleted. Only `new` (= /devlyn:resolve --spec) is supported. The flag stays
|
|
52
74
|
# an accepted no-op so historical runners (run-iter-0033c.sh:137) keep working
|
|
@@ -78,6 +100,13 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
|
|
|
78
100
|
done
|
|
79
101
|
|
|
80
102
|
TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
|
|
103
|
+
if [ "$ARM" = "l2_risk_probes" ]; then
|
|
104
|
+
# This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
|
|
105
|
+
# bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
|
|
106
|
+
# enforces wall-time efficiency by pair/solo ratio; this budget prevents a
|
|
107
|
+
# false timeout before the mandatory second judge can emit its contract line.
|
|
108
|
+
TIMEOUT=$((TIMEOUT + 600))
|
|
109
|
+
fi
|
|
81
110
|
|
|
82
111
|
RESULT_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE/$ARM"
|
|
83
112
|
mkdir -p "$RESULT_DIR"
|
|
@@ -104,7 +133,7 @@ cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
|
|
|
104
133
|
# while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
|
|
105
134
|
# /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
|
|
106
135
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
107
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
136
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
108
137
|
mkdir -p "$WORK_DIR/.claude"
|
|
109
138
|
if [ -d "$REPO_ROOT/.claude/skills" ]; then
|
|
110
139
|
cp -R "$REPO_ROOT/.claude/skills" "$WORK_DIR/.claude/skills"
|
|
@@ -164,11 +193,13 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
|
164
193
|
ARM_CODEX_BLOCKED=0
|
|
165
194
|
fi
|
|
166
195
|
python3 - "$WORK_DIR/.claude/settings.json" \
|
|
167
|
-
"$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" <<'PY'
|
|
196
|
+
"$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" "$ARM" <<'PY'
|
|
168
197
|
import json, sys
|
|
169
|
-
out_path, path_val, real_bin, monitored, codex_blocked = sys.argv[1:
|
|
198
|
+
out_path, path_val, real_bin, monitored, codex_blocked, arm = sys.argv[1:7]
|
|
170
199
|
env = {
|
|
171
200
|
"CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1",
|
|
201
|
+
"CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
|
|
202
|
+
"DISABLE_AUTOUPDATER": "1",
|
|
172
203
|
"PATH": path_val,
|
|
173
204
|
}
|
|
174
205
|
if codex_blocked == "1":
|
|
@@ -182,6 +213,10 @@ else:
|
|
|
182
213
|
# BUILD; both vars are required by the shim/wrapper handshake.
|
|
183
214
|
env["CODEX_REAL_BIN"] = real_bin
|
|
184
215
|
env["CODEX_MONITORED_PATH"] = monitored
|
|
216
|
+
if arm == "l2_risk_probes":
|
|
217
|
+
# Risk-probe derivation is a bounded contract-conversion step. A long
|
|
218
|
+
# Codex run is a harness failure, not useful extra quality signal.
|
|
219
|
+
env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
|
|
185
220
|
data = {"env": env}
|
|
186
221
|
with open(out_path, "w") as f:
|
|
187
222
|
json.dump(data, f, indent=2)
|
|
@@ -231,22 +266,25 @@ if [ -f "$SETUP" ] && [ -s "$SETUP" ]; then
|
|
|
231
266
|
fi
|
|
232
267
|
fi
|
|
233
268
|
|
|
234
|
-
# iter-0019.6: stage normalized .devlyn/spec-verify.json
|
|
235
|
-
#
|
|
236
|
-
#
|
|
237
|
-
#
|
|
238
|
-
#
|
|
239
|
-
# generate the same shape from a spec.md "## Verification" section for
|
|
240
|
-
# real-user runs (Codex R5, 2026-04-28). This stages all 3 arms — bare's
|
|
241
|
-
# .devlyn/ is created lazily by spec-verify-check.py if absent.
|
|
269
|
+
# iter-0019.6: stage normalized .devlyn/spec-verify.json for BUILD_GATE.
|
|
270
|
+
# Only commands safe to reveal before IMPLEMENT may be staged here. Commands
|
|
271
|
+
# that reference BENCH_FIXTURE_DIR are hidden post-run oracles; staging their
|
|
272
|
+
# path leaks verifier names into the arm and lets agents search for answer-key
|
|
273
|
+
# files. Those commands still run in the post-run verifier below.
|
|
242
274
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
243
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
275
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
244
276
|
python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
|
|
245
277
|
import json, os, sys
|
|
246
278
|
expected = json.load(open(sys.argv[1]))
|
|
247
279
|
out_path = sys.argv[2]
|
|
248
|
-
|
|
280
|
+
visible_commands = [
|
|
281
|
+
cmd for cmd in expected.get("verification_commands", [])
|
|
282
|
+
if "BENCH_FIXTURE_DIR" not in str(cmd.get("cmd", ""))
|
|
283
|
+
]
|
|
284
|
+
normalized = {"verification_commands": visible_commands}
|
|
249
285
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
286
|
+
if not visible_commands:
|
|
287
|
+
raise SystemExit(0)
|
|
250
288
|
with open(out_path, "w") as f:
|
|
251
289
|
json.dump(normalized, f, indent=2)
|
|
252
290
|
f.write("\n")
|
|
@@ -270,7 +308,7 @@ PROMPT_FILE="$RESULT_DIR/input.md"
|
|
|
270
308
|
# arms pass the engine flag explicitly so they survive future runtime-default
|
|
271
309
|
# changes (post iter-0020 close-out: default flipped to claude).
|
|
272
310
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
273
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
311
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
274
312
|
case "$ARM" in
|
|
275
313
|
solo_claude)
|
|
276
314
|
ENGINE_CLAUSE="--engine claude"
|
|
@@ -281,13 +319,22 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
|
281
319
|
ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
|
|
282
320
|
;;
|
|
283
321
|
l2_gated)
|
|
284
|
-
#
|
|
285
|
-
#
|
|
286
|
-
#
|
|
287
|
-
#
|
|
322
|
+
# NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
|
|
323
|
+
# pair-JUDGE in VERIFY fires per /devlyn:resolve PHASE 5 policy
|
|
324
|
+
# (high complexity, coverage_failed, or warning-level mechanical
|
|
325
|
+
# findings; never after HIGH/CRITICAL mechanical blockers). Codex
|
|
326
|
+
# remains available as the OTHER-engine pair-JUDGE candidate.
|
|
288
327
|
ENGINE_CLAUSE="--engine claude"
|
|
289
328
|
ENGINE_PROMPT_HINT="Run with \`--engine claude\` and let the orchestrator's pair-mode (VERIFY) trigger naturally per its policy. Codex is available as the OTHER-engine pair-JUDGE — the harness has not blocked it. Do NOT pass \`--pair-verify\`; this arm measures gated triggering."
|
|
290
329
|
;;
|
|
330
|
+
l2_risk_probes)
|
|
331
|
+
# NEW L2 probe-derive arm. Claude plans/implements; Codex is used before
|
|
332
|
+
# IMPLEMENT only to derive bounded executable probes from visible
|
|
333
|
+
# Verification bullets. BUILD_GATE and VERIFY execute those probes
|
|
334
|
+
# mechanically via spec-verify-check.py.
|
|
335
|
+
ENGINE_CLAUSE="--engine claude --risk-probes"
|
|
336
|
+
ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\`. Codex is available as the OTHER-engine probe derivation and pair-JUDGE engine. The probe phase may only derive executable checks from visible \`## Verification\` text; it must not read hidden fixture/verifier paths."
|
|
337
|
+
;;
|
|
291
338
|
l2_forced)
|
|
292
339
|
# iter-0033c: NEW L2 forced — pair-JUDGE always fires. Diagnostic arm
|
|
293
340
|
# for Gate 6 fixture-level cross-check + Gate 7 attribution causality.
|
|
@@ -414,12 +461,17 @@ else
|
|
|
414
461
|
# natural exit at or past the budget is no longer mislabeled as timeout.
|
|
415
462
|
#
|
|
416
463
|
# MCP/config isolation (iter 0004). The harness's `claude -p` subprocess
|
|
417
|
-
# must not load the operator's user-level MCP
|
|
418
|
-
# telegram, vercel,
|
|
419
|
-
# user
|
|
420
|
-
#
|
|
421
|
-
#
|
|
422
|
-
#
|
|
464
|
+
# must not load the operator's user-level MCP/plugins/settings (pencil,
|
|
465
|
+
# codex-cli, telegram, vercel, ...). Project policy is "MCP/plugins are not in
|
|
466
|
+
# the loop"; loading user config inside the arm is uncontrolled environment
|
|
467
|
+
# leaking into the experiment. `--setting-sources project,local` keeps user
|
|
468
|
+
# plugin enablement out of the run but Claude Code still reads the installed
|
|
469
|
+
# plugin registry for autoupdate. Official Claude Code settings document
|
|
470
|
+
# `DISABLE_AUTOUPDATER=1` / `CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1` as the
|
|
471
|
+
# supported way to disable that background traffic, while preserving OAuth
|
|
472
|
+
# auth from the real HOME. `--strict-mcp-config` + an empty `mcpServers` object
|
|
473
|
+
# forces a hermetic MCP set. Skills still resolve via the project
|
|
474
|
+
# `.claude/skills` staged into the worktree.
|
|
423
475
|
# `--debug-file` records per-arm init/runtime so the next hang has a
|
|
424
476
|
# location, not a guess.
|
|
425
477
|
TIMEOUT_FLAG="$RESULT_DIR/.timed_out"
|
|
@@ -436,7 +488,7 @@ else
|
|
|
436
488
|
# PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
|
|
437
489
|
# `codex exec` through the wrapper for starvation safety.
|
|
438
490
|
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
439
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } \
|
|
491
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } \
|
|
440
492
|
&& [ -x "$WORK_DIR/.devlyn-bin/codex" ]; then
|
|
441
493
|
export PATH="$WORK_DIR/.devlyn-bin:$PATH"
|
|
442
494
|
[ "$ARM" = "solo_claude" ] && export CODEX_BLOCKED=1
|
|
@@ -447,10 +499,19 @@ else
|
|
|
447
499
|
# what the post-run verifier (run-fixture.sh:431-434) sets so the gate
|
|
448
500
|
# sees the same environment shape.
|
|
449
501
|
export BENCH_WORKDIR="$WORK_DIR"
|
|
502
|
+
# Python helper scripts run inside the benchmark worktree. Do not let them
|
|
503
|
+
# rewrite tracked __pycache__ artifacts and pollute the arm-only diff.
|
|
504
|
+
export PYTHONDONTWRITEBYTECODE=1
|
|
505
|
+
# Official Claude Code setting: disable background plugin/autoupdate traffic
|
|
506
|
+
# before process startup. Project settings env is not early enough for all
|
|
507
|
+
# startup paths.
|
|
508
|
+
export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
|
|
509
|
+
export DISABLE_AUTOUPDATER=1
|
|
450
510
|
exec claude \
|
|
451
511
|
-p "$(cat "$PROMPT_FILE")" \
|
|
452
512
|
--dangerously-skip-permissions \
|
|
453
513
|
--effort xhigh \
|
|
514
|
+
--setting-sources project,local \
|
|
454
515
|
--strict-mcp-config \
|
|
455
516
|
--mcp-config '{"mcpServers":{}}' \
|
|
456
517
|
--debug-file "$RESULT_DIR/claude-debug.log"
|
|
@@ -459,13 +520,21 @@ else
|
|
|
459
520
|
set +m
|
|
460
521
|
|
|
461
522
|
(
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
523
|
+
deadline=$((T_START + TIMEOUT))
|
|
524
|
+
while kill -0 "$CHILD_PID" 2>/dev/null; do
|
|
525
|
+
now=$(date +%s)
|
|
526
|
+
if [ "$now" -ge "$deadline" ]; then
|
|
527
|
+
: > "$TIMEOUT_FLAG"
|
|
528
|
+
kill -TERM -- "-$CHILD_PID" 2>/dev/null
|
|
529
|
+
kill_worktree_processes "$WORK_DIR" TERM
|
|
530
|
+
sleep 5
|
|
531
|
+
kill -KILL -- "-$CHILD_PID" 2>/dev/null
|
|
532
|
+
kill_worktree_processes "$WORK_DIR" KILL
|
|
533
|
+
exit 0
|
|
534
|
+
fi
|
|
535
|
+
remaining=$((deadline - now))
|
|
536
|
+
[ "$remaining" -gt 30 ] && sleep 30 || sleep "$remaining"
|
|
537
|
+
done
|
|
469
538
|
) &
|
|
470
539
|
WATCHDOG_PID=$!
|
|
471
540
|
|
|
@@ -479,7 +548,16 @@ else
|
|
|
479
548
|
INVOKE_EXIT=124
|
|
480
549
|
WATCHDOG_FIRED=1
|
|
481
550
|
rm -f "$TIMEOUT_FLAG"
|
|
551
|
+
kill_worktree_processes "$WORK_DIR" TERM
|
|
552
|
+
sleep 1
|
|
553
|
+
kill_worktree_processes "$WORK_DIR" KILL
|
|
482
554
|
echo "[run-fixture] arm timed out after ${TIMEOUT}s — INVOKE_EXIT=124" >&2
|
|
555
|
+
else
|
|
556
|
+
# A clean `claude -p` exit can still leave OTHER-engine pair-JUDGE
|
|
557
|
+
# descendants alive; reap any process group rooted in this arm worktree.
|
|
558
|
+
kill_worktree_processes "$WORK_DIR" TERM
|
|
559
|
+
sleep 1
|
|
560
|
+
kill_worktree_processes "$WORK_DIR" KILL
|
|
483
561
|
fi
|
|
484
562
|
set -e
|
|
485
563
|
fi
|
|
@@ -487,6 +565,25 @@ fi
|
|
|
487
565
|
T_END=$(date +%s)
|
|
488
566
|
ELAPSED=$((T_END - T_START))
|
|
489
567
|
|
|
568
|
+
# Restore tracked Python bytecode to the scaffold commit and remove only
|
|
569
|
+
# untracked bytecode. Helper invocations must not count as model work, but
|
|
570
|
+
# deleting tracked scaffold files would also pollute changed-files.txt.
|
|
571
|
+
(cd "$WORK_DIR" \
|
|
572
|
+
&& git restore --source "$SCAFFOLD_SHA" -- .claude/skills/_shared/__pycache__ 2>/dev/null || true)
|
|
573
|
+
cleanup_roots=()
|
|
574
|
+
[ -d "$WORK_DIR/.claude" ] && cleanup_roots+=("$WORK_DIR/.claude")
|
|
575
|
+
[ -d "$WORK_DIR/.devlyn" ] && cleanup_roots+=("$WORK_DIR/.devlyn")
|
|
576
|
+
if [ ${#cleanup_roots[@]} -gt 0 ]; then
|
|
577
|
+
find "${cleanup_roots[@]}" -type f \( -name '*.pyc' -o -name '*.pyo' \) -print0 \
|
|
578
|
+
| while IFS= read -r -d '' py_file; do
|
|
579
|
+
rel="${py_file#$WORK_DIR/}"
|
|
580
|
+
if ! (cd "$WORK_DIR" && git ls-files --error-unmatch "$rel" >/dev/null 2>&1); then
|
|
581
|
+
rm -f "$py_file"
|
|
582
|
+
fi
|
|
583
|
+
done
|
|
584
|
+
find "${cleanup_roots[@]}" -type d -name __pycache__ -empty -delete || true
|
|
585
|
+
fi
|
|
586
|
+
|
|
490
587
|
# Capture the ARM-ONLY diff against the scaffold commit. Variant's
|
|
491
588
|
# auto-resolve pipeline commits internally after each phase, so diffing
|
|
492
589
|
# against HEAD would miss committed work. Diffing against SCAFFOLD_SHA after
|
|
@@ -498,8 +595,7 @@ ELAPSED=$((T_END - T_START))
|
|
|
498
595
|
(cd "$WORK_DIR" \
|
|
499
596
|
&& git diff "$SCAFFOLD_SHA" --name-only) > "$RESULT_DIR/changed-files.txt" 2>&1 || true
|
|
500
597
|
|
|
501
|
-
# Deterministic oracles
|
|
502
|
-
# Findings-only at this stage; scoring integration is step 5.
|
|
598
|
+
# Deterministic oracles. Hard/flag findings are merged into verify.json below.
|
|
503
599
|
python3 "$BENCH_ROOT/scripts/oracle-test-fidelity.py" \
|
|
504
600
|
--work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" \
|
|
505
601
|
> "$RESULT_DIR/oracle-test-fidelity.json" 2>/dev/null || \
|
|
@@ -518,6 +614,41 @@ python3 "$BENCH_ROOT/scripts/oracle-scope-tier-b.py" \
|
|
|
518
614
|
echo '{"oracle":"scope-tier-b","findings":[],"error":"oracle invocation failed"}' \
|
|
519
615
|
> "$RESULT_DIR/oracle-scope-tier-b.json"
|
|
520
616
|
|
|
617
|
+
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
618
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } \
|
|
619
|
+
&& [ -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
|
|
620
|
+
&& [ -f "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" ]; then
|
|
621
|
+
if [ -f "$WORK_DIR/.devlyn/codex-judge.stdout" ] \
|
|
622
|
+
&& [ -f "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" ]; then
|
|
623
|
+
if ! python3 "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" \
|
|
624
|
+
--devlyn-dir "$WORK_DIR/.devlyn" \
|
|
625
|
+
> "$RESULT_DIR/collect-codex-findings.log" 2>&1; then
|
|
626
|
+
echo "[run-fixture] Codex pair findings collection failed; see $RESULT_DIR/collect-codex-findings.log" >&2
|
|
627
|
+
fi
|
|
628
|
+
fi
|
|
629
|
+
if ! python3 "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" \
|
|
630
|
+
--devlyn-dir "$WORK_DIR/.devlyn" --write-state \
|
|
631
|
+
> "$RESULT_DIR/verify-merge-normalize.log" 2>&1; then
|
|
632
|
+
echo "[run-fixture] verify merge normalization failed; see $RESULT_DIR/verify-merge-normalize.log" >&2
|
|
633
|
+
fi
|
|
634
|
+
fi
|
|
635
|
+
|
|
636
|
+
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
637
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } && [ -d "$WORK_DIR/.devlyn" ]; then
|
|
638
|
+
run_dir=$(find "$WORK_DIR/.devlyn/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -1 || true)
|
|
639
|
+
if [ -n "$run_dir" ]; then
|
|
640
|
+
rm -rf "$RESULT_DIR/run-archive"
|
|
641
|
+
cp -R "$run_dir" "$RESULT_DIR/run-archive"
|
|
642
|
+
[ -f "$RESULT_DIR/run-archive/pipeline.state.json" ] \
|
|
643
|
+
|| [ ! -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
|
|
644
|
+
|| cp "$WORK_DIR/.devlyn/pipeline.state.json" "$RESULT_DIR/run-archive/pipeline.state.json"
|
|
645
|
+
else
|
|
646
|
+
rm -rf "$RESULT_DIR/run-archive"
|
|
647
|
+
mkdir -p "$RESULT_DIR/run-archive"
|
|
648
|
+
find "$WORK_DIR/.devlyn" -maxdepth 1 -type f -exec cp {} "$RESULT_DIR/run-archive/" \;
|
|
649
|
+
fi
|
|
650
|
+
fi
|
|
651
|
+
|
|
521
652
|
# Run verification commands + forbidden pattern scan + deps check. Uses
|
|
522
653
|
# the operator's real HOME (same as the arm saw). Fixtures that need HOME
|
|
523
654
|
# isolation override it inline per verification command.
|
|
@@ -532,10 +663,14 @@ verify_env = os.environ.copy()
|
|
|
532
663
|
# Expose the work-dir path so fixtures whose verification needs to reference
|
|
533
664
|
# the work root can do so portably (e.g. F9's out-of-repo check).
|
|
534
665
|
verify_env["BENCH_WORKDIR"] = work
|
|
666
|
+
# Hidden benchmark verifiers live in the fixture directory, outside the arm's
|
|
667
|
+
# work tree. This keeps oracle code from becoming implementation context.
|
|
668
|
+
verify_env["BENCH_FIXTURE_DIR"] = os.path.dirname(os.path.abspath(sys.argv[1]))
|
|
535
669
|
|
|
536
670
|
verify = {"commands": [], "forbidden_pattern_hits": [], "deps_added": 0,
|
|
537
671
|
"max_deps_added": expected.get("max_deps_added", 0),
|
|
538
|
-
"missing_required_files": [], "forbidden_files_present": []
|
|
672
|
+
"missing_required_files": [], "forbidden_files_present": [],
|
|
673
|
+
"oracle_findings": [], "oracle_disqualifier": False}
|
|
539
674
|
|
|
540
675
|
for vc in expected.get("verification_commands", []):
|
|
541
676
|
try:
|
|
@@ -631,11 +766,29 @@ verify["commands_passed"] = passed
|
|
|
631
766
|
verify["commands_total"] = total
|
|
632
767
|
verify["verify_score"] = (passed / total) if total else 1.0
|
|
633
768
|
|
|
769
|
+
for oracle_file in (
|
|
770
|
+
"oracle-scope-tier-a.json",
|
|
771
|
+
"oracle-scope-tier-b.json",
|
|
772
|
+
"oracle-test-fidelity.json",
|
|
773
|
+
):
|
|
774
|
+
try:
|
|
775
|
+
data = json.load(open(os.path.join(result_dir, oracle_file)))
|
|
776
|
+
except Exception:
|
|
777
|
+
continue
|
|
778
|
+
oracle_name = data.get("oracle") or oracle_file.removesuffix(".json")
|
|
779
|
+
for finding in data.get("findings", []) or []:
|
|
780
|
+
item = dict(finding)
|
|
781
|
+
item["oracle"] = oracle_name
|
|
782
|
+
verify["oracle_findings"].append(item)
|
|
783
|
+
if item.get("severity") in ("disqualifier", "hard", "flag"):
|
|
784
|
+
verify["oracle_disqualifier"] = True
|
|
785
|
+
|
|
634
786
|
verify["disqualifier"] = (
|
|
635
787
|
any(h["severity"] == "disqualifier" for h in verify["forbidden_pattern_hits"])
|
|
636
788
|
or verify["deps_added"] > verify["max_deps_added"]
|
|
637
789
|
or bool(verify["missing_required_files"])
|
|
638
790
|
or bool(verify["forbidden_files_present"])
|
|
791
|
+
or verify["oracle_disqualifier"]
|
|
639
792
|
)
|
|
640
793
|
|
|
641
794
|
json.dump(verify, open(os.path.join(result_dir, "verify.json"), "w"), indent=2)
|
|
@@ -669,11 +822,65 @@ try:
|
|
|
669
822
|
except Exception:
|
|
670
823
|
changed = []
|
|
671
824
|
|
|
825
|
+
state = {}
|
|
826
|
+
state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
|
|
827
|
+
if os.path.isfile(state_path):
|
|
828
|
+
with open(state_path) as f:
|
|
829
|
+
state = json.load(f)
|
|
830
|
+
verify_phase = (state.get("phases") or {}).get("verify") or {}
|
|
831
|
+
sub_verdicts = verify_phase.get("sub_verdicts")
|
|
832
|
+
pair_trigger = verify_phase.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
|
|
833
|
+
pair_mode = bool(
|
|
834
|
+
isinstance(sub_verdicts, dict)
|
|
835
|
+
and (sub_verdicts.get("judge_codex") is not None or sub_verdicts.get("pair_judge") is not None)
|
|
836
|
+
) or bool(verify_phase.get("pair_mode"))
|
|
837
|
+
|
|
838
|
+
invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
|
|
839
|
+
plugin_contamination = False
|
|
840
|
+
plugin_contamination_reason = None
|
|
841
|
+
debug_path = os.path.join(result_dir, "claude-debug.log")
|
|
842
|
+
try:
|
|
843
|
+
with open(debug_path, errors="replace") as f:
|
|
844
|
+
debug_text = f.read()
|
|
845
|
+
except OSError:
|
|
846
|
+
debug_text = ""
|
|
847
|
+
if (
|
|
848
|
+
"Plugin autoupdate: checking installed plugins" in debug_text
|
|
849
|
+
or "Caching plugin from source:" in debug_text
|
|
850
|
+
or "Cloned repository from " in debug_text
|
|
851
|
+
or "Successfully cached plugin " in debug_text
|
|
852
|
+
or "Found 8 plugins (8 enabled" in debug_text
|
|
853
|
+
):
|
|
854
|
+
if "Plugin autoupdate: skipped (auto-updater disabled)" not in debug_text:
|
|
855
|
+
plugin_contamination = True
|
|
856
|
+
plugin_contamination_reason = "plugin_contamination"
|
|
857
|
+
|
|
858
|
+
invoke_failure = (
|
|
859
|
+
(invoke_exit not in (0,) and not timing["timed_out"])
|
|
860
|
+
or plugin_contamination
|
|
861
|
+
)
|
|
862
|
+
invoke_failure_reason = None
|
|
863
|
+
if plugin_contamination:
|
|
864
|
+
invoke_failure_reason = plugin_contamination_reason
|
|
865
|
+
elif invoke_failure:
|
|
866
|
+
transcript_path = os.path.join(result_dir, "transcript.txt")
|
|
867
|
+
haystack = ""
|
|
868
|
+
for path in (transcript_path, debug_path):
|
|
869
|
+
try:
|
|
870
|
+
with open(path, errors="replace") as f:
|
|
871
|
+
haystack += "\n" + f.read()
|
|
872
|
+
except OSError:
|
|
873
|
+
pass
|
|
874
|
+
if "You've hit your limit" in haystack or "rate_limit_error" in haystack:
|
|
875
|
+
invoke_failure_reason = "provider_limit"
|
|
876
|
+
|
|
672
877
|
result = {
|
|
673
878
|
"fixture": fixture,
|
|
674
879
|
"arm": arm,
|
|
675
880
|
"run_id": run_id,
|
|
676
881
|
"disqualifier": verify.get("disqualifier", False),
|
|
882
|
+
"oracle_disqualifier": verify.get("oracle_disqualifier", False),
|
|
883
|
+
"oracle_findings_count": len(verify.get("oracle_findings", [])),
|
|
677
884
|
"verify_score": verify.get("verify_score", 0.0),
|
|
678
885
|
"commands_passed": verify.get("commands_passed", 0),
|
|
679
886
|
"commands_total": verify.get("commands_total", 0),
|
|
@@ -681,8 +888,15 @@ result = {
|
|
|
681
888
|
"files_changed": len(changed),
|
|
682
889
|
"elapsed_seconds": elapsed,
|
|
683
890
|
"timed_out": timing["timed_out"],
|
|
684
|
-
"
|
|
685
|
-
"
|
|
891
|
+
"environment_contamination": plugin_contamination,
|
|
892
|
+
"environment_contamination_reason": plugin_contamination_reason,
|
|
893
|
+
"invoke_exit": invoke_exit,
|
|
894
|
+
"invoke_failure": invoke_failure,
|
|
895
|
+
"invoke_failure_reason": invoke_failure_reason,
|
|
896
|
+
"terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
|
|
897
|
+
"verify_verdict": verify_phase.get("verdict"),
|
|
898
|
+
"pair_trigger": pair_trigger,
|
|
899
|
+
"pair_mode": pair_mode,
|
|
686
900
|
}
|
|
687
901
|
json.dump(result, open(os.path.join(result_dir, "result.json"), "w"), indent=2)
|
|
688
902
|
print(json.dumps(result, indent=2))
|