devlyn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +321 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  49. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  57. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  65. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  74. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  83. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  84. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
  85. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
  86. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
  87. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
  88. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
  89. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
  90. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  91. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  92. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  93. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  94. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  95. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  96. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  97. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
  98. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
  99. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  100. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  101. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  102. package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
  103. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  104. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  105. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  106. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  107. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  108. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  109. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  110. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  111. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  112. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  113. package/config/skills/_shared/archive_run.py +3 -0
  114. package/config/skills/_shared/codex-config.md +2 -2
  115. package/config/skills/_shared/codex-monitored.sh +72 -7
  116. package/config/skills/_shared/collect-codex-findings.py +125 -0
  117. package/config/skills/_shared/engine-preflight.md +1 -1
  118. package/config/skills/_shared/expected.schema.json +18 -0
  119. package/config/skills/_shared/spec-verify-check.py +363 -10
  120. package/config/skills/_shared/verify-merge-findings.py +327 -0
  121. package/config/skills/devlyn:resolve/SKILL.md +69 -8
  122. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  123. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
  124. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  125. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  126. package/package.json +1 -1
  127. package/scripts/lint-skills.sh +69 -20
@@ -15,10 +15,27 @@
15
15
  set -euo pipefail
16
16
 
17
17
  usage() {
18
- echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
18
+ echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
19
19
  exit 1
20
20
  }
21
21
 
22
+ kill_worktree_processes() {
23
+ local work_dir="$1"
24
+ local signal="$2"
25
+ local physical_work_dir current_pgid
26
+ physical_work_dir="$(cd "$work_dir" 2>/dev/null && pwd -P || printf '%s' "$work_dir")"
27
+ current_pgid="$(ps -o pgid= -p "$$" | tr -d ' ')"
28
+ ps -axo pid=,pgid=,command= \
29
+ | awk -v p1="$work_dir" -v p2="$physical_work_dir" -v self="$$" -v current_pgid="$current_pgid" '
30
+ $1 != self && $2 != current_pgid && (index($0, p1) || index($0, p2)) { print $2 }
31
+ ' \
32
+ | sort -u \
33
+ | while IFS= read -r pgid; do
34
+ [ -n "$pgid" ] || continue
35
+ kill "-$signal" -- "-$pgid" 2>/dev/null || true
36
+ done
37
+ }
38
+
22
39
  FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
23
40
  RESOLVE_SKILL="new"
24
41
  while [ $# -gt 0 ]; do
@@ -35,18 +52,23 @@ done
35
52
  # iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
36
53
  # solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
37
54
  # bare (L0: direct claude -p, no skill, no codex).
38
- # iter-0033c (Codex R0-infra adoption, 2026-05-02): two new arms for NEW L2 measurement on /devlyn:resolve —
55
+ # iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
39
56
  # l2_gated (--engine claude, no --pair-verify; pair fires only on natural triggers),
40
- # l2_forced (--engine claude --pair-verify; diagnostic). Both require --resolve-skill new.
57
+ # l2_risk_probes (--engine claude --risk-probes; pair converts visible Verification bullets to executable probes before IMPLEMENT),
58
+ # l2_forced (--engine claude --pair-verify; retired because it leaks pair-awareness before IMPLEMENT).
41
59
  [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] || [ "$ARM" = "bare" ] \
42
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ] || \
43
- { echo "arm must be variant|solo_claude|bare|l2_gated|l2_forced"; exit 1; }
60
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ] || \
61
+ { echo "arm must be variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced"; exit 1; }
44
62
  # iter-0033c (Codex R0-infra Q2): l2_* arms require NEW skill surface (only NEW
45
63
  # `/devlyn:resolve` honors --pair-verify; OLD `/devlyn:auto-resolve` would silently
46
64
  # ignore the flag and produce mis-attributed L2 numbers).
47
- if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
65
+ if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
48
66
  echo "l2_* arms require --resolve-skill new (got '$RESOLVE_SKILL')"; exit 1
49
67
  fi
68
+ if [ "$ARM" = "l2_forced" ]; then
69
+ echo "l2_forced is retired: it puts --pair-verify in the initial prompt, so IMPLEMENT can become pair-aware before the diff is frozen. Use scripts/run-frozen-verify-pair.sh for leak-free VERIFY-pair measurement." >&2
70
+ exit 1
71
+ fi
50
72
  # iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` was
51
73
  # deleted. Only `new` (= /devlyn:resolve --spec) is supported. The flag stays
52
74
  # an accepted no-op so historical runners (run-iter-0033c.sh:137) keep working
@@ -78,6 +100,13 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
78
100
  done
79
101
 
80
102
  TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
103
+ if [ "$ARM" = "l2_risk_probes" ]; then
104
+ # This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
105
+ # bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
106
+ # enforces wall-time efficiency by pair/solo ratio; this budget prevents a
107
+ # false timeout before the mandatory second judge can emit its contract line.
108
+ TIMEOUT=$((TIMEOUT + 600))
109
+ fi
81
110
 
82
111
  RESULT_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE/$ARM"
83
112
  mkdir -p "$RESULT_DIR"
@@ -104,7 +133,7 @@ cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
104
133
  # while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
105
134
  # /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
106
135
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
107
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
136
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
108
137
  mkdir -p "$WORK_DIR/.claude"
109
138
  if [ -d "$REPO_ROOT/.claude/skills" ]; then
110
139
  cp -R "$REPO_ROOT/.claude/skills" "$WORK_DIR/.claude/skills"
@@ -164,11 +193,13 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
164
193
  ARM_CODEX_BLOCKED=0
165
194
  fi
166
195
  python3 - "$WORK_DIR/.claude/settings.json" \
167
- "$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" <<'PY'
196
+ "$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" "$ARM" <<'PY'
168
197
  import json, sys
169
- out_path, path_val, real_bin, monitored, codex_blocked = sys.argv[1:6]
198
+ out_path, path_val, real_bin, monitored, codex_blocked, arm = sys.argv[1:7]
170
199
  env = {
171
200
  "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1",
201
+ "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
202
+ "DISABLE_AUTOUPDATER": "1",
172
203
  "PATH": path_val,
173
204
  }
174
205
  if codex_blocked == "1":
@@ -182,6 +213,10 @@ else:
182
213
  # BUILD; both vars are required by the shim/wrapper handshake.
183
214
  env["CODEX_REAL_BIN"] = real_bin
184
215
  env["CODEX_MONITORED_PATH"] = monitored
216
+ if arm == "l2_risk_probes":
217
+ # Risk-probe derivation is a bounded contract-conversion step. A long
218
+ # Codex run is a harness failure, not useful extra quality signal.
219
+ env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
185
220
  data = {"env": env}
186
221
  with open(out_path, "w") as f:
187
222
  json.dump(data, f, indent=2)
@@ -231,22 +266,25 @@ if [ -f "$SETUP" ] && [ -s "$SETUP" ]; then
231
266
  fi
232
267
  fi
233
268
 
234
- # iter-0019.6: stage normalized .devlyn/spec-verify.json containing ONLY
235
- # verification_commands from expected.json (no tier_a_waivers, no
236
- # forbidden_patterns, no scope oracles those have separate enforcement
237
- # layers). BUILD_GATE's spec-verify-check.py reads this generic path so
238
- # the orchestrator stays benchmark-agnostic; future /devlyn:ideate could
239
- # generate the same shape from a spec.md "## Verification" section for
240
- # real-user runs (Codex R5, 2026-04-28). This stages all 3 arms — bare's
241
- # .devlyn/ is created lazily by spec-verify-check.py if absent.
269
+ # iter-0019.6: stage normalized .devlyn/spec-verify.json for BUILD_GATE.
270
+ # Only commands safe to reveal before IMPLEMENT may be staged here. Commands
271
+ # that reference BENCH_FIXTURE_DIR are hidden post-run oracles; staging their
272
+ # path leaks verifier names into the arm and lets agents search for answer-key
273
+ # files. Those commands still run in the post-run verifier below.
242
274
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
243
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
275
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
244
276
  python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
245
277
  import json, os, sys
246
278
  expected = json.load(open(sys.argv[1]))
247
279
  out_path = sys.argv[2]
248
- normalized = {"verification_commands": expected.get("verification_commands", [])}
280
+ visible_commands = [
281
+ cmd for cmd in expected.get("verification_commands", [])
282
+ if "BENCH_FIXTURE_DIR" not in str(cmd.get("cmd", ""))
283
+ ]
284
+ normalized = {"verification_commands": visible_commands}
249
285
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
286
+ if not visible_commands:
287
+ raise SystemExit(0)
250
288
  with open(out_path, "w") as f:
251
289
  json.dump(normalized, f, indent=2)
252
290
  f.write("\n")
@@ -270,7 +308,7 @@ PROMPT_FILE="$RESULT_DIR/input.md"
270
308
  # arms pass the engine flag explicitly so they survive future runtime-default
271
309
  # changes (post iter-0020 close-out: default flipped to claude).
272
310
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
273
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
311
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
274
312
  case "$ARM" in
275
313
  solo_claude)
276
314
  ENGINE_CLAUSE="--engine claude"
@@ -281,13 +319,22 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
281
319
  ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
282
320
  ;;
283
321
  l2_gated)
284
- # iter-0033c: NEW L2 with natural pair-mode triggers. Claude does
285
- # IMPLEMENT; pair-JUDGE in VERIFY fires only on coverage_failed OR
286
- # MECHANICAL warning per /devlyn:resolve PHASE 5. Codex remains
287
- # available as the OTHER-engine pair-JUDGE candidate.
322
+ # NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
323
+ # pair-JUDGE in VERIFY fires per /devlyn:resolve PHASE 5 policy
324
+ # (high complexity, coverage_failed, or warning-level mechanical
325
+ # findings; never after HIGH/CRITICAL mechanical blockers). Codex
326
+ # remains available as the OTHER-engine pair-JUDGE candidate.
288
327
  ENGINE_CLAUSE="--engine claude"
289
328
  ENGINE_PROMPT_HINT="Run with \`--engine claude\` and let the orchestrator's pair-mode (VERIFY) trigger naturally per its policy. Codex is available as the OTHER-engine pair-JUDGE — the harness has not blocked it. Do NOT pass \`--pair-verify\`; this arm measures gated triggering."
290
329
  ;;
330
+ l2_risk_probes)
331
+ # NEW L2 probe-derive arm. Claude plans/implements; Codex is used before
332
+ # IMPLEMENT only to derive bounded executable probes from visible
333
+ # Verification bullets. BUILD_GATE and VERIFY execute those probes
334
+ # mechanically via spec-verify-check.py.
335
+ ENGINE_CLAUSE="--engine claude --risk-probes"
336
+ ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\`. Codex is available as the OTHER-engine probe derivation and pair-JUDGE engine. The probe phase may only derive executable checks from visible \`## Verification\` text; it must not read hidden fixture/verifier paths."
337
+ ;;
291
338
  l2_forced)
292
339
  # iter-0033c: NEW L2 forced — pair-JUDGE always fires. Diagnostic arm
293
340
  # for Gate 6 fixture-level cross-check + Gate 7 attribution causality.
@@ -414,12 +461,17 @@ else
414
461
  # natural exit at or past the budget is no longer mislabeled as timeout.
415
462
  #
416
463
  # MCP/config isolation (iter 0004). The harness's `claude -p` subprocess
417
- # must not load the operator's user-level MCP plugins (pencil, codex-cli,
418
- # telegram, vercel, ). Project policy is "MCP is not in the loop"; loading
419
- # user MCP inside the variant arm is uncontrolled environment leaking into
420
- # the experiment, and it is the most plausible cause of the F7 0-byte-
421
- # transcript hang. `--strict-mcp-config` + an empty `mcpServers` object
422
- # forces a hermetic subprocess. Skills still resolve via `/skill-name`.
464
+ # must not load the operator's user-level MCP/plugins/settings (pencil,
465
+ # codex-cli, telegram, vercel, ...). Project policy is "MCP/plugins are not in
466
+ # the loop"; loading user config inside the arm is uncontrolled environment
467
+ # leaking into the experiment. `--setting-sources project,local` keeps user
468
+ # plugin enablement out of the run but Claude Code still reads the installed
469
+ # plugin registry for autoupdate. Official Claude Code settings document
470
+ # `DISABLE_AUTOUPDATER=1` / `CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1` as the
471
+ # supported way to disable that background traffic, while preserving OAuth
472
+ # auth from the real HOME. `--strict-mcp-config` + an empty `mcpServers` object
473
+ # forces a hermetic MCP set. Skills still resolve via the project
474
+ # `.claude/skills` staged into the worktree.
423
475
  # `--debug-file` records per-arm init/runtime so the next hang has a
424
476
  # location, not a guess.
425
477
  TIMEOUT_FLAG="$RESULT_DIR/.timed_out"
@@ -436,7 +488,7 @@ else
436
488
  # PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
437
489
  # `codex exec` through the wrapper for starvation safety.
438
490
  if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
439
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } \
491
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } \
440
492
  && [ -x "$WORK_DIR/.devlyn-bin/codex" ]; then
441
493
  export PATH="$WORK_DIR/.devlyn-bin:$PATH"
442
494
  [ "$ARM" = "solo_claude" ] && export CODEX_BLOCKED=1
@@ -447,10 +499,19 @@ else
447
499
  # what the post-run verifier (run-fixture.sh:431-434) sets so the gate
448
500
  # sees the same environment shape.
449
501
  export BENCH_WORKDIR="$WORK_DIR"
502
+ # Python helper scripts run inside the benchmark worktree. Do not let them
503
+ # rewrite tracked __pycache__ artifacts and pollute the arm-only diff.
504
+ export PYTHONDONTWRITEBYTECODE=1
505
+ # Official Claude Code setting: disable background plugin/autoupdate traffic
506
+ # before process startup. Project settings env is not early enough for all
507
+ # startup paths.
508
+ export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
509
+ export DISABLE_AUTOUPDATER=1
450
510
  exec claude \
451
511
  -p "$(cat "$PROMPT_FILE")" \
452
512
  --dangerously-skip-permissions \
453
513
  --effort xhigh \
514
+ --setting-sources project,local \
454
515
  --strict-mcp-config \
455
516
  --mcp-config '{"mcpServers":{}}' \
456
517
  --debug-file "$RESULT_DIR/claude-debug.log"
@@ -459,13 +520,21 @@ else
459
520
  set +m
460
521
 
461
522
  (
462
- sleep "$TIMEOUT"
463
- if kill -0 "$CHILD_PID" 2>/dev/null; then
464
- : > "$TIMEOUT_FLAG"
465
- kill -TERM -- "-$CHILD_PID" 2>/dev/null
466
- sleep 5
467
- kill -KILL -- "-$CHILD_PID" 2>/dev/null
468
- fi
523
+ deadline=$((T_START + TIMEOUT))
524
+ while kill -0 "$CHILD_PID" 2>/dev/null; do
525
+ now=$(date +%s)
526
+ if [ "$now" -ge "$deadline" ]; then
527
+ : > "$TIMEOUT_FLAG"
528
+ kill -TERM -- "-$CHILD_PID" 2>/dev/null
529
+ kill_worktree_processes "$WORK_DIR" TERM
530
+ sleep 5
531
+ kill -KILL -- "-$CHILD_PID" 2>/dev/null
532
+ kill_worktree_processes "$WORK_DIR" KILL
533
+ exit 0
534
+ fi
535
+ remaining=$((deadline - now))
536
+ [ "$remaining" -gt 30 ] && sleep 30 || sleep "$remaining"
537
+ done
469
538
  ) &
470
539
  WATCHDOG_PID=$!
471
540
 
@@ -479,7 +548,16 @@ else
479
548
  INVOKE_EXIT=124
480
549
  WATCHDOG_FIRED=1
481
550
  rm -f "$TIMEOUT_FLAG"
551
+ kill_worktree_processes "$WORK_DIR" TERM
552
+ sleep 1
553
+ kill_worktree_processes "$WORK_DIR" KILL
482
554
  echo "[run-fixture] arm timed out after ${TIMEOUT}s — INVOKE_EXIT=124" >&2
555
+ else
556
+ # A clean `claude -p` exit can still leave OTHER-engine pair-JUDGE
557
+ # descendants alive; reap any process group rooted in this arm worktree.
558
+ kill_worktree_processes "$WORK_DIR" TERM
559
+ sleep 1
560
+ kill_worktree_processes "$WORK_DIR" KILL
483
561
  fi
484
562
  set -e
485
563
  fi
@@ -487,6 +565,25 @@ fi
487
565
  T_END=$(date +%s)
488
566
  ELAPSED=$((T_END - T_START))
489
567
 
568
+ # Restore tracked Python bytecode to the scaffold commit and remove only
569
+ # untracked bytecode. Helper invocations must not count as model work, but
570
+ # deleting tracked scaffold files would also pollute changed-files.txt.
571
+ (cd "$WORK_DIR" \
572
+ && git restore --source "$SCAFFOLD_SHA" -- .claude/skills/_shared/__pycache__ 2>/dev/null || true)
573
+ cleanup_roots=()
574
+ [ -d "$WORK_DIR/.claude" ] && cleanup_roots+=("$WORK_DIR/.claude")
575
+ [ -d "$WORK_DIR/.devlyn" ] && cleanup_roots+=("$WORK_DIR/.devlyn")
576
+ if [ ${#cleanup_roots[@]} -gt 0 ]; then
577
+ find "${cleanup_roots[@]}" -type f \( -name '*.pyc' -o -name '*.pyo' \) -print0 \
578
+ | while IFS= read -r -d '' py_file; do
579
+ rel="${py_file#$WORK_DIR/}"
580
+ if ! (cd "$WORK_DIR" && git ls-files --error-unmatch "$rel" >/dev/null 2>&1); then
581
+ rm -f "$py_file"
582
+ fi
583
+ done
584
+ find "${cleanup_roots[@]}" -type d -name __pycache__ -empty -delete || true
585
+ fi
586
+
490
587
  # Capture the ARM-ONLY diff against the scaffold commit. Variant's
491
588
  # auto-resolve pipeline commits internally after each phase, so diffing
492
589
  # against HEAD would miss committed work. Diffing against SCAFFOLD_SHA after
@@ -498,8 +595,7 @@ ELAPSED=$((T_END - T_START))
498
595
  (cd "$WORK_DIR" \
499
596
  && git diff "$SCAFFOLD_SHA" --name-only) > "$RESULT_DIR/changed-files.txt" 2>&1 || true
500
597
 
501
- # Deterministic oracles (step 1+ of the benchmark-extension plan).
502
- # Findings-only at this stage; scoring integration is step 5.
598
+ # Deterministic oracles. Hard/flag findings are merged into verify.json below.
503
599
  python3 "$BENCH_ROOT/scripts/oracle-test-fidelity.py" \
504
600
  --work "$WORK_DIR" --scaffold "$SCAFFOLD_SHA" \
505
601
  > "$RESULT_DIR/oracle-test-fidelity.json" 2>/dev/null || \
@@ -518,6 +614,41 @@ python3 "$BENCH_ROOT/scripts/oracle-scope-tier-b.py" \
518
614
  echo '{"oracle":"scope-tier-b","findings":[],"error":"oracle invocation failed"}' \
519
615
  > "$RESULT_DIR/oracle-scope-tier-b.json"
520
616
 
617
+ if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
618
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } \
619
+ && [ -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
620
+ && [ -f "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" ]; then
621
+ if [ -f "$WORK_DIR/.devlyn/codex-judge.stdout" ] \
622
+ && [ -f "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" ]; then
623
+ if ! python3 "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" \
624
+ --devlyn-dir "$WORK_DIR/.devlyn" \
625
+ > "$RESULT_DIR/collect-codex-findings.log" 2>&1; then
626
+ echo "[run-fixture] Codex pair findings collection failed; see $RESULT_DIR/collect-codex-findings.log" >&2
627
+ fi
628
+ fi
629
+ if ! python3 "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" \
630
+ --devlyn-dir "$WORK_DIR/.devlyn" --write-state \
631
+ > "$RESULT_DIR/verify-merge-normalize.log" 2>&1; then
632
+ echo "[run-fixture] verify merge normalization failed; see $RESULT_DIR/verify-merge-normalize.log" >&2
633
+ fi
634
+ fi
635
+
636
+ if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
637
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } && [ -d "$WORK_DIR/.devlyn" ]; then
638
+ run_dir=$(find "$WORK_DIR/.devlyn/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -1 || true)
639
+ if [ -n "$run_dir" ]; then
640
+ rm -rf "$RESULT_DIR/run-archive"
641
+ cp -R "$run_dir" "$RESULT_DIR/run-archive"
642
+ [ -f "$RESULT_DIR/run-archive/pipeline.state.json" ] \
643
+ || [ ! -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
644
+ || cp "$WORK_DIR/.devlyn/pipeline.state.json" "$RESULT_DIR/run-archive/pipeline.state.json"
645
+ else
646
+ rm -rf "$RESULT_DIR/run-archive"
647
+ mkdir -p "$RESULT_DIR/run-archive"
648
+ find "$WORK_DIR/.devlyn" -maxdepth 1 -type f -exec cp {} "$RESULT_DIR/run-archive/" \;
649
+ fi
650
+ fi
651
+
521
652
  # Run verification commands + forbidden pattern scan + deps check. Uses
522
653
  # the operator's real HOME (same as the arm saw). Fixtures that need HOME
523
654
  # isolation override it inline per verification command.
@@ -532,10 +663,14 @@ verify_env = os.environ.copy()
532
663
  # Expose the work-dir path so fixtures whose verification needs to reference
533
664
  # the work root can do so portably (e.g. F9's out-of-repo check).
534
665
  verify_env["BENCH_WORKDIR"] = work
666
+ # Hidden benchmark verifiers live in the fixture directory, outside the arm's
667
+ # work tree. This keeps oracle code from becoming implementation context.
668
+ verify_env["BENCH_FIXTURE_DIR"] = os.path.dirname(os.path.abspath(sys.argv[1]))
535
669
 
536
670
  verify = {"commands": [], "forbidden_pattern_hits": [], "deps_added": 0,
537
671
  "max_deps_added": expected.get("max_deps_added", 0),
538
- "missing_required_files": [], "forbidden_files_present": []}
672
+ "missing_required_files": [], "forbidden_files_present": [],
673
+ "oracle_findings": [], "oracle_disqualifier": False}
539
674
 
540
675
  for vc in expected.get("verification_commands", []):
541
676
  try:
@@ -631,11 +766,29 @@ verify["commands_passed"] = passed
631
766
  verify["commands_total"] = total
632
767
  verify["verify_score"] = (passed / total) if total else 1.0
633
768
 
769
+ for oracle_file in (
770
+ "oracle-scope-tier-a.json",
771
+ "oracle-scope-tier-b.json",
772
+ "oracle-test-fidelity.json",
773
+ ):
774
+ try:
775
+ data = json.load(open(os.path.join(result_dir, oracle_file)))
776
+ except Exception:
777
+ continue
778
+ oracle_name = data.get("oracle") or oracle_file.removesuffix(".json")
779
+ for finding in data.get("findings", []) or []:
780
+ item = dict(finding)
781
+ item["oracle"] = oracle_name
782
+ verify["oracle_findings"].append(item)
783
+ if item.get("severity") in ("disqualifier", "hard", "flag"):
784
+ verify["oracle_disqualifier"] = True
785
+
634
786
  verify["disqualifier"] = (
635
787
  any(h["severity"] == "disqualifier" for h in verify["forbidden_pattern_hits"])
636
788
  or verify["deps_added"] > verify["max_deps_added"]
637
789
  or bool(verify["missing_required_files"])
638
790
  or bool(verify["forbidden_files_present"])
791
+ or verify["oracle_disqualifier"]
639
792
  )
640
793
 
641
794
  json.dump(verify, open(os.path.join(result_dir, "verify.json"), "w"), indent=2)
@@ -669,11 +822,65 @@ try:
669
822
  except Exception:
670
823
  changed = []
671
824
 
825
+ state = {}
826
+ state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
827
+ if os.path.isfile(state_path):
828
+ with open(state_path) as f:
829
+ state = json.load(f)
830
+ verify_phase = (state.get("phases") or {}).get("verify") or {}
831
+ sub_verdicts = verify_phase.get("sub_verdicts")
832
+ pair_trigger = verify_phase.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
833
+ pair_mode = bool(
834
+ isinstance(sub_verdicts, dict)
835
+ and (sub_verdicts.get("judge_codex") is not None or sub_verdicts.get("pair_judge") is not None)
836
+ ) or bool(verify_phase.get("pair_mode"))
837
+
838
+ invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
839
+ plugin_contamination = False
840
+ plugin_contamination_reason = None
841
+ debug_path = os.path.join(result_dir, "claude-debug.log")
842
+ try:
843
+ with open(debug_path, errors="replace") as f:
844
+ debug_text = f.read()
845
+ except OSError:
846
+ debug_text = ""
847
+ if (
848
+ "Plugin autoupdate: checking installed plugins" in debug_text
849
+ or "Caching plugin from source:" in debug_text
850
+ or "Cloned repository from " in debug_text
851
+ or "Successfully cached plugin " in debug_text
852
+ or "Found 8 plugins (8 enabled" in debug_text
853
+ ):
854
+ if "Plugin autoupdate: skipped (auto-updater disabled)" not in debug_text:
855
+ plugin_contamination = True
856
+ plugin_contamination_reason = "plugin_contamination"
857
+
858
+ invoke_failure = (
859
+ (invoke_exit not in (0,) and not timing["timed_out"])
860
+ or plugin_contamination
861
+ )
862
+ invoke_failure_reason = None
863
+ if plugin_contamination:
864
+ invoke_failure_reason = plugin_contamination_reason
865
+ elif invoke_failure:
866
+ transcript_path = os.path.join(result_dir, "transcript.txt")
867
+ haystack = ""
868
+ for path in (transcript_path, debug_path):
869
+ try:
870
+ with open(path, errors="replace") as f:
871
+ haystack += "\n" + f.read()
872
+ except OSError:
873
+ pass
874
+ if "You've hit your limit" in haystack or "rate_limit_error" in haystack:
875
+ invoke_failure_reason = "provider_limit"
876
+
672
877
  result = {
673
878
  "fixture": fixture,
674
879
  "arm": arm,
675
880
  "run_id": run_id,
676
881
  "disqualifier": verify.get("disqualifier", False),
882
+ "oracle_disqualifier": verify.get("oracle_disqualifier", False),
883
+ "oracle_findings_count": len(verify.get("oracle_findings", [])),
677
884
  "verify_score": verify.get("verify_score", 0.0),
678
885
  "commands_passed": verify.get("commands_passed", 0),
679
886
  "commands_total": verify.get("commands_total", 0),
@@ -681,8 +888,15 @@ result = {
681
888
  "files_changed": len(changed),
682
889
  "elapsed_seconds": elapsed,
683
890
  "timed_out": timing["timed_out"],
684
- "invoke_exit": int(os.environ.get("INVOKE_EXIT", "0")),
685
- "invoke_failure": int(os.environ.get("INVOKE_EXIT", "0")) not in (0,) and not timing["timed_out"],
891
+ "environment_contamination": plugin_contamination,
892
+ "environment_contamination_reason": plugin_contamination_reason,
893
+ "invoke_exit": invoke_exit,
894
+ "invoke_failure": invoke_failure,
895
+ "invoke_failure_reason": invoke_failure_reason,
896
+ "terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
897
+ "verify_verdict": verify_phase.get("verdict"),
898
+ "pair_trigger": pair_trigger,
899
+ "pair_mode": pair_mode,
686
900
  }
687
901
  json.dump(result, open(os.path.join(result_dir, "result.json"), "w"), indent=2)
688
902
  print(json.dumps(result, indent=2))