devlyn-cli 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (138) hide show
  1. package/CLAUDE.md +1 -1
  2. package/README.md +1 -1
  3. package/benchmark/auto-resolve/README.md +318 -2
  4. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  91. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  100. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  101. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  102. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  103. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  104. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  105. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  106. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  107. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  110. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  111. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  112. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  113. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  114. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  116. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  117. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  118. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  119. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  120. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  121. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  122. package/bin/devlyn.js +56 -10
  123. package/config/skills/_shared/archive_run.py +3 -0
  124. package/config/skills/_shared/codex-config.md +2 -2
  125. package/config/skills/_shared/codex-monitored.sh +72 -7
  126. package/config/skills/_shared/collect-codex-findings.py +125 -0
  127. package/config/skills/_shared/engine-preflight.md +1 -1
  128. package/config/skills/_shared/expected.schema.json +18 -0
  129. package/config/skills/_shared/spec-verify-check.py +312 -10
  130. package/config/skills/_shared/verify-merge-findings.py +327 -0
  131. package/config/skills/devlyn:ideate/SKILL.md +1 -1
  132. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  133. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  134. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  135. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  136. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  137. package/package.json +1 -1
  138. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,192 @@
1
+ #!/usr/bin/env python3
2
+ """Prepare a SWE-bench instance worktree for producing a candidate patch."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import re
9
+ import shutil
10
+ import subprocess
11
+ from pathlib import Path
12
+ from typing import Any
13
+
14
+
15
+ SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
16
+
17
+
18
+ def run(cmd: list[str], cwd: Path | None = None) -> None:
19
+ subprocess.run(cmd, cwd=cwd, check=True)
20
+
21
+
22
+ def read_instances(path: Path) -> list[dict[str, Any]]:
23
+ rows: list[dict[str, Any]] = []
24
+ with path.open(encoding="utf8") as f:
25
+ for line_no, line in enumerate(f, start=1):
26
+ if not line.strip():
27
+ continue
28
+ value = json.loads(line)
29
+ if not isinstance(value, dict):
30
+ raise ValueError(f"{path}:{line_no}: expected JSON object")
31
+ rows.append(value)
32
+ return rows
33
+
34
+
35
+ def require_text(instance: dict[str, Any], key: str) -> str:
36
+ value = instance.get(key)
37
+ if not isinstance(value, str) or not value.strip():
38
+ raise ValueError(f"SWE-bench instance missing non-empty {key!r}")
39
+ return value.strip()
40
+
41
+
42
+ def pick_instance(path: Path, instance_id: str) -> dict[str, Any]:
43
+ matches = [row for row in read_instances(path) if row.get("instance_id") == instance_id]
44
+ if len(matches) != 1:
45
+ raise ValueError(f"expected exactly one {instance_id!r} row in {path}, found {len(matches)}")
46
+ return matches[0]
47
+
48
+
49
+ def repo_cache_name(repo: str, base_commit: str) -> str:
50
+ return f"{repo.replace('/', '__')}-{base_commit[:12]}"
51
+
52
+
53
+ def prepare_repo(instance: dict[str, Any], repos_root: Path) -> Path:
54
+ repo = require_text(instance, "repo")
55
+ base_commit = require_text(instance, "base_commit")
56
+ repos_root.mkdir(parents=True, exist_ok=True)
57
+ dest = repos_root / repo_cache_name(repo, base_commit)
58
+
59
+ if not dest.exists():
60
+ run(["git", "clone", "--quiet", f"https://github.com/{repo}.git", str(dest)])
61
+
62
+ run(["git", "fetch", "--quiet", "--all", "--tags"], cwd=dest)
63
+ run(["git", "checkout", "--quiet", base_commit], cwd=dest)
64
+ run(["git", "reset", "--hard", "--quiet"], cwd=dest)
65
+ run(["git", "clean", "-ffdqx"], cwd=dest)
66
+ return dest
67
+
68
+
69
+ def copy_worktree(repo_path: Path, worktree: Path) -> None:
70
+ if worktree.exists():
71
+ shutil.rmtree(worktree)
72
+ run(["git", "clone", "--quiet", "--no-hardlinks", str(repo_path), str(worktree)])
73
+ run(["git", "checkout", "--quiet", "HEAD"], cwd=worktree)
74
+ run(["git", "reset", "--hard", "--quiet"], cwd=worktree)
75
+ run(["git", "clean", "-ffdqx"], cwd=worktree)
76
+
77
+
78
+ def write_spec(instance: dict[str, Any], worktree: Path) -> Path:
79
+ instance_id = require_text(instance, "instance_id")
80
+ repo = require_text(instance, "repo")
81
+ base_commit = require_text(instance, "base_commit")
82
+ problem = require_text(instance, "problem_statement")
83
+ spec_path = worktree / "docs" / "roadmap" / "phase-1" / f"{instance_id}.md"
84
+ spec_path.parent.mkdir(parents=True, exist_ok=True)
85
+ spec_path.write_text(
86
+ f"""---
87
+ id: "{instance_id}"
88
+ title: "SWE-bench {instance_id}"
89
+ status: planned
90
+ complexity: high
91
+ depends-on: []
92
+ ---
93
+
94
+ # SWE-bench {instance_id}
95
+
96
+ Repository: `{repo}`
97
+ Base commit: `{base_commit}`
98
+
99
+ ## Requirements
100
+
101
+ - [ ] Resolve the issue described in the problem statement.
102
+ - [ ] Preserve existing behavior outside the issue's scope.
103
+ - [ ] Keep the implementation consistent with the repository's local style and
104
+ dependency policy.
105
+ - [ ] Add focused regression coverage when practical.
106
+
107
+ ## Problem Statement
108
+
109
+ {problem}
110
+
111
+ ## Constraints
112
+
113
+ - Do not inspect or rely on the SWE-bench gold `patch` or `test_patch` fields.
114
+ - Do not add broad rewrites, unrelated formatting churn, or new dependencies
115
+ unless the visible problem statement strictly requires them.
116
+
117
+ ## Verification
118
+
119
+ - Run the most focused practical verification for the changed behavior.
120
+ """,
121
+ encoding="utf8",
122
+ )
123
+ return spec_path
124
+
125
+
126
+ def copy_devlyn_context(worktree: Path) -> None:
127
+ skills_src = Path("config/skills")
128
+ if skills_src.exists():
129
+ skills_dst = worktree / ".claude" / "skills"
130
+ if skills_dst.exists():
131
+ shutil.rmtree(skills_dst)
132
+ shutil.copytree(skills_src, skills_dst)
133
+ claude_src = Path("CLAUDE.md")
134
+ if claude_src.exists():
135
+ shutil.copy2(claude_src, worktree / "CLAUDE.md")
136
+
137
+
138
+ def main() -> int:
139
+ parser = argparse.ArgumentParser()
140
+ parser.add_argument("--instances-jsonl", required=True, type=Path)
141
+ parser.add_argument("--instance-id", required=True)
142
+ parser.add_argument(
143
+ "--repos-root",
144
+ default=Path("benchmark/auto-resolve/external/swebench/repos-solver"),
145
+ type=Path,
146
+ )
147
+ parser.add_argument(
148
+ "--worktrees-root",
149
+ default=Path("benchmark/auto-resolve/external/swebench/worktrees"),
150
+ type=Path,
151
+ )
152
+ parser.add_argument("--copy-devlyn-context", action="store_true")
153
+ args = parser.parse_args()
154
+
155
+ instance = pick_instance(args.instances_jsonl, args.instance_id)
156
+ instance_id = require_text(instance, "instance_id")
157
+ if not SAFE_ID.match(instance_id):
158
+ raise ValueError(f"unsafe instance_id for path/spec use: {instance_id!r}")
159
+
160
+ repo_path = prepare_repo(instance, args.repos_root)
161
+ worktree = args.worktrees_root / instance_id
162
+ args.worktrees_root.mkdir(parents=True, exist_ok=True)
163
+ copy_worktree(repo_path, worktree)
164
+ spec_path = write_spec(instance, worktree)
165
+ if args.copy_devlyn_context:
166
+ copy_devlyn_context(worktree)
167
+
168
+ prompt = (
169
+ f"You are solving SWE-bench instance {instance_id} in this checked-out repository at "
170
+ "the base commit. Do not inspect any gold SWE-bench patch or test_patch. Read the "
171
+ f"local code and the spec at {spec_path.relative_to(worktree)}. Make the smallest "
172
+ "correct source/test change for the visible issue. Run a focused verification "
173
+ "command. At the end, report changed files, verification command, and verdict."
174
+ )
175
+ (worktree / "solve-prompt.txt").write_text(prompt + "\n", encoding="utf8")
176
+ print(
177
+ json.dumps(
178
+ {
179
+ "instance_id": instance_id,
180
+ "repo_dir": str(repo_path),
181
+ "worktree": str(worktree),
182
+ "spec_path": str(spec_path),
183
+ "prompt_file": str(worktree / "solve-prompt.txt"),
184
+ },
185
+ indent=2,
186
+ )
187
+ )
188
+ return 0
189
+
190
+
191
+ if __name__ == "__main__":
192
+ raise SystemExit(main())
@@ -15,10 +15,27 @@
15
15
  set -euo pipefail
16
16
 
17
17
  usage() {
18
- echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
18
+ echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
19
19
  exit 1
20
20
  }
21
21
 
22
+ kill_worktree_processes() {
23
+ local work_dir="$1"
24
+ local signal="$2"
25
+ local physical_work_dir current_pgid
26
+ physical_work_dir="$(cd "$work_dir" 2>/dev/null && pwd -P || printf '%s' "$work_dir")"
27
+ current_pgid="$(ps -o pgid= -p "$$" | tr -d ' ')"
28
+ ps -axo pid=,pgid=,command= \
29
+ | awk -v p1="$work_dir" -v p2="$physical_work_dir" -v self="$$" -v current_pgid="$current_pgid" '
30
+ $1 != self && $2 != current_pgid && (index($0, p1) || index($0, p2)) { print $2 }
31
+ ' \
32
+ | sort -u \
33
+ | while IFS= read -r pgid; do
34
+ [ -n "$pgid" ] || continue
35
+ kill "-$signal" -- "-$pgid" 2>/dev/null || true
36
+ done
37
+ }
38
+
22
39
  FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
23
40
  RESOLVE_SKILL="new"
24
41
  while [ $# -gt 0 ]; do
@@ -35,18 +52,23 @@ done
35
52
  # iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
36
53
  # solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
37
54
  # bare (L0: direct claude -p, no skill, no codex).
38
- # iter-0033c (Codex R0-infra adoption, 2026-05-02): two new arms for NEW L2 measurement on /devlyn:resolve —
55
+ # iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
39
56
  # l2_gated (--engine claude, no --pair-verify; pair fires only on natural triggers),
40
- # l2_forced (--engine claude --pair-verify; diagnostic). Both require --resolve-skill new.
57
+ # l2_risk_probes (--engine claude --risk-probes; pair converts visible Verification bullets to executable probes before IMPLEMENT),
58
+ # l2_forced (--engine claude --pair-verify; retired because it leaks pair-awareness before IMPLEMENT).
41
59
  [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] || [ "$ARM" = "bare" ] \
42
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ] || \
43
- { echo "arm must be variant|solo_claude|bare|l2_gated|l2_forced"; exit 1; }
60
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ] || \
61
+ { echo "arm must be variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced"; exit 1; }
44
62
  # iter-0033c (Codex R0-infra Q2): l2_* arms require NEW skill surface (only NEW
45
63
  # `/devlyn:resolve` honors --pair-verify; OLD `/devlyn:auto-resolve` would silently
46
64
  # ignore the flag and produce mis-attributed L2 numbers).
47
- if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
65
+ if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
48
66
  echo "l2_* arms require --resolve-skill new (got '$RESOLVE_SKILL')"; exit 1
49
67
  fi
68
+ if [ "$ARM" = "l2_forced" ]; then
69
+ echo "l2_forced is retired: it puts --pair-verify in the initial prompt, so IMPLEMENT can become pair-aware before the diff is frozen. Use scripts/run-frozen-verify-pair.sh for leak-free VERIFY-pair measurement." >&2
70
+ exit 1
71
+ fi
50
72
  # iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` was
51
73
  # deleted. Only `new` (= /devlyn:resolve --spec) is supported. The flag stays
52
74
  # an accepted no-op so historical runners (run-iter-0033c.sh:137) keep working
@@ -78,6 +100,13 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
78
100
  done
79
101
 
80
102
  TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
103
+ if [ "$ARM" = "l2_risk_probes" ]; then
104
+ # This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
105
+ # bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
106
+ # enforces wall-time efficiency by pair/solo ratio; this budget prevents a
107
+ # false timeout before the mandatory second judge can emit its contract line.
108
+ TIMEOUT=$((TIMEOUT + 600))
109
+ fi
81
110
 
82
111
  RESULT_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE/$ARM"
83
112
  mkdir -p "$RESULT_DIR"
@@ -104,7 +133,7 @@ cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
104
133
  # while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
105
134
  # /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
106
135
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
107
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
136
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
108
137
  mkdir -p "$WORK_DIR/.claude"
109
138
  if [ -d "$REPO_ROOT/.claude/skills" ]; then
110
139
  cp -R "$REPO_ROOT/.claude/skills" "$WORK_DIR/.claude/skills"
@@ -164,11 +193,13 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
164
193
  ARM_CODEX_BLOCKED=0
165
194
  fi
166
195
  python3 - "$WORK_DIR/.claude/settings.json" \
167
- "$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" <<'PY'
196
+ "$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" "$ARM" <<'PY'
168
197
  import json, sys
169
- out_path, path_val, real_bin, monitored, codex_blocked = sys.argv[1:6]
198
+ out_path, path_val, real_bin, monitored, codex_blocked, arm = sys.argv[1:7]
170
199
  env = {
171
200
  "CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1",
201
+ "CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
202
+ "DISABLE_AUTOUPDATER": "1",
172
203
  "PATH": path_val,
173
204
  }
174
205
  if codex_blocked == "1":
@@ -182,6 +213,10 @@ else:
182
213
  # BUILD; both vars are required by the shim/wrapper handshake.
183
214
  env["CODEX_REAL_BIN"] = real_bin
184
215
  env["CODEX_MONITORED_PATH"] = monitored
216
+ if arm == "l2_risk_probes":
217
+ # Risk-probe derivation is a bounded contract-conversion step. A long
218
+ # Codex run is a harness failure, not useful extra quality signal.
219
+ env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
185
220
  data = {"env": env}
186
221
  with open(out_path, "w") as f:
187
222
  json.dump(data, f, indent=2)
@@ -231,22 +266,25 @@ if [ -f "$SETUP" ] && [ -s "$SETUP" ]; then
231
266
  fi
232
267
  fi
233
268
 
234
- # iter-0019.6: stage normalized .devlyn/spec-verify.json containing ONLY
235
- # verification_commands from expected.json (no tier_a_waivers, no
236
- # forbidden_patterns, no scope oracles those have separate enforcement
237
- # layers). BUILD_GATE's spec-verify-check.py reads this generic path so
238
- # the orchestrator stays benchmark-agnostic; future /devlyn:ideate could
239
- # generate the same shape from a spec.md "## Verification" section for
240
- # real-user runs (Codex R5, 2026-04-28). This stages all 3 arms — bare's
241
- # .devlyn/ is created lazily by spec-verify-check.py if absent.
269
+ # iter-0019.6: stage normalized .devlyn/spec-verify.json for BUILD_GATE.
270
+ # Only commands safe to reveal before IMPLEMENT may be staged here. Commands
271
+ # that reference BENCH_FIXTURE_DIR are hidden post-run oracles; staging their
272
+ # path leaks verifier names into the arm and lets agents search for answer-key
273
+ # files. Those commands still run in the post-run verifier below.
242
274
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
243
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
275
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
244
276
  python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
245
277
  import json, os, sys
246
278
  expected = json.load(open(sys.argv[1]))
247
279
  out_path = sys.argv[2]
248
- normalized = {"verification_commands": expected.get("verification_commands", [])}
280
+ visible_commands = [
281
+ cmd for cmd in expected.get("verification_commands", [])
282
+ if "BENCH_FIXTURE_DIR" not in str(cmd.get("cmd", ""))
283
+ ]
284
+ normalized = {"verification_commands": visible_commands}
249
285
  os.makedirs(os.path.dirname(out_path), exist_ok=True)
286
+ if not visible_commands:
287
+ raise SystemExit(0)
250
288
  with open(out_path, "w") as f:
251
289
  json.dump(normalized, f, indent=2)
252
290
  f.write("\n")
@@ -270,7 +308,7 @@ PROMPT_FILE="$RESULT_DIR/input.md"
270
308
  # arms pass the engine flag explicitly so they survive future runtime-default
271
309
  # changes (post iter-0020 close-out: default flipped to claude).
272
310
  if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
273
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
311
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
274
312
  case "$ARM" in
275
313
  solo_claude)
276
314
  ENGINE_CLAUSE="--engine claude"
@@ -281,13 +319,22 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
281
319
  ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
282
320
  ;;
283
321
  l2_gated)
284
- # iter-0033c: NEW L2 with natural pair-mode triggers. Claude does
285
- # IMPLEMENT; pair-JUDGE in VERIFY fires only on coverage_failed OR
286
- # MECHANICAL warning per /devlyn:resolve PHASE 5. Codex remains
287
- # available as the OTHER-engine pair-JUDGE candidate.
322
+ # NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
323
+ # pair-JUDGE in VERIFY fires per /devlyn:resolve PHASE 5 policy
324
+ # (high complexity, coverage_failed, or warning-level mechanical
325
+ # findings; never after HIGH/CRITICAL mechanical blockers). Codex
326
+ # remains available as the OTHER-engine pair-JUDGE candidate.
288
327
  ENGINE_CLAUSE="--engine claude"
289
328
  ENGINE_PROMPT_HINT="Run with \`--engine claude\` and let the orchestrator's pair-mode (VERIFY) trigger naturally per its policy. Codex is available as the OTHER-engine pair-JUDGE — the harness has not blocked it. Do NOT pass \`--pair-verify\`; this arm measures gated triggering."
290
329
  ;;
330
+ l2_risk_probes)
331
+ # NEW L2 probe-derive arm. Claude plans/implements; Codex is used before
332
+ # IMPLEMENT only to derive bounded executable probes from visible
333
+ # Verification bullets. BUILD_GATE and VERIFY execute those probes
334
+ # mechanically via spec-verify-check.py.
335
+ ENGINE_CLAUSE="--engine claude --risk-probes"
336
+ ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\`. Codex is available as the OTHER-engine probe derivation and pair-JUDGE engine. The probe phase may only derive executable checks from visible \`## Verification\` text; it must not read hidden fixture/verifier paths."
337
+ ;;
291
338
  l2_forced)
292
339
  # iter-0033c: NEW L2 forced — pair-JUDGE always fires. Diagnostic arm
293
340
  # for Gate 6 fixture-level cross-check + Gate 7 attribution causality.
@@ -414,12 +461,17 @@ else
414
461
  # natural exit at or past the budget is no longer mislabeled as timeout.
415
462
  #
416
463
  # MCP/config isolation (iter 0004). The harness's `claude -p` subprocess
417
- # must not load the operator's user-level MCP plugins (pencil, codex-cli,
418
- # telegram, vercel, ). Project policy is "MCP is not in the loop"; loading
419
- # user MCP inside the variant arm is uncontrolled environment leaking into
420
- # the experiment, and it is the most plausible cause of the F7 0-byte-
421
- # transcript hang. `--strict-mcp-config` + an empty `mcpServers` object
422
- # forces a hermetic subprocess. Skills still resolve via `/skill-name`.
464
+ # must not load the operator's user-level MCP/plugins/settings (pencil,
465
+ # codex-cli, telegram, vercel, ...). Project policy is "MCP/plugins are not in
466
+ # the loop"; loading user config inside the arm is uncontrolled environment
467
+ # leaking into the experiment. `--setting-sources project,local` keeps user
468
+ # plugin enablement out of the run but Claude Code still reads the installed
469
+ # plugin registry for autoupdate. Official Claude Code settings document
470
+ # `DISABLE_AUTOUPDATER=1` / `CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1` as the
471
+ # supported way to disable that background traffic, while preserving OAuth
472
+ # auth from the real HOME. `--strict-mcp-config` + an empty `mcpServers` object
473
+ # forces a hermetic MCP set. Skills still resolve via the project
474
+ # `.claude/skills` staged into the worktree.
423
475
  # `--debug-file` records per-arm init/runtime so the next hang has a
424
476
  # location, not a guess.
425
477
  TIMEOUT_FLAG="$RESULT_DIR/.timed_out"
@@ -436,7 +488,7 @@ else
436
488
  # PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
437
489
  # `codex exec` through the wrapper for starvation safety.
438
490
  if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
439
- || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } \
491
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } \
440
492
  && [ -x "$WORK_DIR/.devlyn-bin/codex" ]; then
441
493
  export PATH="$WORK_DIR/.devlyn-bin:$PATH"
442
494
  [ "$ARM" = "solo_claude" ] && export CODEX_BLOCKED=1
@@ -447,10 +499,19 @@ else
447
499
  # what the post-run verifier (run-fixture.sh:431-434) sets so the gate
448
500
  # sees the same environment shape.
449
501
  export BENCH_WORKDIR="$WORK_DIR"
502
+ # Python helper scripts run inside the benchmark worktree. Do not let them
503
+ # rewrite tracked __pycache__ artifacts and pollute the arm-only diff.
504
+ export PYTHONDONTWRITEBYTECODE=1
505
+ # Official Claude Code setting: disable background plugin/autoupdate traffic
506
+ # before process startup. Project settings env is not early enough for all
507
+ # startup paths.
508
+ export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
509
+ export DISABLE_AUTOUPDATER=1
450
510
  exec claude \
451
511
  -p "$(cat "$PROMPT_FILE")" \
452
512
  --dangerously-skip-permissions \
453
513
  --effort xhigh \
514
+ --setting-sources project,local \
454
515
  --strict-mcp-config \
455
516
  --mcp-config '{"mcpServers":{}}' \
456
517
  --debug-file "$RESULT_DIR/claude-debug.log"
@@ -459,13 +520,21 @@ else
459
520
  set +m
460
521
 
461
522
  (
462
- sleep "$TIMEOUT"
463
- if kill -0 "$CHILD_PID" 2>/dev/null; then
464
- : > "$TIMEOUT_FLAG"
465
- kill -TERM -- "-$CHILD_PID" 2>/dev/null
466
- sleep 5
467
- kill -KILL -- "-$CHILD_PID" 2>/dev/null
468
- fi
523
+ deadline=$((T_START + TIMEOUT))
524
+ while kill -0 "$CHILD_PID" 2>/dev/null; do
525
+ now=$(date +%s)
526
+ if [ "$now" -ge "$deadline" ]; then
527
+ : > "$TIMEOUT_FLAG"
528
+ kill -TERM -- "-$CHILD_PID" 2>/dev/null
529
+ kill_worktree_processes "$WORK_DIR" TERM
530
+ sleep 5
531
+ kill -KILL -- "-$CHILD_PID" 2>/dev/null
532
+ kill_worktree_processes "$WORK_DIR" KILL
533
+ exit 0
534
+ fi
535
+ remaining=$((deadline - now))
536
+ [ "$remaining" -gt 30 ] && sleep 30 || sleep "$remaining"
537
+ done
469
538
  ) &
470
539
  WATCHDOG_PID=$!
471
540
 
@@ -479,7 +548,16 @@ else
479
548
  INVOKE_EXIT=124
480
549
  WATCHDOG_FIRED=1
481
550
  rm -f "$TIMEOUT_FLAG"
551
+ kill_worktree_processes "$WORK_DIR" TERM
552
+ sleep 1
553
+ kill_worktree_processes "$WORK_DIR" KILL
482
554
  echo "[run-fixture] arm timed out after ${TIMEOUT}s — INVOKE_EXIT=124" >&2
555
+ else
556
+ # A clean `claude -p` exit can still leave OTHER-engine pair-JUDGE
557
+ # descendants alive; reap any process group rooted in this arm worktree.
558
+ kill_worktree_processes "$WORK_DIR" TERM
559
+ sleep 1
560
+ kill_worktree_processes "$WORK_DIR" KILL
483
561
  fi
484
562
  set -e
485
563
  fi
@@ -487,6 +565,25 @@ fi
487
565
  T_END=$(date +%s)
488
566
  ELAPSED=$((T_END - T_START))
489
567
 
568
+ # Restore tracked Python bytecode to the scaffold commit and remove only
569
+ # untracked bytecode. Helper invocations must not count as model work, but
570
+ # deleting tracked scaffold files would also pollute changed-files.txt.
571
+ (cd "$WORK_DIR" \
572
+ && git restore --source "$SCAFFOLD_SHA" -- .claude/skills/_shared/__pycache__ 2>/dev/null || true)
573
+ cleanup_roots=()
574
+ [ -d "$WORK_DIR/.claude" ] && cleanup_roots+=("$WORK_DIR/.claude")
575
+ [ -d "$WORK_DIR/.devlyn" ] && cleanup_roots+=("$WORK_DIR/.devlyn")
576
+ if [ ${#cleanup_roots[@]} -gt 0 ]; then
577
+ find "${cleanup_roots[@]}" -type f \( -name '*.pyc' -o -name '*.pyo' \) -print0 \
578
+ | while IFS= read -r -d '' py_file; do
579
+ rel="${py_file#$WORK_DIR/}"
580
+ if ! (cd "$WORK_DIR" && git ls-files --error-unmatch "$rel" >/dev/null 2>&1); then
581
+ rm -f "$py_file"
582
+ fi
583
+ done
584
+ find "${cleanup_roots[@]}" -type d -name __pycache__ -empty -delete || true
585
+ fi
586
+
490
587
  # Capture the ARM-ONLY diff against the scaffold commit. Variant's
491
588
  # auto-resolve pipeline commits internally after each phase, so diffing
492
589
  # against HEAD would miss committed work. Diffing against SCAFFOLD_SHA after
@@ -518,6 +615,41 @@ python3 "$BENCH_ROOT/scripts/oracle-scope-tier-b.py" \
518
615
  echo '{"oracle":"scope-tier-b","findings":[],"error":"oracle invocation failed"}' \
519
616
  > "$RESULT_DIR/oracle-scope-tier-b.json"
520
617
 
618
+ if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
619
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } \
620
+ && [ -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
621
+ && [ -f "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" ]; then
622
+ if [ -f "$WORK_DIR/.devlyn/codex-judge.stdout" ] \
623
+ && [ -f "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" ]; then
624
+ if ! python3 "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" \
625
+ --devlyn-dir "$WORK_DIR/.devlyn" \
626
+ > "$RESULT_DIR/collect-codex-findings.log" 2>&1; then
627
+ echo "[run-fixture] Codex pair findings collection failed; see $RESULT_DIR/collect-codex-findings.log" >&2
628
+ fi
629
+ fi
630
+ if ! python3 "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" \
631
+ --devlyn-dir "$WORK_DIR/.devlyn" --write-state \
632
+ > "$RESULT_DIR/verify-merge-normalize.log" 2>&1; then
633
+ echo "[run-fixture] verify merge normalization failed; see $RESULT_DIR/verify-merge-normalize.log" >&2
634
+ fi
635
+ fi
636
+
637
+ if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
638
+ || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } && [ -d "$WORK_DIR/.devlyn" ]; then
639
+ run_dir=$(find "$WORK_DIR/.devlyn/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -1 || true)
640
+ if [ -n "$run_dir" ]; then
641
+ rm -rf "$RESULT_DIR/run-archive"
642
+ cp -R "$run_dir" "$RESULT_DIR/run-archive"
643
+ [ -f "$RESULT_DIR/run-archive/pipeline.state.json" ] \
644
+ || [ ! -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
645
+ || cp "$WORK_DIR/.devlyn/pipeline.state.json" "$RESULT_DIR/run-archive/pipeline.state.json"
646
+ else
647
+ rm -rf "$RESULT_DIR/run-archive"
648
+ mkdir -p "$RESULT_DIR/run-archive"
649
+ find "$WORK_DIR/.devlyn" -maxdepth 1 -type f -exec cp {} "$RESULT_DIR/run-archive/" \;
650
+ fi
651
+ fi
652
+
521
653
  # Run verification commands + forbidden pattern scan + deps check. Uses
522
654
  # the operator's real HOME (same as the arm saw). Fixtures that need HOME
523
655
  # isolation override it inline per verification command.
@@ -532,6 +664,9 @@ verify_env = os.environ.copy()
532
664
  # Expose the work-dir path so fixtures whose verification needs to reference
533
665
  # the work root can do so portably (e.g. F9's out-of-repo check).
534
666
  verify_env["BENCH_WORKDIR"] = work
667
+ # Hidden benchmark verifiers live in the fixture directory, outside the arm's
668
+ # work tree. This keeps oracle code from becoming implementation context.
669
+ verify_env["BENCH_FIXTURE_DIR"] = os.path.dirname(os.path.abspath(sys.argv[1]))
535
670
 
536
671
  verify = {"commands": [], "forbidden_pattern_hits": [], "deps_added": 0,
537
672
  "max_deps_added": expected.get("max_deps_added", 0),
@@ -669,6 +804,58 @@ try:
669
804
  except Exception:
670
805
  changed = []
671
806
 
807
+ state = {}
808
+ state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
809
+ if os.path.isfile(state_path):
810
+ with open(state_path) as f:
811
+ state = json.load(f)
812
+ verify_phase = (state.get("phases") or {}).get("verify") or {}
813
+ sub_verdicts = verify_phase.get("sub_verdicts")
814
+ pair_trigger = verify_phase.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
815
+ pair_mode = bool(
816
+ isinstance(sub_verdicts, dict)
817
+ and (sub_verdicts.get("judge_codex") is not None or sub_verdicts.get("pair_judge") is not None)
818
+ ) or bool(verify_phase.get("pair_mode"))
819
+
820
+ invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
821
+ plugin_contamination = False
822
+ plugin_contamination_reason = None
823
+ debug_path = os.path.join(result_dir, "claude-debug.log")
824
+ try:
825
+ with open(debug_path, errors="replace") as f:
826
+ debug_text = f.read()
827
+ except OSError:
828
+ debug_text = ""
829
+ if (
830
+ "Plugin autoupdate: checking installed plugins" in debug_text
831
+ or "Caching plugin from source:" in debug_text
832
+ or "Cloned repository from " in debug_text
833
+ or "Successfully cached plugin " in debug_text
834
+ or "Found 8 plugins (8 enabled" in debug_text
835
+ ):
836
+ if "Plugin autoupdate: skipped (auto-updater disabled)" not in debug_text:
837
+ plugin_contamination = True
838
+ plugin_contamination_reason = "plugin_contamination"
839
+
840
+ invoke_failure = (
841
+ (invoke_exit not in (0,) and not timing["timed_out"])
842
+ or plugin_contamination
843
+ )
844
+ invoke_failure_reason = None
845
+ if plugin_contamination:
846
+ invoke_failure_reason = plugin_contamination_reason
847
+ elif invoke_failure:
848
+ transcript_path = os.path.join(result_dir, "transcript.txt")
849
+ haystack = ""
850
+ for path in (transcript_path, debug_path):
851
+ try:
852
+ with open(path, errors="replace") as f:
853
+ haystack += "\n" + f.read()
854
+ except OSError:
855
+ pass
856
+ if "You've hit your limit" in haystack or "rate_limit_error" in haystack:
857
+ invoke_failure_reason = "provider_limit"
858
+
672
859
  result = {
673
860
  "fixture": fixture,
674
861
  "arm": arm,
@@ -681,8 +868,15 @@ result = {
681
868
  "files_changed": len(changed),
682
869
  "elapsed_seconds": elapsed,
683
870
  "timed_out": timing["timed_out"],
684
- "invoke_exit": int(os.environ.get("INVOKE_EXIT", "0")),
685
- "invoke_failure": int(os.environ.get("INVOKE_EXIT", "0")) not in (0,) and not timing["timed_out"],
871
+ "environment_contamination": plugin_contamination,
872
+ "environment_contamination_reason": plugin_contamination_reason,
873
+ "invoke_exit": invoke_exit,
874
+ "invoke_failure": invoke_failure,
875
+ "invoke_failure_reason": invoke_failure_reason,
876
+ "terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
877
+ "verify_verdict": verify_phase.get("verdict"),
878
+ "pair_trigger": pair_trigger,
879
+ "pair_mode": pair_mode,
686
880
  }
687
881
  json.dump(result, open(os.path.join(result_dir, "result.json"), "w"), indent=2)
688
882
  print(json.dumps(result, indent=2))