devlyn-cli 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +318 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +312 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +62 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +32 -0
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Prepare a SWE-bench instance worktree for producing a candidate patch."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def run(cmd: list[str], cwd: Path | None = None) -> None:
|
|
19
|
+
subprocess.run(cmd, cwd=cwd, check=True)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def read_instances(path: Path) -> list[dict[str, Any]]:
|
|
23
|
+
rows: list[dict[str, Any]] = []
|
|
24
|
+
with path.open(encoding="utf8") as f:
|
|
25
|
+
for line_no, line in enumerate(f, start=1):
|
|
26
|
+
if not line.strip():
|
|
27
|
+
continue
|
|
28
|
+
value = json.loads(line)
|
|
29
|
+
if not isinstance(value, dict):
|
|
30
|
+
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
31
|
+
rows.append(value)
|
|
32
|
+
return rows
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def require_text(instance: dict[str, Any], key: str) -> str:
|
|
36
|
+
value = instance.get(key)
|
|
37
|
+
if not isinstance(value, str) or not value.strip():
|
|
38
|
+
raise ValueError(f"SWE-bench instance missing non-empty {key!r}")
|
|
39
|
+
return value.strip()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def pick_instance(path: Path, instance_id: str) -> dict[str, Any]:
|
|
43
|
+
matches = [row for row in read_instances(path) if row.get("instance_id") == instance_id]
|
|
44
|
+
if len(matches) != 1:
|
|
45
|
+
raise ValueError(f"expected exactly one {instance_id!r} row in {path}, found {len(matches)}")
|
|
46
|
+
return matches[0]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def repo_cache_name(repo: str, base_commit: str) -> str:
|
|
50
|
+
return f"{repo.replace('/', '__')}-{base_commit[:12]}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def prepare_repo(instance: dict[str, Any], repos_root: Path) -> Path:
|
|
54
|
+
repo = require_text(instance, "repo")
|
|
55
|
+
base_commit = require_text(instance, "base_commit")
|
|
56
|
+
repos_root.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
dest = repos_root / repo_cache_name(repo, base_commit)
|
|
58
|
+
|
|
59
|
+
if not dest.exists():
|
|
60
|
+
run(["git", "clone", "--quiet", f"https://github.com/{repo}.git", str(dest)])
|
|
61
|
+
|
|
62
|
+
run(["git", "fetch", "--quiet", "--all", "--tags"], cwd=dest)
|
|
63
|
+
run(["git", "checkout", "--quiet", base_commit], cwd=dest)
|
|
64
|
+
run(["git", "reset", "--hard", "--quiet"], cwd=dest)
|
|
65
|
+
run(["git", "clean", "-ffdqx"], cwd=dest)
|
|
66
|
+
return dest
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def copy_worktree(repo_path: Path, worktree: Path) -> None:
|
|
70
|
+
if worktree.exists():
|
|
71
|
+
shutil.rmtree(worktree)
|
|
72
|
+
run(["git", "clone", "--quiet", "--no-hardlinks", str(repo_path), str(worktree)])
|
|
73
|
+
run(["git", "checkout", "--quiet", "HEAD"], cwd=worktree)
|
|
74
|
+
run(["git", "reset", "--hard", "--quiet"], cwd=worktree)
|
|
75
|
+
run(["git", "clean", "-ffdqx"], cwd=worktree)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_spec(instance: dict[str, Any], worktree: Path) -> Path:
|
|
79
|
+
instance_id = require_text(instance, "instance_id")
|
|
80
|
+
repo = require_text(instance, "repo")
|
|
81
|
+
base_commit = require_text(instance, "base_commit")
|
|
82
|
+
problem = require_text(instance, "problem_statement")
|
|
83
|
+
spec_path = worktree / "docs" / "roadmap" / "phase-1" / f"{instance_id}.md"
|
|
84
|
+
spec_path.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
spec_path.write_text(
|
|
86
|
+
f"""---
|
|
87
|
+
id: "{instance_id}"
|
|
88
|
+
title: "SWE-bench {instance_id}"
|
|
89
|
+
status: planned
|
|
90
|
+
complexity: high
|
|
91
|
+
depends-on: []
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
# SWE-bench {instance_id}
|
|
95
|
+
|
|
96
|
+
Repository: `{repo}`
|
|
97
|
+
Base commit: `{base_commit}`
|
|
98
|
+
|
|
99
|
+
## Requirements
|
|
100
|
+
|
|
101
|
+
- [ ] Resolve the issue described in the problem statement.
|
|
102
|
+
- [ ] Preserve existing behavior outside the issue's scope.
|
|
103
|
+
- [ ] Keep the implementation consistent with the repository's local style and
|
|
104
|
+
dependency policy.
|
|
105
|
+
- [ ] Add focused regression coverage when practical.
|
|
106
|
+
|
|
107
|
+
## Problem Statement
|
|
108
|
+
|
|
109
|
+
{problem}
|
|
110
|
+
|
|
111
|
+
## Constraints
|
|
112
|
+
|
|
113
|
+
- Do not inspect or rely on the SWE-bench gold `patch` or `test_patch` fields.
|
|
114
|
+
- Do not add broad rewrites, unrelated formatting churn, or new dependencies
|
|
115
|
+
unless the visible problem statement strictly requires them.
|
|
116
|
+
|
|
117
|
+
## Verification
|
|
118
|
+
|
|
119
|
+
- Run the most focused practical verification for the changed behavior.
|
|
120
|
+
""",
|
|
121
|
+
encoding="utf8",
|
|
122
|
+
)
|
|
123
|
+
return spec_path
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def copy_devlyn_context(worktree: Path) -> None:
|
|
127
|
+
skills_src = Path("config/skills")
|
|
128
|
+
if skills_src.exists():
|
|
129
|
+
skills_dst = worktree / ".claude" / "skills"
|
|
130
|
+
if skills_dst.exists():
|
|
131
|
+
shutil.rmtree(skills_dst)
|
|
132
|
+
shutil.copytree(skills_src, skills_dst)
|
|
133
|
+
claude_src = Path("CLAUDE.md")
|
|
134
|
+
if claude_src.exists():
|
|
135
|
+
shutil.copy2(claude_src, worktree / "CLAUDE.md")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main() -> int:
|
|
139
|
+
parser = argparse.ArgumentParser()
|
|
140
|
+
parser.add_argument("--instances-jsonl", required=True, type=Path)
|
|
141
|
+
parser.add_argument("--instance-id", required=True)
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--repos-root",
|
|
144
|
+
default=Path("benchmark/auto-resolve/external/swebench/repos-solver"),
|
|
145
|
+
type=Path,
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--worktrees-root",
|
|
149
|
+
default=Path("benchmark/auto-resolve/external/swebench/worktrees"),
|
|
150
|
+
type=Path,
|
|
151
|
+
)
|
|
152
|
+
parser.add_argument("--copy-devlyn-context", action="store_true")
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
instance = pick_instance(args.instances_jsonl, args.instance_id)
|
|
156
|
+
instance_id = require_text(instance, "instance_id")
|
|
157
|
+
if not SAFE_ID.match(instance_id):
|
|
158
|
+
raise ValueError(f"unsafe instance_id for path/spec use: {instance_id!r}")
|
|
159
|
+
|
|
160
|
+
repo_path = prepare_repo(instance, args.repos_root)
|
|
161
|
+
worktree = args.worktrees_root / instance_id
|
|
162
|
+
args.worktrees_root.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
copy_worktree(repo_path, worktree)
|
|
164
|
+
spec_path = write_spec(instance, worktree)
|
|
165
|
+
if args.copy_devlyn_context:
|
|
166
|
+
copy_devlyn_context(worktree)
|
|
167
|
+
|
|
168
|
+
prompt = (
|
|
169
|
+
f"You are solving SWE-bench instance {instance_id} in this checked-out repository at "
|
|
170
|
+
"the base commit. Do not inspect any gold SWE-bench patch or test_patch. Read the "
|
|
171
|
+
f"local code and the spec at {spec_path.relative_to(worktree)}. Make the smallest "
|
|
172
|
+
"correct source/test change for the visible issue. Run a focused verification "
|
|
173
|
+
"command. At the end, report changed files, verification command, and verdict."
|
|
174
|
+
)
|
|
175
|
+
(worktree / "solve-prompt.txt").write_text(prompt + "\n", encoding="utf8")
|
|
176
|
+
print(
|
|
177
|
+
json.dumps(
|
|
178
|
+
{
|
|
179
|
+
"instance_id": instance_id,
|
|
180
|
+
"repo_dir": str(repo_path),
|
|
181
|
+
"worktree": str(worktree),
|
|
182
|
+
"spec_path": str(spec_path),
|
|
183
|
+
"prompt_file": str(worktree / "solve-prompt.txt"),
|
|
184
|
+
},
|
|
185
|
+
indent=2,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
raise SystemExit(main())
|
|
@@ -15,10 +15,27 @@
|
|
|
15
15
|
set -euo pipefail
|
|
16
16
|
|
|
17
17
|
usage() {
|
|
18
|
-
echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
|
|
18
|
+
echo "usage: $0 --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> [--resolve-skill new] [--dry-run]"
|
|
19
19
|
exit 1
|
|
20
20
|
}
|
|
21
21
|
|
|
22
|
+
kill_worktree_processes() {
|
|
23
|
+
local work_dir="$1"
|
|
24
|
+
local signal="$2"
|
|
25
|
+
local physical_work_dir current_pgid
|
|
26
|
+
physical_work_dir="$(cd "$work_dir" 2>/dev/null && pwd -P || printf '%s' "$work_dir")"
|
|
27
|
+
current_pgid="$(ps -o pgid= -p "$$" | tr -d ' ')"
|
|
28
|
+
ps -axo pid=,pgid=,command= \
|
|
29
|
+
| awk -v p1="$work_dir" -v p2="$physical_work_dir" -v self="$$" -v current_pgid="$current_pgid" '
|
|
30
|
+
$1 != self && $2 != current_pgid && (index($0, p1) || index($0, p2)) { print $2 }
|
|
31
|
+
' \
|
|
32
|
+
| sort -u \
|
|
33
|
+
| while IFS= read -r pgid; do
|
|
34
|
+
[ -n "$pgid" ] || continue
|
|
35
|
+
kill "-$signal" -- "-$pgid" 2>/dev/null || true
|
|
36
|
+
done
|
|
37
|
+
}
|
|
38
|
+
|
|
22
39
|
FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
|
|
23
40
|
RESOLVE_SKILL="new"
|
|
24
41
|
while [ $# -gt 0 ]; do
|
|
@@ -35,18 +52,23 @@ done
|
|
|
35
52
|
# iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
|
|
36
53
|
# solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
|
|
37
54
|
# bare (L0: direct claude -p, no skill, no codex).
|
|
38
|
-
# iter-0033c (Codex R0-infra adoption, 2026-05-02): two
|
|
55
|
+
# iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
|
|
39
56
|
# l2_gated (--engine claude, no --pair-verify; pair fires only on natural triggers),
|
|
40
|
-
#
|
|
57
|
+
# l2_risk_probes (--engine claude --risk-probes; pair converts visible Verification bullets to executable probes before IMPLEMENT),
|
|
58
|
+
# l2_forced (--engine claude --pair-verify; retired because it leaks pair-awareness before IMPLEMENT).
|
|
41
59
|
[ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] || [ "$ARM" = "bare" ] \
|
|
42
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ] || \
|
|
43
|
-
{ echo "arm must be variant|solo_claude|bare|l2_gated|l2_forced"; exit 1; }
|
|
60
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ] || \
|
|
61
|
+
{ echo "arm must be variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced"; exit 1; }
|
|
44
62
|
# iter-0033c (Codex R0-infra Q2): l2_* arms require NEW skill surface (only NEW
|
|
45
63
|
# `/devlyn:resolve` honors --pair-verify; OLD `/devlyn:auto-resolve` would silently
|
|
46
64
|
# ignore the flag and produce mis-attributed L2 numbers).
|
|
47
|
-
if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
|
|
65
|
+
if { [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } && [ "$RESOLVE_SKILL" != "new" ]; then
|
|
48
66
|
echo "l2_* arms require --resolve-skill new (got '$RESOLVE_SKILL')"; exit 1
|
|
49
67
|
fi
|
|
68
|
+
if [ "$ARM" = "l2_forced" ]; then
|
|
69
|
+
echo "l2_forced is retired: it puts --pair-verify in the initial prompt, so IMPLEMENT can become pair-aware before the diff is frozen. Use scripts/run-frozen-verify-pair.sh for leak-free VERIFY-pair measurement." >&2
|
|
70
|
+
exit 1
|
|
71
|
+
fi
|
|
50
72
|
# iter-0034 Phase 4 cutover (2026-05-03): OLD `/devlyn:auto-resolve` was
|
|
51
73
|
# deleted. Only `new` (= /devlyn:resolve --spec) is supported. The flag stays
|
|
52
74
|
# an accepted no-op so historical runners (run-iter-0033c.sh:137) keep working
|
|
@@ -78,6 +100,13 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
|
|
|
78
100
|
done
|
|
79
101
|
|
|
80
102
|
TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
|
|
103
|
+
if [ "$ARM" = "l2_risk_probes" ]; then
|
|
104
|
+
# This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
|
|
105
|
+
# bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
|
|
106
|
+
# enforces wall-time efficiency by pair/solo ratio; this budget prevents a
|
|
107
|
+
# false timeout before the mandatory second judge can emit its contract line.
|
|
108
|
+
TIMEOUT=$((TIMEOUT + 600))
|
|
109
|
+
fi
|
|
81
110
|
|
|
82
111
|
RESULT_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE/$ARM"
|
|
83
112
|
mkdir -p "$RESULT_DIR"
|
|
@@ -104,7 +133,7 @@ cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
|
|
|
104
133
|
# while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
|
|
105
134
|
# /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
|
|
106
135
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
107
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
136
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
108
137
|
mkdir -p "$WORK_DIR/.claude"
|
|
109
138
|
if [ -d "$REPO_ROOT/.claude/skills" ]; then
|
|
110
139
|
cp -R "$REPO_ROOT/.claude/skills" "$WORK_DIR/.claude/skills"
|
|
@@ -164,11 +193,13 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
|
164
193
|
ARM_CODEX_BLOCKED=0
|
|
165
194
|
fi
|
|
166
195
|
python3 - "$WORK_DIR/.claude/settings.json" \
|
|
167
|
-
"$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" <<'PY'
|
|
196
|
+
"$INJECTED_PATH" "$CODEX_REAL_BIN" "$CODEX_MONITORED_PATH" "$ARM_CODEX_BLOCKED" "$ARM" <<'PY'
|
|
168
197
|
import json, sys
|
|
169
|
-
out_path, path_val, real_bin, monitored, codex_blocked = sys.argv[1:
|
|
198
|
+
out_path, path_val, real_bin, monitored, codex_blocked, arm = sys.argv[1:7]
|
|
170
199
|
env = {
|
|
171
200
|
"CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS": "1",
|
|
201
|
+
"CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC": "1",
|
|
202
|
+
"DISABLE_AUTOUPDATER": "1",
|
|
172
203
|
"PATH": path_val,
|
|
173
204
|
}
|
|
174
205
|
if codex_blocked == "1":
|
|
@@ -182,6 +213,10 @@ else:
|
|
|
182
213
|
# BUILD; both vars are required by the shim/wrapper handshake.
|
|
183
214
|
env["CODEX_REAL_BIN"] = real_bin
|
|
184
215
|
env["CODEX_MONITORED_PATH"] = monitored
|
|
216
|
+
if arm == "l2_risk_probes":
|
|
217
|
+
# Risk-probe derivation is a bounded contract-conversion step. A long
|
|
218
|
+
# Codex run is a harness failure, not useful extra quality signal.
|
|
219
|
+
env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
|
|
185
220
|
data = {"env": env}
|
|
186
221
|
with open(out_path, "w") as f:
|
|
187
222
|
json.dump(data, f, indent=2)
|
|
@@ -231,22 +266,25 @@ if [ -f "$SETUP" ] && [ -s "$SETUP" ]; then
|
|
|
231
266
|
fi
|
|
232
267
|
fi
|
|
233
268
|
|
|
234
|
-
# iter-0019.6: stage normalized .devlyn/spec-verify.json
|
|
235
|
-
#
|
|
236
|
-
#
|
|
237
|
-
#
|
|
238
|
-
#
|
|
239
|
-
# generate the same shape from a spec.md "## Verification" section for
|
|
240
|
-
# real-user runs (Codex R5, 2026-04-28). This stages all 3 arms — bare's
|
|
241
|
-
# .devlyn/ is created lazily by spec-verify-check.py if absent.
|
|
269
|
+
# iter-0019.6: stage normalized .devlyn/spec-verify.json for BUILD_GATE.
|
|
270
|
+
# Only commands safe to reveal before IMPLEMENT may be staged here. Commands
|
|
271
|
+
# that reference BENCH_FIXTURE_DIR are hidden post-run oracles; staging their
|
|
272
|
+
# path leaks verifier names into the arm and lets agents search for answer-key
|
|
273
|
+
# files. Those commands still run in the post-run verifier below.
|
|
242
274
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
243
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
275
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
244
276
|
python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
|
|
245
277
|
import json, os, sys
|
|
246
278
|
expected = json.load(open(sys.argv[1]))
|
|
247
279
|
out_path = sys.argv[2]
|
|
248
|
-
|
|
280
|
+
visible_commands = [
|
|
281
|
+
cmd for cmd in expected.get("verification_commands", [])
|
|
282
|
+
if "BENCH_FIXTURE_DIR" not in str(cmd.get("cmd", ""))
|
|
283
|
+
]
|
|
284
|
+
normalized = {"verification_commands": visible_commands}
|
|
249
285
|
os.makedirs(os.path.dirname(out_path), exist_ok=True)
|
|
286
|
+
if not visible_commands:
|
|
287
|
+
raise SystemExit(0)
|
|
250
288
|
with open(out_path, "w") as f:
|
|
251
289
|
json.dump(normalized, f, indent=2)
|
|
252
290
|
f.write("\n")
|
|
@@ -270,7 +308,7 @@ PROMPT_FILE="$RESULT_DIR/input.md"
|
|
|
270
308
|
# arms pass the engine flag explicitly so they survive future runtime-default
|
|
271
309
|
# changes (post iter-0020 close-out: default flipped to claude).
|
|
272
310
|
if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
273
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; then
|
|
311
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
|
|
274
312
|
case "$ARM" in
|
|
275
313
|
solo_claude)
|
|
276
314
|
ENGINE_CLAUSE="--engine claude"
|
|
@@ -281,13 +319,22 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
|
281
319
|
ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
|
|
282
320
|
;;
|
|
283
321
|
l2_gated)
|
|
284
|
-
#
|
|
285
|
-
#
|
|
286
|
-
#
|
|
287
|
-
#
|
|
322
|
+
# NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
|
|
323
|
+
# pair-JUDGE in VERIFY fires per /devlyn:resolve PHASE 5 policy
|
|
324
|
+
# (high complexity, coverage_failed, or warning-level mechanical
|
|
325
|
+
# findings; never after HIGH/CRITICAL mechanical blockers). Codex
|
|
326
|
+
# remains available as the OTHER-engine pair-JUDGE candidate.
|
|
288
327
|
ENGINE_CLAUSE="--engine claude"
|
|
289
328
|
ENGINE_PROMPT_HINT="Run with \`--engine claude\` and let the orchestrator's pair-mode (VERIFY) trigger naturally per its policy. Codex is available as the OTHER-engine pair-JUDGE — the harness has not blocked it. Do NOT pass \`--pair-verify\`; this arm measures gated triggering."
|
|
290
329
|
;;
|
|
330
|
+
l2_risk_probes)
|
|
331
|
+
# NEW L2 probe-derive arm. Claude plans/implements; Codex is used before
|
|
332
|
+
# IMPLEMENT only to derive bounded executable probes from visible
|
|
333
|
+
# Verification bullets. BUILD_GATE and VERIFY execute those probes
|
|
334
|
+
# mechanically via spec-verify-check.py.
|
|
335
|
+
ENGINE_CLAUSE="--engine claude --risk-probes"
|
|
336
|
+
ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\`. Codex is available as the OTHER-engine probe derivation and pair-JUDGE engine. The probe phase may only derive executable checks from visible \`## Verification\` text; it must not read hidden fixture/verifier paths."
|
|
337
|
+
;;
|
|
291
338
|
l2_forced)
|
|
292
339
|
# iter-0033c: NEW L2 forced — pair-JUDGE always fires. Diagnostic arm
|
|
293
340
|
# for Gate 6 fixture-level cross-check + Gate 7 attribution causality.
|
|
@@ -414,12 +461,17 @@ else
|
|
|
414
461
|
# natural exit at or past the budget is no longer mislabeled as timeout.
|
|
415
462
|
#
|
|
416
463
|
# MCP/config isolation (iter 0004). The harness's `claude -p` subprocess
|
|
417
|
-
# must not load the operator's user-level MCP
|
|
418
|
-
# telegram, vercel,
|
|
419
|
-
# user
|
|
420
|
-
#
|
|
421
|
-
#
|
|
422
|
-
#
|
|
464
|
+
# must not load the operator's user-level MCP/plugins/settings (pencil,
|
|
465
|
+
# codex-cli, telegram, vercel, ...). Project policy is "MCP/plugins are not in
|
|
466
|
+
# the loop"; loading user config inside the arm is uncontrolled environment
|
|
467
|
+
# leaking into the experiment. `--setting-sources project,local` keeps user
|
|
468
|
+
# plugin enablement out of the run but Claude Code still reads the installed
|
|
469
|
+
# plugin registry for autoupdate. Official Claude Code settings document
|
|
470
|
+
# `DISABLE_AUTOUPDATER=1` / `CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1` as the
|
|
471
|
+
# supported way to disable that background traffic, while preserving OAuth
|
|
472
|
+
# auth from the real HOME. `--strict-mcp-config` + an empty `mcpServers` object
|
|
473
|
+
# forces a hermetic MCP set. Skills still resolve via the project
|
|
474
|
+
# `.claude/skills` staged into the worktree.
|
|
423
475
|
# `--debug-file` records per-arm init/runtime so the next hang has a
|
|
424
476
|
# location, not a guess.
|
|
425
477
|
TIMEOUT_FLAG="$RESULT_DIR/.timed_out"
|
|
@@ -436,7 +488,7 @@ else
|
|
|
436
488
|
# PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
|
|
437
489
|
# `codex exec` through the wrapper for starvation safety.
|
|
438
490
|
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
439
|
-
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_forced" ]; } \
|
|
491
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; } \
|
|
440
492
|
&& [ -x "$WORK_DIR/.devlyn-bin/codex" ]; then
|
|
441
493
|
export PATH="$WORK_DIR/.devlyn-bin:$PATH"
|
|
442
494
|
[ "$ARM" = "solo_claude" ] && export CODEX_BLOCKED=1
|
|
@@ -447,10 +499,19 @@ else
|
|
|
447
499
|
# what the post-run verifier (run-fixture.sh:431-434) sets so the gate
|
|
448
500
|
# sees the same environment shape.
|
|
449
501
|
export BENCH_WORKDIR="$WORK_DIR"
|
|
502
|
+
# Python helper scripts run inside the benchmark worktree. Do not let them
|
|
503
|
+
# rewrite tracked __pycache__ artifacts and pollute the arm-only diff.
|
|
504
|
+
export PYTHONDONTWRITEBYTECODE=1
|
|
505
|
+
# Official Claude Code setting: disable background plugin/autoupdate traffic
|
|
506
|
+
# before process startup. Project settings env is not early enough for all
|
|
507
|
+
# startup paths.
|
|
508
|
+
export CLAUDE_CODE_DISABLE_NONESSENTIAL_TRAFFIC=1
|
|
509
|
+
export DISABLE_AUTOUPDATER=1
|
|
450
510
|
exec claude \
|
|
451
511
|
-p "$(cat "$PROMPT_FILE")" \
|
|
452
512
|
--dangerously-skip-permissions \
|
|
453
513
|
--effort xhigh \
|
|
514
|
+
--setting-sources project,local \
|
|
454
515
|
--strict-mcp-config \
|
|
455
516
|
--mcp-config '{"mcpServers":{}}' \
|
|
456
517
|
--debug-file "$RESULT_DIR/claude-debug.log"
|
|
@@ -459,13 +520,21 @@ else
|
|
|
459
520
|
set +m
|
|
460
521
|
|
|
461
522
|
(
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
523
|
+
deadline=$((T_START + TIMEOUT))
|
|
524
|
+
while kill -0 "$CHILD_PID" 2>/dev/null; do
|
|
525
|
+
now=$(date +%s)
|
|
526
|
+
if [ "$now" -ge "$deadline" ]; then
|
|
527
|
+
: > "$TIMEOUT_FLAG"
|
|
528
|
+
kill -TERM -- "-$CHILD_PID" 2>/dev/null
|
|
529
|
+
kill_worktree_processes "$WORK_DIR" TERM
|
|
530
|
+
sleep 5
|
|
531
|
+
kill -KILL -- "-$CHILD_PID" 2>/dev/null
|
|
532
|
+
kill_worktree_processes "$WORK_DIR" KILL
|
|
533
|
+
exit 0
|
|
534
|
+
fi
|
|
535
|
+
remaining=$((deadline - now))
|
|
536
|
+
[ "$remaining" -gt 30 ] && sleep 30 || sleep "$remaining"
|
|
537
|
+
done
|
|
469
538
|
) &
|
|
470
539
|
WATCHDOG_PID=$!
|
|
471
540
|
|
|
@@ -479,7 +548,16 @@ else
|
|
|
479
548
|
INVOKE_EXIT=124
|
|
480
549
|
WATCHDOG_FIRED=1
|
|
481
550
|
rm -f "$TIMEOUT_FLAG"
|
|
551
|
+
kill_worktree_processes "$WORK_DIR" TERM
|
|
552
|
+
sleep 1
|
|
553
|
+
kill_worktree_processes "$WORK_DIR" KILL
|
|
482
554
|
echo "[run-fixture] arm timed out after ${TIMEOUT}s — INVOKE_EXIT=124" >&2
|
|
555
|
+
else
|
|
556
|
+
# A clean `claude -p` exit can still leave OTHER-engine pair-JUDGE
|
|
557
|
+
# descendants alive; reap any process group rooted in this arm worktree.
|
|
558
|
+
kill_worktree_processes "$WORK_DIR" TERM
|
|
559
|
+
sleep 1
|
|
560
|
+
kill_worktree_processes "$WORK_DIR" KILL
|
|
483
561
|
fi
|
|
484
562
|
set -e
|
|
485
563
|
fi
|
|
@@ -487,6 +565,25 @@ fi
|
|
|
487
565
|
T_END=$(date +%s)
|
|
488
566
|
ELAPSED=$((T_END - T_START))
|
|
489
567
|
|
|
568
|
+
# Restore tracked Python bytecode to the scaffold commit and remove only
|
|
569
|
+
# untracked bytecode. Helper invocations must not count as model work, but
|
|
570
|
+
# deleting tracked scaffold files would also pollute changed-files.txt.
|
|
571
|
+
(cd "$WORK_DIR" \
|
|
572
|
+
&& git restore --source "$SCAFFOLD_SHA" -- .claude/skills/_shared/__pycache__ 2>/dev/null || true)
|
|
573
|
+
cleanup_roots=()
|
|
574
|
+
[ -d "$WORK_DIR/.claude" ] && cleanup_roots+=("$WORK_DIR/.claude")
|
|
575
|
+
[ -d "$WORK_DIR/.devlyn" ] && cleanup_roots+=("$WORK_DIR/.devlyn")
|
|
576
|
+
if [ ${#cleanup_roots[@]} -gt 0 ]; then
|
|
577
|
+
find "${cleanup_roots[@]}" -type f \( -name '*.pyc' -o -name '*.pyo' \) -print0 \
|
|
578
|
+
| while IFS= read -r -d '' py_file; do
|
|
579
|
+
rel="${py_file#$WORK_DIR/}"
|
|
580
|
+
if ! (cd "$WORK_DIR" && git ls-files --error-unmatch "$rel" >/dev/null 2>&1); then
|
|
581
|
+
rm -f "$py_file"
|
|
582
|
+
fi
|
|
583
|
+
done
|
|
584
|
+
find "${cleanup_roots[@]}" -type d -name __pycache__ -empty -delete || true
|
|
585
|
+
fi
|
|
586
|
+
|
|
490
587
|
# Capture the ARM-ONLY diff against the scaffold commit. Variant's
|
|
491
588
|
# auto-resolve pipeline commits internally after each phase, so diffing
|
|
492
589
|
# against HEAD would miss committed work. Diffing against SCAFFOLD_SHA after
|
|
@@ -518,6 +615,41 @@ python3 "$BENCH_ROOT/scripts/oracle-scope-tier-b.py" \
|
|
|
518
615
|
echo '{"oracle":"scope-tier-b","findings":[],"error":"oracle invocation failed"}' \
|
|
519
616
|
> "$RESULT_DIR/oracle-scope-tier-b.json"
|
|
520
617
|
|
|
618
|
+
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
619
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } \
|
|
620
|
+
&& [ -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
|
|
621
|
+
&& [ -f "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" ]; then
|
|
622
|
+
if [ -f "$WORK_DIR/.devlyn/codex-judge.stdout" ] \
|
|
623
|
+
&& [ -f "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" ]; then
|
|
624
|
+
if ! python3 "$WORK_DIR/.claude/skills/_shared/collect-codex-findings.py" \
|
|
625
|
+
--devlyn-dir "$WORK_DIR/.devlyn" \
|
|
626
|
+
> "$RESULT_DIR/collect-codex-findings.log" 2>&1; then
|
|
627
|
+
echo "[run-fixture] Codex pair findings collection failed; see $RESULT_DIR/collect-codex-findings.log" >&2
|
|
628
|
+
fi
|
|
629
|
+
fi
|
|
630
|
+
if ! python3 "$WORK_DIR/.claude/skills/_shared/verify-merge-findings.py" \
|
|
631
|
+
--devlyn-dir "$WORK_DIR/.devlyn" --write-state \
|
|
632
|
+
> "$RESULT_DIR/verify-merge-normalize.log" 2>&1; then
|
|
633
|
+
echo "[run-fixture] verify merge normalization failed; see $RESULT_DIR/verify-merge-normalize.log" >&2
|
|
634
|
+
fi
|
|
635
|
+
fi
|
|
636
|
+
|
|
637
|
+
if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
|
|
638
|
+
|| [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ]; } && [ -d "$WORK_DIR/.devlyn" ]; then
|
|
639
|
+
run_dir=$(find "$WORK_DIR/.devlyn/runs" -mindepth 1 -maxdepth 1 -type d 2>/dev/null | sort | tail -1 || true)
|
|
640
|
+
if [ -n "$run_dir" ]; then
|
|
641
|
+
rm -rf "$RESULT_DIR/run-archive"
|
|
642
|
+
cp -R "$run_dir" "$RESULT_DIR/run-archive"
|
|
643
|
+
[ -f "$RESULT_DIR/run-archive/pipeline.state.json" ] \
|
|
644
|
+
|| [ ! -f "$WORK_DIR/.devlyn/pipeline.state.json" ] \
|
|
645
|
+
|| cp "$WORK_DIR/.devlyn/pipeline.state.json" "$RESULT_DIR/run-archive/pipeline.state.json"
|
|
646
|
+
else
|
|
647
|
+
rm -rf "$RESULT_DIR/run-archive"
|
|
648
|
+
mkdir -p "$RESULT_DIR/run-archive"
|
|
649
|
+
find "$WORK_DIR/.devlyn" -maxdepth 1 -type f -exec cp {} "$RESULT_DIR/run-archive/" \;
|
|
650
|
+
fi
|
|
651
|
+
fi
|
|
652
|
+
|
|
521
653
|
# Run verification commands + forbidden pattern scan + deps check. Uses
|
|
522
654
|
# the operator's real HOME (same as the arm saw). Fixtures that need HOME
|
|
523
655
|
# isolation override it inline per verification command.
|
|
@@ -532,6 +664,9 @@ verify_env = os.environ.copy()
|
|
|
532
664
|
# Expose the work-dir path so fixtures whose verification needs to reference
|
|
533
665
|
# the work root can do so portably (e.g. F9's out-of-repo check).
|
|
534
666
|
verify_env["BENCH_WORKDIR"] = work
|
|
667
|
+
# Hidden benchmark verifiers live in the fixture directory, outside the arm's
|
|
668
|
+
# work tree. This keeps oracle code from becoming implementation context.
|
|
669
|
+
verify_env["BENCH_FIXTURE_DIR"] = os.path.dirname(os.path.abspath(sys.argv[1]))
|
|
535
670
|
|
|
536
671
|
verify = {"commands": [], "forbidden_pattern_hits": [], "deps_added": 0,
|
|
537
672
|
"max_deps_added": expected.get("max_deps_added", 0),
|
|
@@ -669,6 +804,58 @@ try:
|
|
|
669
804
|
except Exception:
|
|
670
805
|
changed = []
|
|
671
806
|
|
|
807
|
+
state = {}
|
|
808
|
+
state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
|
|
809
|
+
if os.path.isfile(state_path):
|
|
810
|
+
with open(state_path) as f:
|
|
811
|
+
state = json.load(f)
|
|
812
|
+
verify_phase = (state.get("phases") or {}).get("verify") or {}
|
|
813
|
+
sub_verdicts = verify_phase.get("sub_verdicts")
|
|
814
|
+
pair_trigger = verify_phase.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
|
|
815
|
+
pair_mode = bool(
|
|
816
|
+
isinstance(sub_verdicts, dict)
|
|
817
|
+
and (sub_verdicts.get("judge_codex") is not None or sub_verdicts.get("pair_judge") is not None)
|
|
818
|
+
) or bool(verify_phase.get("pair_mode"))
|
|
819
|
+
|
|
820
|
+
invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
|
|
821
|
+
plugin_contamination = False
|
|
822
|
+
plugin_contamination_reason = None
|
|
823
|
+
debug_path = os.path.join(result_dir, "claude-debug.log")
|
|
824
|
+
try:
|
|
825
|
+
with open(debug_path, errors="replace") as f:
|
|
826
|
+
debug_text = f.read()
|
|
827
|
+
except OSError:
|
|
828
|
+
debug_text = ""
|
|
829
|
+
if (
|
|
830
|
+
"Plugin autoupdate: checking installed plugins" in debug_text
|
|
831
|
+
or "Caching plugin from source:" in debug_text
|
|
832
|
+
or "Cloned repository from " in debug_text
|
|
833
|
+
or "Successfully cached plugin " in debug_text
|
|
834
|
+
or "Found 8 plugins (8 enabled" in debug_text
|
|
835
|
+
):
|
|
836
|
+
if "Plugin autoupdate: skipped (auto-updater disabled)" not in debug_text:
|
|
837
|
+
plugin_contamination = True
|
|
838
|
+
plugin_contamination_reason = "plugin_contamination"
|
|
839
|
+
|
|
840
|
+
invoke_failure = (
|
|
841
|
+
(invoke_exit not in (0,) and not timing["timed_out"])
|
|
842
|
+
or plugin_contamination
|
|
843
|
+
)
|
|
844
|
+
invoke_failure_reason = None
|
|
845
|
+
if plugin_contamination:
|
|
846
|
+
invoke_failure_reason = plugin_contamination_reason
|
|
847
|
+
elif invoke_failure:
|
|
848
|
+
transcript_path = os.path.join(result_dir, "transcript.txt")
|
|
849
|
+
haystack = ""
|
|
850
|
+
for path in (transcript_path, debug_path):
|
|
851
|
+
try:
|
|
852
|
+
with open(path, errors="replace") as f:
|
|
853
|
+
haystack += "\n" + f.read()
|
|
854
|
+
except OSError:
|
|
855
|
+
pass
|
|
856
|
+
if "You've hit your limit" in haystack or "rate_limit_error" in haystack:
|
|
857
|
+
invoke_failure_reason = "provider_limit"
|
|
858
|
+
|
|
672
859
|
result = {
|
|
673
860
|
"fixture": fixture,
|
|
674
861
|
"arm": arm,
|
|
@@ -681,8 +868,15 @@ result = {
|
|
|
681
868
|
"files_changed": len(changed),
|
|
682
869
|
"elapsed_seconds": elapsed,
|
|
683
870
|
"timed_out": timing["timed_out"],
|
|
684
|
-
"
|
|
685
|
-
"
|
|
871
|
+
"environment_contamination": plugin_contamination,
|
|
872
|
+
"environment_contamination_reason": plugin_contamination_reason,
|
|
873
|
+
"invoke_exit": invoke_exit,
|
|
874
|
+
"invoke_failure": invoke_failure,
|
|
875
|
+
"invoke_failure_reason": invoke_failure_reason,
|
|
876
|
+
"terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
|
|
877
|
+
"verify_verdict": verify_phase.get("verdict"),
|
|
878
|
+
"pair_trigger": pair_trigger,
|
|
879
|
+
"pair_mode": pair_mode,
|
|
686
880
|
}
|
|
687
881
|
json.dump(result, open(os.path.join(result_dir, "result.json"), "w"), indent=2)
|
|
688
882
|
print(json.dumps(result, indent=2))
|