devlyn-cli 2.1.0 → 2.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +321 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
- package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
- package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +363 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +69 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +69 -20
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Prepare a SWE-bench instance worktree for producing a candidate patch."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import re
|
|
9
|
+
import shutil
|
|
10
|
+
import subprocess
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def run(cmd: list[str], cwd: Path | None = None) -> None:
|
|
19
|
+
subprocess.run(cmd, cwd=cwd, check=True)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def read_instances(path: Path) -> list[dict[str, Any]]:
|
|
23
|
+
rows: list[dict[str, Any]] = []
|
|
24
|
+
with path.open(encoding="utf8") as f:
|
|
25
|
+
for line_no, line in enumerate(f, start=1):
|
|
26
|
+
if not line.strip():
|
|
27
|
+
continue
|
|
28
|
+
value = json.loads(line)
|
|
29
|
+
if not isinstance(value, dict):
|
|
30
|
+
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
31
|
+
rows.append(value)
|
|
32
|
+
return rows
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def require_text(instance: dict[str, Any], key: str) -> str:
|
|
36
|
+
value = instance.get(key)
|
|
37
|
+
if not isinstance(value, str) or not value.strip():
|
|
38
|
+
raise ValueError(f"SWE-bench instance missing non-empty {key!r}")
|
|
39
|
+
return value.strip()
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def pick_instance(path: Path, instance_id: str) -> dict[str, Any]:
|
|
43
|
+
matches = [row for row in read_instances(path) if row.get("instance_id") == instance_id]
|
|
44
|
+
if len(matches) != 1:
|
|
45
|
+
raise ValueError(f"expected exactly one {instance_id!r} row in {path}, found {len(matches)}")
|
|
46
|
+
return matches[0]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def repo_cache_name(repo: str, base_commit: str) -> str:
|
|
50
|
+
return f"{repo.replace('/', '__')}-{base_commit[:12]}"
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def prepare_repo(instance: dict[str, Any], repos_root: Path) -> Path:
|
|
54
|
+
repo = require_text(instance, "repo")
|
|
55
|
+
base_commit = require_text(instance, "base_commit")
|
|
56
|
+
repos_root.mkdir(parents=True, exist_ok=True)
|
|
57
|
+
dest = repos_root / repo_cache_name(repo, base_commit)
|
|
58
|
+
|
|
59
|
+
if not dest.exists():
|
|
60
|
+
run(["git", "clone", "--quiet", f"https://github.com/{repo}.git", str(dest)])
|
|
61
|
+
|
|
62
|
+
run(["git", "fetch", "--quiet", "--all", "--tags"], cwd=dest)
|
|
63
|
+
run(["git", "checkout", "--quiet", base_commit], cwd=dest)
|
|
64
|
+
run(["git", "reset", "--hard", "--quiet"], cwd=dest)
|
|
65
|
+
run(["git", "clean", "-ffdqx"], cwd=dest)
|
|
66
|
+
return dest
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def copy_worktree(repo_path: Path, worktree: Path) -> None:
|
|
70
|
+
if worktree.exists():
|
|
71
|
+
shutil.rmtree(worktree)
|
|
72
|
+
run(["git", "clone", "--quiet", "--no-hardlinks", str(repo_path), str(worktree)])
|
|
73
|
+
run(["git", "checkout", "--quiet", "HEAD"], cwd=worktree)
|
|
74
|
+
run(["git", "reset", "--hard", "--quiet"], cwd=worktree)
|
|
75
|
+
run(["git", "clean", "-ffdqx"], cwd=worktree)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def write_spec(instance: dict[str, Any], worktree: Path) -> Path:
|
|
79
|
+
instance_id = require_text(instance, "instance_id")
|
|
80
|
+
repo = require_text(instance, "repo")
|
|
81
|
+
base_commit = require_text(instance, "base_commit")
|
|
82
|
+
problem = require_text(instance, "problem_statement")
|
|
83
|
+
spec_path = worktree / "docs" / "roadmap" / "phase-1" / f"{instance_id}.md"
|
|
84
|
+
spec_path.parent.mkdir(parents=True, exist_ok=True)
|
|
85
|
+
spec_path.write_text(
|
|
86
|
+
f"""---
|
|
87
|
+
id: "{instance_id}"
|
|
88
|
+
title: "SWE-bench {instance_id}"
|
|
89
|
+
status: planned
|
|
90
|
+
complexity: high
|
|
91
|
+
depends-on: []
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
# SWE-bench {instance_id}
|
|
95
|
+
|
|
96
|
+
Repository: `{repo}`
|
|
97
|
+
Base commit: `{base_commit}`
|
|
98
|
+
|
|
99
|
+
## Requirements
|
|
100
|
+
|
|
101
|
+
- [ ] Resolve the issue described in the problem statement.
|
|
102
|
+
- [ ] Preserve existing behavior outside the issue's scope.
|
|
103
|
+
- [ ] Keep the implementation consistent with the repository's local style and
|
|
104
|
+
dependency policy.
|
|
105
|
+
- [ ] Add focused regression coverage when practical.
|
|
106
|
+
|
|
107
|
+
## Problem Statement
|
|
108
|
+
|
|
109
|
+
{problem}
|
|
110
|
+
|
|
111
|
+
## Constraints
|
|
112
|
+
|
|
113
|
+
- Do not inspect or rely on the SWE-bench gold `patch` or `test_patch` fields.
|
|
114
|
+
- Do not add broad rewrites, unrelated formatting churn, or new dependencies
|
|
115
|
+
unless the visible problem statement strictly requires them.
|
|
116
|
+
|
|
117
|
+
## Verification
|
|
118
|
+
|
|
119
|
+
- Run the most focused practical verification for the changed behavior.
|
|
120
|
+
""",
|
|
121
|
+
encoding="utf8",
|
|
122
|
+
)
|
|
123
|
+
return spec_path
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def copy_devlyn_context(worktree: Path) -> None:
|
|
127
|
+
skills_src = Path("config/skills")
|
|
128
|
+
if skills_src.exists():
|
|
129
|
+
skills_dst = worktree / ".claude" / "skills"
|
|
130
|
+
if skills_dst.exists():
|
|
131
|
+
shutil.rmtree(skills_dst)
|
|
132
|
+
shutil.copytree(skills_src, skills_dst)
|
|
133
|
+
claude_src = Path("CLAUDE.md")
|
|
134
|
+
if claude_src.exists():
|
|
135
|
+
shutil.copy2(claude_src, worktree / "CLAUDE.md")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def main() -> int:
|
|
139
|
+
parser = argparse.ArgumentParser()
|
|
140
|
+
parser.add_argument("--instances-jsonl", required=True, type=Path)
|
|
141
|
+
parser.add_argument("--instance-id", required=True)
|
|
142
|
+
parser.add_argument(
|
|
143
|
+
"--repos-root",
|
|
144
|
+
default=Path("benchmark/auto-resolve/external/swebench/repos-solver"),
|
|
145
|
+
type=Path,
|
|
146
|
+
)
|
|
147
|
+
parser.add_argument(
|
|
148
|
+
"--worktrees-root",
|
|
149
|
+
default=Path("benchmark/auto-resolve/external/swebench/worktrees"),
|
|
150
|
+
type=Path,
|
|
151
|
+
)
|
|
152
|
+
parser.add_argument("--copy-devlyn-context", action="store_true")
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
instance = pick_instance(args.instances_jsonl, args.instance_id)
|
|
156
|
+
instance_id = require_text(instance, "instance_id")
|
|
157
|
+
if not SAFE_ID.match(instance_id):
|
|
158
|
+
raise ValueError(f"unsafe instance_id for path/spec use: {instance_id!r}")
|
|
159
|
+
|
|
160
|
+
repo_path = prepare_repo(instance, args.repos_root)
|
|
161
|
+
worktree = args.worktrees_root / instance_id
|
|
162
|
+
args.worktrees_root.mkdir(parents=True, exist_ok=True)
|
|
163
|
+
copy_worktree(repo_path, worktree)
|
|
164
|
+
spec_path = write_spec(instance, worktree)
|
|
165
|
+
if args.copy_devlyn_context:
|
|
166
|
+
copy_devlyn_context(worktree)
|
|
167
|
+
|
|
168
|
+
prompt = (
|
|
169
|
+
f"You are solving SWE-bench instance {instance_id} in this checked-out repository at "
|
|
170
|
+
"the base commit. Do not inspect any gold SWE-bench patch or test_patch. Read the "
|
|
171
|
+
f"local code and the spec at {spec_path.relative_to(worktree)}. Make the smallest "
|
|
172
|
+
"correct source/test change for the visible issue. Run a focused verification "
|
|
173
|
+
"command. At the end, report changed files, verification command, and verdict."
|
|
174
|
+
)
|
|
175
|
+
(worktree / "solve-prompt.txt").write_text(prompt + "\n", encoding="utf8")
|
|
176
|
+
print(
|
|
177
|
+
json.dumps(
|
|
178
|
+
{
|
|
179
|
+
"instance_id": instance_id,
|
|
180
|
+
"repo_dir": str(repo_path),
|
|
181
|
+
"worktree": str(worktree),
|
|
182
|
+
"spec_path": str(spec_path),
|
|
183
|
+
"prompt_file": str(worktree / "solve-prompt.txt"),
|
|
184
|
+
},
|
|
185
|
+
indent=2,
|
|
186
|
+
)
|
|
187
|
+
)
|
|
188
|
+
return 0
|
|
189
|
+
|
|
190
|
+
|
|
191
|
+
if __name__ == "__main__":
|
|
192
|
+
raise SystemExit(main())
|