devlyn-cli 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CLAUDE.md +1 -1
- package/benchmark/auto-resolve/README.md +318 -2
- package/benchmark/auto-resolve/RUBRIC.md +6 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
- package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
- package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
- package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
- package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
- package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
- package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
- package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
- package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
- package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
- package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
- package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
- package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
- package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
- package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
- package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
- package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
- package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
- package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
- package/benchmark/auto-resolve/scripts/judge.sh +82 -3
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
- package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
- package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
- package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
- package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
- package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
- package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
- package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
- package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
- package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
- package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
- package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
- package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
- package/config/skills/_shared/archive_run.py +3 -0
- package/config/skills/_shared/codex-config.md +2 -2
- package/config/skills/_shared/codex-monitored.sh +72 -7
- package/config/skills/_shared/collect-codex-findings.py +125 -0
- package/config/skills/_shared/engine-preflight.md +1 -1
- package/config/skills/_shared/expected.schema.json +18 -0
- package/config/skills/_shared/spec-verify-check.py +312 -10
- package/config/skills/_shared/verify-merge-findings.py +327 -0
- package/config/skills/devlyn:resolve/SKILL.md +62 -8
- package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
- package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
- package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
- package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
- package/package.json +1 -1
- package/scripts/lint-skills.sh +32 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Headroom gate for candidate L2/pair fixtures.
|
|
3
|
+
|
|
4
|
+
Pair lift is not measurable when bare and solo already score near the ceiling.
|
|
5
|
+
This gate checks the precondition recorded in HANDOFF.md: before an L2 pair
|
|
6
|
+
measurement is pre-registered, candidate fixtures must leave enough room for
|
|
7
|
+
pair to improve the outcome.
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import pathlib
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def load_json(path: pathlib.Path) -> dict | None:
|
|
18
|
+
if not path.is_file():
|
|
19
|
+
return None
|
|
20
|
+
return json.loads(path.read_text())
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def score_for(judge: dict, arm: str) -> int | None:
|
|
24
|
+
scores = judge.get("scores_by_arm") or {}
|
|
25
|
+
value = scores.get(arm)
|
|
26
|
+
return value if isinstance(value, int) else None
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def arm_clean_failures(fixture_dir: pathlib.Path, judge: dict, arm: str) -> list[str]:
|
|
30
|
+
failures: list[str] = []
|
|
31
|
+
result = load_json(fixture_dir / arm / "result.json")
|
|
32
|
+
verify = load_json(fixture_dir / arm / "verify.json")
|
|
33
|
+
if result is None:
|
|
34
|
+
failures.append(f"{arm} result.json missing")
|
|
35
|
+
if verify is None:
|
|
36
|
+
failures.append(f"{arm} verify.json missing")
|
|
37
|
+
dq_by_arm = judge.get("disqualifiers_by_arm") or {}
|
|
38
|
+
if bool((dq_by_arm.get(arm) or {}).get("disqualifier")):
|
|
39
|
+
failures.append(f"{arm} judge disqualifier")
|
|
40
|
+
if result is not None:
|
|
41
|
+
if bool(result.get("disqualifier")):
|
|
42
|
+
failures.append(f"{arm} result disqualifier")
|
|
43
|
+
if bool(result.get("timed_out")):
|
|
44
|
+
failures.append(f"{arm} timed out")
|
|
45
|
+
if bool(result.get("invoke_failure")):
|
|
46
|
+
failures.append(f"{arm} invoke failure")
|
|
47
|
+
if verify is not None and bool(verify.get("disqualifier")):
|
|
48
|
+
failures.append(f"{arm} verify disqualifier")
|
|
49
|
+
return failures
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def main() -> int:
|
|
53
|
+
parser = argparse.ArgumentParser()
|
|
54
|
+
parser.add_argument("--run-id", required=True)
|
|
55
|
+
parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
|
|
56
|
+
parser.add_argument("--bare-max", type=int, default=60)
|
|
57
|
+
parser.add_argument("--solo-max", type=int, default=80)
|
|
58
|
+
parser.add_argument("--min-fixtures", type=int, default=2)
|
|
59
|
+
parser.add_argument("--out-json", default=None)
|
|
60
|
+
parser.add_argument("--out-md", default=None)
|
|
61
|
+
args = parser.parse_args()
|
|
62
|
+
|
|
63
|
+
res_root = pathlib.Path(args.results_root) / args.run_id
|
|
64
|
+
if not res_root.is_dir():
|
|
65
|
+
print(f"no results dir: {res_root}", file=sys.stderr)
|
|
66
|
+
return 2
|
|
67
|
+
|
|
68
|
+
rows = []
|
|
69
|
+
for fixture_dir in sorted(p for p in res_root.iterdir() if p.is_dir()):
|
|
70
|
+
judge = load_json(fixture_dir / "judge.json")
|
|
71
|
+
if judge is None:
|
|
72
|
+
rows.append({
|
|
73
|
+
"fixture": fixture_dir.name,
|
|
74
|
+
"status": "MISSING_JUDGE",
|
|
75
|
+
"reason": "judge.json missing",
|
|
76
|
+
})
|
|
77
|
+
continue
|
|
78
|
+
bare = score_for(judge, "bare")
|
|
79
|
+
solo = score_for(judge, "solo_claude")
|
|
80
|
+
bare_clean_failures = arm_clean_failures(fixture_dir, judge, "bare")
|
|
81
|
+
solo_clean_failures = arm_clean_failures(fixture_dir, judge, "solo_claude")
|
|
82
|
+
bare_ok = bare is not None and bare <= args.bare_max and not bare_clean_failures
|
|
83
|
+
solo_ok = solo is not None and solo <= args.solo_max and not solo_clean_failures
|
|
84
|
+
status = "PASS" if bare_ok and solo_ok else "FAIL"
|
|
85
|
+
reasons = []
|
|
86
|
+
if bare is None:
|
|
87
|
+
reasons.append("bare score missing")
|
|
88
|
+
elif bare > args.bare_max:
|
|
89
|
+
reasons.append(f"bare score {bare} > {args.bare_max}")
|
|
90
|
+
if solo is None:
|
|
91
|
+
reasons.append("solo_claude score missing")
|
|
92
|
+
elif solo > args.solo_max:
|
|
93
|
+
reasons.append(f"solo_claude score {solo} > {args.solo_max}")
|
|
94
|
+
reasons.extend(bare_clean_failures)
|
|
95
|
+
reasons.extend(solo_clean_failures)
|
|
96
|
+
rows.append({
|
|
97
|
+
"fixture": fixture_dir.name,
|
|
98
|
+
"status": status,
|
|
99
|
+
"bare_score": bare,
|
|
100
|
+
"solo_score": solo,
|
|
101
|
+
"reason": "; ".join(reasons) if reasons else "",
|
|
102
|
+
})
|
|
103
|
+
|
|
104
|
+
pass_count = sum(1 for row in rows if row["status"] == "PASS")
|
|
105
|
+
fixture_count_ok = len(rows) >= args.min_fixtures
|
|
106
|
+
verdict = "PASS" if pass_count == len(rows) and rows and fixture_count_ok else "FAIL"
|
|
107
|
+
payload = {
|
|
108
|
+
"run_id": args.run_id,
|
|
109
|
+
"rule": f"at least {args.min_fixtures} candidate fixtures; each must satisfy bare <= {args.bare_max} and solo_claude <= {args.solo_max}, with both arms clean",
|
|
110
|
+
"verdict": verdict,
|
|
111
|
+
"fixtures_total": len(rows),
|
|
112
|
+
"fixtures_passed": pass_count,
|
|
113
|
+
"min_fixtures": args.min_fixtures,
|
|
114
|
+
"fixture_count_ok": fixture_count_ok,
|
|
115
|
+
"rows": rows,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
if args.out_json:
|
|
119
|
+
pathlib.Path(args.out_json).write_text(json.dumps(payload, indent=2) + "\n")
|
|
120
|
+
|
|
121
|
+
lines = [
|
|
122
|
+
f"# Headroom Gate — {args.run_id}",
|
|
123
|
+
"",
|
|
124
|
+
f"Verdict: **{verdict}**",
|
|
125
|
+
"",
|
|
126
|
+
f"Rule: at least {args.min_fixtures} fixtures; bare <= {args.bare_max}, "
|
|
127
|
+
f"solo_claude <= {args.solo_max}, both arms clean.",
|
|
128
|
+
"",
|
|
129
|
+
"| Fixture | Bare | Solo | Status | Reason |",
|
|
130
|
+
"|---|---:|---:|---|---|",
|
|
131
|
+
]
|
|
132
|
+
for row in rows:
|
|
133
|
+
lines.append(
|
|
134
|
+
f"| {row['fixture']} | {row.get('bare_score')} | {row.get('solo_score')} | "
|
|
135
|
+
f"{row['status']} | {row.get('reason', '')} |"
|
|
136
|
+
)
|
|
137
|
+
report = "\n".join(lines) + "\n"
|
|
138
|
+
if args.out_md:
|
|
139
|
+
pathlib.Path(args.out_md).write_text(report)
|
|
140
|
+
else:
|
|
141
|
+
print(report)
|
|
142
|
+
|
|
143
|
+
return 0 if verdict == "PASS" else 1
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
if __name__ == "__main__":
|
|
147
|
+
sys.exit(main())
|
|
@@ -41,11 +41,13 @@ RES_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE"
|
|
|
41
41
|
# date iter-0019.
|
|
42
42
|
ARMS_PRESENT=()
|
|
43
43
|
# iter-0033c: l2_gated/l2_forced added for NEW L2 vs NEW L1 measurement.
|
|
44
|
+
# iter-0037: l2_risk_probes adds bounded visible-verification probes before
|
|
45
|
+
# IMPLEMENT; judge treats it as another blind arm when artifacts exist.
|
|
44
46
|
# Slot count is still A/B/C max 3 — pair-eligible iter-0033c fixtures supply
|
|
45
47
|
# {solo_claude, l2_gated, l2_forced}; non-pair-eligible fixtures supply
|
|
46
48
|
# {solo_claude, l2_gated}. The blind-shuffle slot mapping below already
|
|
47
49
|
# tolerates arbitrary ARMS_PRESENT counts ≥2.
|
|
48
|
-
for arm in variant solo_claude bare l2_gated l2_forced; do
|
|
50
|
+
for arm in variant solo_claude bare l2_gated l2_risk_probes l2_forced; do
|
|
49
51
|
if [ -f "$RES_DIR/$arm/diff.patch" ] && [ -f "$RES_DIR/$arm/verify.json" ]; then
|
|
50
52
|
ARMS_PRESENT+=("$arm")
|
|
51
53
|
fi
|
|
@@ -216,8 +218,34 @@ PY
|
|
|
216
218
|
# traceable. Run from a clean temp CWD so the judge can't peek at project
|
|
217
219
|
# files that would leak arm identity.
|
|
218
220
|
command -v codex >/dev/null 2>&1 || { echo "codex CLI not on PATH; cannot judge"; exit 1; }
|
|
219
|
-
CODEX_CLI_VER=$(
|
|
220
|
-
|
|
221
|
+
CODEX_CLI_VER=$(python3 - <<'PY'
|
|
222
|
+
import subprocess
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
proc = subprocess.run(
|
|
226
|
+
["codex", "--version"],
|
|
227
|
+
stdout=subprocess.PIPE,
|
|
228
|
+
stderr=subprocess.PIPE,
|
|
229
|
+
text=True,
|
|
230
|
+
timeout=5,
|
|
231
|
+
)
|
|
232
|
+
out = (proc.stdout or proc.stderr).strip()
|
|
233
|
+
if proc.returncode == 0 and out:
|
|
234
|
+
print(out)
|
|
235
|
+
else:
|
|
236
|
+
print(f"codex-cli unknown (version-exit-{proc.returncode})")
|
|
237
|
+
except subprocess.TimeoutExpired:
|
|
238
|
+
print("codex-cli unknown (version-timeout)")
|
|
239
|
+
except FileNotFoundError:
|
|
240
|
+
print("codex-cli missing")
|
|
241
|
+
except Exception as exc:
|
|
242
|
+
print(f"codex-cli unknown ({type(exc).__name__})")
|
|
243
|
+
PY
|
|
244
|
+
)
|
|
245
|
+
JUDGE_MODEL=$({ grep -E '^model[[:space:]]*=' "${HOME}/.codex/config.toml" 2>/dev/null || true; } \
|
|
246
|
+
| head -1 \
|
|
247
|
+
| sed -E 's/.*=[[:space:]]*"?([^"]*)"?[[:space:]]*$/\1/' \
|
|
248
|
+
| xargs)
|
|
221
249
|
[ -z "$JUDGE_MODEL" ] && JUDGE_MODEL="(unknown — codex config.toml not readable)"
|
|
222
250
|
|
|
223
251
|
JUDGE_CWD="/tmp/judge-$RUN_ID-$FIXTURE"
|
|
@@ -237,6 +265,7 @@ fi
|
|
|
237
265
|
|
|
238
266
|
# Extract JSON (codex wraps with banners; pick the last {...} block)
|
|
239
267
|
python3 - "$JUDGE_OUT" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" <<'PY'
|
|
268
|
+
import math
|
|
240
269
|
import sys, re, json, pathlib
|
|
241
270
|
out = pathlib.Path(sys.argv[1]).read_text()
|
|
242
271
|
target = pathlib.Path(sys.argv[2])
|
|
@@ -298,6 +327,56 @@ if axis_invalid_cells:
|
|
|
298
327
|
f"clamped: {axis_invalid_cells}\n"
|
|
299
328
|
)
|
|
300
329
|
|
|
330
|
+
# Verification is the machine-readable acceptance contract. RUBRIC.md puts
|
|
331
|
+
# verification behavior under Spec Compliance, but LLM judges can still grade
|
|
332
|
+
# generous prose around failed commands. Cap score mechanically so an arm that
|
|
333
|
+
# fails required verification cannot receive a ceiling score.
|
|
334
|
+
def arm_verify_score(arm: str):
|
|
335
|
+
path = target.parent / arm / "verify.json"
|
|
336
|
+
if not path.is_file():
|
|
337
|
+
return None
|
|
338
|
+
data = json.loads(path.read_text())
|
|
339
|
+
value = data.get("verify_score")
|
|
340
|
+
return float(value) if isinstance(value, (int, float)) else None
|
|
341
|
+
|
|
342
|
+
verify_caps = []
|
|
343
|
+
for letter, score_key, breakdown_key in (
|
|
344
|
+
("A", "a_score", "a_breakdown"),
|
|
345
|
+
("B", "b_score", "b_breakdown"),
|
|
346
|
+
("C", "c_score", "c_breakdown"),
|
|
347
|
+
):
|
|
348
|
+
arm = mapping.get(letter)
|
|
349
|
+
if not arm:
|
|
350
|
+
continue
|
|
351
|
+
verify_score = arm_verify_score(arm)
|
|
352
|
+
if verify_score is None:
|
|
353
|
+
continue
|
|
354
|
+
verify_score = max(0.0, min(1.0, verify_score))
|
|
355
|
+
score_cap = math.floor(100 * verify_score)
|
|
356
|
+
spec_cap = math.floor(25 * verify_score)
|
|
357
|
+
raw_score = chosen.get(score_key)
|
|
358
|
+
raw_spec = (chosen.get(breakdown_key) or {}).get("spec")
|
|
359
|
+
row = {
|
|
360
|
+
"letter": letter,
|
|
361
|
+
"arm": arm,
|
|
362
|
+
"verify_score": verify_score,
|
|
363
|
+
"score_cap": score_cap,
|
|
364
|
+
"spec_cap": spec_cap,
|
|
365
|
+
"raw_score": raw_score,
|
|
366
|
+
"raw_spec": raw_spec,
|
|
367
|
+
"score_capped": False,
|
|
368
|
+
"spec_capped": False,
|
|
369
|
+
}
|
|
370
|
+
if isinstance(raw_score, (int, float)) and raw_score > score_cap:
|
|
371
|
+
chosen[score_key] = score_cap
|
|
372
|
+
row["score_capped"] = True
|
|
373
|
+
breakdown = chosen.get(breakdown_key)
|
|
374
|
+
if isinstance(breakdown, dict) and isinstance(raw_spec, (int, float)) and raw_spec > spec_cap:
|
|
375
|
+
breakdown["spec"] = spec_cap
|
|
376
|
+
row["spec_capped"] = True
|
|
377
|
+
verify_caps.append(row)
|
|
378
|
+
chosen["_verify_score_caps"] = verify_caps
|
|
379
|
+
|
|
301
380
|
# scores_by_arm: arm-name → score, computed from the blind A/B/C scores.
|
|
302
381
|
# This is the canonical 3-arm-aware shape the report consumer reads. The
|
|
303
382
|
# legacy variant_score / bare_score / margin fields below are derived from
|
|
@@ -0,0 +1,244 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Prepare a SWE-bench instance for frozen VERIFY solo-vs-pair review.
|
|
3
|
+
|
|
4
|
+
The script does not run models and does not evaluate SWE-bench correctness.
|
|
5
|
+
It converts one official SWE-bench-style instance plus one candidate patch into
|
|
6
|
+
the case layout consumed by run-frozen-verify-pair.sh.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import re
|
|
14
|
+
import shlex
|
|
15
|
+
import shutil
|
|
16
|
+
import subprocess
|
|
17
|
+
from pathlib import Path
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def run(cmd: list[str], cwd: Path | None = None) -> None:
|
|
25
|
+
subprocess.run(cmd, cwd=cwd, check=True)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def read_json(path: Path) -> dict[str, Any]:
|
|
29
|
+
with path.open(encoding="utf8") as f:
|
|
30
|
+
data = json.load(f)
|
|
31
|
+
if not isinstance(data, dict):
|
|
32
|
+
raise ValueError(f"expected JSON object: {path}")
|
|
33
|
+
return data
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def require_text(instance: dict[str, Any], key: str) -> str:
|
|
37
|
+
value = instance.get(key)
|
|
38
|
+
if not isinstance(value, str) or not value.strip():
|
|
39
|
+
raise ValueError(f"SWE-bench instance missing non-empty {key!r}")
|
|
40
|
+
return value.strip()
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def repo_cache_name(repo: str, base_commit: str) -> str:
|
|
44
|
+
safe_repo = repo.replace("/", "__")
|
|
45
|
+
return f"{safe_repo}-{base_commit[:12]}"
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def prepare_repo(instance: dict[str, Any], repo_dir: Path | None, repos_root: Path) -> Path:
|
|
49
|
+
repo = require_text(instance, "repo")
|
|
50
|
+
base_commit = require_text(instance, "base_commit")
|
|
51
|
+
repos_root.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
dest = repos_root / repo_cache_name(repo, base_commit)
|
|
53
|
+
|
|
54
|
+
if repo_dir is not None:
|
|
55
|
+
if dest.exists():
|
|
56
|
+
shutil.rmtree(dest)
|
|
57
|
+
run(["git", "clone", "--quiet", "--no-hardlinks", str(repo_dir), str(dest)])
|
|
58
|
+
elif not dest.exists():
|
|
59
|
+
run(["git", "clone", "--quiet", f"https://github.com/{repo}.git", str(dest)])
|
|
60
|
+
|
|
61
|
+
run(["git", "fetch", "--quiet", "--all", "--tags"], cwd=dest)
|
|
62
|
+
run(["git", "checkout", "--quiet", base_commit], cwd=dest)
|
|
63
|
+
run(["git", "reset", "--hard", "--quiet"], cwd=dest)
|
|
64
|
+
run(["git", "clean", "-ffdqx"], cwd=dest)
|
|
65
|
+
return dest
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def write_case_files(
|
|
69
|
+
instance: dict[str, Any],
|
|
70
|
+
case_dir: Path,
|
|
71
|
+
patch_text: str,
|
|
72
|
+
timeout_seconds: int,
|
|
73
|
+
) -> None:
|
|
74
|
+
instance_id = require_text(instance, "instance_id")
|
|
75
|
+
repo = require_text(instance, "repo")
|
|
76
|
+
base_commit = require_text(instance, "base_commit")
|
|
77
|
+
problem = require_text(instance, "problem_statement")
|
|
78
|
+
case_dir.mkdir(parents=True, exist_ok=True)
|
|
79
|
+
|
|
80
|
+
metadata = {
|
|
81
|
+
"id": instance_id,
|
|
82
|
+
"category": "high-risk",
|
|
83
|
+
"difficulty": instance.get("difficulty") or "swebench",
|
|
84
|
+
"timeout_seconds": timeout_seconds,
|
|
85
|
+
"required_tools": ["git", "python3"],
|
|
86
|
+
"browser": False,
|
|
87
|
+
"deps_change_expected": True,
|
|
88
|
+
"intent": f"SWE-bench issue for {repo} at {base_commit}: resolve the supplied problem statement without using the gold patch.",
|
|
89
|
+
"source": {
|
|
90
|
+
"benchmark": "SWE-bench",
|
|
91
|
+
"repo": repo,
|
|
92
|
+
"base_commit": base_commit,
|
|
93
|
+
"issue_url": instance.get("issue_url"),
|
|
94
|
+
"pr_url": instance.get("pr_url"),
|
|
95
|
+
"version": instance.get("version"),
|
|
96
|
+
},
|
|
97
|
+
}
|
|
98
|
+
(case_dir / "metadata.json").write_text(json.dumps(metadata, indent=2) + "\n", encoding="utf8")
|
|
99
|
+
|
|
100
|
+
spec = f"""---
|
|
101
|
+
id: "{instance_id}"
|
|
102
|
+
title: "SWE-bench {instance_id}"
|
|
103
|
+
status: planned
|
|
104
|
+
complexity: high
|
|
105
|
+
depends-on: []
|
|
106
|
+
---
|
|
107
|
+
|
|
108
|
+
# SWE-bench {instance_id}
|
|
109
|
+
|
|
110
|
+
## Context
|
|
111
|
+
|
|
112
|
+
Repository: `{repo}`
|
|
113
|
+
Base commit: `{base_commit}`
|
|
114
|
+
|
|
115
|
+
This case is imported from a SWE-bench-style instance. Treat the problem
|
|
116
|
+
statement below as the visible user contract. Do not use the gold `patch` or
|
|
117
|
+
`test_patch` fields as implementation guidance during model generation or
|
|
118
|
+
review.
|
|
119
|
+
|
|
120
|
+
## Requirements
|
|
121
|
+
|
|
122
|
+
- [ ] Resolve the reported issue described in the problem statement.
|
|
123
|
+
- [ ] Preserve existing behavior outside the issue's scope.
|
|
124
|
+
- [ ] Keep the implementation consistent with the repository's local style and
|
|
125
|
+
dependency policy.
|
|
126
|
+
- [ ] Surface failures explicitly; do not hide errors behind silent fallbacks.
|
|
127
|
+
|
|
128
|
+
## Problem Statement
|
|
129
|
+
|
|
130
|
+
{problem}
|
|
131
|
+
|
|
132
|
+
## Constraints
|
|
133
|
+
|
|
134
|
+
- Do not inspect or rely on the SWE-bench gold solution patch while producing
|
|
135
|
+
or judging a candidate patch.
|
|
136
|
+
- Do not add broad rewrites, unrelated formatting churn, or new dependencies
|
|
137
|
+
unless the problem statement strictly requires them.
|
|
138
|
+
- Frozen VERIFY compares reviewers on the same already-applied candidate patch;
|
|
139
|
+
it is review evidence, not a full SWE-bench solve-rate measurement.
|
|
140
|
+
|
|
141
|
+
## Verification
|
|
142
|
+
|
|
143
|
+
- Run the official SWE-bench evaluator separately for solve-rate evidence.
|
|
144
|
+
- Use `/devlyn:resolve --verify-only` here only to compare solo vs gated pair
|
|
145
|
+
review of the frozen candidate patch against the visible problem statement.
|
|
146
|
+
"""
|
|
147
|
+
(case_dir / "spec.md").write_text(spec, encoding="utf8")
|
|
148
|
+
(case_dir / "task.txt").write_text(problem + "\n", encoding="utf8")
|
|
149
|
+
(case_dir / "expected.json").write_text(
|
|
150
|
+
json.dumps(
|
|
151
|
+
{
|
|
152
|
+
"verification_commands": [],
|
|
153
|
+
"forbidden_patterns": [],
|
|
154
|
+
"required_files": [],
|
|
155
|
+
"forbidden_files": [],
|
|
156
|
+
"tier_a_waivers": [],
|
|
157
|
+
"spec_output_files": [],
|
|
158
|
+
"max_deps_added": 999,
|
|
159
|
+
},
|
|
160
|
+
indent=2,
|
|
161
|
+
)
|
|
162
|
+
+ "\n",
|
|
163
|
+
encoding="utf8",
|
|
164
|
+
)
|
|
165
|
+
(case_dir / "setup.sh").write_text("#!/usr/bin/env bash\nset -euo pipefail\n", encoding="utf8")
|
|
166
|
+
(case_dir / "setup.sh").chmod(0o755)
|
|
167
|
+
notes = f"""# {instance_id} — SWE-bench Frozen VERIFY Case
|
|
168
|
+
|
|
169
|
+
Source repo: `{repo}`
|
|
170
|
+
Base commit: `{base_commit}`
|
|
171
|
+
|
|
172
|
+
This case exists to measure whether gated pair VERIFY catches verdict-binding
|
|
173
|
+
review issues that solo VERIFY misses on a fixed candidate patch. It does not
|
|
174
|
+
replace official SWE-bench pass/fail evaluation.
|
|
175
|
+
"""
|
|
176
|
+
(case_dir / "NOTES.md").write_text(notes, encoding="utf8")
|
|
177
|
+
(case_dir / "model.patch").write_text(patch_text, encoding="utf8")
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def main() -> int:
|
|
181
|
+
parser = argparse.ArgumentParser()
|
|
182
|
+
parser.add_argument("--instance-json", required=True, type=Path)
|
|
183
|
+
parser.add_argument("--model-patch", required=True, type=Path)
|
|
184
|
+
parser.add_argument(
|
|
185
|
+
"--cases-root",
|
|
186
|
+
default=Path("benchmark/auto-resolve/external/swebench/cases"),
|
|
187
|
+
type=Path,
|
|
188
|
+
)
|
|
189
|
+
parser.add_argument(
|
|
190
|
+
"--repos-root",
|
|
191
|
+
default=Path("benchmark/auto-resolve/external/swebench/repos"),
|
|
192
|
+
type=Path,
|
|
193
|
+
)
|
|
194
|
+
parser.add_argument(
|
|
195
|
+
"--repo-dir",
|
|
196
|
+
type=Path,
|
|
197
|
+
help="Local clone/source repo to copy instead of cloning GitHub; useful for tests and cached runs.",
|
|
198
|
+
)
|
|
199
|
+
parser.add_argument("--timeout-seconds", type=int, default=2400)
|
|
200
|
+
args = parser.parse_args()
|
|
201
|
+
|
|
202
|
+
instance = read_json(args.instance_json)
|
|
203
|
+
instance_id = require_text(instance, "instance_id")
|
|
204
|
+
if not SAFE_ID.match(instance_id):
|
|
205
|
+
raise ValueError(f"unsafe instance_id for path/spec use: {instance_id!r}")
|
|
206
|
+
patch_text = args.model_patch.read_text(encoding="utf8")
|
|
207
|
+
if not patch_text.strip():
|
|
208
|
+
raise ValueError(f"model patch is empty: {args.model_patch}")
|
|
209
|
+
|
|
210
|
+
repo_path = prepare_repo(instance, args.repo_dir, args.repos_root)
|
|
211
|
+
case_dir = args.cases_root / instance_id
|
|
212
|
+
write_case_files(instance, case_dir, patch_text, args.timeout_seconds)
|
|
213
|
+
|
|
214
|
+
command = [
|
|
215
|
+
"bash",
|
|
216
|
+
"benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh",
|
|
217
|
+
"--fixture",
|
|
218
|
+
instance_id,
|
|
219
|
+
"--fixtures-root",
|
|
220
|
+
str(args.cases_root),
|
|
221
|
+
"--base-repo",
|
|
222
|
+
str(repo_path),
|
|
223
|
+
"--diff",
|
|
224
|
+
str(case_dir / "model.patch"),
|
|
225
|
+
"--pair-mode",
|
|
226
|
+
"gated",
|
|
227
|
+
]
|
|
228
|
+
(case_dir / "run-command.txt").write_text(shlex.join(command) + "\n", encoding="utf8")
|
|
229
|
+
print(
|
|
230
|
+
json.dumps(
|
|
231
|
+
{
|
|
232
|
+
"instance_id": instance_id,
|
|
233
|
+
"case_dir": str(case_dir),
|
|
234
|
+
"repo_dir": str(repo_path),
|
|
235
|
+
"run_command": command,
|
|
236
|
+
},
|
|
237
|
+
indent=2,
|
|
238
|
+
)
|
|
239
|
+
)
|
|
240
|
+
return 0
|
|
241
|
+
|
|
242
|
+
|
|
243
|
+
if __name__ == "__main__":
|
|
244
|
+
raise SystemExit(main())
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Prepare a SWE-bench prediction JSONL as frozen VERIFY review cases."""
|
|
3
|
+
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import argparse
|
|
7
|
+
import json
|
|
8
|
+
import subprocess
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def read_jsonl(path: Path) -> list[dict[str, Any]]:
|
|
15
|
+
rows: list[dict[str, Any]] = []
|
|
16
|
+
with path.open(encoding="utf8") as f:
|
|
17
|
+
for line_no, line in enumerate(f, start=1):
|
|
18
|
+
if not line.strip():
|
|
19
|
+
continue
|
|
20
|
+
value = json.loads(line)
|
|
21
|
+
if not isinstance(value, dict):
|
|
22
|
+
raise ValueError(f"{path}:{line_no}: expected JSON object")
|
|
23
|
+
rows.append(value)
|
|
24
|
+
return rows
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def require_text(row: dict[str, Any], key: str, source: str) -> str:
|
|
28
|
+
value = row.get(key)
|
|
29
|
+
if not isinstance(value, str) or not value.strip():
|
|
30
|
+
raise ValueError(f"{source} missing non-empty {key!r}")
|
|
31
|
+
return value.strip()
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def main() -> int:
|
|
35
|
+
parser = argparse.ArgumentParser()
|
|
36
|
+
parser.add_argument("--instances-jsonl", required=True, type=Path)
|
|
37
|
+
parser.add_argument("--predictions-jsonl", required=True, type=Path)
|
|
38
|
+
parser.add_argument(
|
|
39
|
+
"--cases-root",
|
|
40
|
+
default=Path("benchmark/auto-resolve/external/swebench/cases"),
|
|
41
|
+
type=Path,
|
|
42
|
+
)
|
|
43
|
+
parser.add_argument(
|
|
44
|
+
"--repos-root",
|
|
45
|
+
default=Path("benchmark/auto-resolve/external/swebench/repos"),
|
|
46
|
+
type=Path,
|
|
47
|
+
)
|
|
48
|
+
parser.add_argument("--repo-dir", type=Path, help="Use one local repo clone for every selected instance.")
|
|
49
|
+
parser.add_argument("--instance-id", action="append", help="Prepare only these instance ids.")
|
|
50
|
+
parser.add_argument("--limit", type=int, help="Prepare at most N matched instances after filtering.")
|
|
51
|
+
parser.add_argument("--timeout-seconds", type=int, default=2400)
|
|
52
|
+
parser.add_argument("--out-manifest", type=Path)
|
|
53
|
+
args = parser.parse_args()
|
|
54
|
+
|
|
55
|
+
instances = {require_text(row, "instance_id", f"{args.instances_jsonl}"): row for row in read_jsonl(args.instances_jsonl)}
|
|
56
|
+
predictions: dict[str, dict[str, Any]] = {}
|
|
57
|
+
for row in read_jsonl(args.predictions_jsonl):
|
|
58
|
+
instance_id = require_text(row, "instance_id", f"{args.predictions_jsonl}")
|
|
59
|
+
if instance_id in predictions:
|
|
60
|
+
raise ValueError(f"duplicate prediction for {instance_id}")
|
|
61
|
+
predictions[instance_id] = row
|
|
62
|
+
|
|
63
|
+
selected_ids = args.instance_id or list(predictions)
|
|
64
|
+
script = Path(__file__).with_name("prepare-swebench-frozen-case.py")
|
|
65
|
+
prepared: list[dict[str, Any]] = []
|
|
66
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
67
|
+
tmp_dir = Path(tmp)
|
|
68
|
+
for instance_id in selected_ids:
|
|
69
|
+
if args.limit is not None and len(prepared) >= args.limit:
|
|
70
|
+
break
|
|
71
|
+
if instance_id not in instances:
|
|
72
|
+
raise ValueError(f"prediction instance not found in instances JSONL: {instance_id}")
|
|
73
|
+
prediction = predictions.get(instance_id)
|
|
74
|
+
if prediction is None:
|
|
75
|
+
raise ValueError(f"selected instance missing prediction: {instance_id}")
|
|
76
|
+
patch_value = prediction.get("model_patch")
|
|
77
|
+
if not isinstance(patch_value, str) or not patch_value.strip():
|
|
78
|
+
raise ValueError(f"prediction {instance_id} missing non-empty 'model_patch'")
|
|
79
|
+
instance_path = tmp_dir / f"{instance_id}.instance.json"
|
|
80
|
+
patch_path = tmp_dir / f"{instance_id}.patch"
|
|
81
|
+
instance_path.write_text(json.dumps(instances[instance_id], indent=2) + "\n", encoding="utf8")
|
|
82
|
+
patch_path.write_text(patch_value, encoding="utf8")
|
|
83
|
+
|
|
84
|
+
cmd = [
|
|
85
|
+
"python3",
|
|
86
|
+
str(script),
|
|
87
|
+
"--instance-json",
|
|
88
|
+
str(instance_path),
|
|
89
|
+
"--model-patch",
|
|
90
|
+
str(patch_path),
|
|
91
|
+
"--cases-root",
|
|
92
|
+
str(args.cases_root),
|
|
93
|
+
"--repos-root",
|
|
94
|
+
str(args.repos_root),
|
|
95
|
+
"--timeout-seconds",
|
|
96
|
+
str(args.timeout_seconds),
|
|
97
|
+
]
|
|
98
|
+
if args.repo_dir is not None:
|
|
99
|
+
cmd.extend(["--repo-dir", str(args.repo_dir)])
|
|
100
|
+
completed = subprocess.run(cmd, check=True, text=True, capture_output=True)
|
|
101
|
+
prepared.append(json.loads(completed.stdout))
|
|
102
|
+
|
|
103
|
+
manifest = {
|
|
104
|
+
"instances_jsonl": str(args.instances_jsonl),
|
|
105
|
+
"predictions_jsonl": str(args.predictions_jsonl),
|
|
106
|
+
"cases_root": str(args.cases_root),
|
|
107
|
+
"repos_root": str(args.repos_root),
|
|
108
|
+
"prepared_count": len(prepared),
|
|
109
|
+
"prepared": prepared,
|
|
110
|
+
}
|
|
111
|
+
if args.out_manifest:
|
|
112
|
+
args.out_manifest.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf8")
|
|
113
|
+
print(json.dumps(manifest, indent=2))
|
|
114
|
+
return 0
|
|
115
|
+
|
|
116
|
+
|
|
117
|
+
if __name__ == "__main__":
|
|
118
|
+
raise SystemExit(main())
|