devlyn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +321 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F1-cli-trivial-flag/spec.md +0 -1
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +51 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  12. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +50 -0
  18. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +57 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  25. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +51 -0
  31. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +57 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  40. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  41. package/benchmark/auto-resolve/fixtures/F2-cli-medium-subcommand/spec.md +0 -1
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +61 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  48. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  49. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +64 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  56. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  57. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +70 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  64. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  65. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +64 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  73. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  74. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +68 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  82. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  83. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  84. package/benchmark/auto-resolve/fixtures/F3-backend-contract-risk/spec.md +0 -1
  85. package/benchmark/auto-resolve/fixtures/F4-web-browser-design/spec.md +0 -1
  86. package/benchmark/auto-resolve/fixtures/F5-fix-loop-red-green/spec.md +0 -1
  87. package/benchmark/auto-resolve/fixtures/F6-dep-audit-native-module/spec.md +0 -1
  88. package/benchmark/auto-resolve/fixtures/F7-out-of-scope-trap/spec.md +0 -1
  89. package/benchmark/auto-resolve/fixtures/F9-e2e-ideate-to-resolve/spec.md +0 -3
  90. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  91. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  92. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  93. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  94. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  95. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  96. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  97. package/benchmark/auto-resolve/scripts/oracle-scope-tier-a.py +0 -11
  98. package/benchmark/auto-resolve/scripts/oracle-scope-tier-b.py +0 -10
  99. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  100. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  101. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  102. package/benchmark/auto-resolve/scripts/run-fixture.sh +257 -43
  103. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  104. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  105. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  106. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  107. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  108. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  109. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  110. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  111. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  112. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  113. package/config/skills/_shared/archive_run.py +3 -0
  114. package/config/skills/_shared/codex-config.md +2 -2
  115. package/config/skills/_shared/codex-monitored.sh +72 -7
  116. package/config/skills/_shared/collect-codex-findings.py +125 -0
  117. package/config/skills/_shared/engine-preflight.md +1 -1
  118. package/config/skills/_shared/expected.schema.json +18 -0
  119. package/config/skills/_shared/spec-verify-check.py +363 -10
  120. package/config/skills/_shared/verify-merge-findings.py +327 -0
  121. package/config/skills/devlyn:resolve/SKILL.md +69 -8
  122. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  123. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +183 -0
  124. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  125. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  126. package/package.json +1 -1
  127. package/scripts/lint-skills.sh +69 -20
@@ -0,0 +1,147 @@
1
+ #!/usr/bin/env python3
2
+ """Headroom gate for candidate L2/pair fixtures.
3
+
4
+ Pair lift is not measurable when bare and solo already score near the ceiling.
5
+ This gate checks the precondition recorded in HANDOFF.md: before an L2 pair
6
+ measurement is pre-registered, candidate fixtures must leave enough room for
7
+ pair to improve the outcome.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import pathlib
14
+ import sys
15
+
16
+
17
+ def load_json(path: pathlib.Path) -> dict | None:
18
+ if not path.is_file():
19
+ return None
20
+ return json.loads(path.read_text())
21
+
22
+
23
+ def score_for(judge: dict, arm: str) -> int | None:
24
+ scores = judge.get("scores_by_arm") or {}
25
+ value = scores.get(arm)
26
+ return value if isinstance(value, int) else None
27
+
28
+
29
+ def arm_clean_failures(fixture_dir: pathlib.Path, judge: dict, arm: str) -> list[str]:
30
+ failures: list[str] = []
31
+ result = load_json(fixture_dir / arm / "result.json")
32
+ verify = load_json(fixture_dir / arm / "verify.json")
33
+ if result is None:
34
+ failures.append(f"{arm} result.json missing")
35
+ if verify is None:
36
+ failures.append(f"{arm} verify.json missing")
37
+ dq_by_arm = judge.get("disqualifiers_by_arm") or {}
38
+ if bool((dq_by_arm.get(arm) or {}).get("disqualifier")):
39
+ failures.append(f"{arm} judge disqualifier")
40
+ if result is not None:
41
+ if bool(result.get("disqualifier")):
42
+ failures.append(f"{arm} result disqualifier")
43
+ if bool(result.get("timed_out")):
44
+ failures.append(f"{arm} timed out")
45
+ if bool(result.get("invoke_failure")):
46
+ failures.append(f"{arm} invoke failure")
47
+ if verify is not None and bool(verify.get("disqualifier")):
48
+ failures.append(f"{arm} verify disqualifier")
49
+ return failures
50
+
51
+
52
+ def main() -> int:
53
+ parser = argparse.ArgumentParser()
54
+ parser.add_argument("--run-id", required=True)
55
+ parser.add_argument("--results-root", default="benchmark/auto-resolve/results")
56
+ parser.add_argument("--bare-max", type=int, default=60)
57
+ parser.add_argument("--solo-max", type=int, default=80)
58
+ parser.add_argument("--min-fixtures", type=int, default=2)
59
+ parser.add_argument("--out-json", default=None)
60
+ parser.add_argument("--out-md", default=None)
61
+ args = parser.parse_args()
62
+
63
+ res_root = pathlib.Path(args.results_root) / args.run_id
64
+ if not res_root.is_dir():
65
+ print(f"no results dir: {res_root}", file=sys.stderr)
66
+ return 2
67
+
68
+ rows = []
69
+ for fixture_dir in sorted(p for p in res_root.iterdir() if p.is_dir()):
70
+ judge = load_json(fixture_dir / "judge.json")
71
+ if judge is None:
72
+ rows.append({
73
+ "fixture": fixture_dir.name,
74
+ "status": "MISSING_JUDGE",
75
+ "reason": "judge.json missing",
76
+ })
77
+ continue
78
+ bare = score_for(judge, "bare")
79
+ solo = score_for(judge, "solo_claude")
80
+ bare_clean_failures = arm_clean_failures(fixture_dir, judge, "bare")
81
+ solo_clean_failures = arm_clean_failures(fixture_dir, judge, "solo_claude")
82
+ bare_ok = bare is not None and bare <= args.bare_max and not bare_clean_failures
83
+ solo_ok = solo is not None and solo <= args.solo_max and not solo_clean_failures
84
+ status = "PASS" if bare_ok and solo_ok else "FAIL"
85
+ reasons = []
86
+ if bare is None:
87
+ reasons.append("bare score missing")
88
+ elif bare > args.bare_max:
89
+ reasons.append(f"bare score {bare} > {args.bare_max}")
90
+ if solo is None:
91
+ reasons.append("solo_claude score missing")
92
+ elif solo > args.solo_max:
93
+ reasons.append(f"solo_claude score {solo} > {args.solo_max}")
94
+ reasons.extend(bare_clean_failures)
95
+ reasons.extend(solo_clean_failures)
96
+ rows.append({
97
+ "fixture": fixture_dir.name,
98
+ "status": status,
99
+ "bare_score": bare,
100
+ "solo_score": solo,
101
+ "reason": "; ".join(reasons) if reasons else "",
102
+ })
103
+
104
+ pass_count = sum(1 for row in rows if row["status"] == "PASS")
105
+ fixture_count_ok = len(rows) >= args.min_fixtures
106
+ verdict = "PASS" if pass_count == len(rows) and rows and fixture_count_ok else "FAIL"
107
+ payload = {
108
+ "run_id": args.run_id,
109
+ "rule": f"at least {args.min_fixtures} candidate fixtures; each must satisfy bare <= {args.bare_max} and solo_claude <= {args.solo_max}, with both arms clean",
110
+ "verdict": verdict,
111
+ "fixtures_total": len(rows),
112
+ "fixtures_passed": pass_count,
113
+ "min_fixtures": args.min_fixtures,
114
+ "fixture_count_ok": fixture_count_ok,
115
+ "rows": rows,
116
+ }
117
+
118
+ if args.out_json:
119
+ pathlib.Path(args.out_json).write_text(json.dumps(payload, indent=2) + "\n")
120
+
121
+ lines = [
122
+ f"# Headroom Gate — {args.run_id}",
123
+ "",
124
+ f"Verdict: **{verdict}**",
125
+ "",
126
+ f"Rule: at least {args.min_fixtures} fixtures; bare <= {args.bare_max}, "
127
+ f"solo_claude <= {args.solo_max}, both arms clean.",
128
+ "",
129
+ "| Fixture | Bare | Solo | Status | Reason |",
130
+ "|---|---:|---:|---|---|",
131
+ ]
132
+ for row in rows:
133
+ lines.append(
134
+ f"| {row['fixture']} | {row.get('bare_score')} | {row.get('solo_score')} | "
135
+ f"{row['status']} | {row.get('reason', '')} |"
136
+ )
137
+ report = "\n".join(lines) + "\n"
138
+ if args.out_md:
139
+ pathlib.Path(args.out_md).write_text(report)
140
+ else:
141
+ print(report)
142
+
143
+ return 0 if verdict == "PASS" else 1
144
+
145
+
146
+ if __name__ == "__main__":
147
+ sys.exit(main())
@@ -41,11 +41,13 @@ RES_DIR="$BENCH_ROOT/results/$RUN_ID/$FIXTURE"
41
41
  # date iter-0019.
42
42
  ARMS_PRESENT=()
43
43
  # iter-0033c: l2_gated/l2_forced added for NEW L2 vs NEW L1 measurement.
44
+ # iter-0037: l2_risk_probes adds bounded visible-verification probes before
45
+ # IMPLEMENT; judge treats it as another blind arm when artifacts exist.
44
46
  # Slot count is still A/B/C max 3 — pair-eligible iter-0033c fixtures supply
45
47
  # {solo_claude, l2_gated, l2_forced}; non-pair-eligible fixtures supply
46
48
  # {solo_claude, l2_gated}. The blind-shuffle slot mapping below already
47
49
  # tolerates arbitrary ARMS_PRESENT counts ≥2.
48
- for arm in variant solo_claude bare l2_gated l2_forced; do
50
+ for arm in variant solo_claude bare l2_gated l2_risk_probes l2_forced; do
49
51
  if [ -f "$RES_DIR/$arm/diff.patch" ] && [ -f "$RES_DIR/$arm/verify.json" ]; then
50
52
  ARMS_PRESENT+=("$arm")
51
53
  fi
@@ -216,8 +218,34 @@ PY
216
218
  # traceable. Run from a clean temp CWD so the judge can't peek at project
217
219
  # files that would leak arm identity.
218
220
  command -v codex >/dev/null 2>&1 || { echo "codex CLI not on PATH; cannot judge"; exit 1; }
219
- CODEX_CLI_VER=$(codex --version 2>/dev/null || echo "codex-cli unknown")
220
- JUDGE_MODEL=$(grep -E '^model\s*=' "${HOME}/.codex/config.toml" 2>/dev/null | head -1 | sed -E 's/.*=\s*"?([^"]+)"?.*/\1/')
221
+ CODEX_CLI_VER=$(python3 - <<'PY'
222
+ import subprocess
223
+
224
+ try:
225
+ proc = subprocess.run(
226
+ ["codex", "--version"],
227
+ stdout=subprocess.PIPE,
228
+ stderr=subprocess.PIPE,
229
+ text=True,
230
+ timeout=5,
231
+ )
232
+ out = (proc.stdout or proc.stderr).strip()
233
+ if proc.returncode == 0 and out:
234
+ print(out)
235
+ else:
236
+ print(f"codex-cli unknown (version-exit-{proc.returncode})")
237
+ except subprocess.TimeoutExpired:
238
+ print("codex-cli unknown (version-timeout)")
239
+ except FileNotFoundError:
240
+ print("codex-cli missing")
241
+ except Exception as exc:
242
+ print(f"codex-cli unknown ({type(exc).__name__})")
243
+ PY
244
+ )
245
+ JUDGE_MODEL=$({ grep -E '^model[[:space:]]*=' "${HOME}/.codex/config.toml" 2>/dev/null || true; } \
246
+ | head -1 \
247
+ | sed -E 's/.*=[[:space:]]*"?([^"]*)"?[[:space:]]*$/\1/' \
248
+ | xargs)
221
249
  [ -z "$JUDGE_MODEL" ] && JUDGE_MODEL="(unknown — codex config.toml not readable)"
222
250
 
223
251
  JUDGE_CWD="/tmp/judge-$RUN_ID-$FIXTURE"
@@ -237,6 +265,7 @@ fi
237
265
 
238
266
  # Extract JSON (codex wraps with banners; pick the last {...} block)
239
267
  python3 - "$JUDGE_OUT" "$RES_DIR/judge.json" "$A_ARM" "$B_ARM" "$C_ARM" "$SEED" "$CODEX_CLI_VER" "$JUDGE_MODEL" <<'PY'
268
+ import math
240
269
  import sys, re, json, pathlib
241
270
  out = pathlib.Path(sys.argv[1]).read_text()
242
271
  target = pathlib.Path(sys.argv[2])
@@ -298,6 +327,56 @@ if axis_invalid_cells:
298
327
  f"clamped: {axis_invalid_cells}\n"
299
328
  )
300
329
 
330
+ # Verification is the machine-readable acceptance contract. RUBRIC.md puts
331
+ # verification behavior under Spec Compliance, but LLM judges can still grade
332
+ # generous prose around failed commands. Cap score mechanically so an arm that
333
+ # fails required verification cannot receive a ceiling score.
334
+ def arm_verify_score(arm: str):
335
+ path = target.parent / arm / "verify.json"
336
+ if not path.is_file():
337
+ return None
338
+ data = json.loads(path.read_text())
339
+ value = data.get("verify_score")
340
+ return float(value) if isinstance(value, (int, float)) else None
341
+
342
+ verify_caps = []
343
+ for letter, score_key, breakdown_key in (
344
+ ("A", "a_score", "a_breakdown"),
345
+ ("B", "b_score", "b_breakdown"),
346
+ ("C", "c_score", "c_breakdown"),
347
+ ):
348
+ arm = mapping.get(letter)
349
+ if not arm:
350
+ continue
351
+ verify_score = arm_verify_score(arm)
352
+ if verify_score is None:
353
+ continue
354
+ verify_score = max(0.0, min(1.0, verify_score))
355
+ score_cap = math.floor(100 * verify_score)
356
+ spec_cap = math.floor(25 * verify_score)
357
+ raw_score = chosen.get(score_key)
358
+ raw_spec = (chosen.get(breakdown_key) or {}).get("spec")
359
+ row = {
360
+ "letter": letter,
361
+ "arm": arm,
362
+ "verify_score": verify_score,
363
+ "score_cap": score_cap,
364
+ "spec_cap": spec_cap,
365
+ "raw_score": raw_score,
366
+ "raw_spec": raw_spec,
367
+ "score_capped": False,
368
+ "spec_capped": False,
369
+ }
370
+ if isinstance(raw_score, (int, float)) and raw_score > score_cap:
371
+ chosen[score_key] = score_cap
372
+ row["score_capped"] = True
373
+ breakdown = chosen.get(breakdown_key)
374
+ if isinstance(breakdown, dict) and isinstance(raw_spec, (int, float)) and raw_spec > spec_cap:
375
+ breakdown["spec"] = spec_cap
376
+ row["spec_capped"] = True
377
+ verify_caps.append(row)
378
+ chosen["_verify_score_caps"] = verify_caps
379
+
301
380
  # scores_by_arm: arm-name → score, computed from the blind A/B/C scores.
302
381
  # This is the canonical 3-arm-aware shape the report consumer reads. The
303
382
  # legacy variant_score / bare_score / margin fields below are derived from
@@ -154,20 +154,9 @@ def analyze(work_dir, scaffold_sha, waivers, fixture_id=None):
154
154
  findings = []
155
155
  entries = git_diff_status(scaffold_sha, work_dir)
156
156
 
157
- # Structural exemption: every benchmark fixture has its own spec at
158
- # docs/roadmap/phase-*/<fixture_id>.md, and auto-resolve's DOCS phase
159
- # Job 1 legitimately flips its frontmatter status. That flip is a
160
- # skill feature, not a scope violation — always exempt regardless of
161
- # per-fixture waivers.
162
- own_spec_globs = []
163
- if fixture_id:
164
- own_spec_globs.append(f"docs/roadmap/phase-*/{fixture_id}.md")
165
-
166
157
  for status, path in entries:
167
158
  if is_waived(path, waivers):
168
159
  continue
169
- if is_waived(path, own_spec_globs):
170
- continue
171
160
 
172
161
  # Lockfile deletion — only when file existed at scaffold.
173
162
  if status == "D" and os.path.basename(path) in LOCKFILE_NAMES:
@@ -173,22 +173,12 @@ def analyze(work_dir_str: str, scaffold_sha: str, tier_c_globs, waivers,
173
173
 
174
174
  reachable = bfs_trace(seeds, work_dir)
175
175
 
176
- # Structural exemption: the fixture's own spec file at
177
- # docs/roadmap/phase-*/<fixture_id>.md is always authorized — DOCS
178
- # phase Job 1 flips its frontmatter status by design. Kept in sync
179
- # with oracle-scope-tier-a.py.
180
- own_spec_globs = []
181
- if fixture_id:
182
- own_spec_globs.append(f"docs/roadmap/phase-*/{fixture_id}.md")
183
-
184
176
  findings = []
185
177
  for path in sorted(touched):
186
178
  if match_any(path, tier_c_globs):
187
179
  continue
188
180
  if match_any(path, waivers):
189
181
  continue
190
- if match_any(path, own_spec_globs):
191
- continue
192
182
  if path in reachable:
193
183
  depth, via = reachable[path]
194
184
  findings.append({
@@ -0,0 +1,244 @@
1
+ #!/usr/bin/env python3
2
+ """Prepare a SWE-bench instance for frozen VERIFY solo-vs-pair review.
3
+
4
+ The script does not run models and does not evaluate SWE-bench correctness.
5
+ It converts one official SWE-bench-style instance plus one candidate patch into
6
+ the case layout consumed by run-frozen-verify-pair.sh.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import argparse
12
+ import json
13
+ import re
14
+ import shlex
15
+ import shutil
16
+ import subprocess
17
+ from pathlib import Path
18
+ from typing import Any
19
+
20
+
21
+ SAFE_ID = re.compile(r"^[A-Za-z0-9_.-]+$")
22
+
23
+
24
+ def run(cmd: list[str], cwd: Path | None = None) -> None:
25
+ subprocess.run(cmd, cwd=cwd, check=True)
26
+
27
+
28
+ def read_json(path: Path) -> dict[str, Any]:
29
+ with path.open(encoding="utf8") as f:
30
+ data = json.load(f)
31
+ if not isinstance(data, dict):
32
+ raise ValueError(f"expected JSON object: {path}")
33
+ return data
34
+
35
+
36
+ def require_text(instance: dict[str, Any], key: str) -> str:
37
+ value = instance.get(key)
38
+ if not isinstance(value, str) or not value.strip():
39
+ raise ValueError(f"SWE-bench instance missing non-empty {key!r}")
40
+ return value.strip()
41
+
42
+
43
+ def repo_cache_name(repo: str, base_commit: str) -> str:
44
+ safe_repo = repo.replace("/", "__")
45
+ return f"{safe_repo}-{base_commit[:12]}"
46
+
47
+
48
+ def prepare_repo(instance: dict[str, Any], repo_dir: Path | None, repos_root: Path) -> Path:
49
+ repo = require_text(instance, "repo")
50
+ base_commit = require_text(instance, "base_commit")
51
+ repos_root.mkdir(parents=True, exist_ok=True)
52
+ dest = repos_root / repo_cache_name(repo, base_commit)
53
+
54
+ if repo_dir is not None:
55
+ if dest.exists():
56
+ shutil.rmtree(dest)
57
+ run(["git", "clone", "--quiet", "--no-hardlinks", str(repo_dir), str(dest)])
58
+ elif not dest.exists():
59
+ run(["git", "clone", "--quiet", f"https://github.com/{repo}.git", str(dest)])
60
+
61
+ run(["git", "fetch", "--quiet", "--all", "--tags"], cwd=dest)
62
+ run(["git", "checkout", "--quiet", base_commit], cwd=dest)
63
+ run(["git", "reset", "--hard", "--quiet"], cwd=dest)
64
+ run(["git", "clean", "-ffdqx"], cwd=dest)
65
+ return dest
66
+
67
+
68
+ def write_case_files(
69
+ instance: dict[str, Any],
70
+ case_dir: Path,
71
+ patch_text: str,
72
+ timeout_seconds: int,
73
+ ) -> None:
74
+ instance_id = require_text(instance, "instance_id")
75
+ repo = require_text(instance, "repo")
76
+ base_commit = require_text(instance, "base_commit")
77
+ problem = require_text(instance, "problem_statement")
78
+ case_dir.mkdir(parents=True, exist_ok=True)
79
+
80
+ metadata = {
81
+ "id": instance_id,
82
+ "category": "high-risk",
83
+ "difficulty": instance.get("difficulty") or "swebench",
84
+ "timeout_seconds": timeout_seconds,
85
+ "required_tools": ["git", "python3"],
86
+ "browser": False,
87
+ "deps_change_expected": True,
88
+ "intent": f"SWE-bench issue for {repo} at {base_commit}: resolve the supplied problem statement without using the gold patch.",
89
+ "source": {
90
+ "benchmark": "SWE-bench",
91
+ "repo": repo,
92
+ "base_commit": base_commit,
93
+ "issue_url": instance.get("issue_url"),
94
+ "pr_url": instance.get("pr_url"),
95
+ "version": instance.get("version"),
96
+ },
97
+ }
98
+ (case_dir / "metadata.json").write_text(json.dumps(metadata, indent=2) + "\n", encoding="utf8")
99
+
100
+ spec = f"""---
101
+ id: "{instance_id}"
102
+ title: "SWE-bench {instance_id}"
103
+ status: planned
104
+ complexity: high
105
+ depends-on: []
106
+ ---
107
+
108
+ # SWE-bench {instance_id}
109
+
110
+ ## Context
111
+
112
+ Repository: `{repo}`
113
+ Base commit: `{base_commit}`
114
+
115
+ This case is imported from a SWE-bench-style instance. Treat the problem
116
+ statement below as the visible user contract. Do not use the gold `patch` or
117
+ `test_patch` fields as implementation guidance during model generation or
118
+ review.
119
+
120
+ ## Requirements
121
+
122
+ - [ ] Resolve the reported issue described in the problem statement.
123
+ - [ ] Preserve existing behavior outside the issue's scope.
124
+ - [ ] Keep the implementation consistent with the repository's local style and
125
+ dependency policy.
126
+ - [ ] Surface failures explicitly; do not hide errors behind silent fallbacks.
127
+
128
+ ## Problem Statement
129
+
130
+ {problem}
131
+
132
+ ## Constraints
133
+
134
+ - Do not inspect or rely on the SWE-bench gold solution patch while producing
135
+ or judging a candidate patch.
136
+ - Do not add broad rewrites, unrelated formatting churn, or new dependencies
137
+ unless the problem statement strictly requires them.
138
+ - Frozen VERIFY compares reviewers on the same already-applied candidate patch;
139
+ it is review evidence, not a full SWE-bench solve-rate measurement.
140
+
141
+ ## Verification
142
+
143
+ - Run the official SWE-bench evaluator separately for solve-rate evidence.
144
+ - Use `/devlyn:resolve --verify-only` here only to compare solo vs gated pair
145
+ review of the frozen candidate patch against the visible problem statement.
146
+ """
147
+ (case_dir / "spec.md").write_text(spec, encoding="utf8")
148
+ (case_dir / "task.txt").write_text(problem + "\n", encoding="utf8")
149
+ (case_dir / "expected.json").write_text(
150
+ json.dumps(
151
+ {
152
+ "verification_commands": [],
153
+ "forbidden_patterns": [],
154
+ "required_files": [],
155
+ "forbidden_files": [],
156
+ "tier_a_waivers": [],
157
+ "spec_output_files": [],
158
+ "max_deps_added": 999,
159
+ },
160
+ indent=2,
161
+ )
162
+ + "\n",
163
+ encoding="utf8",
164
+ )
165
+ (case_dir / "setup.sh").write_text("#!/usr/bin/env bash\nset -euo pipefail\n", encoding="utf8")
166
+ (case_dir / "setup.sh").chmod(0o755)
167
+ notes = f"""# {instance_id} — SWE-bench Frozen VERIFY Case
168
+
169
+ Source repo: `{repo}`
170
+ Base commit: `{base_commit}`
171
+
172
+ This case exists to measure whether gated pair VERIFY catches verdict-binding
173
+ review issues that solo VERIFY misses on a fixed candidate patch. It does not
174
+ replace official SWE-bench pass/fail evaluation.
175
+ """
176
+ (case_dir / "NOTES.md").write_text(notes, encoding="utf8")
177
+ (case_dir / "model.patch").write_text(patch_text, encoding="utf8")
178
+
179
+
180
+ def main() -> int:
181
+ parser = argparse.ArgumentParser()
182
+ parser.add_argument("--instance-json", required=True, type=Path)
183
+ parser.add_argument("--model-patch", required=True, type=Path)
184
+ parser.add_argument(
185
+ "--cases-root",
186
+ default=Path("benchmark/auto-resolve/external/swebench/cases"),
187
+ type=Path,
188
+ )
189
+ parser.add_argument(
190
+ "--repos-root",
191
+ default=Path("benchmark/auto-resolve/external/swebench/repos"),
192
+ type=Path,
193
+ )
194
+ parser.add_argument(
195
+ "--repo-dir",
196
+ type=Path,
197
+ help="Local clone/source repo to copy instead of cloning GitHub; useful for tests and cached runs.",
198
+ )
199
+ parser.add_argument("--timeout-seconds", type=int, default=2400)
200
+ args = parser.parse_args()
201
+
202
+ instance = read_json(args.instance_json)
203
+ instance_id = require_text(instance, "instance_id")
204
+ if not SAFE_ID.match(instance_id):
205
+ raise ValueError(f"unsafe instance_id for path/spec use: {instance_id!r}")
206
+ patch_text = args.model_patch.read_text(encoding="utf8")
207
+ if not patch_text.strip():
208
+ raise ValueError(f"model patch is empty: {args.model_patch}")
209
+
210
+ repo_path = prepare_repo(instance, args.repo_dir, args.repos_root)
211
+ case_dir = args.cases_root / instance_id
212
+ write_case_files(instance, case_dir, patch_text, args.timeout_seconds)
213
+
214
+ command = [
215
+ "bash",
216
+ "benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh",
217
+ "--fixture",
218
+ instance_id,
219
+ "--fixtures-root",
220
+ str(args.cases_root),
221
+ "--base-repo",
222
+ str(repo_path),
223
+ "--diff",
224
+ str(case_dir / "model.patch"),
225
+ "--pair-mode",
226
+ "gated",
227
+ ]
228
+ (case_dir / "run-command.txt").write_text(shlex.join(command) + "\n", encoding="utf8")
229
+ print(
230
+ json.dumps(
231
+ {
232
+ "instance_id": instance_id,
233
+ "case_dir": str(case_dir),
234
+ "repo_dir": str(repo_path),
235
+ "run_command": command,
236
+ },
237
+ indent=2,
238
+ )
239
+ )
240
+ return 0
241
+
242
+
243
+ if __name__ == "__main__":
244
+ raise SystemExit(main())
@@ -0,0 +1,118 @@
1
+ #!/usr/bin/env python3
2
+ """Prepare a SWE-bench prediction JSONL as frozen VERIFY review cases."""
3
+
4
+ from __future__ import annotations
5
+
6
+ import argparse
7
+ import json
8
+ import subprocess
9
+ import tempfile
10
+ from pathlib import Path
11
+ from typing import Any
12
+
13
+
14
+ def read_jsonl(path: Path) -> list[dict[str, Any]]:
15
+ rows: list[dict[str, Any]] = []
16
+ with path.open(encoding="utf8") as f:
17
+ for line_no, line in enumerate(f, start=1):
18
+ if not line.strip():
19
+ continue
20
+ value = json.loads(line)
21
+ if not isinstance(value, dict):
22
+ raise ValueError(f"{path}:{line_no}: expected JSON object")
23
+ rows.append(value)
24
+ return rows
25
+
26
+
27
+ def require_text(row: dict[str, Any], key: str, source: str) -> str:
28
+ value = row.get(key)
29
+ if not isinstance(value, str) or not value.strip():
30
+ raise ValueError(f"{source} missing non-empty {key!r}")
31
+ return value.strip()
32
+
33
+
34
+ def main() -> int:
35
+ parser = argparse.ArgumentParser()
36
+ parser.add_argument("--instances-jsonl", required=True, type=Path)
37
+ parser.add_argument("--predictions-jsonl", required=True, type=Path)
38
+ parser.add_argument(
39
+ "--cases-root",
40
+ default=Path("benchmark/auto-resolve/external/swebench/cases"),
41
+ type=Path,
42
+ )
43
+ parser.add_argument(
44
+ "--repos-root",
45
+ default=Path("benchmark/auto-resolve/external/swebench/repos"),
46
+ type=Path,
47
+ )
48
+ parser.add_argument("--repo-dir", type=Path, help="Use one local repo clone for every selected instance.")
49
+ parser.add_argument("--instance-id", action="append", help="Prepare only these instance ids.")
50
+ parser.add_argument("--limit", type=int, help="Prepare at most N matched instances after filtering.")
51
+ parser.add_argument("--timeout-seconds", type=int, default=2400)
52
+ parser.add_argument("--out-manifest", type=Path)
53
+ args = parser.parse_args()
54
+
55
+ instances = {require_text(row, "instance_id", f"{args.instances_jsonl}"): row for row in read_jsonl(args.instances_jsonl)}
56
+ predictions: dict[str, dict[str, Any]] = {}
57
+ for row in read_jsonl(args.predictions_jsonl):
58
+ instance_id = require_text(row, "instance_id", f"{args.predictions_jsonl}")
59
+ if instance_id in predictions:
60
+ raise ValueError(f"duplicate prediction for {instance_id}")
61
+ predictions[instance_id] = row
62
+
63
+ selected_ids = args.instance_id or list(predictions)
64
+ script = Path(__file__).with_name("prepare-swebench-frozen-case.py")
65
+ prepared: list[dict[str, Any]] = []
66
+ with tempfile.TemporaryDirectory() as tmp:
67
+ tmp_dir = Path(tmp)
68
+ for instance_id in selected_ids:
69
+ if args.limit is not None and len(prepared) >= args.limit:
70
+ break
71
+ if instance_id not in instances:
72
+ raise ValueError(f"prediction instance not found in instances JSONL: {instance_id}")
73
+ prediction = predictions.get(instance_id)
74
+ if prediction is None:
75
+ raise ValueError(f"selected instance missing prediction: {instance_id}")
76
+ patch_value = prediction.get("model_patch")
77
+ if not isinstance(patch_value, str) or not patch_value.strip():
78
+ raise ValueError(f"prediction {instance_id} missing non-empty 'model_patch'")
79
+ instance_path = tmp_dir / f"{instance_id}.instance.json"
80
+ patch_path = tmp_dir / f"{instance_id}.patch"
81
+ instance_path.write_text(json.dumps(instances[instance_id], indent=2) + "\n", encoding="utf8")
82
+ patch_path.write_text(patch_value, encoding="utf8")
83
+
84
+ cmd = [
85
+ "python3",
86
+ str(script),
87
+ "--instance-json",
88
+ str(instance_path),
89
+ "--model-patch",
90
+ str(patch_path),
91
+ "--cases-root",
92
+ str(args.cases_root),
93
+ "--repos-root",
94
+ str(args.repos_root),
95
+ "--timeout-seconds",
96
+ str(args.timeout_seconds),
97
+ ]
98
+ if args.repo_dir is not None:
99
+ cmd.extend(["--repo-dir", str(args.repo_dir)])
100
+ completed = subprocess.run(cmd, check=True, text=True, capture_output=True)
101
+ prepared.append(json.loads(completed.stdout))
102
+
103
+ manifest = {
104
+ "instances_jsonl": str(args.instances_jsonl),
105
+ "predictions_jsonl": str(args.predictions_jsonl),
106
+ "cases_root": str(args.cases_root),
107
+ "repos_root": str(args.repos_root),
108
+ "prepared_count": len(prepared),
109
+ "prepared": prepared,
110
+ }
111
+ if args.out_manifest:
112
+ args.out_manifest.write_text(json.dumps(manifest, indent=2) + "\n", encoding="utf8")
113
+ print(json.dumps(manifest, indent=2))
114
+ return 0
115
+
116
+
117
+ if __name__ == "__main__":
118
+ raise SystemExit(main())