devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. package/CLAUDE.md +1 -1
  2. package/benchmark/auto-resolve/README.md +318 -2
  3. package/benchmark/auto-resolve/RUBRIC.md +6 -0
  4. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/NOTES.md +63 -0
  5. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/expected.json +60 -0
  6. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/metadata.json +10 -0
  7. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/setup.sh +17 -0
  8. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/spec.md +52 -0
  9. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/task.txt +9 -0
  10. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/invalid.js +29 -0
  11. package/benchmark/auto-resolve/fixtures/F10-persist-write-collision/verifiers/parallel.js +50 -0
  12. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/NOTES.md +70 -0
  13. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/expected.json +52 -0
  14. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/metadata.json +10 -0
  15. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/setup.sh +171 -0
  16. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/spec.md +51 -0
  17. package/benchmark/auto-resolve/fixtures/F11-batch-import-all-or-nothing/task.txt +9 -0
  18. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/NOTES.md +83 -0
  19. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/expected.json +74 -0
  20. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/metadata.json +10 -0
  21. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/setup.sh +251 -0
  22. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/spec.md +58 -0
  23. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/task.txt +13 -0
  24. package/benchmark/auto-resolve/fixtures/F12-webhook-raw-body-signature/verifiers/replay-malformed-body.js +64 -0
  25. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/NOTES.md +98 -0
  26. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/expected.json +46 -0
  27. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/metadata.json +10 -0
  28. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/setup.sh +336 -0
  29. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/spec.md +52 -0
  30. package/benchmark/auto-resolve/fixtures/F15-frozen-diff-race-review/task.txt +9 -0
  31. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/NOTES.md +26 -0
  32. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/expected.json +64 -0
  33. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/metadata.json +10 -0
  34. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/setup.sh +32 -0
  35. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/spec.md +58 -0
  36. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/task.txt +7 -0
  37. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/exact-success.js +54 -0
  38. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/no-hardcoded-pricing.js +47 -0
  39. package/benchmark/auto-resolve/fixtures/F16-cli-quote-tax-rules/verifiers/stock-error.js +45 -0
  40. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/NOTES.md +27 -0
  41. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/expected.json +62 -0
  42. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/metadata.json +10 -0
  43. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/setup.sh +2 -0
  44. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/spec.md +62 -0
  45. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/task.txt +7 -0
  46. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/error-order.js +55 -0
  47. package/benchmark/auto-resolve/fixtures/F21-cli-scheduler-priority/verifiers/priority-blocked.js +48 -0
  48. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/NOTES.md +27 -0
  49. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/expected.json +56 -0
  50. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/metadata.json +10 -0
  51. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/setup.sh +2 -0
  52. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/spec.md +65 -0
  53. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/task.txt +7 -0
  54. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/conflicting-duplicate.js +34 -0
  55. package/benchmark/auto-resolve/fixtures/F22-cli-ledger-close/verifiers/idempotent-close.js +41 -0
  56. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/NOTES.md +27 -0
  57. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/expected.json +56 -0
  58. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/metadata.json +10 -0
  59. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/setup.sh +2 -0
  60. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/spec.md +71 -0
  61. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/task.txt +7 -0
  62. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/priority-rollback.js +64 -0
  63. package/benchmark/auto-resolve/fixtures/F23-cli-fulfillment-wave/verifiers/single-warehouse-fefo.js +66 -0
  64. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/NOTES.md +28 -0
  65. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/expected.json +66 -0
  66. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/metadata.json +10 -0
  67. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/setup.sh +36 -0
  68. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/spec.md +65 -0
  69. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/task.txt +7 -0
  70. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/catalog-source.js +57 -0
  71. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/exact-success.js +63 -0
  72. package/benchmark/auto-resolve/fixtures/F25-cli-cart-promotion-rules/verifiers/stock-error.js +34 -0
  73. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/NOTES.md +25 -0
  74. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/expected.json +68 -0
  75. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/metadata.json +10 -0
  76. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/setup.sh +17 -0
  77. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/spec.md +69 -0
  78. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/task.txt +7 -0
  79. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/conflicting-duplicate.js +29 -0
  80. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/exact-payout.js +58 -0
  81. package/benchmark/auto-resolve/fixtures/F26-cli-payout-ledger-rules/verifiers/rules-source.js +56 -0
  82. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/NOTES.md +24 -0
  83. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/expected.json +66 -0
  84. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/metadata.json +10 -0
  85. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/setup.sh +22 -0
  86. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/spec.md +62 -0
  87. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/task.txt +9 -0
  88. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/exact-success.js +48 -0
  89. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/insufficient-balance.js +36 -0
  90. package/benchmark/auto-resolve/fixtures/F27-cli-gift-card-redemption/verifiers/rules-source.js +55 -0
  91. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/NOTES.md +20 -0
  92. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/expected.json +66 -0
  93. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/metadata.json +10 -0
  94. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/setup.sh +23 -0
  95. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/spec.md +66 -0
  96. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/task.txt +11 -0
  97. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/exact-success.js +44 -0
  98. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/rules-source.js +58 -0
  99. package/benchmark/auto-resolve/fixtures/F28-cli-rental-quote-rules/verifiers/unavailable-inventory.js +35 -0
  100. package/benchmark/auto-resolve/fixtures/SCHEMA.md +13 -1
  101. package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py +98 -0
  102. package/benchmark/auto-resolve/scripts/fetch-swebench-instances.py +111 -0
  103. package/benchmark/auto-resolve/scripts/frozen-verify-gate.py +289 -0
  104. package/benchmark/auto-resolve/scripts/full-pipeline-pair-gate.py +250 -0
  105. package/benchmark/auto-resolve/scripts/headroom-gate.py +147 -0
  106. package/benchmark/auto-resolve/scripts/judge.sh +82 -3
  107. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py +244 -0
  108. package/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py +118 -0
  109. package/benchmark/auto-resolve/scripts/prepare-swebench-solver-worktree.py +192 -0
  110. package/benchmark/auto-resolve/scripts/run-fixture.sh +234 -40
  111. package/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh +511 -0
  112. package/benchmark/auto-resolve/scripts/run-full-pipeline-pair-candidate.sh +162 -0
  113. package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh +93 -0
  114. package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh +209 -0
  115. package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh +239 -0
  116. package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py +265 -0
  117. package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh +192 -0
  118. package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh +131 -0
  119. package/benchmark/auto-resolve/scripts/test-headroom-gate.sh +84 -0
  120. package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh +302 -0
  121. package/config/skills/_shared/archive_run.py +3 -0
  122. package/config/skills/_shared/codex-config.md +2 -2
  123. package/config/skills/_shared/codex-monitored.sh +72 -7
  124. package/config/skills/_shared/collect-codex-findings.py +125 -0
  125. package/config/skills/_shared/engine-preflight.md +1 -1
  126. package/config/skills/_shared/expected.schema.json +18 -0
  127. package/config/skills/_shared/spec-verify-check.py +312 -10
  128. package/config/skills/_shared/verify-merge-findings.py +327 -0
  129. package/config/skills/devlyn:resolve/SKILL.md +62 -8
  130. package/config/skills/devlyn:resolve/references/phases/build-gate.md +1 -1
  131. package/config/skills/devlyn:resolve/references/phases/probe-derive.md +164 -0
  132. package/config/skills/devlyn:resolve/references/phases/verify.md +156 -4
  133. package/config/skills/devlyn:resolve/references/state-schema.md +10 -4
  134. package/package.json +1 -1
  135. package/scripts/lint-skills.sh +32 -0
@@ -0,0 +1,93 @@
1
+ #!/usr/bin/env bash
2
+ # run-headroom-candidate.sh — calibrate candidate fixtures for L2/pair headroom.
3
+ #
4
+ # Runs only the arms needed by headroom-gate.py: bare and solo_claude.
5
+ # Then blind-judges those two arms and applies the mechanical gate.
6
+
7
+ set -euo pipefail
8
+
9
+ usage() {
10
+ local code="${1:-1}"
11
+ echo "usage: $0 [--run-id ID] <fixture> [<fixture> ...]" >&2
12
+ exit "$code"
13
+ }
14
+
15
+ RUN_ID=""
16
+ FIXTURES=()
17
+ while [ $# -gt 0 ]; do
18
+ case "$1" in
19
+ --run-id) RUN_ID="$2"; shift 2;;
20
+ -h|--help) usage 0;;
21
+ F[0-9]*) FIXTURES+=("$1"); shift;;
22
+ *) echo "unknown arg: $1" >&2; usage;;
23
+ esac
24
+ done
25
+
26
+ [ ${#FIXTURES[@]} -gt 0 ] || usage
27
+
28
+ BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
29
+ REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
30
+
31
+ if [ -z "$RUN_ID" ]; then
32
+ TS=$(date -u +%Y%m%dT%H%M%SZ)
33
+ SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
34
+ RUN_ID="${TS}-${SHA}-headroom"
35
+ fi
36
+
37
+ echo ""
38
+ echo "═══ Headroom Candidate Run ═══"
39
+ echo "Run-id: $RUN_ID"
40
+ echo "Fixtures: ${FIXTURES[*]}"
41
+ echo "Arms: bare solo_claude"
42
+ if [ ${#FIXTURES[@]} -lt 2 ]; then
43
+ echo "Gate: will FAIL set gate unless at least 2 fixtures are supplied"
44
+ fi
45
+ echo ""
46
+
47
+ SRC_SKILLS="$REPO_ROOT/config/skills"
48
+ DST_SKILLS="$REPO_ROOT/.claude/skills"
49
+ mkdir -p "$DST_SKILLS"
50
+ mirrored=0
51
+ for src_dir in "$SRC_SKILLS"/*/; do
52
+ [ -d "$src_dir" ] || continue
53
+ name=$(basename "$src_dir")
54
+ case "$name" in
55
+ devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
56
+ continue ;;
57
+ esac
58
+ staging="$DST_SKILLS/.${name}.staging"
59
+ rm -rf "$staging"
60
+ cp -R "$src_dir" "$staging"
61
+ rm -rf "$DST_SKILLS/$name"
62
+ mv "$staging" "$DST_SKILLS/$name"
63
+ mirrored=$((mirrored + 1))
64
+ done
65
+ echo "[headroom] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
66
+
67
+ for fid in "${FIXTURES[@]}"; do
68
+ echo "[headroom] ► $fid / bare"
69
+ bash "$BENCH_ROOT/scripts/run-fixture.sh" \
70
+ --fixture "$fid" --arm bare --run-id "$RUN_ID" \
71
+ || echo "[headroom] ✗ $fid / bare (arm failure tolerated; artifacts may still exist)"
72
+
73
+ echo "[headroom] ► $fid / solo_claude"
74
+ bash "$BENCH_ROOT/scripts/run-fixture.sh" \
75
+ --fixture "$fid" --arm solo_claude --run-id "$RUN_ID" \
76
+ || echo "[headroom] ✗ $fid / solo_claude (arm failure tolerated; artifacts may still exist)"
77
+
78
+ echo "[headroom] ► judge $fid"
79
+ bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
80
+ || echo "[headroom] ✗ judge failed for $fid"
81
+ done
82
+
83
+ echo ""
84
+ set +e
85
+ python3 "$BENCH_ROOT/scripts/headroom-gate.py" \
86
+ --run-id "$RUN_ID" \
87
+ --out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json" \
88
+ --out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
89
+ GATE_EXIT=$?
90
+ set -e
91
+
92
+ cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
93
+ exit "$GATE_EXIT"
@@ -0,0 +1,209 @@
1
+ #!/usr/bin/env bash
2
+ # Run a prepared SWE-bench frozen VERIFY corpus and gate the result set.
3
+ set -euo pipefail
4
+
5
+ usage() {
6
+ cat >&2 <<EOF
7
+ usage: $0 --manifest <path> [--run-prefix ID] [--pair-mode forced|gated]
8
+ [--min-runs N] [--out-json <path>] [--out-md <path>]
9
+ [--max-pair-solo-wall-ratio N] [--timeout-seconds N]
10
+ [--run-ids-out <path>] [--resume-completed-arms]
11
+ [--prepare-only] [--gate-only-run-ids <path>]
12
+
13
+ Reads the manifest from prepare-swebench-frozen-corpus.py, runs each prepared
14
+ case through run-frozen-verify-pair.sh, then applies frozen-verify-gate.py to
15
+ the resulting run ids. --prepare-only validates patch application without
16
+ provider calls and skips the gate. --gate-only-run-ids reruns the gate over an
17
+ existing newline-delimited run-id file without invoking providers.
18
+ EOF
19
+ exit "${1:-1}"
20
+ }
21
+
22
+ MANIFEST=""
23
+ RUN_PREFIX=""
24
+ PAIR_MODE="gated"
25
+ MIN_RUNS=2
26
+ OUT_JSON=""
27
+ OUT_MD=""
28
+ MAX_PAIR_SOLO_WALL_RATIO=""
29
+ PREPARE_ONLY=0
30
+ GATE_ONLY_RUN_IDS=""
31
+ TIMEOUT_SECONDS=""
32
+ RUN_IDS_OUT=""
33
+ RESUME_COMPLETED_ARMS=0
34
+ while [ $# -gt 0 ]; do
35
+ case "$1" in
36
+ --manifest) MANIFEST="$2"; shift 2;;
37
+ --run-prefix) RUN_PREFIX="$2"; shift 2;;
38
+ --pair-mode) PAIR_MODE="$2"; shift 2;;
39
+ --min-runs) MIN_RUNS="$2"; shift 2;;
40
+ --out-json) OUT_JSON="$2"; shift 2;;
41
+ --out-md) OUT_MD="$2"; shift 2;;
42
+ --max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
43
+ --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
44
+ --run-ids-out) RUN_IDS_OUT="$2"; shift 2;;
45
+ --resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
46
+ --prepare-only) PREPARE_ONLY=1; shift;;
47
+ --gate-only-run-ids) GATE_ONLY_RUN_IDS="$2"; shift 2;;
48
+ -h|--help) usage 0;;
49
+ *) echo "unknown arg: $1" >&2; usage 1;;
50
+ esac
51
+ done
52
+
53
+ [ -n "$MANIFEST" ] || usage 1
54
+ [ -f "$MANIFEST" ] || { echo "manifest not found: $MANIFEST" >&2; exit 1; }
55
+ [ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated" >&2; exit 1; }
56
+ case "$MIN_RUNS" in ''|*[!0-9]*) echo "--min-runs must be an integer" >&2; exit 1;; esac
57
+ [ "$MIN_RUNS" -gt 0 ] || { echo "--min-runs must be > 0" >&2; exit 1; }
58
+ if [ -n "$TIMEOUT_SECONDS" ]; then
59
+ case "$TIMEOUT_SECONDS" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
60
+ [ "$TIMEOUT_SECONDS" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
61
+ fi
62
+ if [ -n "$MAX_PAIR_SOLO_WALL_RATIO" ]; then
63
+ python3 - "$MAX_PAIR_SOLO_WALL_RATIO" <<'PY' || { echo "--max-pair-solo-wall-ratio must be a positive number" >&2; exit 1; }
64
+ import sys
65
+ try:
66
+ value = float(sys.argv[1])
67
+ except ValueError:
68
+ raise SystemExit(1)
69
+ if value <= 0:
70
+ raise SystemExit(1)
71
+ PY
72
+ fi
73
+ [ -z "$GATE_ONLY_RUN_IDS" ] || [ -f "$GATE_ONLY_RUN_IDS" ] || { echo "run ids file not found: $GATE_ONLY_RUN_IDS" >&2; exit 1; }
74
+ [ "$PREPARE_ONLY" -eq 0 ] || [ -z "$GATE_ONLY_RUN_IDS" ] || { echo "--prepare-only and --gate-only-run-ids are mutually exclusive" >&2; exit 1; }
75
+
76
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
77
+ BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
78
+ if [ -z "$RUN_PREFIX" ]; then
79
+ RUN_PREFIX="$(date -u +%Y%m%dT%H%M%SZ)-swebench-frozen"
80
+ fi
81
+
82
+ TMP_RUN_IDS="$(mktemp)"
83
+ trap 'rm -f "$TMP_RUN_IDS"' EXIT
84
+ ROW_FAILURES=0
85
+
86
+ if [ -n "$GATE_ONLY_RUN_IDS" ]; then
87
+ cp "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS"
88
+ else
89
+ while IFS=$'\t' read -r index instance_id cases_root repo_dir diff_path; do
90
+ [ -n "$instance_id" ] || continue
91
+ run_id="${RUN_PREFIX}-${index}-${instance_id}"
92
+ safe_run_id="$(printf '%s' "$run_id" | tr -c 'A-Za-z0-9_.-' '-')"
93
+ echo "[swebench-frozen-corpus] ${index}: ${instance_id} -> ${safe_run_id}"
94
+ cmd=(
95
+ bash "$SCRIPT_DIR/run-frozen-verify-pair.sh"
96
+ --fixture "$instance_id"
97
+ --fixtures-root "$cases_root"
98
+ --base-repo "$repo_dir"
99
+ --diff "$diff_path"
100
+ --run-id "$safe_run_id"
101
+ --pair-mode "$PAIR_MODE"
102
+ )
103
+ if [ -n "$TIMEOUT_SECONDS" ]; then
104
+ cmd+=(--timeout-seconds "$TIMEOUT_SECONDS")
105
+ fi
106
+ if [ "$PREPARE_ONLY" -eq 1 ]; then
107
+ cmd+=(--prepare-only)
108
+ fi
109
+ if [ "$RESUME_COMPLETED_ARMS" -eq 1 ]; then
110
+ cmd+=(--resume-completed-arms)
111
+ fi
112
+ set +e
113
+ "${cmd[@]}" </dev/null
114
+ row_exit=$?
115
+ set -e
116
+ if [ "$row_exit" -ne 0 ]; then
117
+ echo "[swebench-frozen-corpus] row failed: ${safe_run_id} exit=${row_exit}" >&2
118
+ ROW_FAILURES=$((ROW_FAILURES + 1))
119
+ python3 - "$BENCH_ROOT/results/$safe_run_id" "$instance_id" "$row_exit" <<'PY'
120
+ import json
121
+ import pathlib
122
+ import sys
123
+
124
+ run_root = pathlib.Path(sys.argv[1])
125
+ instance_id = sys.argv[2]
126
+ row_exit = int(sys.argv[3])
127
+ run_root.mkdir(parents=True, exist_ok=True)
128
+ for arm in ("solo", "pair"):
129
+ arm_root = run_root / arm
130
+ arm_root.mkdir(parents=True, exist_ok=True)
131
+ input_path = arm_root / "input.md"
132
+ if not input_path.exists():
133
+ input_path.write_text(
134
+ f"Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/{instance_id}.md.\n",
135
+ encoding="utf8",
136
+ )
137
+ compare_path = run_root / "compare.json"
138
+ if not compare_path.exists():
139
+ compare_path.write_text(
140
+ json.dumps(
141
+ {
142
+ "solo": {"invoke_exit": row_exit, "timed_out": False},
143
+ "pair": {"invoke_exit": row_exit, "timed_out": False, "pair_mode": False},
144
+ "comparison": {
145
+ "pair_trigger_missed": False,
146
+ "pair_verdict_lift": False,
147
+ "pair_internal_verdict_lift": False,
148
+ "row_failed_before_compare": True,
149
+ "row_exit": row_exit,
150
+ },
151
+ },
152
+ indent=2,
153
+ )
154
+ + "\n",
155
+ encoding="utf8",
156
+ )
157
+ PY
158
+ fi
159
+ printf '%s\n' "$safe_run_id" >> "$TMP_RUN_IDS"
160
+ done < <(python3 - "$MANIFEST" <<'PY'
161
+ import json, pathlib, sys
162
+ manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
163
+ for index, row in enumerate(manifest.get("prepared") or [], start=1):
164
+ instance_id = row["instance_id"]
165
+ case_dir = pathlib.Path(row["case_dir"])
166
+ repo_dir = pathlib.Path(row["repo_dir"])
167
+ print("\t".join([
168
+ str(index),
169
+ instance_id,
170
+ str(case_dir.parent),
171
+ str(repo_dir),
172
+ str(case_dir / "model.patch"),
173
+ ]))
174
+ PY
175
+ )
176
+ fi
177
+
178
+ if [ -n "$RUN_IDS_OUT" ]; then
179
+ mkdir -p "$(dirname "$RUN_IDS_OUT")"
180
+ cp "$TMP_RUN_IDS" "$RUN_IDS_OUT"
181
+ fi
182
+
183
+ if [ "$PREPARE_ONLY" -eq 1 ]; then
184
+ echo "[swebench-frozen-corpus] prepare-only complete; gate skipped"
185
+ if [ "$ROW_FAILURES" -gt 0 ]; then
186
+ echo "[swebench-frozen-corpus] row failures: $ROW_FAILURES" >&2
187
+ exit 1
188
+ fi
189
+ exit 0
190
+ fi
191
+
192
+ run_count="$(wc -l < "$TMP_RUN_IDS" | tr -d ' ')"
193
+ [ "$run_count" -gt 0 ] || { echo "manifest prepared no runs" >&2; exit 1; }
194
+
195
+ fixtures_root="$(python3 - "$MANIFEST" <<'PY'
196
+ import json, pathlib, sys
197
+ manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
198
+ print(manifest["cases_root"])
199
+ PY
200
+ )"
201
+ gate_args=(python3 "$SCRIPT_DIR/frozen-verify-gate.py" --fixtures-root "$fixtures_root" --min-runs "$MIN_RUNS")
202
+ [ -z "$OUT_JSON" ] || gate_args+=(--out-json "$OUT_JSON")
203
+ [ -z "$OUT_MD" ] || gate_args+=(--out-md "$OUT_MD")
204
+ [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || gate_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
205
+ while IFS= read -r run_id; do
206
+ gate_args+=(--run-id "$run_id")
207
+ done < "$TMP_RUN_IDS"
208
+
209
+ "${gate_args[@]}"
@@ -0,0 +1,239 @@
1
+ #!/usr/bin/env bash
2
+ # Prepare SWE-bench solver worktrees, run a direct solver, and collect patches.
3
+ set -euo pipefail
4
+
5
+ usage() {
6
+ cat >&2 <<EOF
7
+ usage: $0 --instances-jsonl <path> --predictions-out <path>
8
+ [--instance-id ID ...] [--limit N] [--model-name NAME]
9
+ [--repos-root <path>] [--worktrees-root <path>]
10
+ [--timeout-seconds N] [--copy-devlyn-context] [--resume]
11
+
12
+ Runs Claude Code directly against each selected SWE-bench instance without
13
+ reading gold patch/test_patch fields. Each worktree receives patch.diff plus
14
+ direct-transcript.txt and claude-direct-debug.log. At the end, patch.diff files
15
+ are collected into a SWE-bench predictions JSONL.
16
+ EOF
17
+ exit "${1:-1}"
18
+ }
19
+
20
+ INSTANCES_JSONL=""
21
+ PREDICTIONS_OUT=""
22
+ MODEL_NAME="claude-direct"
23
+ REPOS_ROOT="benchmark/auto-resolve/external/swebench/repos-solver"
24
+ WORKTREES_ROOT="benchmark/auto-resolve/external/swebench/worktrees"
25
+ TIMEOUT_SECONDS=2400
26
+ COPY_DEVLYN_CONTEXT=0
27
+ RESUME=0
28
+ LIMIT=""
29
+ INSTANCE_IDS=()
30
+
31
+ while [ $# -gt 0 ]; do
32
+ case "$1" in
33
+ --instances-jsonl) INSTANCES_JSONL="$2"; shift 2;;
34
+ --predictions-out) PREDICTIONS_OUT="$2"; shift 2;;
35
+ --model-name) MODEL_NAME="$2"; shift 2;;
36
+ --repos-root) REPOS_ROOT="$2"; shift 2;;
37
+ --worktrees-root) WORKTREES_ROOT="$2"; shift 2;;
38
+ --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
39
+ --copy-devlyn-context) COPY_DEVLYN_CONTEXT=1; shift;;
40
+ --resume) RESUME=1; shift;;
41
+ --limit) LIMIT="$2"; shift 2;;
42
+ --instance-id) INSTANCE_IDS+=("$2"); shift 2;;
43
+ -h|--help) usage 0;;
44
+ *) echo "unknown arg: $1" >&2; usage 1;;
45
+ esac
46
+ done
47
+
48
+ [ -n "$INSTANCES_JSONL" ] || usage 1
49
+ [ -n "$PREDICTIONS_OUT" ] || usage 1
50
+ [ -f "$INSTANCES_JSONL" ] || { echo "instances JSONL not found: $INSTANCES_JSONL" >&2; exit 1; }
51
+ case "$TIMEOUT_SECONDS" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
52
+ [ "$TIMEOUT_SECONDS" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
53
+ if [ -n "$LIMIT" ]; then
54
+ case "$LIMIT" in ''|*[!0-9]*) echo "--limit must be an integer" >&2; exit 1;; esac
55
+ [ "$LIMIT" -gt 0 ] || { echo "--limit must be > 0" >&2; exit 1; }
56
+ fi
57
+ command -v claude >/dev/null 2>&1 || { echo "claude command not found" >&2; exit 1; }
58
+ mkdir -p "$REPOS_ROOT" "$WORKTREES_ROOT" "$(dirname "$PREDICTIONS_OUT")"
59
+
60
+ SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
61
+ TMP_IDS="$(mktemp)"
62
+ TMP_SELECTED_INSTANCES="$(mktemp)"
63
+ trap 'rm -f "$TMP_IDS" "$TMP_SELECTED_INSTANCES"' EXIT
64
+
65
+ python3 - "$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT" "${INSTANCE_IDS[@]}" > "$TMP_IDS" <<'PY'
66
+ import json
67
+ import sys
68
+ from pathlib import Path
69
+
70
+ instances_path = Path(sys.argv[1])
71
+ selected_path = Path(sys.argv[2])
72
+ limit = int(sys.argv[3]) if sys.argv[3] else None
73
+ requested = sys.argv[4:]
74
+ requested_set = set(requested)
75
+ rows = []
76
+ with instances_path.open(encoding="utf8") as f:
77
+ for line_no, line in enumerate(f, start=1):
78
+ if not line.strip():
79
+ continue
80
+ row = json.loads(line)
81
+ instance_id = row.get("instance_id")
82
+ if not isinstance(instance_id, str) or not instance_id:
83
+ raise SystemExit(f"{instances_path}:{line_no}: missing instance_id")
84
+ if requested_set and instance_id not in requested_set:
85
+ continue
86
+ rows.append(row)
87
+ if limit is not None and len(rows) >= limit:
88
+ break
89
+ if requested_set:
90
+ missing = sorted(requested_set - {row["instance_id"] for row in rows})
91
+ if missing:
92
+ raise SystemExit(f"requested instance ids not found: {', '.join(missing)}")
93
+ for instance_id in rows:
94
+ print(instance_id["instance_id"])
95
+ with selected_path.open("w", encoding="utf8") as f:
96
+ for row in rows:
97
+ f.write(json.dumps(row) + "\n")
98
+ PY
99
+
100
+ run_solver() {
101
+ local worktree
102
+ worktree="$(cd "$1" && pwd -P)"
103
+ local timeout_seconds="$2"
104
+ local prompt_file="$worktree/solve-prompt.txt"
105
+ local transcript="$worktree/direct-transcript.txt"
106
+ local debug_log="$worktree/claude-direct-debug.log"
107
+ local timeout_flag="$worktree/.solver-timed-out"
108
+
109
+ rm -f "$transcript" "$debug_log" "$timeout_flag"
110
+ set +e
111
+ set -m
112
+ (
113
+ cd "$worktree"
114
+ exec claude \
115
+ -p "$(cat "$prompt_file")" \
116
+ --dangerously-skip-permissions \
117
+ --effort xhigh \
118
+ --strict-mcp-config \
119
+ --mcp-config '{"mcpServers":{}}' \
120
+ --debug-file "$debug_log" \
121
+ </dev/null
122
+ ) > "$transcript" 2>&1 &
123
+ local child_pid=$!
124
+ set +m
125
+
126
+ (
127
+ sleep "$timeout_seconds"
128
+ if kill -0 "$child_pid" 2>/dev/null; then
129
+ : > "$timeout_flag"
130
+ kill -TERM -- "-$child_pid" 2>/dev/null
131
+ sleep 5
132
+ kill -KILL -- "-$child_pid" 2>/dev/null
133
+ fi
134
+ ) &
135
+ local watchdog_pid=$!
136
+
137
+ wait "$child_pid"
138
+ local invoke_exit=$?
139
+ kill -TERM "$watchdog_pid" 2>/dev/null || true
140
+ wait "$watchdog_pid" 2>/dev/null || true
141
+
142
+ if [ -f "$timeout_flag" ]; then
143
+ rm -f "$timeout_flag"
144
+ invoke_exit=124
145
+ fi
146
+ set -e
147
+ return "$invoke_exit"
148
+ }
149
+
150
+ write_patch() {
151
+ local worktree
152
+ worktree="$(cd "$1" && pwd -P)"
153
+ (
154
+ cd "$worktree"
155
+ git add -N -- . \
156
+ ':(exclude).claude/**' \
157
+ ':(exclude)CLAUDE.md' \
158
+ ':(exclude)benchmark/**' \
159
+ ':(exclude)docs/roadmap/phase-1/*.md' \
160
+ ':(exclude)solve-prompt.txt' \
161
+ ':(exclude)direct-transcript.txt' \
162
+ ':(exclude)claude-direct-debug.log' \
163
+ ':(exclude)latest' \
164
+ ':(exclude).solver-timed-out' >/dev/null 2>&1 || true
165
+ git diff --binary -- . \
166
+ ':(exclude).claude/**' \
167
+ ':(exclude)CLAUDE.md' \
168
+ ':(exclude)benchmark/**' \
169
+ ':(exclude)docs/roadmap/phase-1/*.md' \
170
+ ':(exclude)solve-prompt.txt' \
171
+ ':(exclude)direct-transcript.txt' \
172
+ ':(exclude)claude-direct-debug.log' \
173
+ ':(exclude)latest' \
174
+ ':(exclude).solver-timed-out' > patch.diff
175
+ )
176
+ }
177
+
178
+ while IFS= read -r instance_id; do
179
+ [ -n "$instance_id" ] || continue
180
+ worktree="$WORKTREES_ROOT/$instance_id"
181
+ if [ "$RESUME" -eq 1 ] && [ -s "$worktree/patch.diff" ]; then
182
+ echo "[swebench-solver] skip existing patch: $instance_id"
183
+ continue
184
+ fi
185
+
186
+ echo "[swebench-solver] prepare: $instance_id"
187
+ prepare_cmd=(
188
+ python3 "$SCRIPT_DIR/prepare-swebench-solver-worktree.py"
189
+ --instances-jsonl "$INSTANCES_JSONL"
190
+ --instance-id "$instance_id"
191
+ --repos-root "$REPOS_ROOT"
192
+ --worktrees-root "$WORKTREES_ROOT"
193
+ )
194
+ if [ "$COPY_DEVLYN_CONTEXT" -eq 1 ]; then
195
+ prepare_cmd+=(--copy-devlyn-context)
196
+ fi
197
+ "${prepare_cmd[@]}" > "$worktree.prepare.json"
198
+
199
+ echo "[swebench-solver] solve: $instance_id"
200
+ if run_solver "$worktree" "$TIMEOUT_SECONDS"; then
201
+ invoke_exit=0
202
+ else
203
+ invoke_exit=$?
204
+ fi
205
+ write_patch "$worktree"
206
+ python3 - "$worktree" "$instance_id" "$invoke_exit" <<'PY'
207
+ import json
208
+ import subprocess
209
+ import sys
210
+ from pathlib import Path
211
+
212
+ worktree = Path(sys.argv[1])
213
+ instance_id = sys.argv[2]
214
+ invoke_exit = int(sys.argv[3])
215
+ patch = worktree / "patch.diff"
216
+ stat = subprocess.run(
217
+ ["git", "-C", str(worktree), "diff", "--stat", "--", "."],
218
+ text=True,
219
+ capture_output=True,
220
+ check=False,
221
+ )
222
+ report = {
223
+ "instance_id": instance_id,
224
+ "invoke_exit": invoke_exit,
225
+ "patch_path": str(patch),
226
+ "patch_bytes": patch.stat().st_size if patch.exists() else 0,
227
+ "diff_stat": stat.stdout.strip(),
228
+ }
229
+ (worktree / "solver-result.json").write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
230
+ print(json.dumps(report, indent=2))
231
+ PY
232
+ done < "$TMP_IDS"
233
+
234
+ python3 "$SCRIPT_DIR/collect-swebench-predictions.py" \
235
+ --patch-root "$WORKTREES_ROOT" \
236
+ --instances-jsonl "$TMP_SELECTED_INSTANCES" \
237
+ --model-name "$MODEL_NAME" \
238
+ --out "$PREDICTIONS_OUT" \
239
+ --allow-empty