npm - devlyn-cli - Versions diffs - 2.1.0 → 2.2.0 - Mend

devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh ADDED Viewed

@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# run-headroom-candidate.sh — calibrate candidate fixtures for L2/pair headroom.
+#
+# Runs only the arms needed by headroom-gate.py: bare and solo_claude.
+# Then blind-judges those two arms and applies the mechanical gate.
+set -euo pipefail
+usage() {
+  local code="${1:-1}"
+  echo "usage: $0 [--run-id ID] <fixture> [<fixture> ...]" >&2
+  exit "$code"
+}
+RUN_ID=""
+FIXTURES=()
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --run-id) RUN_ID="$2"; shift 2;;
+    -h|--help) usage 0;;
+    F[0-9]*) FIXTURES+=("$1"); shift;;
+    *) echo "unknown arg: $1" >&2; usage;;
+  esac
+done
+[ ${#FIXTURES[@]} -gt 0 ] || usage
+BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
+if [ -z "$RUN_ID" ]; then
+  TS=$(date -u +%Y%m%dT%H%M%SZ)
+  SHA=$(git -C "$REPO_ROOT" rev-parse --short HEAD 2>/dev/null || echo nogit)
+  RUN_ID="${TS}-${SHA}-headroom"
+fi
+echo ""
+echo "═══ Headroom Candidate Run ═══"
+echo "Run-id:   $RUN_ID"
+echo "Fixtures: ${FIXTURES[*]}"
+echo "Arms:     bare solo_claude"
+if [ ${#FIXTURES[@]} -lt 2 ]; then
+  echo "Gate:     will FAIL set gate unless at least 2 fixtures are supplied"
+fi
+echo ""
+SRC_SKILLS="$REPO_ROOT/config/skills"
+DST_SKILLS="$REPO_ROOT/.claude/skills"
+mkdir -p "$DST_SKILLS"
+mirrored=0
+for src_dir in "$SRC_SKILLS"/*/; do
+  [ -d "$src_dir" ] || continue
+  name=$(basename "$src_dir")
+  case "$name" in
+    devlyn:auto-resolve-workspace|devlyn:ideate-workspace|preflight-workspace|roadmap-archival-workspace)
+      continue ;;
+  esac
+  staging="$DST_SKILLS/.${name}.staging"
+  rm -rf "$staging"
+  cp -R "$src_dir" "$staging"
+  rm -rf "$DST_SKILLS/$name"
+  mv "$staging" "$DST_SKILLS/$name"
+  mirrored=$((mirrored + 1))
+done
+echo "[headroom] mirrored $mirrored committed skill(s): config/skills/ -> .claude/skills/"
+for fid in "${FIXTURES[@]}"; do
+  echo "[headroom] ► $fid / bare"
+  bash "$BENCH_ROOT/scripts/run-fixture.sh" \
+    --fixture "$fid" --arm bare --run-id "$RUN_ID" \
+    || echo "[headroom] ✗ $fid / bare (arm failure tolerated; artifacts may still exist)"
+  echo "[headroom] ► $fid / solo_claude"
+  bash "$BENCH_ROOT/scripts/run-fixture.sh" \
+    --fixture "$fid" --arm solo_claude --run-id "$RUN_ID" \
+    || echo "[headroom] ✗ $fid / solo_claude (arm failure tolerated; artifacts may still exist)"
+  echo "[headroom] ► judge $fid"
+  bash "$BENCH_ROOT/scripts/judge.sh" --fixture "$fid" --run-id "$RUN_ID" \
+    || echo "[headroom] ✗ judge failed for $fid"
+done
+echo ""
+set +e
+python3 "$BENCH_ROOT/scripts/headroom-gate.py" \
+  --run-id "$RUN_ID" \
+  --out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json" \
+  --out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
+GATE_EXIT=$?
+set -e
+cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
+exit "$GATE_EXIT"

package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh ADDED Viewed

@@ -0,0 +1,209 @@
+#!/usr/bin/env bash
+# Run a prepared SWE-bench frozen VERIFY corpus and gate the result set.
+set -euo pipefail
+usage() {
+  cat >&2 <<EOF
+usage: $0 --manifest <path> [--run-prefix ID] [--pair-mode forced|gated]
+          [--min-runs N] [--out-json <path>] [--out-md <path>]
+          [--max-pair-solo-wall-ratio N] [--timeout-seconds N]
+          [--run-ids-out <path>] [--resume-completed-arms]
+          [--prepare-only] [--gate-only-run-ids <path>]
+Reads the manifest from prepare-swebench-frozen-corpus.py, runs each prepared
+case through run-frozen-verify-pair.sh, then applies frozen-verify-gate.py to
+the resulting run ids. --prepare-only validates patch application without
+provider calls and skips the gate. --gate-only-run-ids reruns the gate over an
+existing newline-delimited run-id file without invoking providers.
+EOF
+  exit "${1:-1}"
+}
+MANIFEST=""
+RUN_PREFIX=""
+PAIR_MODE="gated"
+MIN_RUNS=2
+OUT_JSON=""
+OUT_MD=""
+MAX_PAIR_SOLO_WALL_RATIO=""
+PREPARE_ONLY=0
+GATE_ONLY_RUN_IDS=""
+TIMEOUT_SECONDS=""
+RUN_IDS_OUT=""
+RESUME_COMPLETED_ARMS=0
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --manifest) MANIFEST="$2"; shift 2;;
+    --run-prefix) RUN_PREFIX="$2"; shift 2;;
+    --pair-mode) PAIR_MODE="$2"; shift 2;;
+    --min-runs) MIN_RUNS="$2"; shift 2;;
+    --out-json) OUT_JSON="$2"; shift 2;;
+    --out-md) OUT_MD="$2"; shift 2;;
+    --max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
+    --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
+    --run-ids-out) RUN_IDS_OUT="$2"; shift 2;;
+    --resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
+    --prepare-only) PREPARE_ONLY=1; shift;;
+    --gate-only-run-ids) GATE_ONLY_RUN_IDS="$2"; shift 2;;
+    -h|--help) usage 0;;
+    *) echo "unknown arg: $1" >&2; usage 1;;
+  esac
+done
+[ -n "$MANIFEST" ] || usage 1
+[ -f "$MANIFEST" ] || { echo "manifest not found: $MANIFEST" >&2; exit 1; }
+[ "$PAIR_MODE" = "forced" ] || [ "$PAIR_MODE" = "gated" ] || { echo "--pair-mode must be forced|gated" >&2; exit 1; }
+case "$MIN_RUNS" in ''|*[!0-9]*) echo "--min-runs must be an integer" >&2; exit 1;; esac
+[ "$MIN_RUNS" -gt 0 ] || { echo "--min-runs must be > 0" >&2; exit 1; }
+if [ -n "$TIMEOUT_SECONDS" ]; then
+  case "$TIMEOUT_SECONDS" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
+  [ "$TIMEOUT_SECONDS" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
+fi
+if [ -n "$MAX_PAIR_SOLO_WALL_RATIO" ]; then
+  python3 - "$MAX_PAIR_SOLO_WALL_RATIO" <<'PY' || { echo "--max-pair-solo-wall-ratio must be a positive number" >&2; exit 1; }
+import sys
+try:
+    value = float(sys.argv[1])
+except ValueError:
+    raise SystemExit(1)
+if value <= 0:
+    raise SystemExit(1)
+PY
+fi
+[ -z "$GATE_ONLY_RUN_IDS" ] || [ -f "$GATE_ONLY_RUN_IDS" ] || { echo "run ids file not found: $GATE_ONLY_RUN_IDS" >&2; exit 1; }
+[ "$PREPARE_ONLY" -eq 0 ] || [ -z "$GATE_ONLY_RUN_IDS" ] || { echo "--prepare-only and --gate-only-run-ids are mutually exclusive" >&2; exit 1; }
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+BENCH_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+if [ -z "$RUN_PREFIX" ]; then
+  RUN_PREFIX="$(date -u +%Y%m%dT%H%M%SZ)-swebench-frozen"
+fi
+TMP_RUN_IDS="$(mktemp)"
+trap 'rm -f "$TMP_RUN_IDS"' EXIT
+ROW_FAILURES=0
+if [ -n "$GATE_ONLY_RUN_IDS" ]; then
+  cp "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS"
+else
+  while IFS=$'\t' read -r index instance_id cases_root repo_dir diff_path; do
+    [ -n "$instance_id" ] || continue
+    run_id="${RUN_PREFIX}-${index}-${instance_id}"
+    safe_run_id="$(printf '%s' "$run_id" | tr -c 'A-Za-z0-9_.-' '-')"
+    echo "[swebench-frozen-corpus] ${index}: ${instance_id} -> ${safe_run_id}"
+    cmd=(
+      bash "$SCRIPT_DIR/run-frozen-verify-pair.sh"
+      --fixture "$instance_id"
+      --fixtures-root "$cases_root"
+      --base-repo "$repo_dir"
+      --diff "$diff_path"
+      --run-id "$safe_run_id"
+      --pair-mode "$PAIR_MODE"
+    )
+    if [ -n "$TIMEOUT_SECONDS" ]; then
+      cmd+=(--timeout-seconds "$TIMEOUT_SECONDS")
+    fi
+    if [ "$PREPARE_ONLY" -eq 1 ]; then
+      cmd+=(--prepare-only)
+    fi
+    if [ "$RESUME_COMPLETED_ARMS" -eq 1 ]; then
+      cmd+=(--resume-completed-arms)
+    fi
+    set +e
+    "${cmd[@]}" </dev/null
+    row_exit=$?
+    set -e
+    if [ "$row_exit" -ne 0 ]; then
+      echo "[swebench-frozen-corpus] row failed: ${safe_run_id} exit=${row_exit}" >&2
+      ROW_FAILURES=$((ROW_FAILURES + 1))
+      python3 - "$BENCH_ROOT/results/$safe_run_id" "$instance_id" "$row_exit" <<'PY'
+import json
+import pathlib
+import sys
+run_root = pathlib.Path(sys.argv[1])
+instance_id = sys.argv[2]
+row_exit = int(sys.argv[3])
+run_root.mkdir(parents=True, exist_ok=True)
+for arm in ("solo", "pair"):
+    arm_root = run_root / arm
+    arm_root.mkdir(parents=True, exist_ok=True)
+    input_path = arm_root / "input.md"
+    if not input_path.exists():
+        input_path.write_text(
+            f"Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/{instance_id}.md.\n",
+            encoding="utf8",
+        )
+compare_path = run_root / "compare.json"
+if not compare_path.exists():
+    compare_path.write_text(
+        json.dumps(
+            {
+                "solo": {"invoke_exit": row_exit, "timed_out": False},
+                "pair": {"invoke_exit": row_exit, "timed_out": False, "pair_mode": False},
+                "comparison": {
+                    "pair_trigger_missed": False,
+                    "pair_verdict_lift": False,
+                    "pair_internal_verdict_lift": False,
+                    "row_failed_before_compare": True,
+                    "row_exit": row_exit,
+                },
+            },
+            indent=2,
+        )
+        + "\n",
+        encoding="utf8",
+    )
+PY
+    fi
+    printf '%s\n' "$safe_run_id" >> "$TMP_RUN_IDS"
+  done < <(python3 - "$MANIFEST" <<'PY'
+import json, pathlib, sys
+manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
+for index, row in enumerate(manifest.get("prepared") or [], start=1):
+    instance_id = row["instance_id"]
+    case_dir = pathlib.Path(row["case_dir"])
+    repo_dir = pathlib.Path(row["repo_dir"])
+    print("\t".join([
+        str(index),
+        instance_id,
+        str(case_dir.parent),
+        str(repo_dir),
+        str(case_dir / "model.patch"),
+    ]))
+PY
+)
+fi
+if [ -n "$RUN_IDS_OUT" ]; then
+  mkdir -p "$(dirname "$RUN_IDS_OUT")"
+  cp "$TMP_RUN_IDS" "$RUN_IDS_OUT"
+fi
+if [ "$PREPARE_ONLY" -eq 1 ]; then
+  echo "[swebench-frozen-corpus] prepare-only complete; gate skipped"
+  if [ "$ROW_FAILURES" -gt 0 ]; then
+    echo "[swebench-frozen-corpus] row failures: $ROW_FAILURES" >&2
+    exit 1
+  fi
+  exit 0
+fi
+run_count="$(wc -l < "$TMP_RUN_IDS" | tr -d ' ')"
+[ "$run_count" -gt 0 ] || { echo "manifest prepared no runs" >&2; exit 1; }
+fixtures_root="$(python3 - "$MANIFEST" <<'PY'
+import json, pathlib, sys
+manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
+print(manifest["cases_root"])
+PY
+)"
+gate_args=(python3 "$SCRIPT_DIR/frozen-verify-gate.py" --fixtures-root "$fixtures_root" --min-runs "$MIN_RUNS")
+[ -z "$OUT_JSON" ] || gate_args+=(--out-json "$OUT_JSON")
+[ -z "$OUT_MD" ] || gate_args+=(--out-md "$OUT_MD")
+[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || gate_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
+while IFS= read -r run_id; do
+  gate_args+=(--run-id "$run_id")
+done < "$TMP_RUN_IDS"
+"${gate_args[@]}"

package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh ADDED Viewed

@@ -0,0 +1,239 @@
+#!/usr/bin/env bash
+# Prepare SWE-bench solver worktrees, run a direct solver, and collect patches.
+set -euo pipefail
+usage() {
+  cat >&2 <<EOF
+usage: $0 --instances-jsonl <path> --predictions-out <path>
+          [--instance-id ID ...] [--limit N] [--model-name NAME]
+          [--repos-root <path>] [--worktrees-root <path>]
+          [--timeout-seconds N] [--copy-devlyn-context] [--resume]
+Runs Claude Code directly against each selected SWE-bench instance without
+reading gold patch/test_patch fields. Each worktree receives patch.diff plus
+direct-transcript.txt and claude-direct-debug.log. At the end, patch.diff files
+are collected into a SWE-bench predictions JSONL.
+EOF
+  exit "${1:-1}"
+}
+INSTANCES_JSONL=""
+PREDICTIONS_OUT=""
+MODEL_NAME="claude-direct"
+REPOS_ROOT="benchmark/auto-resolve/external/swebench/repos-solver"
+WORKTREES_ROOT="benchmark/auto-resolve/external/swebench/worktrees"
+TIMEOUT_SECONDS=2400
+COPY_DEVLYN_CONTEXT=0
+RESUME=0
+LIMIT=""
+INSTANCE_IDS=()
+while [ $# -gt 0 ]; do
+  case "$1" in
+    --instances-jsonl) INSTANCES_JSONL="$2"; shift 2;;
+    --predictions-out) PREDICTIONS_OUT="$2"; shift 2;;
+    --model-name) MODEL_NAME="$2"; shift 2;;
+    --repos-root) REPOS_ROOT="$2"; shift 2;;
+    --worktrees-root) WORKTREES_ROOT="$2"; shift 2;;
+    --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
+    --copy-devlyn-context) COPY_DEVLYN_CONTEXT=1; shift;;
+    --resume) RESUME=1; shift;;
+    --limit) LIMIT="$2"; shift 2;;
+    --instance-id) INSTANCE_IDS+=("$2"); shift 2;;
+    -h|--help) usage 0;;
+    *) echo "unknown arg: $1" >&2; usage 1;;
+  esac
+done
+[ -n "$INSTANCES_JSONL" ] || usage 1
+[ -n "$PREDICTIONS_OUT" ] || usage 1
+[ -f "$INSTANCES_JSONL" ] || { echo "instances JSONL not found: $INSTANCES_JSONL" >&2; exit 1; }
+case "$TIMEOUT_SECONDS" in ''|*[!0-9]*) echo "--timeout-seconds must be an integer" >&2; exit 1;; esac
+[ "$TIMEOUT_SECONDS" -gt 0 ] || { echo "--timeout-seconds must be > 0" >&2; exit 1; }
+if [ -n "$LIMIT" ]; then
+  case "$LIMIT" in ''|*[!0-9]*) echo "--limit must be an integer" >&2; exit 1;; esac
+  [ "$LIMIT" -gt 0 ] || { echo "--limit must be > 0" >&2; exit 1; }
+fi
+command -v claude >/dev/null 2>&1 || { echo "claude command not found" >&2; exit 1; }
+mkdir -p "$REPOS_ROOT" "$WORKTREES_ROOT" "$(dirname "$PREDICTIONS_OUT")"
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+TMP_IDS="$(mktemp)"
+TMP_SELECTED_INSTANCES="$(mktemp)"
+trap 'rm -f "$TMP_IDS" "$TMP_SELECTED_INSTANCES"' EXIT
+python3 - "$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT" "${INSTANCE_IDS[@]}" > "$TMP_IDS" <<'PY'
+import json
+import sys
+from pathlib import Path
+instances_path = Path(sys.argv[1])
+selected_path = Path(sys.argv[2])
+limit = int(sys.argv[3]) if sys.argv[3] else None
+requested = sys.argv[4:]
+requested_set = set(requested)
+rows = []
+with instances_path.open(encoding="utf8") as f:
+    for line_no, line in enumerate(f, start=1):
+        if not line.strip():
+            continue
+        row = json.loads(line)
+        instance_id = row.get("instance_id")
+        if not isinstance(instance_id, str) or not instance_id:
+            raise SystemExit(f"{instances_path}:{line_no}: missing instance_id")
+        if requested_set and instance_id not in requested_set:
+            continue
+        rows.append(row)
+        if limit is not None and len(rows) >= limit:
+            break
+if requested_set:
+    missing = sorted(requested_set - {row["instance_id"] for row in rows})
+    if missing:
+        raise SystemExit(f"requested instance ids not found: {', '.join(missing)}")
+for instance_id in rows:
+    print(instance_id["instance_id"])
+with selected_path.open("w", encoding="utf8") as f:
+    for row in rows:
+        f.write(json.dumps(row) + "\n")
+PY
+run_solver() {
+  local worktree
+  worktree="$(cd "$1" && pwd -P)"
+  local timeout_seconds="$2"
+  local prompt_file="$worktree/solve-prompt.txt"
+  local transcript="$worktree/direct-transcript.txt"
+  local debug_log="$worktree/claude-direct-debug.log"
+  local timeout_flag="$worktree/.solver-timed-out"
+  rm -f "$transcript" "$debug_log" "$timeout_flag"
+  set +e
+  set -m
+  (
+    cd "$worktree"
+    exec claude \
+      -p "$(cat "$prompt_file")" \
+      --dangerously-skip-permissions \
+      --effort xhigh \
+      --strict-mcp-config \
+      --mcp-config '{"mcpServers":{}}' \
+      --debug-file "$debug_log" \
+      </dev/null
+  ) > "$transcript" 2>&1 &
+  local child_pid=$!
+  set +m
+  (
+    sleep "$timeout_seconds"
+    if kill -0 "$child_pid" 2>/dev/null; then
+      : > "$timeout_flag"
+      kill -TERM -- "-$child_pid" 2>/dev/null
+      sleep 5
+      kill -KILL -- "-$child_pid" 2>/dev/null
+    fi
+  ) &
+  local watchdog_pid=$!
+  wait "$child_pid"
+  local invoke_exit=$?
+  kill -TERM "$watchdog_pid" 2>/dev/null || true
+  wait "$watchdog_pid" 2>/dev/null || true
+  if [ -f "$timeout_flag" ]; then
+    rm -f "$timeout_flag"
+    invoke_exit=124
+  fi
+  set -e
+  return "$invoke_exit"
+}
+write_patch() {
+  local worktree
+  worktree="$(cd "$1" && pwd -P)"
+  (
+    cd "$worktree"
+    git add -N -- . \
+      ':(exclude).claude/**' \
+      ':(exclude)CLAUDE.md' \
+      ':(exclude)benchmark/**' \
+      ':(exclude)docs/roadmap/phase-1/*.md' \
+      ':(exclude)solve-prompt.txt' \
+      ':(exclude)direct-transcript.txt' \
+      ':(exclude)claude-direct-debug.log' \
+      ':(exclude)latest' \
+      ':(exclude).solver-timed-out' >/dev/null 2>&1 || true
+    git diff --binary -- . \
+      ':(exclude).claude/**' \
+      ':(exclude)CLAUDE.md' \
+      ':(exclude)benchmark/**' \
+      ':(exclude)docs/roadmap/phase-1/*.md' \
+      ':(exclude)solve-prompt.txt' \
+      ':(exclude)direct-transcript.txt' \
+      ':(exclude)claude-direct-debug.log' \
+      ':(exclude)latest' \
+      ':(exclude).solver-timed-out' > patch.diff
+  )
+}
+while IFS= read -r instance_id; do
+  [ -n "$instance_id" ] || continue
+  worktree="$WORKTREES_ROOT/$instance_id"
+  if [ "$RESUME" -eq 1 ] && [ -s "$worktree/patch.diff" ]; then
+    echo "[swebench-solver] skip existing patch: $instance_id"
+    continue
+  fi
+  echo "[swebench-solver] prepare: $instance_id"
+  prepare_cmd=(
+    python3 "$SCRIPT_DIR/prepare-swebench-solver-worktree.py"
+    --instances-jsonl "$INSTANCES_JSONL"
+    --instance-id "$instance_id"
+    --repos-root "$REPOS_ROOT"
+    --worktrees-root "$WORKTREES_ROOT"
+  )
+  if [ "$COPY_DEVLYN_CONTEXT" -eq 1 ]; then
+    prepare_cmd+=(--copy-devlyn-context)
+  fi
+  "${prepare_cmd[@]}" > "$worktree.prepare.json"
+  echo "[swebench-solver] solve: $instance_id"
+  if run_solver "$worktree" "$TIMEOUT_SECONDS"; then
+    invoke_exit=0
+  else
+    invoke_exit=$?
+  fi
+  write_patch "$worktree"
+  python3 - "$worktree" "$instance_id" "$invoke_exit" <<'PY'
+import json
+import subprocess
+import sys
+from pathlib import Path
+worktree = Path(sys.argv[1])
+instance_id = sys.argv[2]
+invoke_exit = int(sys.argv[3])
+patch = worktree / "patch.diff"
+stat = subprocess.run(
+    ["git", "-C", str(worktree), "diff", "--stat", "--", "."],
+    text=True,
+    capture_output=True,
+    check=False,
+)
+report = {
+    "instance_id": instance_id,
+    "invoke_exit": invoke_exit,
+    "patch_path": str(patch),
+    "patch_bytes": patch.stat().st_size if patch.exists() else 0,
+    "diff_stat": stat.stdout.strip(),
+}
+(worktree / "solver-result.json").write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
+print(json.dumps(report, indent=2))
+PY
+done < "$TMP_IDS"
+python3 "$SCRIPT_DIR/collect-swebench-predictions.py" \
+  --patch-root "$WORKTREES_ROOT" \
+  --instances-jsonl "$TMP_SELECTED_INSTANCES" \
+  --model-name "$MODEL_NAME" \
+  --out "$PREDICTIONS_OUT" \
+  --allow-empty