npm - devlyn-cli - Versions diffs - 2.2.2 → 2.3.1 - Mend

devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

package/benchmark/auto-resolve/scripts/run-headroom-candidate.sh CHANGED Viewed

@@ -8,25 +8,93 @@ set -euo pipefail
 usage() {
   local code="${1:-1}"
-  echo "usage: $0 [--run-id ID] <fixture> [<fixture> ...]" >&2
+  cat >&2 <<'EOF'
+usage: run-headroom-candidate.sh [options] <fixture> [<fixture> ...]
+Options:
+  --run-id ID
+  --bare-max N       (default: 60)
+  --solo-max N       (default: 80)
+  --min-bare-headroom N  (default: 5)
+  --min-solo-headroom N  (default: 5)
+  --min-fixtures N   (default: 2)
+  --allow-rejected-fixtures
+                    allow rejected/ceiling fixtures for diagnostics only
+  --dry-run          validate args/fixtures and print replay command only
+EOF
   exit "$code"
 }
+require_value() {
+  local flag="$1"
+  local value="${2:-}"
+  if [ -z "$value" ] || [[ "$value" == --* ]]; then
+    echo "$flag requires a value" >&2
+    exit 1
+  fi
+}
 RUN_ID=""
+BARE_MAX=60
+SOLO_MAX=80
+MIN_BARE_HEADROOM=5
+MIN_SOLO_HEADROOM=5
+MIN_FIXTURES=2
+ALLOW_REJECTED_FIXTURES=0
+DRY_RUN=0
 FIXTURES=()
 while [ $# -gt 0 ]; do
   case "$1" in
-    --run-id) RUN_ID="$2"; shift 2;;
+    --run-id) require_value "$1" "${2:-}"; RUN_ID="$2"; shift 2;;
+    --bare-max) require_value "$1" "${2:-}"; BARE_MAX="$2"; shift 2;;
+    --solo-max) require_value "$1" "${2:-}"; SOLO_MAX="$2"; shift 2;;
+    --min-bare-headroom) require_value "$1" "${2:-}"; MIN_BARE_HEADROOM="$2"; shift 2;;
+    --min-solo-headroom) require_value "$1" "${2:-}"; MIN_SOLO_HEADROOM="$2"; shift 2;;
+    --min-fixtures) require_value "$1" "${2:-}"; MIN_FIXTURES="$2"; shift 2;;
+    --allow-rejected-fixtures) ALLOW_REJECTED_FIXTURES=1; shift;;
+    --dry-run) DRY_RUN=1; shift;;
     -h|--help) usage 0;;
-    F[0-9]*) FIXTURES+=("$1"); shift;;
+    [FS][0-9]*) FIXTURES+=("$1"); shift;;
     *) echo "unknown arg: $1" >&2; usage;;
   esac
 done
+for threshold in BARE_MAX SOLO_MAX MIN_BARE_HEADROOM MIN_SOLO_HEADROOM MIN_FIXTURES; do
+  value="${!threshold}"
+  case "$threshold" in
+    BARE_MAX) flag="bare-max" ;;
+    SOLO_MAX) flag="solo-max" ;;
+    MIN_BARE_HEADROOM) flag="min-bare-headroom" ;;
+    MIN_SOLO_HEADROOM) flag="min-solo-headroom" ;;
+    MIN_FIXTURES) flag="min-fixtures" ;;
+  esac
+  if [[ ! "$value" =~ ^[0-9]+$ ]]; then
+    echo "--$flag must be an integer: $value" >&2
+    exit 1
+  fi
+done
+if [ "$MIN_FIXTURES" -lt 1 ]; then
+  echo "--min-fixtures must be >= 1" >&2
+  exit 1
+fi
+if [ "$MIN_BARE_HEADROOM" -lt 0 ]; then
+  echo "--min-bare-headroom must be >= 0" >&2
+  exit 1
+fi
+if [ "$MIN_SOLO_HEADROOM" -lt 0 ]; then
+  echo "--min-solo-headroom must be >= 0" >&2
+  exit 1
+fi
 [ ${#FIXTURES[@]} -gt 0 ] || usage
 BENCH_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
 REPO_ROOT="$(cd "$BENCH_ROOT/../.." && pwd)"
+source "$BENCH_ROOT/scripts/pair-rejected-fixtures.sh"
+if ! declare -F rejected_pair_fixture_reason >/dev/null; then
+  echo "rejected fixture registry must define rejected_pair_fixture_reason" >&2
+  exit 1
+fi
 if [ -z "$RUN_ID" ]; then
   TS=$(date -u +%Y%m%dT%H%M%SZ)
@@ -34,16 +102,177 @@ if [ -z "$RUN_ID" ]; then
   RUN_ID="${TS}-${SHA}-headroom"
 fi
+print_command() {
+  local cmd
+  if [ "${DEVLYN_BENCHMARK_CLI_SUBCOMMAND:-}" = "headroom" ]; then
+    cmd=(npx devlyn-cli benchmark headroom --run-id "$RUN_ID")
+  else
+    cmd=(bash "$0" --run-id "$RUN_ID")
+  fi
+  cmd+=(--bare-max "$BARE_MAX")
+  cmd+=(--solo-max "$SOLO_MAX")
+  cmd+=(--min-bare-headroom "$MIN_BARE_HEADROOM")
+  cmd+=(--min-solo-headroom "$MIN_SOLO_HEADROOM")
+  cmd+=(--min-fixtures "$MIN_FIXTURES")
+  [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ] || cmd+=(--allow-rejected-fixtures)
+  [ "$DRY_RUN" -eq 0 ] || cmd+=(--dry-run)
+  cmd+=("${FIXTURES[@]}")
+  printf 'Command: '
+  printf '%q ' "${cmd[@]}"
+  printf '\n'
+}
+fixture_exists() {
+  local fid="$1"
+  [ -d "$BENCH_ROOT/fixtures/$fid" ] || [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
+}
+fixture_dir() {
+  local fid="$1"
+  if [ -d "$BENCH_ROOT/fixtures/$fid" ]; then
+    printf '%s\n' "$BENCH_ROOT/fixtures/$fid"
+  else
+    printf '%s\n' "$BENCH_ROOT/shadow-fixtures/$fid"
+  fi
+}
+is_shadow_fixture() {
+  local fid="$1"
+  [ -d "$BENCH_ROOT/shadow-fixtures/$fid" ]
+}
+retired_fixture_exists() {
+  local fid="$1"
+  [ -d "$BENCH_ROOT/fixtures/retired/$fid" ]
+}
+fixture_smoke_only() {
+  local fid="$1"
+  [[ "$fid" == S1 || "$fid" == S1-* ]]
+}
+fixture_category() {
+  local dir="$1"
+  python3 - "$dir/metadata.json" <<'PY'
+import json
+import sys
+try:
+    with open(sys.argv[1], encoding="utf-8") as handle:
+        print(json.load(handle).get("category", ""))
+except FileNotFoundError:
+    print("")
+PY
+}
+fixture_has_solo_headroom_hypothesis() {
+  local dir="$1"
+  python3 "$BENCH_ROOT/scripts/solo-headroom-hypothesis.py" --expected-json "$dir/expected.json" "$dir/spec.md"
+}
+fixture_has_solo_ceiling_avoidance_note() {
+  local dir="$1"
+  python3 "$BENCH_ROOT/scripts/solo-ceiling-avoidance.py" "$dir/NOTES.md"
+}
+fixture_has_pair_evidence() {
+  local fid="$1"
+  python3 - "$BENCH_ROOT/results" "$fid" <<'PY'
+import json
+import pathlib
+import sys
+results = pathlib.Path(sys.argv[1])
+fixture = sys.argv[2]
+if not results.is_dir():
+    sys.exit(1)
+for path in results.glob("*/full-pipeline-pair-gate.json"):
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except (OSError, json.JSONDecodeError):
+        continue
+    if data.get("verdict") != "PASS":
+        continue
+    rows = data.get("rows")
+    if not isinstance(rows, list):
+        continue
+    for row in rows:
+        if isinstance(row, dict) and row.get("fixture") == fixture and row.get("status") == "PASS":
+            sys.exit(0)
+sys.exit(1)
+PY
+}
+validate_fixtures() {
+  local missing=0
+  local fid reason dir category
+  for fid in "${FIXTURES[@]}"; do
+    if ! fixture_exists "$fid"; then
+      if retired_fixture_exists "$fid"; then
+        echo "fixture is retired and is not rerun by pair-candidate runners: $fid. Use preserved results/docs for historical replay." >&2
+        missing=1
+        continue
+      fi
+      echo "fixture not found in fixtures/ or shadow-fixtures/: $fid" >&2
+      missing=1
+      continue
+    fi
+    if [ "$DRY_RUN" -eq 0 ] && fixture_smoke_only "$fid"; then
+      echo "fixture is smoke-only and cannot run providers: $fid. Use --dry-run for runner/package validation." >&2
+      missing=1
+      continue
+    fi
+    reason="$(rejected_pair_fixture_reason "$fid" || true)"
+    if [ "$ALLOW_REJECTED_FIXTURES" -eq 0 ]; then
+      if [ -n "$reason" ]; then
+        echo "fixture rejected for pair-candidate runs: $fid ($reason). Use --allow-rejected-fixtures for diagnostics only." >&2
+        missing=1
+        continue
+      fi
+    fi
+    if [ -z "$reason" ]; then
+      dir="$(fixture_dir "$fid")"
+      category="$(fixture_category "$dir")"
+      if [ "$category" = "high-risk" ] && ! fixture_has_pair_evidence "$fid"; then
+        if ! fixture_has_solo_headroom_hypothesis "$dir"; then
+          echo "fixture spec.md needs a solo-headroom hypothesis with solo_claude miss and observable command from expected.json before provider spend: $fid" >&2
+          missing=1
+        fi
+        if is_shadow_fixture "$fid" && ! fixture_has_solo_ceiling_avoidance_note "$dir"; then
+          echo "shadow fixture NOTES.md needs ## Solo ceiling avoidance with solo_claude, a rejected/solo-saturated control comparison, and headroom reasoning before provider spend: $fid" >&2
+          missing=1
+        fi
+      fi
+    fi
+  done
+  [ "$missing" -eq 0 ] || exit 1
+}
 echo ""
 echo "═══ Headroom Candidate Run ═══"
 echo "Run-id:   $RUN_ID"
 echo "Fixtures: ${FIXTURES[*]}"
 echo "Arms:     bare solo_claude"
-if [ ${#FIXTURES[@]} -lt 2 ]; then
-  echo "Gate:     will FAIL set gate unless at least 2 fixtures are supplied"
+echo "Gate:     bare <= $BARE_MAX (headroom >= $MIN_BARE_HEADROOM), solo_claude <= $SOLO_MAX (headroom >= $MIN_SOLO_HEADROOM), baseline evidence-complete, min fixtures $MIN_FIXTURES"
+[ "$DRY_RUN" -eq 0 ] || echo "Mode:     DRY RUN (no model/provider invocations)"
+print_command
+if [ ${#FIXTURES[@]} -lt "$MIN_FIXTURES" ]; then
+  echo "Gate:     will FAIL set gate unless at least $MIN_FIXTURES fixtures are supplied"
 fi
 echo ""
+validate_fixtures
+if [ "$DRY_RUN" -eq 1 ] && [ "${#FIXTURES[@]}" -lt "$MIN_FIXTURES" ]; then
+  echo "[headroom] DRY RUN failed — ${#FIXTURES[@]} fixture(s) supplied, --min-fixtures requires $MIN_FIXTURES." >&2
+  exit 1
+fi
+if [ "$DRY_RUN" -eq 1 ]; then
+  echo "[headroom] DRY RUN complete — fixtures resolved, no arms or judges executed."
+  exit 0
+fi
 SRC_SKILLS="$REPO_ROOT/config/skills"
 DST_SKILLS="$REPO_ROOT/.claude/skills"
 mkdir -p "$DST_SKILLS"
@@ -84,10 +313,24 @@ echo ""
 set +e
 python3 "$BENCH_ROOT/scripts/headroom-gate.py" \
   --run-id "$RUN_ID" \
+  --bare-max "$BARE_MAX" \
+  --solo-max "$SOLO_MAX" \
+  --min-bare-headroom "$MIN_BARE_HEADROOM" \
+  --min-solo-headroom "$MIN_SOLO_HEADROOM" \
+  --min-fixtures "$MIN_FIXTURES" \
   --out-json "$BENCH_ROOT/results/$RUN_ID/headroom-gate.json" \
   --out-md "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
 GATE_EXIT=$?
 set -e
-cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
+if [ -f "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md" ]; then
+  cat "$BENCH_ROOT/results/$RUN_ID/headroom-gate.md"
+else
+  echo "[headroom] headroom gate report missing: $BENCH_ROOT/results/$RUN_ID/headroom-gate.md" >&2
+fi
+if [ "$GATE_EXIT" -eq 0 ]; then
+  echo "[headroom] headroom gate passed — candidate set accepted."
+else
+  echo "[headroom] headroom gate failed — candidate set rejected."
+fi
 exit "$GATE_EXIT"

package/benchmark/auto-resolve/scripts/run-iter-0033c.sh CHANGED Viewed

@@ -96,16 +96,29 @@ echo "[run-iter-0033c] RUN_ID=$RUN_ID"
 echo "[run-iter-0033c] RESULTS_DIR=$RESULTS_DIR"
 # --- Determine pair-eligible set from manifest input bundle ---
-# Build a draft manifest using the C1 summary as the L1 placeholder; we'll
-# rebuild with the real L1 rerun summary at the end. For now we just need
-# the pair-eligible set for arm-selection per fixture.
+# Pair eligibility is pre-registered from C1/F9 before any iter-0033c arms run.
+# The later L1 rerun summary is archived into the final manifest for provenance;
+# it must not change the arm-selection set after execution has begun.
 DRAFT_MANIFEST="$RESULTS_DIR/manifest-draft.json"
 python3 benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py \
   --c1-summary "$C1_SUMMARY" \
   --f9-judge "$F9_JUDGE" \
   --l1-rerun-summary "$C1_SUMMARY" \
   --output "$DRAFT_MANIFEST"
-PAIR_ELIGIBLE=$(python3 -c "import json;print(' '.join(json.load(open('$DRAFT_MANIFEST'))['fixtures_pair_eligible']))")
+PAIR_ELIGIBLE=$(python3 - "$DRAFT_MANIFEST" "$REPO_ROOT/benchmark/auto-resolve/scripts" <<'PY'
+import pathlib
+import sys
+sys.path.insert(0, sys.argv[2])
+from pair_evidence_contract import loads_strict_json_object
+manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
+fixtures = manifest.get("fixtures_pair_eligible")
+if not isinstance(fixtures, list) or not all(isinstance(item, str) for item in fixtures):
+    raise SystemExit("manifest fixtures_pair_eligible must be a string array")
+print(" ".join(fixtures))
+PY
+)
 echo "[run-iter-0033c] pair-eligible: $PAIR_ELIGIBLE"
 # --- Per-fixture interleaved arm loop ---
@@ -161,50 +174,11 @@ done
 # --- Build L1 rerun summary from solo_claude arm result.json + judge.json ---
 L1_RERUN_SUMMARY="$RESULTS_DIR/l1-rerun-summary.json"
-python3 - "$RESULTS_DIR" "$L1_RERUN_SUMMARY" "$RUN_ID" "$HEAD_SHA" <<'PY'
-import json, sys
-from pathlib import Path
-results_dir = Path(sys.argv[1])
-out_path = Path(sys.argv[2])
-run_id = sys.argv[3]
-head_sha = sys.argv[4]
-rows = []
-for fx_dir in sorted(results_dir.iterdir()):
-    if not fx_dir.is_dir():
-        continue
-    judge_p = fx_dir / "judge.json"
-    if not judge_p.is_file():
-        continue
-    judge = json.loads(judge_p.read_text())
-    mapping = judge.get("_blind_mapping") or {}
-    inv = {v: k for k, v in mapping.items()}
-    arms = {}
-    for arm_name in ("solo_claude", "l2_gated", "l2_forced", "bare"):
-        letter = inv.get(arm_name)
-        if not letter:
-            continue
-        arm_dir = fx_dir / arm_name
-        result = {}
-        if (arm_dir / "result.json").is_file():
-            result = json.loads((arm_dir / "result.json").read_text())
-        arms[arm_name] = {
-            "score": judge.get(f"{letter}_score"),
-            "wall_s": result.get("elapsed_seconds"),
-            "verify_score": result.get("verify_score"),
-            "files_changed": result.get("files_changed"),
-            "timed_out": result.get("timed_out"),
-            "disqualifier": result.get("disqualifier"),
-        }
-    rows.append({"fixture": fx_dir.name, "arms": arms})
-out = {
-    "run_id": run_id,
-    "git_sha": head_sha,
-    "fixtures_total": len(rows),
-    "rows": rows,
-}
-out_path.write_text(json.dumps(out, indent=2) + "\n")
-print(f"[l1-rerun-summary] wrote {out_path} (fixtures={len(rows)})")
-PY
+python3 benchmark/auto-resolve/scripts/iter-0033c-l1-summary.py \
+  --results-dir "$RESULTS_DIR" \
+  --out "$L1_RERUN_SUMMARY" \
+  --run-id "$RUN_ID" \
+  --git-sha "$HEAD_SHA"
 # --- Build final manifest with real L1 rerun summary ---
 FINAL_MANIFEST="$RESULTS_DIR/iter-0033c-pair-eligible.json"

package/benchmark/auto-resolve/scripts/run-suite.sh CHANGED Viewed

@@ -6,13 +6,13 @@
 #
 # Usage:
 #   run-suite.sh                            # all fixtures, n=1 smoke
-#   run-suite.sh --n 3                      # 3 runs per fixture for ship decisions
 #   run-suite.sh F2 F5                      # specific fixtures only
 #   run-suite.sh --dry-run                  # skip model invocations, validate setup
 #   run-suite.sh --judge-only --run-id X    # re-judge an existing run
 #   run-suite.sh --label v3.6               # tag this run
 #   run-suite.sh --bless                    # if ship-gate PASS, promote to baselines/shipped.json
 #   run-suite.sh --resolve-skill new        # invoke /devlyn:resolve --spec (the only supported value post iter-0034 cutover; flag kept as accepted no-op for historical runners)
+#   run-suite.sh --suite shadow --dry-run   # list shadow tasks; shadow suite refuses provider/judge runs
 #
 # Exits 0 on PASS, 1 on FAIL.
@@ -32,17 +32,26 @@ SUITE="golden"
 RESOLVE_SKILL="new"
 FIXTURES=()
+require_value() {
+  local flag="$1"
+  local value="${2:-}"
+  if [ -z "$value" ] || [[ "$value" == --* ]]; then
+    echo "$flag requires a value" >&2
+    exit 1
+  fi
+}
 while [ $# -gt 0 ]; do
   case "$1" in
-    --n)              N="$2"; shift 2;;
-    --label)          LABEL="$2"; shift 2;;
+    --n)              require_value "$1" "${2:-}"; N="$2"; shift 2;;
+    --label)          require_value "$1" "${2:-}"; LABEL="$2"; shift 2;;
     --dry-run)        DRY_RUN=1; shift;;
     --judge-only)     JUDGE_ONLY=1; shift;;
-    --run-id)         RUN_ID_ARG="$2"; shift 2;;
+    --run-id)         require_value "$1" "${2:-}"; RUN_ID_ARG="$2"; shift 2;;
     --bless)          BLESS=1; shift;;
     --accept-missing) ACCEPT_MISSING=1; shift;;
-    --suite)          SUITE="$2"; shift 2;;
-    --resolve-skill)  RESOLVE_SKILL="$2"; shift 2;;
+    --suite)          require_value "$1" "${2:-}"; SUITE="$2"; shift 2;;
+    --resolve-skill)  require_value "$1" "${2:-}"; RESOLVE_SKILL="$2"; shift 2;;
     -h|--help)
       head -22 "$0" | sed -n '3,22p'; exit 0;;
     [FS][0-9]*)       FIXTURES+=("$1"); shift;;
@@ -69,8 +78,15 @@ case "$SUITE" in
   *)       echo "error: --suite must be 'golden' or 'shadow' (got '$SUITE')" >&2; exit 1;;
 esac
+if [ "$SUITE" = "shadow" ] && [ "$DRY_RUN" -eq 0 ]; then
+  echo "shadow suite run-suite is dry-run only. Use benchmark headroom/pair with explicit S* candidates for real provider measurement." >&2
+  exit 1
+fi
 # n must be 1 while iteration semantics aren't wired through judge/report.
 # Remove this block when compile-report.py gains multi-iter aggregation.
+case "$N" in ''|*[!0-9]*) echo "error: --n must be an integer" >&2; exit 1;; esac
+[ "$N" -gt 0 ] || { echo "error: --n must be > 0" >&2; exit 1; }
 if [ "$N" -ne 1 ]; then
   echo "error: --n $N not yet supported — judge/report currently expect a single iteration per fixture." >&2
   echo "       Track progress in benchmark/auto-resolve/BENCHMARK-DESIGN.md (#multi-iter-roadmap)." >&2
@@ -101,6 +117,22 @@ fi
 RES_DIR="$BENCH_ROOT/results/$RUN_ID"
 mkdir -p "$RES_DIR"
+print_command() {
+  local cmd=(bash "$0" --n "$N" --suite "$SUITE" --resolve-skill "$RESOLVE_SKILL")
+  [ -z "$LABEL" ] || cmd+=(--label "$LABEL")
+  cmd+=(--run-id "$RUN_ID")
+  [ $DRY_RUN -eq 0 ] || cmd+=(--dry-run)
+  [ $JUDGE_ONLY -eq 0 ] || cmd+=(--judge-only)
+  [ $BLESS -eq 0 ] || cmd+=(--bless)
+  [ $ACCEPT_MISSING -eq 0 ] || cmd+=(--accept-missing)
+  if [ ${#FIXTURES[@]} -gt 0 ]; then
+    cmd+=("${FIXTURES[@]}")
+  fi
+  printf 'Command: '
+  printf '%q ' "${cmd[@]}"
+  printf '\n'
+}
 echo ""
 echo "═══ Benchmark Suite Run ═══"
 echo "Run-id:        $RUN_ID"
@@ -111,6 +143,7 @@ echo "n:             $N"
 echo "Resolve skill: $RESOLVE_SKILL"
 [ $DRY_RUN -eq 1 ] && echo "Mode:          DRY RUN (no model invocations)"
 [ $JUDGE_ONLY -eq 1 ] && echo "Mode:          JUDGE ONLY (re-judging existing artifacts)"
+print_command
 echo ""
 # ---- Mirror committed skills into .claude/skills (iter-0017) --------------
@@ -201,7 +234,11 @@ done
 if [ $DRY_RUN -eq 1 ]; then
   echo ""
   echo "[suite] DRY RUN complete — results in $RES_DIR"
-  echo "Run without --dry-run to invoke models."
+  if [ "$SUITE" = "shadow" ]; then
+    echo "Use benchmark headroom/pair with explicit S* candidates for real provider measurement."
+  else
+    echo "Run without --dry-run to invoke models."
+  fi
   exit 0
 fi

package/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh CHANGED Viewed

@@ -19,6 +19,15 @@ EOF
   exit "${1:-1}"
 }
+require_value() {
+  local flag="$1"
+  local value="${2:-}"
+  if [ -z "$value" ] || [[ "$value" == --* ]]; then
+    echo "$flag requires a value" >&2
+    exit 1
+  fi
+}
 MANIFEST=""
 RUN_PREFIX=""
 PAIR_MODE="gated"
@@ -33,18 +42,18 @@ RUN_IDS_OUT=""
 RESUME_COMPLETED_ARMS=0
 while [ $# -gt 0 ]; do
   case "$1" in
-    --manifest) MANIFEST="$2"; shift 2;;
-    --run-prefix) RUN_PREFIX="$2"; shift 2;;
-    --pair-mode) PAIR_MODE="$2"; shift 2;;
-    --min-runs) MIN_RUNS="$2"; shift 2;;
-    --out-json) OUT_JSON="$2"; shift 2;;
-    --out-md) OUT_MD="$2"; shift 2;;
-    --max-pair-solo-wall-ratio) MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
-    --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
-    --run-ids-out) RUN_IDS_OUT="$2"; shift 2;;
+    --manifest) require_value "$1" "${2:-}"; MANIFEST="$2"; shift 2;;
+    --run-prefix) require_value "$1" "${2:-}"; RUN_PREFIX="$2"; shift 2;;
+    --pair-mode) require_value "$1" "${2:-}"; PAIR_MODE="$2"; shift 2;;
+    --min-runs) require_value "$1" "${2:-}"; MIN_RUNS="$2"; shift 2;;
+    --out-json) require_value "$1" "${2:-}"; OUT_JSON="$2"; shift 2;;
+    --out-md) require_value "$1" "${2:-}"; OUT_MD="$2"; shift 2;;
+    --max-pair-solo-wall-ratio) require_value "$1" "${2:-}"; MAX_PAIR_SOLO_WALL_RATIO="$2"; shift 2;;
+    --timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_SECONDS="$2"; shift 2;;
+    --run-ids-out) require_value "$1" "${2:-}"; RUN_IDS_OUT="$2"; shift 2;;
     --resume-completed-arms) RESUME_COMPLETED_ARMS=1; shift;;
     --prepare-only) PREPARE_ONLY=1; shift;;
-    --gate-only-run-ids) GATE_ONLY_RUN_IDS="$2"; shift 2;;
+    --gate-only-run-ids) require_value "$1" "${2:-}"; GATE_ONLY_RUN_IDS="$2"; shift 2;;
     -h|--help) usage 0;;
     *) echo "unknown arg: $1" >&2; usage 1;;
   esac
@@ -79,12 +88,91 @@ if [ -z "$RUN_PREFIX" ]; then
   RUN_PREFIX="$(date -u +%Y%m%dT%H%M%SZ)-swebench-frozen"
 fi
+print_command() {
+  local cmd=(bash "$0" --manifest "$MANIFEST" --run-prefix "$RUN_PREFIX")
+  cmd+=(--pair-mode "$PAIR_MODE")
+  cmd+=(--min-runs "$MIN_RUNS")
+  [ -z "$OUT_JSON" ] || cmd+=(--out-json "$OUT_JSON")
+  [ -z "$OUT_MD" ] || cmd+=(--out-md "$OUT_MD")
+  [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || cmd+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")
+  [ -z "$TIMEOUT_SECONDS" ] || cmd+=(--timeout-seconds "$TIMEOUT_SECONDS")
+  [ -z "$RUN_IDS_OUT" ] || cmd+=(--run-ids-out "$RUN_IDS_OUT")
+  [ "$RESUME_COMPLETED_ARMS" -eq 0 ] || cmd+=(--resume-completed-arms)
+  [ "$PREPARE_ONLY" -eq 0 ] || cmd+=(--prepare-only)
+  [ -z "$GATE_ONLY_RUN_IDS" ] || cmd+=(--gate-only-run-ids "$GATE_ONLY_RUN_IDS")
+  printf 'Command: '
+  printf '%q ' "${cmd[@]}"
+  printf '\n'
+}
+echo ""
+echo "═══ SWE-bench Frozen VERIFY Corpus Run ═══"
+echo "Run-prefix: $RUN_PREFIX"
+echo "Pair mode:  $PAIR_MODE"
+echo "Min runs:   $MIN_RUNS"
+[ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || echo "Wall cap:   pair/solo <= ${MAX_PAIR_SOLO_WALL_RATIO}x"
+print_command
+echo ""
 TMP_RUN_IDS="$(mktemp)"
 trap 'rm -f "$TMP_RUN_IDS"' EXIT
 ROW_FAILURES=0
+python3 - "$MANIFEST" "$GATE_ONLY_RUN_IDS" "$SCRIPT_DIR" <<'PY'
+import pathlib
+import sys
+sys.path.insert(0, sys.argv[3])
+from pair_evidence_contract import loads_strict_json_object
+manifest_path = pathlib.Path(sys.argv[1])
+gate_only_run_ids = sys.argv[2]
+try:
+    manifest = loads_strict_json_object(manifest_path.read_text())
+except ValueError as exc:
+    if str(exc) == "top-level JSON value must be an object":
+        raise SystemExit("manifest malformed: expected JSON object") from exc
+    raise
+if not isinstance(manifest, dict):
+    raise SystemExit("manifest malformed: expected JSON object")
+cases_root = manifest.get("cases_root")
+if not isinstance(cases_root, str) or not cases_root.strip():
+    raise SystemExit("manifest malformed: missing non-empty cases_root")
+if gate_only_run_ids:
+    raise SystemExit(0)
+prepared = manifest.get("prepared")
+if not isinstance(prepared, list) or not prepared:
+    raise SystemExit("manifest malformed: prepared must be a non-empty array")
+for index, row in enumerate(prepared, start=1):
+    if not isinstance(row, dict):
+        raise SystemExit(f"manifest malformed: prepared[{index}] expected JSON object")
+    for key in ("instance_id", "case_dir", "repo_dir"):
+        value = row.get(key)
+        if not isinstance(value, str) or not value.strip():
+            raise SystemExit(f"manifest malformed: prepared[{index}] missing non-empty {key}")
+PY
 if [ -n "$GATE_ONLY_RUN_IDS" ]; then
-  cp "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS"
+  python3 - "$GATE_ONLY_RUN_IDS" "$TMP_RUN_IDS" <<'PY'
+import pathlib
+import re
+import sys
+source = pathlib.Path(sys.argv[1])
+dest = pathlib.Path(sys.argv[2])
+safe = re.compile(r"^[A-Za-z0-9_.-]+$")
+run_ids: list[str] = []
+for line_no, line in enumerate(source.read_text(encoding="utf8").splitlines(), start=1):
+    run_id = line.strip()
+    if not run_id:
+        raise SystemExit(f"run ids malformed: line {line_no} is empty")
+    if not safe.match(run_id):
+        raise SystemExit(f"run ids malformed: line {line_no} has unsafe run id")
+    run_ids.append(run_id)
+if not run_ids:
+    raise SystemExit("run ids malformed: no run ids")
+dest.write_text("\n".join(run_ids) + "\n", encoding="utf8")
+PY
 else
   while IFS=$'\t' read -r index instance_id cases_root repo_dir diff_path; do
     [ -n "$instance_id" ] || continue
@@ -157,10 +245,14 @@ if not compare_path.exists():
 PY
     fi
     printf '%s\n' "$safe_run_id" >> "$TMP_RUN_IDS"
-  done < <(python3 - "$MANIFEST" <<'PY'
-import json, pathlib, sys
-manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
-for index, row in enumerate(manifest.get("prepared") or [], start=1):
+  done < <(python3 - "$MANIFEST" "$SCRIPT_DIR" <<'PY'
+import pathlib, sys
+sys.path.insert(0, sys.argv[2])
+from pair_evidence_contract import loads_strict_json_object
+manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
+for index, row in enumerate(manifest["prepared"], start=1):
     instance_id = row["instance_id"]
     case_dir = pathlib.Path(row["case_dir"])
     repo_dir = pathlib.Path(row["repo_dir"])
@@ -192,13 +284,22 @@ fi
 run_count="$(wc -l < "$TMP_RUN_IDS" | tr -d ' ')"
 [ "$run_count" -gt 0 ] || { echo "manifest prepared no runs" >&2; exit 1; }
-fixtures_root="$(python3 - "$MANIFEST" <<'PY'
-import json, pathlib, sys
-manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
+fixtures_root="$(python3 - "$MANIFEST" "$SCRIPT_DIR" <<'PY'
+import pathlib, sys
+sys.path.insert(0, sys.argv[2])
+from pair_evidence_contract import loads_strict_json_object
+manifest = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
 print(manifest["cases_root"])
 PY
 )"
-gate_args=(python3 "$SCRIPT_DIR/frozen-verify-gate.py" --fixtures-root "$fixtures_root" --min-runs "$MIN_RUNS")
+gate_args=(
+  python3 "$SCRIPT_DIR/frozen-verify-gate.py"
+  --fixtures-root "$fixtures_root"
+  --min-runs "$MIN_RUNS"
+  --require-hypothesis-trigger
+)
 [ -z "$OUT_JSON" ] || gate_args+=(--out-json "$OUT_JSON")
 [ -z "$OUT_MD" ] || gate_args+=(--out-md "$OUT_MD")
 [ -z "$MAX_PAIR_SOLO_WALL_RATIO" ] || gate_args+=(--max-pair-solo-wall-ratio "$MAX_PAIR_SOLO_WALL_RATIO")