npm - devlyn-cli - Versions diffs - 2.1.0 → 2.2.0 - Mend

devlyn-cli 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py ADDED Viewed

@@ -0,0 +1,265 @@
+#!/usr/bin/env python3
+"""Render a SWE-bench frozen VERIFY matrix from compare artifacts."""
+from __future__ import annotations
+import argparse
+import json
+import re
+from collections import Counter
+from pathlib import Path
+from typing import Any
+RANK = {
+    "PASS": 0,
+    "PASS_WITH_ISSUES": 1,
+    "NEEDS_WORK": 2,
+    "BLOCKED": 3,
+}
+def rank(verdict: str | None) -> int:
+    return RANK.get(verdict or "", -1)
+def load_json(path: Path) -> dict[str, Any]:
+    return json.loads(path.read_text(encoding="utf8"))
+def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
+    transcript_path = results_root / run_id / arm / "transcript.txt"
+    if not transcript_path.is_file():
+        return None
+    transcript = transcript_path.read_text(encoding="utf8", errors="replace")
+    if "You've hit your limit" in transcript:
+        return "provider_limit"
+    return None
+def infer_fixture_id(results_root: Path, run_id: str) -> str:
+    for arm in ("pair", "solo"):
+        input_path = results_root / run_id / arm / "input.md"
+        if not input_path.exists():
+            continue
+        match = re.search(r"docs/roadmap/phase-1/([^`\s]+)\.md", input_path.read_text())
+        if match:
+            return match.group(1)
+    return "unknown"
+def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
+    if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
+        return None
+    if solo_elapsed <= 0:
+        return None
+    return pair_elapsed / solo_elapsed
+def load_gate_rows(gate_json: Path | None) -> dict[str, dict[str, Any]]:
+    if gate_json is None:
+        return {}
+    doc = load_json(gate_json)
+    return {row["run_id"]: row for row in doc.get("rows", [])}
+def min_gate_rate(value: str) -> float:
+    rate = float(value)
+    if rate < 0 or rate > 1:
+        raise argparse.ArgumentTypeError("--min-gate-rate must be between 0 and 1")
+    return rate
+def non_negative_int(value: str) -> int:
+    parsed = int(value)
+    if parsed < 0:
+        raise argparse.ArgumentTypeError("value must be >= 0")
+    return parsed
+def classify(row: dict[str, Any], included: bool) -> str:
+    if included:
+        external = row["external_lift"]
+        internal = row["internal_lift"]
+        if external and internal:
+            return "gate: external + internal lift"
+        if external:
+            return "gate: external lift"
+        if internal:
+            return "gate: internal lift"
+        return "gate"
+    if row.get("row_failed_before_compare"):
+        row_exit = row.get("row_exit")
+        suffix = f" exit={row_exit}" if isinstance(row_exit, int) else ""
+        return f"failed attempt: row runner{suffix}"
+    if row.get("compare_missing"):
+        return "failed attempt: missing compare"
+    if row.get("solo_timed_out") or row.get("pair_timed_out"):
+        return "failed attempt: timeout"
+    if row.get("solo_failure_reason") == "provider_limit" or row.get("pair_failure_reason") == "provider_limit":
+        return "failed attempt: provider limit"
+    if row.get("solo_invoke_exit") not in (None, 0) or row.get("pair_invoke_exit") not in (None, 0):
+        return "failed attempt: nonzero invoke exit"
+    if row["solo_mechanical"] == "FAIL":
+        return "excluded: solo mechanical dominated"
+    if row["external_lift"] or row["internal_lift"]:
+        failures = row.get("gate_failures") or []
+        if failures:
+            return "lift excluded: " + "; ".join(failures)
+        return "lift outside gate"
+    if rank(row["pair_verdict"]) > rank(row["solo_verdict"]):
+        return "recall-only advisory"
+    if row["pair_found_more_low_or_worse"] or row["pair_found_more_findings"]:
+        return "recall-only findings"
+    return "no verdict lift"
+def build_row(results_root: Path, run_id: str, gate_rows_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
+    compare_path = results_root / run_id / "compare.json"
+    if compare_path.exists():
+        compare = load_json(compare_path)
+    else:
+        compare = {
+            "solo": {},
+            "pair": {},
+            "comparison": {"compare_missing": True},
+        }
+    solo = compare.get("solo") or {}
+    pair = compare.get("pair") or {}
+    comparison = compare.get("comparison") or {}
+    pair_ratio = elapsed_ratio(pair.get("elapsed_seconds"), solo.get("elapsed_seconds"))
+    gate_row = gate_rows_by_id.get(run_id) or {}
+    row = {
+        "fixture_id": infer_fixture_id(results_root, run_id),
+        "run_id": run_id,
+        "solo_verdict": comparison.get("solo_verdict") or solo.get("verify_verdict"),
+        "pair_verdict": comparison.get("pair_verdict") or pair.get("verify_verdict"),
+        "pair_mode": bool(pair.get("pair_mode")),
+        "external_lift": bool(comparison.get("pair_verdict_lift")),
+        "internal_lift": bool(comparison.get("pair_internal_verdict_lift")),
+        "pair_found_more_findings": bool(comparison.get("pair_found_more_findings")),
+        "pair_found_more_low_or_worse": bool(comparison.get("pair_found_more_low_or_worse")),
+        "row_failed_before_compare": bool(comparison.get("row_failed_before_compare")),
+        "row_exit": comparison.get("row_exit"),
+        "compare_missing": bool(comparison.get("compare_missing")),
+        "solo_invoke_exit": solo.get("invoke_exit"),
+        "pair_invoke_exit": pair.get("invoke_exit"),
+        "solo_failure_reason": solo.get("invoke_failure_reason")
+        or transcript_failure_reason(results_root, run_id, "solo"),
+        "pair_failure_reason": pair.get("invoke_failure_reason")
+        or transcript_failure_reason(results_root, run_id, "pair"),
+        "solo_timed_out": bool(solo.get("timed_out")),
+        "pair_timed_out": bool(pair.get("timed_out")),
+        "pair_solo_wall_ratio": pair_ratio,
+        "solo_mechanical": (solo.get("sub_verdicts") or {}).get("mechanical"),
+        "pair_mechanical": (pair.get("sub_verdicts") or {}).get("mechanical"),
+        "included_in_gate": gate_row.get("status") == "PASS",
+        "gate_failures": gate_row.get("failures") or [],
+    }
+    row["classification"] = classify(row, row["included_in_gate"])
+    return row
+def fmt_ratio(value: Any) -> str:
+    return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
+def write_md(path: Path, report: dict[str, Any]) -> None:
+    lines = [
+        f"# {report['title']}",
+        "",
+        f"Verdict: **{report['verdict']}**",
+        "",
+        f"Runs: {report['runs_total']}",
+        f"Included in gate: {report['gate_rows']}",
+        f"Excluded/recall/no-lift: {report['excluded_or_recall_rows']}",
+        f"Gate rate: {report['gate_rate']:.3f}",
+        f"Trailing non-gate rows: {report['trailing_non_gate_rows']}",
+    ]
+    if report["yield_thresholds"]:
+        lines.extend(["", f"Yield verdict: **{report['yield_verdict']}**"])
+        if report["yield_failures"]:
+            lines.append("Yield failures:")
+            lines.extend(f"- {failure}" for failure in report["yield_failures"])
+    if report.get("gate_artifact_json"):
+        lines.extend(["", f"Gate artifact: `{report['gate_artifact_json']}`"])
+    lines.extend(["", "Classification counts:"])
+    for name, count in sorted(report["classification_counts"].items()):
+        lines.append(f"- {name}: {count}")
+    lines.extend(
+        [
+            "",
+            "| Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Included | Classification |",
+            "|---|---|---|---|---:|---|---|---|---|",
+        ]
+    )
+    for row in report["rows"]:
+        lines.append(
+            f"| {row['fixture_id']} | {row['solo_verdict']} | {row['pair_verdict']} | "
+            f"{str(row['pair_mode']).lower()} | {fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
+            f"{str(row['external_lift']).lower()} | {str(row['internal_lift']).lower()} | "
+            f"{str(row['included_in_gate']).lower()} | {row['classification']} |"
+        )
+    lines.append("")
+    path.write_text("\n".join(lines), encoding="utf8")
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=Path)
+    parser.add_argument("--run-id", action="append", required=True)
+    parser.add_argument("--gate-json", type=Path)
+    parser.add_argument("--title", required=True)
+    parser.add_argument("--verdict", required=True)
+    parser.add_argument("--min-gate-rate", type=min_gate_rate)
+    parser.add_argument("--max-trailing-non-gate", type=non_negative_int)
+    parser.add_argument("--out-json", required=True, type=Path)
+    parser.add_argument("--out-md", required=True, type=Path)
+    args = parser.parse_args()
+    gate_rows_by_id = load_gate_rows(args.gate_json)
+    rows = [build_row(args.results_root, run_id, gate_rows_by_id) for run_id in args.run_id]
+    gate_rows = sum(1 for row in rows if row["included_in_gate"])
+    trailing_non_gate_rows = 0
+    for row in reversed(rows):
+        if row["included_in_gate"]:
+            break
+        trailing_non_gate_rows += 1
+    gate_rate = gate_rows / len(rows) if rows else 0.0
+    yield_thresholds = {
+        "min_gate_rate": args.min_gate_rate,
+        "max_trailing_non_gate": args.max_trailing_non_gate,
+    }
+    thresholds_configured = any(value is not None for value in yield_thresholds.values())
+    yield_failures = []
+    if args.min_gate_rate is not None and gate_rate < args.min_gate_rate:
+        yield_failures.append(f"gate rate {gate_rate:.3f} < minimum {args.min_gate_rate:.3f}")
+    if args.max_trailing_non_gate is not None and trailing_non_gate_rows > args.max_trailing_non_gate:
+        yield_failures.append(
+            f"trailing non-gate rows {trailing_non_gate_rows} > maximum {args.max_trailing_non_gate}"
+        )
+    report = {
+        "title": args.title,
+        "verdict": args.verdict,
+        "runs_total": len(rows),
+        "gate_rows": gate_rows,
+        "excluded_or_recall_rows": len(rows) - gate_rows,
+        "gate_rate": gate_rate,
+        "trailing_non_gate_rows": trailing_non_gate_rows,
+        "classification_counts": dict(Counter(row["classification"] for row in rows)),
+        "yield_thresholds": {
+            key: value for key, value in yield_thresholds.items() if value is not None
+        },
+        "yield_verdict": "FAIL" if yield_failures else "PASS" if thresholds_configured else "NOT_CONFIGURED",
+        "yield_failures": yield_failures,
+        "gate_artifact_json": str(args.gate_json) if args.gate_json else None,
+        "rows": rows,
+    }
+    args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
+    write_md(args.out_md, report)
+    print(json.dumps(report, indent=2))
+    return 2 if yield_failures else 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/benchmark/auto-resolve/scripts/test-frozen-verify-gate.sh ADDED Viewed

@@ -0,0 +1,192 @@
+#!/usr/bin/env bash
+# Regression tests for frozen-verify-gate.py evidence guards.
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+GATE="$SCRIPT_DIR/frozen-verify-gate.py"
+TMP_DIR="$(mktemp -d /tmp/frozen-verify-gate-test.XXXXXX)"
+FIXTURES_DIR="$TMP_DIR/fixtures"
+trap 'rm -rf "$TMP_DIR"' EXIT
+mkdir -p "$FIXTURES_DIR"
+write_run() {
+  local run_id="$1"
+  local fixture_id="${2:-}"
+  local solo_verdict="$3"
+  local pair_verdict="$4"
+  local lift="$5"
+  local internal_lift="${6:-false}"
+  local pair_primary="${7:-$pair_verdict}"
+  local pair_judge="${8:-$pair_verdict}"
+  mkdir -p "$TMP_DIR/$run_id/pair"
+  if [ -n "$fixture_id" ]; then
+    cat > "$TMP_DIR/$run_id/pair/input.md" <<EOF
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/$fixture_id.md.
+EOF
+  fi
+  cat > "$TMP_DIR/$run_id/compare.json" <<EOF
+{
+  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "$solo_verdict", "elapsed_seconds": 100},
+  "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "$pair_verdict", "pair_mode": true, "elapsed_seconds": 200},
+  "comparison": {
+    "pair_trigger_missed": false,
+    "pair_verdict_lift": $lift,
+    "pair_internal_verdict_lift": $internal_lift,
+    "solo_verdict": "$solo_verdict",
+    "pair_verdict": "$pair_verdict",
+    "pair_primary_verdict": "$pair_primary",
+    "pair_judge_verdict": "$pair_judge"
+  }
+}
+EOF
+}
+expect_fail_contains() {
+  local label="$1"
+  local needle="$2"
+  shift 2
+  local out="$TMP_DIR/$label.out"
+  if "$@" > "$out" 2>&1; then
+    echo "expected failure for $label" >&2
+    cat "$out" >&2
+    exit 1
+  fi
+  if ! grep -Fq "$needle" "$out"; then
+    echo "missing expected text for $label: $needle" >&2
+    cat "$out" >&2
+    exit 1
+  fi
+}
+write_run pass-a F10-persist-write-collision PASS_WITH_ISSUES NEEDS_WORK true
+write_run pass-b F12-webhook-raw-body-signature PASS_WITH_ISSUES NEEDS_WORK true
+mkdir -p "$FIXTURES_DIR/F10-persist-write-collision" "$FIXTURES_DIR/F12-webhook-raw-body-signature"
+python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+  --run-id pass-a --run-id pass-b --min-runs 2 --max-pair-solo-wall-ratio 3 \
+  > "$TMP_DIR/pass.out"
+grep -Fq '"verdict": "PASS"' "$TMP_DIR/pass.out"
+grep -Fq '"avg_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.out"
+grep -Fq '"pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.out"
+mkdir -p "$TMP_DIR/summary-verdicts/pair"
+cat > "$TMP_DIR/summary-verdicts/pair/input.md" <<'EOF'
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F13-summary-verdict-fallback.md.
+EOF
+cat > "$TMP_DIR/summary-verdicts/compare.json" <<'EOF'
+{
+  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
+  "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
+  "comparison": {"pair_trigger_missed": false, "pair_verdict_lift": true, "pair_internal_verdict_lift": false}
+}
+EOF
+mkdir -p "$FIXTURES_DIR/F13-summary-verdict-fallback"
+python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+  --run-id summary-verdicts --min-runs 1 \
+  > "$TMP_DIR/summary-verdicts.out"
+grep -Fq '"verdict": "PASS"' "$TMP_DIR/summary-verdicts.out"
+write_run dup-a F12-webhook-raw-body-signature PASS_WITH_ISSUES NEEDS_WORK true
+write_run dup-b F12-webhook-raw-body-signature PASS_WITH_ISSUES NEEDS_WORK true
+expect_fail_contains duplicate-fixture "duplicate fixture_id=F12-webhook-raw-body-signature" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id dup-a --run-id dup-b --min-runs 2
+write_run missing-fixture "" PASS_WITH_ISSUES NEEDS_WORK true
+expect_fail_contains missing-fixture "fixture_id missing" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id missing-fixture --min-runs 1
+write_run unknown-fixture F99-not-a-real-fixture PASS_WITH_ISSUES NEEDS_WORK true
+expect_fail_contains unknown-fixture "fixture_id not found: F99-not-a-real-fixture" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id unknown-fixture --min-runs 1
+write_run recall-only F11-batch-import-all-or-nothing PASS PASS_WITH_ISSUES false
+mkdir -p "$FIXTURES_DIR/F11-batch-import-all-or-nothing"
+expect_fail_contains recall-only "pair verdict PASS_WITH_ISSUES is not verdict-binding" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id recall-only --min-runs 1
+write_run internal-lift F14-internal-pair-lift NEEDS_WORK NEEDS_WORK false true PASS_WITH_ISSUES NEEDS_WORK
+mkdir -p "$FIXTURES_DIR/F14-internal-pair-lift"
+python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+  --run-id internal-lift --min-runs 1 \
+  > "$TMP_DIR/internal-lift.out"
+grep -Fq '"verdict": "PASS"' "$TMP_DIR/internal-lift.out"
+write_run slow-pair F15-slow-pair PASS_WITH_ISSUES NEEDS_WORK true
+mkdir -p "$FIXTURES_DIR/F15-slow-pair"
+python3 - "$TMP_DIR/slow-pair/compare.json" <<'PY'
+import json
+import sys
+path = sys.argv[1]
+with open(path) as f:
+    data = json.load(f)
+data["pair"]["elapsed_seconds"] = 401
+with open(path, "w") as f:
+    json.dump(data, f)
+PY
+expect_fail_contains slow-pair "pair/solo wall ratio 4.01 exceeds 3.00" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id slow-pair --min-runs 1 --max-pair-solo-wall-ratio 3
+mkdir -p "$TMP_DIR/missing-elapsed/pair"
+cat > "$TMP_DIR/missing-elapsed/pair/input.md" <<'EOF'
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F16-missing-elapsed.md.
+EOF
+cat > "$TMP_DIR/missing-elapsed/compare.json" <<'EOF'
+{
+  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES"},
+  "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true},
+  "comparison": {
+    "pair_trigger_missed": false,
+    "pair_verdict_lift": true,
+    "pair_internal_verdict_lift": false,
+    "solo_verdict": "PASS_WITH_ISSUES",
+    "pair_verdict": "NEEDS_WORK",
+    "pair_primary_verdict": "NEEDS_WORK",
+    "pair_judge_verdict": "NEEDS_WORK"
+  }
+}
+EOF
+mkdir -p "$FIXTURES_DIR/F16-missing-elapsed"
+expect_fail_contains missing-elapsed "pair/solo wall ratio missing" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id missing-elapsed --min-runs 1 --max-pair-solo-wall-ratio 3
+mkdir -p "$TMP_DIR/missing-compare/pair"
+cat > "$TMP_DIR/missing-compare/pair/input.md" <<'EOF'
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F17-missing-compare.md.
+EOF
+mkdir -p "$FIXTURES_DIR/F17-missing-compare"
+expect_fail_contains missing-compare "missing compare.json for missing-compare" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id missing-compare --min-runs 1
+mkdir -p "$TMP_DIR/provider-limit/pair"
+cat > "$TMP_DIR/provider-limit/pair/input.md" <<'EOF'
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/F18-provider-limit.md.
+EOF
+cat > "$TMP_DIR/provider-limit/pair/transcript.txt" <<'EOF'
+You've hit your limit · resets 3am (Asia/Seoul)
+EOF
+cat > "$TMP_DIR/provider-limit/compare.json" <<'EOF'
+{
+  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 100},
+  "pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "pair_mode": false, "elapsed_seconds": 1},
+  "comparison": {
+    "pair_trigger_missed": false,
+    "pair_verdict_lift": false,
+    "pair_internal_verdict_lift": false,
+    "solo_verdict": "PASS",
+    "pair_verdict": null
+  }
+}
+EOF
+mkdir -p "$FIXTURES_DIR/F18-provider-limit"
+expect_fail_contains provider-limit "pair provider limit" \
+  python3 "$GATE" --results-root "$TMP_DIR" --fixtures-root "$FIXTURES_DIR" \
+    --run-id provider-limit --min-runs 1
+echo "✓ test-frozen-verify-gate"

package/benchmark/auto-resolve/scripts/test-full-pipeline-pair-gate.sh ADDED Viewed

@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# Regression tests for full-pipeline-pair-gate.py.
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+GATE="$SCRIPT_DIR/full-pipeline-pair-gate.py"
+TMP_DIR="$(mktemp -d /tmp/full-pipeline-pair-gate-test.XXXXXX)"
+trap 'rm -rf "$TMP_DIR"' EXIT
+write_fixture() {
+  local run_id="$1"
+  local fixture="$2"
+  local bare="$3"
+  local solo="$4"
+  local pair="$5"
+  local pair_mode="${6:-true}"
+  local pair_elapsed="${7:-200}"
+  local solo_elapsed="${8:-100}"
+  local pair_arm="${9:-l2_gated}"
+  local dir="$TMP_DIR/$run_id/$fixture"
+  mkdir -p "$dir/bare" "$dir/solo_claude" "$dir/$pair_arm"
+  cat > "$dir/judge.json" <<EOF
+{
+  "scores_by_arm": {"bare": $bare, "solo_claude": $solo, "$pair_arm": $pair},
+  "disqualifiers_by_arm": {}
+}
+EOF
+  for arm in bare solo_claude "$pair_arm"; do
+    cat > "$dir/$arm/verify.json" <<'EOF'
+{"disqualifier": false}
+EOF
+  done
+  cat > "$dir/bare/result.json" <<'EOF'
+{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": 20}
+EOF
+  cat > "$dir/solo_claude/result.json" <<EOF
+{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $solo_elapsed}
+EOF
+  cat > "$dir/$pair_arm/result.json" <<EOF
+{"timed_out": false, "invoke_failure": false, "disqualifier": false, "elapsed_seconds": $pair_elapsed, "pair_mode": $pair_mode}
+EOF
+}
+expect_fail_contains() {
+  local label="$1"
+  local needle="$2"
+  shift 2
+  local out="$TMP_DIR/$label.out"
+  if "$@" > "$out" 2>&1; then
+    echo "expected failure for $label" >&2
+    cat "$out" >&2
+    exit 1
+  fi
+  if ! grep -Fq "$needle" "$out"; then
+    echo "missing expected text for $label: $needle" >&2
+    cat "$out" >&2
+    exit 1
+  fi
+}
+write_fixture pass F21 50 75 82 true 220 110
+write_fixture pass F22 60 80 88 true 280 140
+python3 "$GATE" --results-root "$TMP_DIR" --run-id pass \
+  --max-pair-solo-wall-ratio 3 \
+  --out-json "$TMP_DIR/pass.json" \
+  --out-md "$TMP_DIR/pass.md"
+grep -Fq '"verdict": "PASS"' "$TMP_DIR/pass.json"
+grep -Fq '"avg_pair_solo_wall_ratio": 2.0' "$TMP_DIR/pass.json"
+grep -Fq 'Verdict: **PASS**' "$TMP_DIR/pass.md"
+write_fixture no-headroom F21 50 81 90 true
+write_fixture no-headroom F22 60 80 88 true
+expect_fail_contains no-headroom "solo_claude score 81 > 80" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id no-headroom
+write_fixture no-pair-mode F21 50 75 85 false
+write_fixture no-pair-mode F22 60 80 90 true
+expect_fail_contains no-pair-mode "l2_gated pair_mode not true" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id no-pair-mode
+write_fixture weak-margin F21 50 75 79 true
+write_fixture weak-margin F22 60 80 88 true
+expect_fail_contains weak-margin "l2_gated margin +4 < +5" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id weak-margin
+write_fixture custom-pair-arm F21 50 75 82 true 220 110 l2_risk_probes
+write_fixture custom-pair-arm F22 60 80 88 true 280 140 l2_risk_probes
+python3 "$GATE" --results-root "$TMP_DIR" --run-id custom-pair-arm \
+  --pair-arm l2_risk_probes \
+  --max-pair-solo-wall-ratio 3 \
+  --out-json "$TMP_DIR/custom-pair-arm.json" \
+  --out-md "$TMP_DIR/custom-pair-arm.md"
+grep -Fq '"pair_arm": "l2_risk_probes"' "$TMP_DIR/custom-pair-arm.json"
+grep -Fq 'l2_risk_probes - solo_claude >= 5' "$TMP_DIR/custom-pair-arm.md"
+write_fixture provider-limit F21 50 75 85 true 37 100 l2_risk_probes
+python3 - "$TMP_DIR/provider-limit/F21/l2_risk_probes/result.json" <<'PY'
+import json, sys
+path = sys.argv[1]
+data = json.load(open(path))
+data["invoke_failure"] = True
+data["invoke_failure_reason"] = "provider_limit"
+json.dump(data, open(path, "w"), indent=2)
+PY
+expect_fail_contains provider-limit "l2_risk_probes invoke failure (provider_limit)" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id provider-limit \
+    --pair-arm l2_risk_probes --min-fixtures 1
+python3 "$GATE" --results-root "$TMP_DIR" --run-id provider-limit \
+  --pair-arm l2_risk_probes --min-fixtures 1 \
+  --out-json "$TMP_DIR/provider-limit.json" \
+  --out-md "$TMP_DIR/provider-limit.md" >/dev/null 2>&1 || true
+grep -Fq '"pair_margin": null' "$TMP_DIR/provider-limit.json"
+grep -Fq '"pair_solo_wall_ratio": null' "$TMP_DIR/provider-limit.json"
+if grep -Fq 'margin -' "$TMP_DIR/provider-limit.md"; then
+  echo "provider-limit row must not report quality margin" >&2
+  cat "$TMP_DIR/provider-limit.md" >&2
+  exit 1
+fi
+write_fixture slow-pair F21 50 75 85 true 401 100
+write_fixture slow-pair F22 60 80 88 true 280 140
+expect_fail_contains slow-pair "pair/solo wall ratio 4.01 > 3.00" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id slow-pair --max-pair-solo-wall-ratio 3
+write_fixture one-fixture F21 50 75 85 true
+expect_fail_contains one-fixture "fixture_count_ok" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id one-fixture --out-json "$TMP_DIR/one-fixture.json"
+grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-fixture.json"
+echo "PASS test-full-pipeline-pair-gate"

package/benchmark/auto-resolve/scripts/test-headroom-gate.sh ADDED Viewed

@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+# Regression tests for headroom-gate.py candidate-set guards.
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+GATE="$SCRIPT_DIR/headroom-gate.py"
+TMP_DIR="$(mktemp -d /tmp/headroom-gate-test.XXXXXX)"
+trap 'rm -rf "$TMP_DIR"' EXIT
+write_fixture() {
+  local run_id="$1"
+  local fixture="$2"
+  local bare="$3"
+  local solo="$4"
+  local solo_timed_out="${5:-false}"
+  local dir="$TMP_DIR/$run_id/$fixture"
+  mkdir -p "$dir/bare" "$dir/solo_claude"
+  cat > "$dir/judge.json" <<EOF
+{
+  "scores_by_arm": {"bare": $bare, "solo_claude": $solo},
+  "disqualifiers_by_arm": {}
+}
+EOF
+  cat > "$dir/bare/result.json" <<'EOF'
+{"timed_out": false, "invoke_failure": false}
+EOF
+  cat > "$dir/bare/verify.json" <<'EOF'
+{"disqualifier": false}
+EOF
+  cat > "$dir/solo_claude/result.json" <<EOF
+{"timed_out": $solo_timed_out, "invoke_failure": false}
+EOF
+  cat > "$dir/solo_claude/verify.json" <<'EOF'
+{"disqualifier": false}
+EOF
+}
+expect_fail_contains() {
+  local label="$1"
+  local needle="$2"
+  shift 2
+  local out="$TMP_DIR/$label.out"
+  if "$@" > "$out" 2>&1; then
+    echo "expected failure for $label" >&2
+    cat "$out" >&2
+    exit 1
+  fi
+  if ! grep -Fq "$needle" "$out"; then
+    echo "missing expected text for $label: $needle" >&2
+    cat "$out" >&2
+    exit 1
+  fi
+}
+write_fixture one-pass F10 50 75
+expect_fail_contains min-fixtures 'Verdict: **FAIL**' \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id one-pass --out-json "$TMP_DIR/one-pass.json"
+grep -Fq '"fixture_count_ok": false' "$TMP_DIR/one-pass.json"
+write_fixture two-pass F10 50 75
+write_fixture two-pass F12 60 80
+python3 "$GATE" --results-root "$TMP_DIR" --run-id two-pass --out-json "$TMP_DIR/two-pass.json" \
+  > "$TMP_DIR/two-pass.out"
+grep -Fq '"verdict": "PASS"' "$TMP_DIR/two-pass.json"
+grep -Fq '"fixture_count_ok": true' "$TMP_DIR/two-pass.json"
+write_fixture solo-ceiling F10 50 75
+write_fixture solo-ceiling F12 20 92
+expect_fail_contains solo-ceiling "solo_claude score 92 > 80" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id solo-ceiling
+write_fixture dirty-solo F10 50 75
+write_fixture dirty-solo F12 20 70 true
+expect_fail_contains dirty-solo "solo_claude timed out" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id dirty-solo
+write_fixture missing-artifact F10 50 75
+write_fixture missing-artifact F12 20 70
+rm "$TMP_DIR/missing-artifact/F12/solo_claude/verify.json"
+expect_fail_contains missing-artifact "solo_claude verify.json missing" \
+  python3 "$GATE" --results-root "$TMP_DIR" --run-id missing-artifact
+echo "✓ test-headroom-gate"