npm - devlyn-cli - Versions diffs - 2.3.0 → 2.3.2 - Mend

devlyn-cli 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (219) hide show

package/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py CHANGED Viewed

@@ -5,11 +5,20 @@ from __future__ import annotations
 import argparse
 import json
+import math
 import re
 from collections import Counter
 from pathlib import Path
 from typing import Any
+from pair_evidence_contract import (
+    all_known_pair_trigger_reasons,
+    has_canonical_pair_trigger_reason,
+    has_known_pair_trigger_reason,
+    loads_strict_json_object,
+    path_has_actionable_solo_headroom_hypothesis,
+)
 RANK = {
     "PASS": 0,
@@ -24,7 +33,29 @@ def rank(verdict: str | None) -> int:
 def load_json(path: Path) -> dict[str, Any]:
-    return json.loads(path.read_text(encoding="utf8"))
+    try:
+        return loads_strict_json_object(path.read_text(encoding="utf8"))
+    except (json.JSONDecodeError, ValueError):
+        return {}
+def object_field(payload: dict[str, Any], key: str) -> dict[str, Any]:
+    value = payload.get(key)
+    return value if isinstance(value, dict) else {}
+def verdict_field(payload: dict[str, Any], key: str) -> str | None:
+    value = payload.get(key)
+    return value if isinstance(value, str) else None
+def number_field(payload: dict[str, Any], key: str) -> int | float | None:
+    value = payload.get(key)
+    if isinstance(value, bool):
+        return None
+    if not isinstance(value, (int, float)) or not math.isfinite(value):
+        return None
+    return value
 def transcript_failure_reason(results_root: Path, run_id: str, arm: str) -> str | None:
@@ -51,16 +82,112 @@ def infer_fixture_id(results_root: Path, run_id: str) -> str:
 def elapsed_ratio(pair_elapsed: Any, solo_elapsed: Any) -> float | None:
     if not isinstance(pair_elapsed, (int, float)) or not isinstance(solo_elapsed, (int, float)):
         return None
-    if solo_elapsed <= 0:
+    if pair_elapsed <= 0 or solo_elapsed <= 0:
         return None
     return pair_elapsed / solo_elapsed
+def is_true(value: Any) -> bool:
+    return value is True
+def pair_trigger_failures(
+    pair: dict[str, Any],
+    *,
+    fixture_spec: Path | None = None,
+    require_hypothesis_trigger: bool = False,
+) -> list[str]:
+    trigger = pair.get("pair_trigger")
+    if not isinstance(trigger, dict):
+        return ["pair_trigger missing or malformed"]
+    eligible = trigger.get("eligible")
+    reasons = trigger.get("reasons")
+    skipped_reason = trigger.get("skipped_reason")
+    if not isinstance(eligible, bool):
+        return ["pair_trigger.eligible malformed"]
+    if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
+        return ["pair_trigger.reasons malformed"]
+    if skipped_reason is not None and not isinstance(skipped_reason, str):
+        return ["pair_trigger.skipped_reason malformed"]
+    if eligible is True:
+        failures = []
+        if not reasons:
+            failures.append("pair_trigger eligible with empty reasons")
+        if reasons and not has_known_pair_trigger_reason(reasons):
+            failures.append("pair_trigger reasons missing known trigger reason")
+        if (
+            reasons
+            and has_known_pair_trigger_reason(reasons)
+            and not all_known_pair_trigger_reasons(reasons)
+        ):
+            failures.append("pair_trigger reasons contain unknown trigger reason")
+        if (
+            reasons
+            and all_known_pair_trigger_reasons(reasons)
+            and not has_canonical_pair_trigger_reason(reasons)
+        ):
+            failures.append("pair_trigger reasons missing canonical trigger reason")
+        if skipped_reason is not None:
+            failures.append("pair_trigger eligible with skipped_reason")
+        if (
+            require_hypothesis_trigger
+            and fixture_spec is not None
+            and path_has_actionable_solo_headroom_hypothesis(fixture_spec)
+            and "spec.solo_headroom_hypothesis" not in reasons
+        ):
+            failures.append("pair_trigger missing spec.solo_headroom_hypothesis")
+        return failures
+    if reasons:
+        return ["pair_trigger ineligible with reasons"]
+    return []
+def pair_trigger_eligible(pair: dict[str, Any]) -> bool:
+    trigger = pair.get("pair_trigger")
+    return (
+        isinstance(trigger, dict)
+        and trigger.get("eligible") is True
+        and isinstance(trigger.get("reasons"), list)
+        and all(isinstance(reason, str) for reason in trigger["reasons"])
+        and len(trigger["reasons"]) > 0
+        and all_known_pair_trigger_reasons(trigger["reasons"])
+        and has_canonical_pair_trigger_reason(trigger["reasons"])
+        and trigger.get("skipped_reason") is None
+    )
+def pair_trigger_reasons(pair: dict[str, Any]) -> list[str]:
+    trigger = pair.get("pair_trigger")
+    if not isinstance(trigger, dict):
+        return []
+    reasons = trigger.get("reasons")
+    if not isinstance(reasons, list) or not all(isinstance(reason, str) for reason in reasons):
+        return []
+    return reasons
+def pair_trigger_label(row: dict[str, Any]) -> str:
+    if row["pair_trigger_missed"]:
+        return "missed"
+    failures = row.get("pair_trigger_failures") or []
+    if failures:
+        return "malformed"
+    if row["pair_trigger_eligible"]:
+        return "eligible"
+    return "not_eligible"
 def load_gate_rows(gate_json: Path | None) -> dict[str, dict[str, Any]]:
     if gate_json is None:
         return {}
     doc = load_json(gate_json)
-    return {row["run_id"]: row for row in doc.get("rows", [])}
+    rows = doc.get("rows")
+    if not isinstance(rows, list):
+        return {}
+    return {
+        row["run_id"]: row for row in rows
+        if isinstance(row, dict) and isinstance(row.get("run_id"), str)
+    }
 def min_gate_rate(value: str) -> float:
@@ -98,8 +225,21 @@ def classify(row: dict[str, Any], included: bool) -> str:
         return "failed attempt: timeout"
     if row.get("solo_failure_reason") == "provider_limit" or row.get("pair_failure_reason") == "provider_limit":
         return "failed attempt: provider limit"
+    if row.get("solo_environment_contamination") or row.get("pair_environment_contamination"):
+        return "failed attempt: environment contamination"
+    if row.get("solo_disqualifier") or row.get("pair_disqualifier"):
+        return "failed attempt: disqualifier"
+    if row.get("solo_invoke_failure") or row.get("pair_invoke_failure"):
+        return "failed attempt: invoke failure"
     if row.get("solo_invoke_exit") not in (None, 0) or row.get("pair_invoke_exit") not in (None, 0):
         return "failed attempt: nonzero invoke exit"
+    if row.get("malformed_compare"):
+        return "failed attempt: malformed compare"
+    if row.get("pair_trigger_missed"):
+        return "failed attempt: pair trigger missed"
+    trigger_failures = row.get("pair_trigger_failures") or []
+    if trigger_failures:
+        return "failed attempt: pair trigger contract: " + "; ".join(trigger_failures)
     if row["solo_mechanical"] == "FAIL":
         return "excluded: solo mechanical dominated"
     if row["external_lift"] or row["internal_lift"]:
@@ -114,47 +254,95 @@ def classify(row: dict[str, Any], included: bool) -> str:
     return "no verdict lift"
-def build_row(results_root: Path, run_id: str, gate_rows_by_id: dict[str, dict[str, Any]]) -> dict[str, Any]:
+def build_row(
+    results_root: Path,
+    run_id: str,
+    gate_rows_by_id: dict[str, dict[str, Any]],
+    *,
+    fixtures_root: Path | None,
+    require_hypothesis_trigger: bool,
+) -> dict[str, Any]:
     compare_path = results_root / run_id / "compare.json"
+    malformed_compare = False
     if compare_path.exists():
         compare = load_json(compare_path)
+        malformed_compare = not bool(compare)
     else:
         compare = {
             "solo": {},
             "pair": {},
             "comparison": {"compare_missing": True},
         }
-    solo = compare.get("solo") or {}
-    pair = compare.get("pair") or {}
-    comparison = compare.get("comparison") or {}
-    pair_ratio = elapsed_ratio(pair.get("elapsed_seconds"), solo.get("elapsed_seconds"))
+    solo = object_field(compare, "solo")
+    pair = object_field(compare, "pair")
+    comparison = object_field(compare, "comparison")
+    malformed_compare = malformed_compare or any(
+        key in compare and not isinstance(compare.get(key), dict)
+        for key in ("solo", "pair", "comparison")
+    )
+    pair_ratio = elapsed_ratio(
+        number_field(pair, "elapsed_seconds"),
+        number_field(solo, "elapsed_seconds"),
+    )
     gate_row = gate_rows_by_id.get(run_id) or {}
+    solo_verdict = (
+        verdict_field(comparison, "solo_verdict")
+        or verdict_field(solo, "verify_verdict")
+    )
+    pair_verdict = (
+        verdict_field(comparison, "pair_verdict")
+        or verdict_field(pair, "verify_verdict")
+    )
+    solo_sub = object_field(solo, "sub_verdicts")
+    pair_sub = object_field(pair, "sub_verdicts")
+    fixture_id = infer_fixture_id(results_root, run_id)
+    fixture_spec = None
+    if fixtures_root is not None and fixture_id != "unknown":
+        fixture_spec = fixtures_root / fixture_id / "spec.md"
+    trigger_failures = pair_trigger_failures(
+        pair,
+        fixture_spec=fixture_spec,
+        require_hypothesis_trigger=require_hypothesis_trigger,
+    )
+    trigger_reasons = pair_trigger_reasons(pair)
     row = {
-        "fixture_id": infer_fixture_id(results_root, run_id),
+        "fixture_id": fixture_id,
         "run_id": run_id,
-        "solo_verdict": comparison.get("solo_verdict") or solo.get("verify_verdict"),
-        "pair_verdict": comparison.get("pair_verdict") or pair.get("verify_verdict"),
-        "pair_mode": bool(pair.get("pair_mode")),
-        "external_lift": bool(comparison.get("pair_verdict_lift")),
-        "internal_lift": bool(comparison.get("pair_internal_verdict_lift")),
-        "pair_found_more_findings": bool(comparison.get("pair_found_more_findings")),
-        "pair_found_more_low_or_worse": bool(comparison.get("pair_found_more_low_or_worse")),
-        "row_failed_before_compare": bool(comparison.get("row_failed_before_compare")),
+        "solo_verdict": solo_verdict,
+        "pair_verdict": pair_verdict,
+        "pair_mode": is_true(pair.get("pair_mode")),
+        "pair_trigger_eligible": pair_trigger_eligible(pair),
+        "pair_trigger_reasons": trigger_reasons,
+        "pair_trigger_has_canonical_reason": has_canonical_pair_trigger_reason(trigger_reasons),
+        "pair_trigger_missed": is_true(comparison.get("pair_trigger_missed")),
+        "pair_trigger_failures": trigger_failures,
+        "external_lift": is_true(comparison.get("pair_verdict_lift")),
+        "internal_lift": is_true(comparison.get("pair_internal_verdict_lift")),
+        "pair_found_more_findings": is_true(comparison.get("pair_found_more_findings")),
+        "pair_found_more_low_or_worse": is_true(comparison.get("pair_found_more_low_or_worse")),
+        "row_failed_before_compare": is_true(comparison.get("row_failed_before_compare")),
         "row_exit": comparison.get("row_exit"),
-        "compare_missing": bool(comparison.get("compare_missing")),
+        "compare_missing": is_true(comparison.get("compare_missing")),
         "solo_invoke_exit": solo.get("invoke_exit"),
         "pair_invoke_exit": pair.get("invoke_exit"),
         "solo_failure_reason": solo.get("invoke_failure_reason")
         or transcript_failure_reason(results_root, run_id, "solo"),
         "pair_failure_reason": pair.get("invoke_failure_reason")
         or transcript_failure_reason(results_root, run_id, "pair"),
-        "solo_timed_out": bool(solo.get("timed_out")),
-        "pair_timed_out": bool(pair.get("timed_out")),
+        "solo_invoke_failure": is_true(solo.get("invoke_failure")),
+        "pair_invoke_failure": is_true(pair.get("invoke_failure")),
+        "solo_environment_contamination": is_true(solo.get("environment_contamination")),
+        "pair_environment_contamination": is_true(pair.get("environment_contamination")),
+        "solo_disqualifier": is_true(solo.get("disqualifier")),
+        "pair_disqualifier": is_true(pair.get("disqualifier")),
+        "solo_timed_out": is_true(solo.get("timed_out")),
+        "pair_timed_out": is_true(pair.get("timed_out")),
         "pair_solo_wall_ratio": pair_ratio,
-        "solo_mechanical": (solo.get("sub_verdicts") or {}).get("mechanical"),
-        "pair_mechanical": (pair.get("sub_verdicts") or {}).get("mechanical"),
+        "solo_mechanical": verdict_field(solo_sub, "mechanical"),
+        "pair_mechanical": verdict_field(pair_sub, "mechanical"),
         "included_in_gate": gate_row.get("status") == "PASS",
         "gate_failures": gate_row.get("failures") or [],
+        "malformed_compare": malformed_compare,
     }
     row["classification"] = classify(row, row["included_in_gate"])
     return row
@@ -164,6 +352,12 @@ def fmt_ratio(value: Any) -> str:
     return f"{value:.2f}x" if isinstance(value, (int, float)) else "n/a"
+def fmt_trigger_reasons(value: Any) -> str:
+    if not isinstance(value, list) or not all(isinstance(item, str) for item in value):
+        return ""
+    return ",".join(value)
 def write_md(path: Path, report: dict[str, Any]) -> None:
     lines = [
         f"# {report['title']}",
@@ -189,14 +383,16 @@ def write_md(path: Path, report: dict[str, Any]) -> None:
     lines.extend(
         [
             "",
-            "| Fixture | Solo | Pair | Pair mode | Wall ratio | External lift | Internal lift | Included | Classification |",
-            "|---|---|---|---|---:|---|---|---|---|",
+            "| Fixture | Solo VERIFY | Pair VERIFY | Pair mode | Pair trigger | Triggers | Wall ratio | External lift | Internal lift | Included | Classification |",
+            "|---|---|---|---|---|---|---:|---|---|---|---|",
         ]
     )
     for row in report["rows"]:
         lines.append(
             f"| {row['fixture_id']} | {row['solo_verdict']} | {row['pair_verdict']} | "
-            f"{str(row['pair_mode']).lower()} | {fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
+            f"{str(row['pair_mode']).lower()} | {pair_trigger_label(row)} | "
+            f"{fmt_trigger_reasons(row.get('pair_trigger_reasons'))} | "
+            f"{fmt_ratio(row.get('pair_solo_wall_ratio'))} | "
             f"{str(row['external_lift']).lower()} | {str(row['internal_lift']).lower()} | "
             f"{str(row['included_in_gate']).lower()} | {row['classification']} |"
         )
@@ -207,18 +403,35 @@ def write_md(path: Path, report: dict[str, Any]) -> None:
 def main() -> int:
     parser = argparse.ArgumentParser()
     parser.add_argument("--results-root", default="benchmark/auto-resolve/results", type=Path)
+    parser.add_argument("--fixtures-root", type=Path)
     parser.add_argument("--run-id", action="append", required=True)
     parser.add_argument("--gate-json", type=Path)
     parser.add_argument("--title", required=True)
     parser.add_argument("--verdict", required=True)
     parser.add_argument("--min-gate-rate", type=min_gate_rate)
     parser.add_argument("--max-trailing-non-gate", type=non_negative_int)
+    parser.add_argument(
+        "--require-hypothesis-trigger",
+        action="store_true",
+        help="require fixtures with actionable solo-headroom hypotheses to expose spec.solo_headroom_hypothesis in pair_trigger.reasons",
+    )
     parser.add_argument("--out-json", required=True, type=Path)
     parser.add_argument("--out-md", required=True, type=Path)
     args = parser.parse_args()
+    if args.require_hypothesis_trigger and args.fixtures_root is None:
+        parser.error("--require-hypothesis-trigger requires --fixtures-root")
     gate_rows_by_id = load_gate_rows(args.gate_json)
-    rows = [build_row(args.results_root, run_id, gate_rows_by_id) for run_id in args.run_id]
+    rows = [
+        build_row(
+            args.results_root,
+            run_id,
+            gate_rows_by_id,
+            fixtures_root=args.fixtures_root,
+            require_hypothesis_trigger=args.require_hypothesis_trigger,
+        )
+        for run_id in args.run_id
+    ]
     gate_rows = sum(1 for row in rows if row["included_in_gate"])
     trailing_non_gate_rows = 0
     for row in reversed(rows):

package/benchmark/auto-resolve/scripts/test-audit-headroom-rejections.sh ADDED Viewed

@@ -0,0 +1,288 @@
+#!/usr/bin/env bash
+# Regression tests for audit-headroom-rejections.py.
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+SCRIPT="$SCRIPT_DIR/audit-headroom-rejections.py"
+TMP_DIR="$(mktemp -d /tmp/audit-headroom-rejections-test.XXXXXX)"
+trap 'rm -rf "$TMP_DIR"' EXIT
+fixtures="$TMP_DIR/fixtures"
+results="$TMP_DIR/results"
+registry="$TMP_DIR/pair-rejected-fixtures.sh"
+mkdir -p "$fixtures/F16-cli-quote-tax-rules" \
+  "$fixtures/F33-cli-new-candidate" \
+  "$fixtures/F34-cli-rejected-candidate" \
+  "$fixtures/F35-cli-missing-judge" \
+  "$fixtures/F36-unsupported-rejection" \
+  "$results/old-f16" \
+  "$results/f33-headroom" \
+  "$results/f33-weak-pair-pass" \
+  "$results/f34-headroom" \
+  "$results/f35-missing-judge" \
+  "$results/20260512-f36-headroom" \
+  "$results/bad-json-headroom" \
+  "$results/malformed-headroom" \
+  "$results/f16-pair-pass"
+cat > "$registry" <<'SH'
+rejected_pair_fixture_reason() {
+  local fid="$1"
+  case "$fid" in
+    F34-*|F34)
+      echo "measured solo ceiling"
+      ;;
+    F36-*|F36)
+      echo "bare 33 / solo_claude 98 in 20260512-missing-headroom"
+      ;;
+    *)
+      return 1
+      ;;
+  esac
+}
+SH
+s_only_registry="$TMP_DIR/s-only-registry.sh"
+cat > "$s_only_registry" <<'SH'
+rejected_pair_fixture_reason() {
+  local fid="$1"
+  case "$fid" in
+    S3-*|S3)
+      echo "shadow solo ceiling"
+      ;;
+    *)
+      return 1
+      ;;
+  esac
+}
+SH
+python3 - "$SCRIPT" "$s_only_registry" <<'PY'
+import importlib.util
+import pathlib
+import sys
+spec = importlib.util.spec_from_file_location("audit_headroom_rejections", sys.argv[1])
+module = importlib.util.module_from_spec(spec)
+assert spec.loader is not None
+spec.loader.exec_module(module)
+assert module.registry_short_ids(pathlib.Path(sys.argv[2])) == {"S3"}
+PY
+write_headroom_fail() {
+  local run_id="$1"
+  local fixture="$2"
+  local bare="$3"
+  local solo="$4"
+  cat > "$results/$run_id/headroom-gate.json" <<JSON
+{
+  "run_id": "$run_id",
+  "verdict": "FAIL",
+  "rows": [
+    {
+      "fixture": "$fixture",
+      "status": "FAIL",
+      "bare_score": $bare,
+      "solo_score": $solo,
+      "reason": "solo_claude score $solo > 80"
+    }
+  ]
+}
+JSON
+}
+write_headroom_fail old-f16 F16-cli-quote-tax-rules 50 98
+write_headroom_fail f33-headroom F33-cli-new-candidate 33 98
+write_headroom_fail f34-headroom F34-cli-rejected-candidate 33 98
+cat > "$results/f35-missing-judge/headroom-gate.json" <<'JSON'
+{
+  "run_id": "f35-missing-judge",
+  "verdict": "FAIL",
+  "rows": [
+    {
+      "fixture": "F35-cli-missing-judge",
+      "status": "MISSING_JUDGE",
+      "reason": "judge.json missing"
+    }
+  ]
+}
+JSON
+cat > "$results/malformed-headroom/headroom-gate.json" <<'JSON'
+{
+  "run_id": "malformed-headroom",
+  "verdict": "FAIL",
+  "rows": []
+}
+JSON
+printf '{not-json\n' > "$results/bad-json-headroom/headroom-gate.json"
+cat > "$results/f16-pair-pass/full-pipeline-pair-gate.json" <<'JSON'
+{
+  "run_id": "f16-pair-pass",
+  "verdict": "PASS",
+  "pair_arm": "l2_risk_probes",
+  "rows": [
+    {
+      "fixture": "F16-cli-quote-tax-rules",
+      "status": "PASS",
+      "bare_score": 50,
+      "solo_score": 75,
+      "pair_score": 96,
+      "pair_margin": 21,
+      "pair_mode": true,
+      "pair_trigger_eligible": true,
+      "pair_solo_wall_ratio": 1.28
+    }
+  ]
+}
+JSON
+mkdir -p "$results/f16-pair-pass/F16-cli-quote-tax-rules/l2_risk_probes"
+cat > "$results/f16-pair-pass/F16-cli-quote-tax-rules/l2_risk_probes/result.json" <<'JSON'
+{
+  "pair_trigger": {
+    "eligible": true,
+    "reasons": ["complexity.high"],
+    "skipped_reason": null
+  }
+}
+JSON
+python3 - "$SCRIPT" "$results" <<'PY'
+import importlib.util
+import pathlib
+import sys
+spec = importlib.util.spec_from_file_location("audit_headroom_rejections", sys.argv[1])
+module = importlib.util.module_from_spec(spec)
+assert spec.loader is not None
+spec.loader.exec_module(module)
+results_root = pathlib.Path(sys.argv[2])
+kwargs = {
+    "results_root": results_root,
+    "run_id": "f16-pair-pass",
+    "fixture": "F16-cli-quote-tax-rules",
+    "pair_arm": "l2_risk_probes",
+}
+assert module.pair_result_trigger_reasons(**kwargs) == ["complexity.high"]
+path = (
+    results_root
+    / "f16-pair-pass"
+    / "F16-cli-quote-tax-rules"
+    / "l2_risk_probes"
+    / "result.json"
+)
+path.write_text(
+    '{"pair_trigger":{"eligible":true,"reasons":["risk high"],"skipped_reason":null}}\n',
+    encoding="utf8",
+)
+assert module.pair_result_trigger_reasons(**kwargs) == []
+path.write_text(
+    '{"pair_trigger":{"eligible":true,"reasons":["complexity.high"],"skipped_reason":null}}\n',
+    encoding="utf8",
+)
+PY
+cat > "$results/f33-weak-pair-pass/full-pipeline-pair-gate.json" <<'JSON'
+{
+  "run_id": "f33-weak-pair-pass",
+  "verdict": "PASS",
+  "pair_arm": "l2_risk_probes",
+  "rows": [
+    {
+      "fixture": "F33-cli-new-candidate",
+      "status": "PASS",
+      "bare_score": 33,
+      "solo_score": 98,
+      "pair_score": 96,
+      "pair_margin": -2,
+      "pair_mode": true,
+      "pair_trigger_eligible": true,
+      "pair_solo_wall_ratio": 1.1
+    }
+  ]
+}
+JSON
+if python3 "$SCRIPT" \
+  --fixtures-root "$fixtures" \
+  --registry "$registry" \
+  --results-root "$results" \
+  --out-json "$TMP_DIR/audit.json" > "$TMP_DIR/audit.out" 2> "$TMP_DIR/audit.err"; then
+  echo "expected unrecorded F33 failure" >&2
+  exit 1
+fi
+grep -Fq 'F33-cli-new-candidate' "$TMP_DIR/audit.err"
+grep -Fq 'F35-cli-missing-judge' "$TMP_DIR/audit.err"
+grep -Fq 'status=MISSING_JUDGE' "$TMP_DIR/audit.err"
+grep -Fq 'malformed-headroom <unknown>' "$TMP_DIR/audit.err"
+grep -Fq 'status=MALFORMED_ROWS' "$TMP_DIR/audit.err"
+grep -Fq 'bad-json-headroom <unknown>' "$TMP_DIR/audit.err"
+grep -Fq 'status=MALFORMED_JSON' "$TMP_DIR/audit.err"
+grep -Fq 'unsupported registry rejection(s)' "$TMP_DIR/audit.err"
+grep -Fq 'F36-unsupported-rejection' "$TMP_DIR/audit.err"
+grep -Fq 'expected_run=20260512-missing-headroom' "$TMP_DIR/audit.err"
+grep -Fq 'solo_claude=98' "$TMP_DIR/audit.err"
+grep -Fq 'expected_solo_claude=98' "$TMP_DIR/audit.err"
+grep -Fq '"verdict": "FAIL"' "$TMP_DIR/audit.json"
+grep -Fq '"fixture": "F33-cli-new-candidate"' "$TMP_DIR/audit.json"
+grep -Fq '"fixture": "F35-cli-missing-judge"' "$TMP_DIR/audit.json"
+grep -Fq '"fixture": "<unknown>"' "$TMP_DIR/audit.json"
+grep -Fq '"unsupported_registry_rejections"' "$TMP_DIR/audit.json"
+if grep -Fq 'F16-cli-quote-tax-rules' "$TMP_DIR/audit.err"; then
+  echo "F16 has passing pair evidence and must not be reported" >&2
+  cat "$TMP_DIR/audit.err" >&2
+  exit 1
+fi
+if grep -Fq 'F34-cli-rejected-candidate' "$TMP_DIR/audit.err"; then
+  echo "F34 is rejected and must not be reported" >&2
+  cat "$TMP_DIR/audit.err" >&2
+  exit 1
+fi
+python3 - "$registry" <<'PY'
+from pathlib import Path
+import sys
+path = Path(sys.argv[1])
+text = path.read_text()
+text = text.replace(
+    '    F34-*|F34)',
+    '    F33-*|F33)\n'
+    '      echo "measured solo ceiling"\n'
+    '      ;;\n'
+    '    F35-*|F35)\n'
+    '      echo "missing judge artifact"\n'
+    '      ;;\n'
+    '    F34-*|F34)'
+)
+path.write_text(text)
+PY
+rm -rf "$results/malformed-headroom"
+rm -rf "$results/bad-json-headroom"
+write_headroom_fail 20260512-f36-headroom F36-unsupported-rejection 33 98
+python3 - "$registry" <<'PY'
+from pathlib import Path
+import sys
+path = Path(sys.argv[1])
+text = path.read_text()
+text = text.replace(
+    "bare 33 / solo_claude 98 in 20260512-missing-headroom",
+    "bare 33 / solo_claude 98 in 20260512-f36-headroom",
+)
+path.write_text(text)
+PY
+python3 "$SCRIPT" \
+  --fixtures-root "$fixtures" \
+  --registry "$registry" \
+  --results-root "$results" \
+  --out-json "$TMP_DIR/audit-pass.json" \
+  > "$TMP_DIR/audit-pass.out"
+grep -Fq 'PASS audit-headroom-rejections' "$TMP_DIR/audit-pass.out"
+grep -Fq '"verdict": "PASS"' "$TMP_DIR/audit-pass.json"
+grep -Fq '"unsupported_registry_rejections": []' "$TMP_DIR/audit-pass.json"
+echo "PASS test-audit-headroom-rejections"