npm - devlyn-cli - Versions diffs - 2.2.2 → 2.3.1 - Mend

devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

package/benchmark/auto-resolve/scripts/run-swebench-solver-batch.sh CHANGED Viewed

@@ -28,18 +28,27 @@ RESUME=0
 LIMIT=""
 INSTANCE_IDS=()
+require_value() {
+  local flag="$1"
+  local value="${2:-}"
+  if [ -z "$value" ] || [[ "$value" == --* ]]; then
+    echo "$flag requires a value" >&2
+    exit 1
+  fi
+}
 while [ $# -gt 0 ]; do
   case "$1" in
-    --instances-jsonl) INSTANCES_JSONL="$2"; shift 2;;
-    --predictions-out) PREDICTIONS_OUT="$2"; shift 2;;
-    --model-name) MODEL_NAME="$2"; shift 2;;
-    --repos-root) REPOS_ROOT="$2"; shift 2;;
-    --worktrees-root) WORKTREES_ROOT="$2"; shift 2;;
-    --timeout-seconds) TIMEOUT_SECONDS="$2"; shift 2;;
+    --instances-jsonl) require_value "$1" "${2:-}"; INSTANCES_JSONL="$2"; shift 2;;
+    --predictions-out) require_value "$1" "${2:-}"; PREDICTIONS_OUT="$2"; shift 2;;
+    --model-name) require_value "$1" "${2:-}"; MODEL_NAME="$2"; shift 2;;
+    --repos-root) require_value "$1" "${2:-}"; REPOS_ROOT="$2"; shift 2;;
+    --worktrees-root) require_value "$1" "${2:-}"; WORKTREES_ROOT="$2"; shift 2;;
+    --timeout-seconds) require_value "$1" "${2:-}"; TIMEOUT_SECONDS="$2"; shift 2;;
     --copy-devlyn-context) COPY_DEVLYN_CONTEXT=1; shift;;
     --resume) RESUME=1; shift;;
-    --limit) LIMIT="$2"; shift 2;;
-    --instance-id) INSTANCE_IDS+=("$2"); shift 2;;
+    --limit) require_value "$1" "${2:-}"; LIMIT="$2"; shift 2;;
+    --instance-id) require_value "$1" "${2:-}"; INSTANCE_IDS+=("$2"); shift 2;;
     -h|--help) usage 0;;
     *) echo "unknown arg: $1" >&2; usage 1;;
   esac
@@ -62,22 +71,31 @@ TMP_IDS="$(mktemp)"
 TMP_SELECTED_INSTANCES="$(mktemp)"
 trap 'rm -f "$TMP_IDS" "$TMP_SELECTED_INSTANCES"' EXIT
-python3 - "$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT" "${INSTANCE_IDS[@]}" > "$TMP_IDS" <<'PY'
+selection_args=("$INSTANCES_JSONL" "$TMP_SELECTED_INSTANCES" "$LIMIT")
+if [ "${#INSTANCE_IDS[@]}" -gt 0 ]; then
+  selection_args+=("${INSTANCE_IDS[@]}")
+fi
+python3 - "$SCRIPT_DIR" "${selection_args[@]}" > "$TMP_IDS" <<'PY'
 import json
 import sys
 from pathlib import Path
+sys.path.insert(0, sys.argv[1])
+from pair_evidence_contract import reject_json_constant
-instances_path = Path(sys.argv[1])
-selected_path = Path(sys.argv[2])
-limit = int(sys.argv[3]) if sys.argv[3] else None
-requested = sys.argv[4:]
+instances_path = Path(sys.argv[2])
+selected_path = Path(sys.argv[3])
+limit = int(sys.argv[4]) if sys.argv[4] else None
+requested = sys.argv[5:]
 requested_set = set(requested)
 rows = []
 with instances_path.open(encoding="utf8") as f:
     for line_no, line in enumerate(f, start=1):
         if not line.strip():
             continue
-        row = json.loads(line)
+        row = json.loads(line, parse_constant=reject_json_constant)
+        if not isinstance(row, dict):
+            raise SystemExit(f"{instances_path}:{line_no}: expected JSON object")
         instance_id = row.get("instance_id")
         if not isinstance(instance_id, str) or not instance_id:
             raise SystemExit(f"{instances_path}:{line_no}: missing instance_id")

package/benchmark/auto-resolve/scripts/ship-gate.py CHANGED Viewed

@@ -11,6 +11,83 @@ Exits 0 on PASS, 1 on FAIL.
 from __future__ import annotations
 import argparse, json, pathlib, sys, shutil, datetime
+SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+from pair_evidence_contract import reject_json_constant
+def load_dict_json(path: pathlib.Path) -> tuple[dict | None, str | None]:
+    try:
+        data = json.loads(path.read_text(), parse_constant=reject_json_constant)
+    except (ValueError, json.JSONDecodeError):
+        return None, "invalid JSON"
+    if not isinstance(data, dict):
+        return None, "expected object"
+    return data, None
+def object_or_empty(value) -> dict:
+    return value if isinstance(value, dict) else {}
+def rows_from_summary(summary: dict, failures: list[str]) -> list[dict]:
+    raw_rows = summary.get("rows")
+    if not isinstance(raw_rows, list):
+        failures.append("summary rows missing or malformed — measurement invalid")
+        return []
+    rows = [row for row in raw_rows if isinstance(row, dict)]
+    if len(rows) != len(raw_rows):
+        failures.append("summary rows contain non-object entries — measurement invalid")
+    return rows
+def int_or_none(value) -> int | None:
+    return value if isinstance(value, int) and not isinstance(value, bool) else None
+def number_or_none(value) -> int | float | None:
+    if isinstance(value, bool):
+        return None
+    return value if isinstance(value, (int, float)) else None
+def bool_or_none(value) -> bool | None:
+    return value if isinstance(value, bool) else None
+def axis_invalid_count(rows: list[dict], arm: str, failures: list[str]) -> int:
+    total = 0
+    for row in rows:
+        arms = object_or_empty(row.get("arms"))
+        payload = object_or_empty(arms.get(arm))
+        raw_count = payload.get("_axis_validation_out_of_range_count", 0)
+        count = number_or_none(raw_count)
+        if count is None:
+            failures.append(f"{arm} axis count malformed — measurement invalid")
+        elif count > 0:
+            total += 1
+    return total
+def unmapped_axis_invalid_count(rows: list[dict], failures: list[str]) -> int:
+    total = 0
+    for row in rows:
+        raw_count = row.get("_axis_validation_unmapped_out_of_range_count", 0)
+        count = number_or_none(raw_count)
+        if count is None:
+            failures.append("unmapped axis count malformed — measurement invalid")
+        elif count > 0:
+            total += 1
+    return total
+def is_known_limit(row: dict) -> bool:
+    raw_category = row.get("category")
+    category = raw_category.lower() if isinstance(raw_category, str) else ""
+    return category in {"edge", "known-limit"}
 def main() -> int:
     p = argparse.ArgumentParser()
@@ -25,68 +102,134 @@ def main() -> int:
     summary_p = root / "results" / args.run_id / "summary.json"
     if not summary_p.exists():
         print(f"no summary at {summary_p}", file=sys.stderr); return 1
-    summary = json.loads(summary_p.read_text())
+    summary, summary_error = load_dict_json(summary_p)
+    if summary is None:
+        print(f"measurement invalid: malformed summary.json ({summary_error})", file=sys.stderr)
+        return 1
     baseline_p = root / "history" / "baselines" / "shipped.json"
     baseline = None
     if baseline_p.exists():
-        try:
-            baseline = json.loads(baseline_p.read_text())
-        except Exception:
+        baseline, _ = load_dict_json(baseline_p)
+        if baseline is None:
             baseline = None
     failures: list[str] = []
     warnings: list[str] = []
+    rows = rows_from_summary(summary, failures)
     # Hard floor 1: no disqualifier in variant
-    if summary["hard_floor_violations"] > 0:
-        failures.append(f"{summary['hard_floor_violations']} variant disqualifier(s) — see report")
+    hard_floor_violations = int_or_none(summary.get("hard_floor_violations"))
+    if hard_floor_violations is None:
+        failures.append("summary hard_floor_violations missing or malformed — measurement invalid")
+    elif hard_floor_violations > 0:
+        failures.append(f"{hard_floor_violations} variant disqualifier(s) — see report")
+    variant_axis_invalid = axis_invalid_count(rows, "variant", failures)
+    if variant_axis_invalid > 0:
+        failures.append(
+            f"variant axis-invalid: {variant_axis_invalid} fixture(s) have out-of-range axis cells — "
+            "re-judge before trusting L2 margins"
+        )
+    bare_axis_invalid = axis_invalid_count(rows, "bare", failures)
+    if bare_axis_invalid > 0:
+        failures.append(
+            f"bare axis-invalid: {bare_axis_invalid} fixture(s) have out-of-range axis cells — "
+            "re-judge before trusting margins"
+        )
+    unmapped_axis_invalid = unmapped_axis_invalid_count(rows, failures)
+    if unmapped_axis_invalid > 0:
+        failures.append(
+            f"judge axis-invalid unmapped: {unmapped_axis_invalid} fixture(s) have out-of-range axis cells "
+            "that could not be mapped to an arm — re-judge before trusting margins"
+        )
     # Hard floor 2: F9 must pass (skipped during bootstrap via --accept-missing)
     # Variant arm legacy gate kept for L2 baseline comparability.
     # iter-0033a (2026-04-30): renamed F9 dir from -to-preflight to -to-resolve to
     # match the shipped 2-skill contract (no preflight). The OLD pre-rename id
     # is preserved in fixtures/retired/ for replay.
-    f9_row = next((r for r in summary["rows"] if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
+    f9_row = next((r for r in rows if r.get("fixture") == "F9-e2e-ideate-to-resolve"), None)
     if f9_row is None:
         if not args.accept_missing:
             failures.append("F9 (E2E novice flow) missing — add fixture or run with --accept-missing")
     else:
-        if (f9_row.get("margin") or -999) < 5:
+        f9_margin = number_or_none(f9_row.get("margin"))
+        if f9_margin is None:
+            failures.append("F9 (E2E novice flow) margin missing or malformed — measurement invalid")
+        elif f9_margin < 5:
             failures.append("F9 (E2E novice flow) must have variant margin ≥ +5")
-    # Hard floor 3: ≥ 7 of 9 gated fixtures with margin ≥ +5
+    for row in rows:
+        if not is_known_limit(row):
+            continue
+        margin = number_or_none(row.get("margin"))
+        if margin is not None and (margin < -3 or margin > 3):
+            warnings.append(
+                f"{row.get('fixture')} known-limit margin {margin:+g} outside expected [-3,+3] range"
+            )
+    # Hard floor 3: at least 7 gated fixtures with margin ≥ +5
     # (skipped during bootstrap via --accept-missing)
-    if summary["gated_fixtures"] > 0 and summary["margin_ge_5_count"] < 7:
+    gated_fixtures = int_or_none(summary.get("gated_fixtures"))
+    margin_ge_5_count = int_or_none(summary.get("margin_ge_5_count"))
+    if gated_fixtures is None or margin_ge_5_count is None:
+        failures.append("summary gated fixture counts missing or malformed — measurement invalid")
+    elif gated_fixtures > 0 and margin_ge_5_count < 7:
         if not args.accept_missing:
             failures.append(
-                f"only {summary['margin_ge_5_count']} of {summary['gated_fixtures']} "
+                f"only {margin_ge_5_count} of {gated_fixtures} "
                 f"gated fixtures have variant margin ≥ +5 (need ≥ 7)"
             )
     # iter-0023 — L1 (solo_claude) gates per NORTH-STAR.md ops test #1.
     # Codex R1 (this iter) caught that ship-gate enforced only legacy L2
     # `variant` margin and never read `solo_over_bare`. Now NORTH-STAR's
-    # documented L1 floor (≥ +5, ≥ 7/9 fixtures, F9 ≥ +5, no L1
+    # documented L1 floor (≥ +5 on at least 7 gated fixtures, F9 ≥ +5, no L1
     # disqualifier) is mechanically enforced.
-    arms_present = summary.get("arms_present", {})
-    margins_avg = summary.get("margins_avg", {})
-    if arms_present.get("solo_claude"):
+    raw_arms_present = summary.get("arms_present")
+    if raw_arms_present is not None and not isinstance(raw_arms_present, dict):
+        failures.append("summary arms_present malformed — measurement invalid")
+    arms_present = object_or_empty(raw_arms_present)
+    raw_margins_avg = summary.get("margins_avg")
+    margins_avg = object_or_empty(raw_margins_avg)
+    raw_solo_present = arms_present.get("solo_claude")
+    solo_present = bool_or_none(raw_solo_present)
+    if raw_solo_present is not None and solo_present is None:
+        failures.append("summary arms_present.solo_claude malformed — measurement invalid")
+    if solo_present is True:
+        if raw_margins_avg is not None and not isinstance(raw_margins_avg, dict):
+            failures.append("summary margins_avg malformed — measurement invalid")
+        l1_dq_by_fixture: dict[str, bool] = {}
+        for r in rows:
+            fixture = str(r.get("fixture"))
+            l1 = object_or_empty(object_or_empty(r.get("arms")).get("solo_claude"))
+            raw_l1_dq = l1.get("disqualifier")
+            parsed_l1_dq = bool_or_none(raw_l1_dq)
+            if raw_l1_dq is not None and parsed_l1_dq is None:
+                failures.append(f"{fixture} L1 disqualifier malformed — measurement invalid")
+                l1_dq_by_fixture[fixture] = True
+            else:
+                l1_dq_by_fixture[fixture] = parsed_l1_dq is True
         l1_avg = margins_avg.get("solo_over_bare")
-        if l1_avg is not None and l1_avg < 5:
+        if l1_avg is not None and number_or_none(l1_avg) is None:
+            failures.append("L1 (solo_over_bare) suite avg malformed — measurement invalid")
+        elif l1_avg is not None and l1_avg < 5:
             warnings.append(
                 f"L1 (solo_over_bare) suite avg {l1_avg:+.1f} below NORTH-STAR floor +5 "
                 "(reporting only — per-fixture L1 gates below are decisive)"
             )
         # F9 L1 floor
         if f9_row is not None:
-            f9_l1 = (f9_row.get("margins") or {}).get("solo_over_bare")
+            f9_l1 = object_or_empty(f9_row.get("margins")).get("solo_over_bare")
             if f9_l1 is None:
                 if not args.accept_missing:
                     failures.append("F9 L1 (solo_over_bare) margin missing — measurement invalid")
+            elif number_or_none(f9_l1) is None:
+                failures.append("F9 L1 (solo_over_bare) margin malformed — measurement invalid")
             elif f9_l1 < 5:
-                failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+d} < +5 floor")
-        # 7-of-9 L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
+                failures.append(f"F9 L1 (solo_over_bare) margin {f9_l1:+g} < +5 floor")
+        # 7-fixture L1 floor — headroom-aware (added 2026-05-02 per iter-0033 R4
         # Codex collab + NORTH-STAR amendment + RUBRIC hard-floor 3 update).
         # A fixture is excluded from the denominator when 100 - L0_score < 5
         # AND L1_score >= 95 AND the L1 arm has no disqualifier / CRITICAL-HIGH
@@ -96,25 +239,26 @@ def main() -> int:
         l1_ge_5 = 0
         l1_gated = 0
         l1_excluded_headroom = []
-        for r in summary.get("rows", []):
-            if (r.get("category") or "").lower() == "known-limit":
+        for r in rows:
+            if is_known_limit(r):
                 continue
-            arms = r.get("arms") or {}
-            l0 = arms.get("bare") or {}
-            l1 = arms.get("solo_claude") or {}
-            l0_score = l0.get("score")
-            l1_score = l1.get("score")
-            m = (r.get("margins") or {}).get("solo_over_bare")
+            arms = object_or_empty(r.get("arms"))
+            l0 = object_or_empty(arms.get("bare"))
+            l1 = object_or_empty(arms.get("solo_claude"))
+            l0_score = number_or_none(l0.get("score"))
+            l1_score = number_or_none(l1.get("score"))
+            m = number_or_none(object_or_empty(r.get("margins")).get("solo_over_bare"))
             if m is None:
                 continue
             # Headroom carve-out — must satisfy ALL conditions:
             # (a) bare ceiling-near (100 - L0 < 5)
             # (b) L1 also ceiling-near (>=95)
             # (c) L1 arm clean (no disqualifier, no axis-invalid, fix-loop didn't fail)
-            l1_dq_here = bool(l1.get("disqualifier"))
-            l1_axis_inv = (l1.get("_axis_validation_out_of_range_count") or 0) > 0
+            l1_dq_here = l1_dq_by_fixture.get(str(r.get("fixture")), False)
+            l1_axis_count = number_or_none(l1.get("_axis_validation_out_of_range_count", 0))
+            l1_axis_inv = bool(l1_axis_count is not None and l1_axis_count > 0)
             if (
-                isinstance(l0_score, (int, float)) and isinstance(l1_score, (int, float))
+                l0_score is not None and l1_score is not None
                 and (100 - l0_score) < 5 and l1_score >= 95
                 and not l1_dq_here and not l1_axis_inv
             ):
@@ -136,14 +280,14 @@ def main() -> int:
             warnings.append(
                 "L1 headroom-excluded (saturation candidates per RUBRIC two-shipped-version rule): "
                 + ", ".join(
-                    f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+d})"
+                    f"{x['fixture']} (L0={x['l0_score']} L1={x['l1_score']} margin={x['margin']:+g})"
                     for x in l1_excluded_headroom
                 )
             )
         # L1 disqualifier floor
         l1_dq = sum(
-            1 for r in summary.get("rows", [])
-            if ((r.get("arms") or {}).get("solo_claude") or {}).get("disqualifier")
+            1 for r in rows
+            if l1_dq_by_fixture.get(str(r.get("fixture")), False)
         )
         if l1_dq > 0:
             failures.append(f"L1 disqualifier(s): {l1_dq} solo_claude arm(s) hit a disqualifier")
@@ -151,10 +295,13 @@ def main() -> int:
         # `_axis_validation` per fixture). If any L1 row has invalid axis data,
         # the L1 score for that row is not trustworthy.
         l1_axis_invalid = 0
-        for r in summary.get("rows", []):
-            av = (r.get("arms") or {}).get("solo_claude") or {}
+        for r in rows:
+            av = object_or_empty(object_or_empty(r.get("arms")).get("solo_claude"))
             inv = av.get("_axis_validation_out_of_range_count")
-            if inv is not None and inv > 0:
+            count = number_or_none(inv)
+            if inv is not None and count is None:
+                failures.append("L1 axis count malformed — measurement invalid")
+            elif count is not None and count > 0:
                 l1_axis_invalid += 1
         if l1_axis_invalid > 0:
             failures.append(
@@ -164,31 +311,53 @@ def main() -> int:
     # Hard floor 4: no per-fixture regression worse than −5 vs shipped baseline
     if baseline:
-        prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
-        for r in summary["rows"]:
+        prev_rows = {
+            r["fixture"]: r for r in baseline.get("rows", [])
+            if isinstance(r, dict) and isinstance(r.get("fixture"), str)
+        }
+        for r in rows:
+            if is_known_limit(r):
+                continue
             fid = r.get("fixture")
             prev = prev_rows.get(fid)
-            if prev and r.get("variant_score") is not None and prev.get("variant_score") is not None:
-                delta = r["variant_score"] - prev["variant_score"]
+            current_score = number_or_none(r.get("variant_score"))
+            previous_score = number_or_none(prev.get("variant_score")) if prev else None
+            if prev and current_score is not None and previous_score is not None:
+                delta = current_score - previous_score
                 if delta < -5:
-                    failures.append(f"{fid} regressed {delta:+d} vs shipped (floor: −5)")
+                    failures.append(f"{fid} regressed {delta:+g} vs shipped (floor: −5)")
     # Soft gate: suite average margin drop > 3
     if baseline:
-        margin_delta = summary["margin_avg"] - baseline.get("margin_avg", 0)
-        if margin_delta < -3:
-            warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
+        current_margin_avg = number_or_none(summary.get("margin_avg"))
+        baseline_margin_avg = number_or_none(baseline.get("margin_avg"))
+        if current_margin_avg is None:
+            failures.append("suite margin missing — measurement invalid")
+        elif baseline_margin_avg is None:
+            warnings.append("shipped baseline margin malformed; skipping suite margin delta")
+        else:
+            margin_delta = current_margin_avg - baseline_margin_avg
+            if margin_delta < -3:
+                warnings.append(f"suite margin dropped {margin_delta:+.1f} vs shipped (soft gate: > −3)")
     # Soft gate: any fixture that was > +5 before is now ≤ 0
     if baseline:
-        prev_rows = {r["fixture"]: r for r in baseline.get("rows", [])}
-        for r in summary["rows"]:
+        prev_rows = {
+            r["fixture"]: r for r in baseline.get("rows", [])
+            if isinstance(r, dict) and isinstance(r.get("fixture"), str)
+        }
+        for r in rows:
             fid = r.get("fixture")
             prev = prev_rows.get(fid)
-            if prev and (prev.get("margin") or 0) > 5 and (r.get("margin") or 0) <= 0:
-                warnings.append(
-                    f"{fid} lost its margin: was {prev['margin']:+d}, now {r['margin']:+d}"
-                )
+            prev_margin = number_or_none(prev.get("margin")) if prev else None
+            current_margin = number_or_none(r.get("margin"))
+            if prev and prev_margin is not None and prev_margin > 5:
+                if current_margin is None:
+                    warnings.append(f"{fid} margin missing; was {prev_margin:+g}")
+                elif current_margin <= 0:
+                    warnings.append(
+                        f"{fid} lost its margin: was {prev_margin:+g}, now {current_margin:+g}"
+                    )
     verdict = "PASS" if not failures else "FAIL"
     print(f"\n═══ SHIP-GATE VERDICT: {verdict} ═══\n")

package/benchmark/auto-resolve/scripts/solo-ceiling-avoidance.py ADDED Viewed

@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""Validate a shadow fixture solo ceiling avoidance note."""
+from __future__ import annotations
+import argparse
+import pathlib
+import re
+import sys
+SECTION_RE = re.compile(r"(?ms)^##[ \t]+Solo ceiling avoidance\b[^\n]*\n(.*?)(?=^##[ \t]+|\Z)")
+CONTROL_RE = re.compile(r"\bS[2-6]\b|S2-S6|solo-saturated|rejected controls?", re.IGNORECASE)
+REASON_RE = re.compile(r"\bdiffer(?:s|ent|ence)?\b|\bunlike\b|\bbecause\b|\bpreserve\b|\bheadroom\b", re.IGNORECASE)
+def read_text(path: pathlib.Path) -> str:
+    try:
+        return path.read_text(encoding="utf-8")
+    except UnicodeDecodeError as exc:
+        print(f"{path}: expected UTF-8 text ({exc})", file=sys.stderr)
+        raise SystemExit(2) from None
+    except OSError as exc:
+        print(f"{path}: unable to read ({exc})", file=sys.stderr)
+        raise SystemExit(2) from None
+def solo_ceiling_avoidance_error(text: str) -> str | None:
+    match = SECTION_RE.search(text)
+    if not match:
+        return "missing ## Solo ceiling avoidance section"
+    section = match.group(1)
+    if "solo_claude" not in section:
+        return "solo ceiling avoidance must mention solo_claude"
+    if not CONTROL_RE.search(section):
+        return "solo ceiling avoidance must compare against rejected or solo-saturated controls such as S2-S6"
+    if not REASON_RE.search(section):
+        return "solo ceiling avoidance must state difference/headroom reasoning"
+    return None
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("path", type=pathlib.Path)
+    args = parser.parse_args(argv)
+    err = solo_ceiling_avoidance_error(read_text(args.path))
+    if err:
+        print(f"{args.path}: {err}", file=sys.stderr)
+        return 1
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))

package/benchmark/auto-resolve/scripts/solo-headroom-hypothesis.py ADDED Viewed

@@ -0,0 +1,77 @@
+#!/usr/bin/env python3
+"""Validate that a pair-candidate fixture states an actionable solo-headroom hypothesis."""
+from __future__ import annotations
+import argparse
+import json
+import pathlib
+import sys
+from pair_evidence_contract import (
+    actionable_observable_commands,
+    has_actionable_solo_headroom_hypothesis_text,
+)
+def combined_text(paths: list[pathlib.Path]) -> str:
+    chunks: list[str] = []
+    for path in paths:
+        if not path.is_file():
+            continue
+        try:
+            chunks.append(path.read_text(encoding="utf-8"))
+        except UnicodeDecodeError as exc:
+            print(f"{path}: expected UTF-8 text ({exc})", file=sys.stderr)
+            raise SystemExit(2) from None
+    return "\n".join(chunks)
+def has_actionable_hypothesis(text: str) -> bool:
+    return has_actionable_solo_headroom_hypothesis_text(text)
+def expected_commands(path: pathlib.Path) -> set[str]:
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except UnicodeDecodeError as exc:
+        print(f"{path}: expected UTF-8 JSON ({exc})", file=sys.stderr)
+        raise SystemExit(2) from None
+    except json.JSONDecodeError as exc:
+        print(f"{path}: invalid JSON ({exc})", file=sys.stderr)
+        raise SystemExit(2) from None
+    commands = data.get("verification_commands")
+    if not isinstance(commands, list):
+        print(f"{path}: verification_commands must be a list", file=sys.stderr)
+        raise SystemExit(2)
+    result: set[str] = set()
+    for index, command in enumerate(commands):
+        if not isinstance(command, dict) or not isinstance(command.get("cmd"), str):
+            print(f"{path}: verification_commands[{index}].cmd must be a string", file=sys.stderr)
+            raise SystemExit(2)
+        result.add(command["cmd"])
+    return result
+def main(argv: list[str]) -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--expected-json",
+        type=pathlib.Path,
+        help="Require the observable hypothesis command to match expected.json verification_commands[].cmd.",
+    )
+    parser.add_argument("paths", nargs="+", type=pathlib.Path)
+    args = parser.parse_args(argv)
+    text = combined_text(args.paths)
+    if not has_actionable_hypothesis(text):
+        return 1
+    if args.expected_json is None:
+        return 0
+    expected = expected_commands(args.expected_json)
+    return 0 if any(command in expected for command in actionable_observable_commands(text)) else 1
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))