npm - devlyn-cli - Versions diffs - 2.2.2 → 2.3.1 - Mend

devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

package/benchmark/auto-resolve/scripts/recent-benchmark-summary.py ADDED Viewed

@@ -0,0 +1,232 @@
+#!/usr/bin/env python3
+"""Print a compact, wrap-safe benchmark snapshot from local artifacts."""
+from __future__ import annotations
+import argparse
+import importlib.util
+import json
+import pathlib
+import sys
+import textwrap
+from typing import Any
+SCRIPT_DIR = pathlib.Path(__file__).resolve().parent
+FRONTIER_PATH = SCRIPT_DIR / "pair-candidate-frontier.py"
+def load_frontier_module() -> Any:
+    spec = importlib.util.spec_from_file_location("pair_candidate_frontier", FRONTIER_PATH)
+    if spec is None or spec.loader is None:
+        raise RuntimeError(f"cannot load frontier module: {FRONTIER_PATH}")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+FRONTIER = load_frontier_module()
+def best_rows(report: dict[str, Any]) -> list[dict[str, Any]]:
+    rows: list[dict[str, Any]] = []
+    for row in report.get("rows", []):
+        if row.get("status") != "pair_evidence_passed":
+            continue
+        best = FRONTIER.best_pair_evidence(row.get("passing_pair_evidence", []))
+        if best is None:
+            continue
+        rows.append({"fixture": row["fixture"], **best})
+    return rows
+def display_fixture(fixture: str) -> str:
+    short, _, rest = fixture.partition("-")
+    return f"{short} {rest.replace('-', ' ')}" if rest else fixture
+def fmt_margin(value: Any) -> str:
+    return f"{value:+d}" if isinstance(value, int) and not isinstance(value, bool) else "n/a"
+def fmt_decimal_margin(value: Any) -> str:
+    return f"{value:+.2f}" if isinstance(value, (int, float)) and not isinstance(value, bool) else "n/a"
+def fmt_wall(value: Any) -> str:
+    return f"{value:.2f}x" if isinstance(value, (int, float)) and not isinstance(value, bool) else "n/a"
+def fmt_score(value: Any) -> str:
+    return str(value) if isinstance(value, int) and not isinstance(value, bool) else "n/a"
+def wrap_item(prefix: str, text: str, *, width: int) -> list[str]:
+    return textwrap.wrap(
+        text,
+        width=width,
+        initial_indent=prefix,
+        subsequent_indent=" " * len(prefix),
+        break_long_words=False,
+        break_on_hyphens=False,
+    ) or [prefix.rstrip()]
+def render_text(report: dict[str, Any], *, width: int) -> str:
+    rows = best_rows(report)
+    lines = [
+        "Recent Benchmark Snapshot",
+        "=========================",
+        "",
+        "Status",
+        f"  Verdict: {report.get('verdict', 'n/a')}",
+        f"  Active fixtures: {report.get('fixtures_total', 'n/a')}",
+        f"  Rejected controls: {report.get('rejected_total', 'n/a')}",
+        f"  Pair evidence rows: {report.get('pair_evidence_total', 'n/a')}",
+        f"  Unmeasured candidates: {report.get('unmeasured_candidate_total', 'n/a')}",
+        "",
+        "Pair Lift",
+        f"  Average margin: {fmt_decimal_margin(report.get('pair_margin_avg'))}",
+        f"  Minimum margin: {fmt_margin(report.get('pair_margin_min'))}",
+        f"  Average wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_avg'))}",
+        f"  Maximum wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_max'))}",
+        f"  Gate: margin >= {fmt_margin(report.get('min_pair_margin'))}; wall <= {fmt_wall(report.get('max_pair_solo_wall_ratio'))}",
+        "",
+        "Pair Evidence",
+    ]
+    if not rows:
+        lines.append("  No passing pair evidence rows found.")
+        return "\n".join(lines) + "\n"
+    for item in rows:
+        lines.append(f"  {display_fixture(item['fixture'])}")
+        lines.append(
+            "    scores: bare {bare} | solo_claude {solo} | pair {pair}".format(
+                bare=fmt_score(item.get("bare_score")),
+                solo=fmt_score(item.get("solo_score")),
+                pair=fmt_score(item.get("pair_score")),
+            )
+        )
+        lines.append(
+            "    lift: {margin} | wall {wall} | arm {arm}".format(
+                margin=fmt_margin(item.get("pair_margin")),
+                wall=fmt_wall(item.get("pair_solo_wall_ratio")),
+                arm=item.get("pair_arm") or "n/a",
+            )
+        )
+        lines.extend(wrap_item("    run: ", str(item.get("run_id") or "n/a"), width=width))
+        triggers = ", ".join(item.get("pair_trigger_reasons") or [])
+        lines.extend(wrap_item("    triggers: ", triggers or "n/a", width=width))
+    return "\n".join(lines) + "\n"
+def render_markdown(report: dict[str, Any], *, width: int) -> str:
+    rows = best_rows(report)
+    lines = [
+        "# Recent Benchmark Snapshot",
+        "",
+        "## Status",
+        "",
+        f"- Verdict: **{report.get('verdict', 'n/a')}**",
+        f"- Active fixtures: {report.get('fixtures_total', 'n/a')}",
+        f"- Rejected controls: {report.get('rejected_total', 'n/a')}",
+        f"- Pair evidence rows: {report.get('pair_evidence_total', 'n/a')}",
+        f"- Unmeasured candidates: {report.get('unmeasured_candidate_total', 'n/a')}",
+        "",
+        "## Pair Lift",
+        "",
+        f"- Average margin: **{fmt_decimal_margin(report.get('pair_margin_avg'))}**",
+        f"- Minimum margin: **{fmt_margin(report.get('pair_margin_min'))}**",
+        f"- Average wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_avg'))}",
+        f"- Maximum wall ratio: {fmt_wall(report.get('pair_solo_wall_ratio_max'))}",
+        f"- Gate: margin >= {fmt_margin(report.get('min_pair_margin'))}; wall <= {fmt_wall(report.get('max_pair_solo_wall_ratio'))}",
+        "",
+        "## Pair Evidence",
+        "",
+    ]
+    if not rows:
+        lines.append("No passing pair evidence rows found.")
+        return "\n".join(lines) + "\n"
+    for item in rows:
+        lines.extend(
+            [
+                f"### {display_fixture(item['fixture'])}",
+                "",
+                f"- Scores: bare {fmt_score(item.get('bare_score'))}, solo_claude {fmt_score(item.get('solo_score'))}, pair {fmt_score(item.get('pair_score'))}.",
+                f"- Lift: {fmt_margin(item.get('pair_margin'))}; wall {fmt_wall(item.get('pair_solo_wall_ratio'))}; arm `{item.get('pair_arm') or 'n/a'}`.",
+                f"- Run: `{item.get('run_id') or 'n/a'}`.",
+            ]
+        )
+        triggers = ", ".join(item.get("pair_trigger_reasons") or [])
+        wrapped = wrap_item("- Triggers: ", triggers or "n/a", width=width)
+        lines.extend(wrapped)
+        lines.append("")
+    return "\n".join(lines).rstrip() + "\n"
+def main() -> int:
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--fixtures-root",
+        type=pathlib.Path,
+        default=pathlib.Path("benchmark/auto-resolve/fixtures"),
+    )
+    parser.add_argument(
+        "--registry",
+        type=pathlib.Path,
+        default=SCRIPT_DIR / "pair-rejected-fixtures.sh",
+    )
+    parser.add_argument(
+        "--results-root",
+        type=pathlib.Path,
+        default=pathlib.Path("benchmark/auto-resolve/results"),
+    )
+    parser.add_argument("--out-json", type=pathlib.Path)
+    parser.add_argument("--out-md", type=pathlib.Path)
+    parser.add_argument(
+        "--max-width",
+        type=int,
+        default=92,
+        help="target maximum line width for text and markdown output",
+    )
+    parser.add_argument(
+        "--min-pair-margin",
+        type=int,
+        default=5,
+        help="minimum pair-over-solo margin required to count passing pair evidence",
+    )
+    parser.add_argument(
+        "--max-pair-solo-wall-ratio",
+        type=float,
+        default=3.0,
+        help="maximum pair/solo wall-time ratio allowed to count passing pair evidence",
+    )
+    args = parser.parse_args()
+    if args.max_width < 60:
+        print("error: --max-width must be >= 60", file=sys.stderr)
+        return 2
+    try:
+        report = FRONTIER.build_report(
+            fixtures_root=args.fixtures_root,
+            registry=args.registry,
+            results_root=args.results_root,
+            min_pair_margin=args.min_pair_margin,
+            max_pair_solo_wall_ratio=args.max_pair_solo_wall_ratio,
+        )
+    except ValueError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+    if args.out_json:
+        args.out_json.parent.mkdir(parents=True, exist_ok=True)
+        args.out_json.write_text(json.dumps(report, indent=2) + "\n", encoding="utf8")
+    if args.out_md:
+        args.out_md.parent.mkdir(parents=True, exist_ok=True)
+        args.out_md.write_text(render_markdown(report, width=args.max_width), encoding="utf8")
+    print(render_text(report, width=args.max_width), end="")
+    return 0
+if __name__ == "__main__":
+    sys.exit(main())

package/benchmark/auto-resolve/scripts/run-fixture.sh CHANGED Viewed

@@ -5,8 +5,8 @@
 # subprocess (isolated session), then captures artifacts + runs verification.
 #
 # Usage:
-#   run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID>
-#   run-fixture.sh --fixture <FID> --arm <variant|bare> --run-id <ID> --dry-run
+#   run-fixture.sh --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID>
+#   run-fixture.sh --fixture <FID> --arm <variant|solo_claude|bare|l2_gated|l2_risk_probes|l2_forced> --run-id <ID> --dry-run
 #
 # Outputs to benchmark/auto-resolve/results/<run-id>/<fixture>/<arm>/:
 #   input.md, transcript.txt, diff.patch, changed-files.txt, verify.json,
@@ -19,6 +19,15 @@ usage() {
   exit 1
 }
+require_value() {
+  local flag="$1"
+  local value="${2:-}"
+  if [ -z "$value" ] || [[ "$value" == --* ]]; then
+    echo "$flag requires a value" >&2
+    exit 1
+  fi
+}
 kill_worktree_processes() {
   local work_dir="$1"
   local signal="$2"
@@ -40,16 +49,16 @@ FIXTURE=""; ARM=""; RUN_ID=""; DRY_RUN=0
 RESOLVE_SKILL="new"
 while [ $# -gt 0 ]; do
   case "$1" in
-    --fixture)        FIXTURE="$2"; shift 2;;
-    --arm)            ARM="$2";     shift 2;;
-    --run-id)         RUN_ID="$2";  shift 2;;
-    --resolve-skill)  RESOLVE_SKILL="$2"; shift 2;;
+    --fixture)        require_value "$1" "${2:-}"; FIXTURE="$2"; shift 2;;
+    --arm)            require_value "$1" "${2:-}"; ARM="$2";     shift 2;;
+    --run-id)         require_value "$1" "${2:-}"; RUN_ID="$2";  shift 2;;
+    --resolve-skill)  require_value "$1" "${2:-}"; RESOLVE_SKILL="$2"; shift 2;;
     --dry-run)        DRY_RUN=1;    shift;;
     *) usage;;
   esac
 done
 [ -n "$FIXTURE" ] && [ -n "$ARM" ] && [ -n "$RUN_ID" ] || usage
-# iter-0019: original 3 arms — variant (L2-old: Claude orchestrator + Codex BUILD pair via --engine auto),
+# iter-0019/0037: 3 smoke arms — variant (L2: Claude orchestrator + risk-probes pair path),
 # solo_claude (L1: Claude orchestrator, codex blocked by shim+wrapper enforcement),
 # bare (L0: direct claude -p, no skill, no codex).
 # iter-0033c (Codex R0-infra adoption, 2026-05-02): two L2 diagnostic arms for /devlyn:resolve —
@@ -99,8 +108,21 @@ for f in "$META" "$EXPECTED" "$SPEC" "$TASK"; do
   [ -f "$f" ] || { echo "fixture missing required file: $f (see SCHEMA.md)"; exit 1; }
 done
-TIMEOUT=$(python3 -c "import json; print(json.load(open('$META'))['timeout_seconds'])")
-if [ "$ARM" = "l2_risk_probes" ]; then
+TIMEOUT=$(python3 - "$META" "$BENCH_ROOT/scripts" <<'PY'
+import pathlib
+import sys
+sys.path.insert(0, sys.argv[2])
+from pair_evidence_contract import loads_strict_json_object
+metadata = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
+timeout = metadata.get("timeout_seconds")
+if not isinstance(timeout, int) or isinstance(timeout, bool) or timeout <= 0:
+    raise SystemExit("metadata timeout_seconds must be a positive integer")
+print(timeout)
+PY
+)
+if [ "$ARM" = "variant" ] || [ "$ARM" = "l2_risk_probes" ]; then
   # This arm adds a bounded Codex probe-derive phase before IMPLEMENT and a
   # bounded Codex pair-JUDGE during VERIFY. The full-pipeline gate still
   # enforces wall-time efficiency by pair/solo ratio; this budget prevents a
@@ -119,19 +141,18 @@ WORK_DIR="/tmp/bench-${RUN_ID}-${FIXTURE}-${ARM}"
 rm -rf "$WORK_DIR"
 cp -R "$BENCH_ROOT/fixtures/test-repo" "$WORK_DIR"
-# All skill-driven arms (variant / solo_claude / l2_gated / l2_forced) get
-# devlyn skills + project CLAUDE.md pre-baseline + codex shim + monitored
-# wrapper. Bare gets nothing (no skill, no shim, no env).
+# All skill-driven arms (variant / solo_claude / l2_gated / l2_risk_probes /
+# l2_forced) get devlyn skills + project CLAUDE.md pre-baseline + codex shim
+# + monitored wrapper. Bare gets nothing (no skill, no shim, no env).
 #
 # iter-0019: solo_claude (L1) shares variant-arm staging because the L1 arm
 # runs the same orchestrator on the same skills — only difference is codex
 # is blocked. Shim catches PATH resolution; wrapper catches direct-path
 # invocations.
-# iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced share variant staging
-# (codex unblocked, shim+wrapper routing). Difference vs variant is the
-# ENGINE_CLAUSE branch below — l2_* run --engine claude (Claude IMPLEMENT)
-# while variant uses --engine auto (Codex IMPLEMENT). Pair-mode in
-# /devlyn:resolve VERIFY phase pulls Codex via the OTHER-engine rule.
+# iter-0033c/0037 (Codex R0-infra Q6 + risk probes): pair arms share variant
+# staging (codex unblocked, shim+wrapper routing). The smoke `variant` arm now
+# follows the current measured risk-probes path rather than an older
+# auto-engine implement route.
 if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
    || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
   mkdir -p "$WORK_DIR/.claude"
@@ -183,7 +204,7 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
     # the bypass weapon. Across iter-0019 paid 5-fixture run the bypass
     # was OPEN but never exercised; this closes the surface preemptively
     # before iter-0020's 9-fixture L0/L1/L2 run.
-    # iter-0033c (Codex R0-infra Q5): l2_gated/l2_forced are codex-UNBLOCKED
+    # iter-0033c/0037 (Codex R0-infra Q5 + risk probes): l2_* arms are codex-UNBLOCKED
     # (codex must be reachable for VERIFY pair-JUDGE). They take the variant
     # path: ARM_CODEX_BLOCKED=0 → python writer omits CODEX_BLOCKED from env
     # entirely (the shim refuses on any non-empty value, so 0 ≠ unset).
@@ -209,11 +230,12 @@ if codex_blocked == "1":
     # CODEX_BLOCKED enforcement gap.
     env["CODEX_BLOCKED"] = "1"
 else:
-    # variant arm (L2) — codex routes through wrapper as part of pair-mode
-    # BUILD; both vars are required by the shim/wrapper handshake.
+    # variant / pair arms — codex routes through wrapper for risk-probe
+    # derivation and VERIFY pair-JUDGE; both vars are required by the
+    # shim/wrapper handshake.
     env["CODEX_REAL_BIN"] = real_bin
     env["CODEX_MONITORED_PATH"] = monitored
-    if arm == "l2_risk_probes":
+    if arm in ("variant", "l2_risk_probes"):
         # Risk-probe derivation is a bounded contract-conversion step. A long
         # Codex run is a harness failure, not useful extra quality signal.
         env["CODEX_MONITORED_TIMEOUT_SEC"] = "300"
@@ -273,9 +295,12 @@ fi
 # files. Those commands still run in the post-run verifier below.
 if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
    || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
-  python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" <<'PY'
-import json, os, sys
-expected = json.load(open(sys.argv[1]))
+  python3 - "$EXPECTED" "$WORK_DIR/.devlyn/spec-verify.json" "$BENCH_ROOT/scripts" <<'PY'
+import json, os, pathlib, sys
+sys.path.insert(0, sys.argv[3])
+from pair_evidence_contract import loads_strict_json_object
+expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
 out_path = sys.argv[2]
 visible_commands = [
     cmd for cmd in expected.get("verification_commands", [])
@@ -301,11 +326,11 @@ fi
 #   2. Spec-mode `/devlyn:resolve --spec <path>` for the rest (post iter-0034
 #      Phase 4 cutover the OLD `/devlyn:auto-resolve` route was deleted).
 PROMPT_FILE="$RESULT_DIR/input.md"
-# Variant uses --engine auto (experimental dual-engine: codex BUILD + claude
-# critique pair); solo_claude uses --engine claude explicitly so the orchestrator
-# routes every phase to Claude and never tries to invoke codex. The CODEX_BLOCKED
-# shim enforces this at the binary layer if the orchestrator misroutes. Both
-# arms pass the engine flag explicitly so they survive future runtime-default
+# Variant uses the current measured risk-probes pair path; solo_claude uses
+# --engine claude explicitly so the orchestrator routes every implementation
+# phase to Claude and never tries to invoke codex. The CODEX_BLOCKED shim
+# enforces this at the binary layer if the orchestrator misroutes. Both arms
+# pass the engine flag explicitly so they survive future runtime-default
 # changes (post iter-0020 close-out: default flipped to claude).
 if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
    || [ "$ARM" = "l2_gated" ] || [ "$ARM" = "l2_risk_probes" ] || [ "$ARM" = "l2_forced" ]; then
@@ -315,8 +340,8 @@ if [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
       ENGINE_PROMPT_HINT="Run with \`--engine claude\` for every phase. Codex must not be invoked — the harness has blocked it at the binary layer for this run."
       ;;
     variant)
-      ENGINE_CLAUSE="--engine auto"
-      ENGINE_PROMPT_HINT="Run with \`--engine auto\` so the experimental dual-engine routing fires (Codex BUILD/FIX, Claude EVAL/CRITIC) — do not override it."
+      ENGINE_CLAUSE="--engine claude --risk-probes"
+      ENGINE_PROMPT_HINT="Run with \`--engine claude --risk-probes\` so the smoke L2 arm uses the current measured pair path: Claude implements, Codex derives bounded visible-verification probes and can act as VERIFY pair-JUDGE."
       ;;
     l2_gated)
       # NEW L2 with natural pair-mode triggers. Claude does IMPLEMENT;
@@ -484,7 +509,7 @@ else
     # iter-0009 + iter-0019: prepend codex shim PATH for any arm that staged
     # one. variant routes through codex-monitored.sh; solo_claude refuses on
     # CODEX_BLOCKED=1; bare has no shim.
-    # iter-0033c (Codex R0-infra Q6): l2_gated/l2_forced ALSO need the shim
+    # iter-0033c/0037 (Codex R0-infra Q6 + risk probes): l2_* arms ALSO need the shim
     # PATH — they route Claude IMPLEMENT but Codex pair-JUDGE in VERIFY hits
     # `codex exec` through the wrapper for starvation safety.
     if { [ "$ARM" = "variant" ] || [ "$ARM" = "solo_claude" ] \
@@ -652,10 +677,13 @@ fi
 # Run verification commands + forbidden pattern scan + deps check. Uses
 # the operator's real HOME (same as the arm saw). Fixtures that need HOME
 # isolation override it inline per verification command.
-python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" <<'PY'
-import json, os, re, subprocess, sys
+python3 - "$EXPECTED" "$RESULT_DIR" "$WORK_DIR" "$BENCH_ROOT/scripts" <<'PY'
+import json, os, pathlib, re, subprocess, sys
+sys.path.insert(0, sys.argv[4])
+from pair_evidence_contract import loads_strict_json_object
-expected = json.load(open(sys.argv[1]))
+expected = loads_strict_json_object(pathlib.Path(sys.argv[1]).read_text())
 result_dir = sys.argv[2]
 work = sys.argv[3]
@@ -771,12 +799,39 @@ for oracle_file in (
     "oracle-scope-tier-b.json",
     "oracle-test-fidelity.json",
 ):
+    oracle_path = os.path.join(result_dir, oracle_file)
     try:
-        data = json.load(open(os.path.join(result_dir, oracle_file)))
-    except Exception:
+        raw_oracle = loads_strict_json_object(pathlib.Path(oracle_path).read_text())
+    except (OSError, ValueError) as exc:
+        oracle_name = oracle_file.removesuffix(".json")
+        verify["oracle_findings"].append({
+            "oracle": oracle_name,
+            "type": "oracle-error",
+            "severity": "hard",
+            "verdict": "Deterministic oracle failed or emitted an invalid artifact",
+            "error": f"oracle artifact malformed or unreadable: {exc}",
+        })
+        verify["oracle_disqualifier"] = True
         continue
+    data = raw_oracle
     oracle_name = data.get("oracle") or oracle_file.removesuffix(".json")
-    for finding in data.get("findings", []) or []:
+    if not isinstance(oracle_name, str) or not oracle_name:
+        oracle_name = oracle_file.removesuffix(".json")
+    oracle_error = data.get("error")
+    if isinstance(oracle_error, str) and oracle_error:
+        verify["oracle_findings"].append({
+            "oracle": oracle_name,
+            "type": "oracle-error",
+            "severity": "hard",
+            "verdict": "Deterministic oracle failed or emitted an invalid artifact",
+            "error": oracle_error,
+        })
+        verify["oracle_disqualifier"] = True
+    raw_findings = data.get("findings")
+    findings = raw_findings if isinstance(raw_findings, list) else []
+    for finding in findings:
+        if not isinstance(finding, dict):
+            continue
         item = dict(finding)
         item["oracle"] = oracle_name
         verify["oracle_findings"].append(item)
@@ -796,12 +851,15 @@ PY
 # Timing + aggregate
 export INVOKE_EXIT WATCHDOG_FIRED
-python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" <<'PY'
-import json, os, sys
+python3 - "$RESULT_DIR" "$FIXTURE" "$ARM" "$RUN_ID" "$T_END" "$ELAPSED" "$TIMEOUT" "$BENCH_ROOT/scripts" <<'PY'
+import json, os, pathlib, sys
 result_dir, fixture, arm, run_id = sys.argv[1:5]
 t_end, elapsed, timeout = int(sys.argv[5]), int(sys.argv[6]), int(sys.argv[7])
-timing = json.load(open(os.path.join(result_dir, "timing.json")))
+sys.path.insert(0, sys.argv[8])
+from pair_evidence_contract import loads_strict_json_object
+timing = loads_strict_json_object(pathlib.Path(result_dir, "timing.json").read_text())
 timing["end_epoch"] = t_end
 timing["elapsed_seconds"] = elapsed
 timing["timeout_seconds"] = timeout
@@ -812,7 +870,10 @@ timing["timeout_seconds"] = timeout
 timing["timed_out"] = os.environ.get("WATCHDOG_FIRED", "0") == "1"
 json.dump(timing, open(os.path.join(result_dir, "timing.json"), "w"), indent=2)
-verify = json.load(open(os.path.join(result_dir, "verify.json")))
+def as_dict(value):
+    return value if isinstance(value, dict) else {}
+verify = as_dict(loads_strict_json_object(pathlib.Path(result_dir, "verify.json").read_text()))
 try:
     with open(os.path.join(result_dir, "diff.patch")) as f: diff_size = len(f.read())
 except Exception: diff_size = 0
@@ -825,15 +886,21 @@ except Exception:
 state = {}
 state_path = os.path.join(result_dir, "run-archive", "pipeline.state.json")
 if os.path.isfile(state_path):
-    with open(state_path) as f:
-        state = json.load(f)
-verify_phase = (state.get("phases") or {}).get("verify") or {}
+    state = as_dict(loads_strict_json_object(pathlib.Path(state_path).read_text()))
+phases = as_dict(state.get("phases"))
+verify_phase = as_dict(phases.get("verify"))
+legacy_verify = as_dict(state.get("verify"))
 sub_verdicts = verify_phase.get("sub_verdicts")
-pair_trigger = verify_phase.get("pair_trigger") or ((state.get("verify") or {}).get("pair_trigger"))
-pair_mode = bool(
-    isinstance(sub_verdicts, dict)
-    and (sub_verdicts.get("judge_codex") is not None or sub_verdicts.get("pair_judge") is not None)
-) or bool(verify_phase.get("pair_mode"))
+pair_trigger = verify_phase.get("pair_trigger") or legacy_verify.get("pair_trigger")
+PAIR_VERDICTS = {"PASS", "PASS_WITH_ISSUES", "NEEDS_WORK", "BLOCKED", "FAIL"}
+def has_pair_judge_verdict(sub_verdicts):
+    return isinstance(sub_verdicts, dict) and (
+        sub_verdicts.get("judge_codex") in PAIR_VERDICTS
+        or sub_verdicts.get("pair_judge") in PAIR_VERDICTS
+    )
+pair_mode = has_pair_judge_verdict(sub_verdicts) or verify_phase.get("pair_mode") is True
 invoke_exit = int(os.environ.get("INVOKE_EXIT", "0"))
 plugin_contamination = False
@@ -893,7 +960,7 @@ result = {
     "invoke_exit": invoke_exit,
     "invoke_failure": invoke_failure,
     "invoke_failure_reason": invoke_failure_reason,
-    "terminal_verdict": ((state.get("phases") or {}).get("final_report") or {}).get("verdict"),
+    "terminal_verdict": as_dict(phases.get("final_report")).get("verdict"),
     "verify_verdict": verify_phase.get("verdict"),
     "pair_trigger": pair_trigger,
     "pair_mode": pair_mode,