npm - devlyn-cli - Versions diffs - 2.2.2 → 2.3.1 - Mend

devlyn-cli 2.2.2 → 2.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (220) hide show

package/benchmark/auto-resolve/scripts/build-pair-eligible-manifest.py CHANGED Viewed

@@ -14,7 +14,7 @@ ensure_ascii=False, allow_nan=False`, then sha256 the bytes.
 Inputs (all required):
   --c1-summary <path>        iter-0033 (C1) summary.json (selection grounds; never a comparison baseline)
   --f9-judge <path>          iter-0033a F9 judge.json (F9 inclusion proof)
-  --l1-rerun-summary <path>  L1 rerun summary at iter-0033c HEAD (fresh baseline)
+  --l1-rerun-summary <path>  L1 rerun summary archived for provenance, not selection
   --output <path>            destination .devlyn/manifests/iter-0033c-pair-eligible.json
 Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set"):
@@ -25,18 +25,38 @@ Selection rule (frozen pre-registration, iter-0033c §"Pair-eligible fixture set
   pair_eligible = high_value ∪ promoted_by_l1_le_l0 ∪ {F9 if iter-0033a passed}
                   − reporting_only
                   − conditional_excluded that did not get promoted
+                  − current rejected/ceiling registry
 """
 import argparse
 import copy
 import hashlib
 import json
+import re
 import subprocess
 import sys
 from pathlib import Path
+SCRIPT_DIR = Path(__file__).resolve().parent
+if str(SCRIPT_DIR) not in sys.path:
+    sys.path.insert(0, str(SCRIPT_DIR))
+from pair_evidence_contract import is_score, reject_json_constant
 HIGH_VALUE = ["F2", "F3", "F4", "F6", "F7"]
 CONDITIONAL = ["F1", "F5"]
 REPORTING_ONLY = ["F8"]
+REJECTED_REGISTRY = Path(__file__).with_name("pair-rejected-fixtures.sh")
+def exact_bool(value: object) -> bool | None:
+    return value if isinstance(value, bool) else None
+def disqualifier_flag(value: object, *, default: bool = False) -> bool:
+    if value is None:
+        return default
+    parsed = exact_bool(value)
+    return parsed if parsed is not None else True
 def file_sha256(path: Path) -> str:
@@ -62,28 +82,111 @@ def fixture_short_id(full: str) -> str:
     return full.split("-", 1)[0] if "-" in full else full
+def load_rejected_fixture_reasons(path: Path) -> dict[str, str]:
+    if not path.is_file():
+        raise ValueError(f"rejected fixture registry not found: {path}")
+    rejected: dict[str, str] = {}
+    current: str | None = None
+    for line in path.read_text().splitlines():
+        match = re.match(r"\s*([FS]\d+)-\*\|([FS]\d+)\)", line)
+        if match and match.group(1) == match.group(2):
+            current = match.group(1)
+            continue
+        reason = re.match(r'\s*echo "([^"]+)"', line)
+        if current and reason:
+            rejected[current] = reason.group(1)
+            current = None
+    return dict(sorted(rejected.items(), key=lambda item: (item[0][0], int(item[0][1:]))))
+def load_rejected_short_ids(path: Path) -> list[str]:
+    return list(load_rejected_fixture_reasons(path))
+def load_json_object(path: Path, label: str) -> dict:
+    try:
+        data = json.loads(path.read_text(), parse_constant=reject_json_constant)
+    except (ValueError, json.JSONDecodeError) as exc:
+        raise ValueError(f"{label} malformed: invalid JSON") from exc
+    if not isinstance(data, dict):
+        raise ValueError(f"{label} malformed: expected object")
+    return data
 def compute_promoted_l1_le_l0(c1_rows: list) -> list:
     """Return short fixture IDs (e.g. 'F3') where solo_claude.score ≤ bare.score in C1."""
     promoted = []
     for row in c1_rows:
-        arms = row.get("arms", {})
-        solo = arms.get("solo_claude", {}).get("score")
-        bare = arms.get("bare", {}).get("score")
-        if solo is None or bare is None:
+        if not isinstance(row, dict):
+            continue
+        raw_arms = row.get("arms")
+        arms = raw_arms if isinstance(raw_arms, dict) else {}
+        raw_solo = arms.get("solo_claude")
+        raw_bare = arms.get("bare")
+        solo_arm = raw_solo if isinstance(raw_solo, dict) else {}
+        bare_arm = raw_bare if isinstance(raw_bare, dict) else {}
+        if (
+            disqualifier_flag(solo_arm.get("disqualifier"))
+            or disqualifier_flag(bare_arm.get("disqualifier"))
+        ):
+            continue
+        solo = solo_arm.get("score")
+        bare = bare_arm.get("score")
+        if not is_score(solo) or not is_score(bare):
             continue
         if solo <= bare:
-            promoted.append(fixture_short_id(row["fixture"]))
+            fixture = row.get("fixture")
+            if isinstance(fixture, str):
+                promoted.append(fixture_short_id(fixture))
     return promoted
+def mapped_score(judge: dict, arm: str) -> int | None:
+    mapping = judge.get("_blind_mapping")
+    if not isinstance(mapping, dict):
+        return None
+    letter = next((slot for slot, mapped in mapping.items() if mapped == arm), None)
+    if letter is None:
+        return None
+    raw_scores = judge.get("scores_by_arm")
+    scores = raw_scores if isinstance(raw_scores, dict) else {}
+    score = scores.get(arm)
+    if is_score(score):
+        return score
+    legacy = judge.get(f"{letter.lower()}_score")
+    return legacy if is_score(legacy) else None
+def mapped_disqualifier(judge: dict, arm: str) -> bool:
+    mapping = judge.get("_blind_mapping")
+    if not isinstance(mapping, dict):
+        return True
+    letter = next((slot for slot, mapped in mapping.items() if mapped == arm), None)
+    if letter is None:
+        return True
+    raw_by_arm = judge.get("disqualifiers_by_arm")
+    if raw_by_arm is not None and not isinstance(raw_by_arm, dict):
+        return True
+    by_arm = raw_by_arm if isinstance(raw_by_arm, dict) else {}
+    if arm in by_arm:
+        entry = by_arm.get(arm)
+        return disqualifier_flag(
+            entry.get("disqualifier") if isinstance(entry, dict) else entry
+        )
+    raw_legacy = judge.get("disqualifiers")
+    if raw_legacy is not None and not isinstance(raw_legacy, dict):
+        return True
+    legacy = raw_legacy if isinstance(raw_legacy, dict) else {}
+    return disqualifier_flag(legacy.get(letter))
 def f9_passed(f9_judge: dict) -> bool:
-    """iter-0033a passed iff A score > B score AND A is not disqualified."""
-    a = f9_judge.get("a_score")
-    b = f9_judge.get("b_score")
-    dqs = f9_judge.get("disqualifiers") or {}
-    if a is None or b is None:
+    """iter-0033a passed iff solo_claude beats bare and solo is not disqualified."""
+    solo = mapped_score(f9_judge, "solo_claude")
+    bare = mapped_score(f9_judge, "bare")
+    if solo is None or bare is None:
         return False
-    return a > b and not bool(dqs.get("A", False))
+    return solo > bare and not mapped_disqualifier(f9_judge, "solo_claude")
 def head_sha() -> str:
@@ -114,10 +217,21 @@ def main() -> int:
             print(f"error: {label} not found: {p}", file=sys.stderr)
             return 2
-    c1 = json.loads(c1_path.read_text())
-    f9 = json.loads(f9_path.read_text())
+    try:
+        c1 = load_json_object(c1_path, "c1-summary")
+        f9 = load_json_object(f9_path, "f9-judge")
+        rejected_reasons = load_rejected_fixture_reasons(REJECTED_REGISTRY)
+        rejected_short_ids = list(rejected_reasons)
+    except ValueError as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        return 2
+    c1_rows = c1.get("rows")
+    if not isinstance(c1_rows, list):
+        print("error: c1-summary malformed: rows must be an array", file=sys.stderr)
+        return 2
-    promoted = compute_promoted_l1_le_l0(c1.get("rows", []))
+    promoted = compute_promoted_l1_le_l0(c1_rows)
     f9_in = f9_passed(f9)
     pair_eligible = list(HIGH_VALUE)  # frozen high-value list, ordered
@@ -127,10 +241,23 @@ def main() -> int:
     if f9_in and "F9" not in pair_eligible:
         pair_eligible.append("F9")
     pair_eligible = [fx for fx in pair_eligible if fx not in REPORTING_ONLY]
+    rejected_excluded = sorted(
+        {fx for fx in pair_eligible if fx in rejected_short_ids},
+        key=lambda s: (s[0], int(s[1:])),
+    )
+    pair_eligible = [fx for fx in pair_eligible if fx not in rejected_short_ids]
     conditional_promoted = [fx for fx in CONDITIONAL if fx in promoted]
     conditional_excluded = [fx for fx in CONDITIONAL if fx not in promoted]
     pair_eligible_sorted = sorted(pair_eligible, key=lambda s: (s[0], int(s[1:])))
+    if not pair_eligible_sorted:
+        rejected_text = ", ".join(rejected_excluded) if rejected_excluded else "none"
+        print(
+            "error: no pair-eligible fixtures remain after rejected-registry filtering "
+            f"(rejected_excluded={rejected_text})",
+            file=sys.stderr,
+        )
+        return 1
     gate3_total = len(pair_eligible_sorted)
     gate3_threshold = (gate3_total + 1) // 2  # ≥50% — ceil(gate3_total / 2)
@@ -152,6 +279,11 @@ def main() -> int:
             "reporting_only": REPORTING_ONLY,
             "conditional_excluded": conditional_excluded,
             "conditional_promoted": conditional_promoted,
+            "rejected_registry": str(REJECTED_REGISTRY),
+            "rejected_excluded": rejected_excluded,
+            "rejected_excluded_reasons": {
+                fixture: rejected_reasons[fixture] for fixture in rejected_excluded
+            },
         },
         "fixtures_pair_eligible": pair_eligible_sorted,
         "gate3_threshold_count": gate3_threshold,

package/benchmark/auto-resolve/scripts/check-f9-artifacts.py CHANGED Viewed

@@ -1,9 +1,9 @@
 #!/usr/bin/env python3
-"""F9 variant/solo arm artifact + transcript fingerprint check.
+"""F9 skill-driven arm artifact + transcript fingerprint check.
 Out-of-band per Codex R0.5 §B (iter-0033a): expected.json.verification_commands
 apply to ALL arms (run-fixture.sh:472), so a `docs/specs/**` check there would
-punish bare. This script runs AFTER run-fixture.sh and asserts variant/solo
+punish bare. This script runs AFTER run-fixture.sh and asserts skill-driven
 arms produced the artifacts the 2-skill ideate→resolve chain should emit.
 Bare arm is exempt by construction.
@@ -13,7 +13,7 @@ Usage:
 Exits:
   0 — all checks pass (or bare arm — exempt).
-  1 — variant/solo arm but artifact contract violated.
+  1 — skill-driven arm but artifact contract violated.
   2 — invalid invocation (missing args, missing dir).
 Emits a small JSON report at <result-dir>/check-f9-artifacts.json.
@@ -25,8 +25,10 @@ import re
 import sys
 from pathlib import Path
+from pair_evidence_contract import loads_strict_json_object
-VARIANT_ARMS = {"variant", "solo_claude", "l2_gated", "l2_forced"}
+SKILL_DRIVEN_ARMS = {"variant", "solo_claude", "l2_gated", "l2_risk_probes", "l2_forced"}
 EXEMPT_ARMS = {"bare"}
 SPEC_DIR_GLOB = "docs/specs/*/spec.md"
@@ -39,6 +41,18 @@ RE_AUTO_RESOLVE = re.compile(r"/devlyn:auto-resolve\b")
 RE_PREFLIGHT = re.compile(r"/devlyn:preflight\b")
+def _load_json_object(path: Path) -> tuple[dict | None, str | None]:
+    try:
+        data = loads_strict_json_object(path.read_text())
+    except json.JSONDecodeError as exc:
+        return None, f"{exc.__class__.__name__}: {exc}"
+    except ValueError as exc:
+        if str(exc) == "top-level JSON value must be an object":
+            return None, "expected JSON object"
+        return None, f"{exc.__class__.__name__}: {exc}"
+    return data, None
 def main() -> int:
     p = argparse.ArgumentParser(description=__doc__.split("\n", 1)[0])
     p.add_argument("--result-dir", required=True,
@@ -71,8 +85,8 @@ def main() -> int:
         _write_report(result_dir, report)
         return 0
-    if arm not in VARIANT_ARMS:
-        print(f"error: unknown arm '{arm}' (expected one of {VARIANT_ARMS | EXEMPT_ARMS})",
+    if arm not in SKILL_DRIVEN_ARMS:
+        print(f"error: unknown arm '{arm}' (expected one of {SKILL_DRIVEN_ARMS | EXEMPT_ARMS})",
               file=sys.stderr)
         return 2
@@ -81,13 +95,13 @@ def main() -> int:
     timing_path = result_dir / "timing.json"
     work_dir: Path
     if timing_path.is_file():
-        try:
-            timing = json.loads(timing_path.read_text())
+        timing, _timing_error = _load_json_object(timing_path)
+        if timing is not None:
             work_dir = Path(timing.get("work_dir", ""))
-        except Exception:
-            work_dir = Path("")
+        else:
+            work_dir = Path("__invalid_timing_work_dir__")
     else:
-        work_dir = Path("")
+        work_dir = Path("__missing_timing_work_dir__")
     if not work_dir.is_dir():
         report["checks"].append({
@@ -163,16 +177,14 @@ def main() -> int:
     else:
         # Read the most recent run.
         state_path = sorted(state_paths)[-1]
-        try:
-            state = json.loads(state_path.read_text())
-        except Exception as exc:
+        state, state_error = _load_json_object(state_path)
+        if state is None:
             report["checks"].append({
                 "name": "pipeline.state.json-parses",
                 "pass": False,
-                "reason": f"{exc.__class__.__name__}: {exc}",
+                "reason": state_error,
             })
             report["pass"] = False
-            state = None
         if state is not None:
             archived = "/runs/" in str(state_path)

package/benchmark/auto-resolve/scripts/collect-swebench-predictions.py CHANGED Viewed

@@ -8,6 +8,8 @@ import json
 from pathlib import Path
 from typing import Any
+from pair_evidence_contract import reject_json_constant
 def read_jsonl(path: Path) -> list[dict[str, Any]]:
     rows: list[dict[str, Any]] = []
@@ -15,7 +17,7 @@ def read_jsonl(path: Path) -> list[dict[str, Any]]:
         for line_no, line in enumerate(f, start=1):
             if not line.strip():
                 continue
-            value = json.loads(line)
+            value = json.loads(line, parse_constant=reject_json_constant)
             if not isinstance(value, dict):
                 raise ValueError(f"{path}:{line_no}: expected JSON object")
             rows.append(value)
@@ -36,11 +38,17 @@ def instance_ids_from_jsonl(path: Path | None) -> set[str] | None:
 def collect_from_root(root: Path, patch_name: str, keep: set[str] | None) -> list[tuple[str, Path]]:
     patches: list[tuple[str, Path]] = []
+    seen: set[str] = set()
     for patch_path in sorted(root.glob(f"*/{patch_name}")):
         instance_id = patch_path.parent.name
         if keep is not None and instance_id not in keep:
             continue
+        seen.add(instance_id)
         patches.append((instance_id, patch_path))
+    if keep is not None:
+        missing = sorted(keep - seen)
+        if missing:
+            raise ValueError(f"missing {patch_name} for instance ids: {', '.join(missing)}")
     return patches
@@ -81,6 +89,8 @@ def main() -> int:
                 + "\n"
             )
             written += 1
+    if written == 0:
+        raise ValueError("no non-empty patches collected")
     report = {
         "patch_root": str(args.patch_root),