npm - devlyn-cli - Versions diffs - 2.1.0 → 2.2.1 - Mend

devlyn-cli 2.1.0 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (127) hide show

package/config/skills/_shared/verify-merge-findings.py ADDED Viewed

@@ -0,0 +1,327 @@
+#!/usr/bin/env python3
+"""Merge VERIFY findings and derive a deterministic verdict.
+VERIFY judges are model-written, but routing on finding severity must be
+mechanical. This script reads the known VERIFY JSONL finding files, writes a
+merged JSONL artifact, computes source-level and overall verdicts, and can
+write the merged verdict back to `.devlyn/pipeline.state.json`.
+"""
+from __future__ import annotations
+import argparse
+import json
+import pathlib
+import sys
+import tempfile
+from typing import Any
+SOURCE_FILES = (
+    ("mechanical", "verify-mechanical.findings.jsonl"),
+    ("judge", "verify.findings.jsonl"),
+    ("pair_judge", "verify.pair.findings.jsonl"),
+    ("pair_judge", "verify.pair-judge.findings.jsonl"),
+)
+VERDICT_RANK = {
+    "PASS": 0,
+    "PASS_WITH_ISSUES": 1,
+    "FAIL": 2,
+    "NEEDS_WORK": 2,
+    "BLOCKED": 3,
+}
+RANK_VERDICT = {0: "PASS", 1: "PASS_WITH_ISSUES", 2: "NEEDS_WORK", 3: "BLOCKED"}
+def rank(verdict: str | None) -> int:
+    return VERDICT_RANK.get(verdict or "PASS", 0)
+def worse(a: str | None, b: str | None) -> str:
+    return RANK_VERDICT[max(rank(a), rank(b))]
+def finding_rank(finding: dict[str, Any]) -> int:
+    severity = str(finding.get("severity") or "").upper()
+    if severity in {"CRITICAL", "HIGH"}:
+        return 2
+    if severity == "MEDIUM" and finding.get("verdict_binding") is True:
+        return 2
+    if severity in {"LOW", "MEDIUM"}:
+        return 1
+    return 0
+def read_findings(devlyn: pathlib.Path) -> tuple[list[dict[str, Any]], dict[str, str]]:
+    findings: list[dict[str, Any]] = []
+    source_verdicts = {source: "PASS" for source, _ in SOURCE_FILES}
+    for source, name in SOURCE_FILES:
+        path = devlyn / name
+        if not path.is_file():
+            continue
+        with path.open(encoding="utf-8") as handle:
+            for line_no, line in enumerate(handle, 1):
+                raw = line.strip()
+                if not raw:
+                    continue
+                try:
+                    item = json.loads(raw)
+                except json.JSONDecodeError as exc:
+                    blocked = {
+                        "id": f"verify-merge-invalid-json-{name}-{line_no}",
+                        "rule_id": "verify.findings.invalid-json",
+                        "severity": "CRITICAL",
+                        "confidence": "high",
+                        "file": name,
+                        "line": line_no,
+                        "message": f"Invalid JSONL finding: {exc}",
+                        "criterion_ref": "verify-merge",
+                        "source": source,
+                    }
+                    findings.append(blocked)
+                    source_verdicts[source] = "BLOCKED"
+                    continue
+                if not isinstance(item, dict):
+                    continue
+                item = dict(item)
+                item.setdefault("source", source)
+                findings.append(item)
+                source_verdicts[source] = worse(
+                    source_verdicts[source], RANK_VERDICT[finding_rank(item)]
+                )
+    findings.extend(detect_pair_stdout_contract_violations(devlyn, source_verdicts))
+    return findings, source_verdicts
+def has_pair_findings(devlyn: pathlib.Path) -> bool:
+    for name in ("verify.pair.findings.jsonl", "verify.pair-judge.findings.jsonl"):
+        path = devlyn / name
+        if path.is_file() and path.read_text(encoding="utf-8").strip():
+            return True
+    return False
+def pair_trigger_required(devlyn: pathlib.Path) -> bool:
+    state_path = devlyn / "pipeline.state.json"
+    if not state_path.is_file():
+        return False
+    try:
+        state = json.loads(state_path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError:
+        return False
+    phases = state.get("phases") if isinstance(state, dict) else {}
+    verify_phase = phases.get("verify") if isinstance(phases, dict) else None
+    trigger = None
+    if isinstance(verify_phase, dict):
+        trigger = verify_phase.get("pair_trigger")
+    if trigger is None and isinstance(state, dict):
+        verify_state = state.get("verify")
+        if isinstance(verify_state, dict):
+            trigger = verify_state.get("pair_trigger")
+    return bool(
+        isinstance(trigger, dict)
+        and trigger.get("eligible") is True
+        and trigger.get("reasons")
+    )
+def pair_blocker(id_: str, message: str, file_: str | None = None) -> dict[str, Any]:
+    return {
+        "id": id_,
+        "rule_id": "verify.pair.emission-contract",
+        "severity": "CRITICAL",
+        "confidence": "high",
+        "file": file_,
+        "line": 1 if file_ else None,
+        "message": message,
+        "criterion_ref": "verify.pair.findings",
+        "source": "pair_judge",
+    }
+def detect_pair_stdout_contract_violations(
+    devlyn: pathlib.Path,
+    source_verdicts: dict[str, str],
+) -> list[dict[str, Any]]:
+    stdout_path = devlyn / "codex-judge.stdout"
+    if has_pair_findings(devlyn):
+        return []
+    if not stdout_path.is_file():
+        if pair_trigger_required(devlyn):
+            source_verdicts["pair_judge"] = "BLOCKED"
+            return [
+                pair_blocker(
+                    "verify-pair-required-output-missing",
+                    "Pair-mode was required, but Codex pair-JUDGE produced no stdout or canonical findings file.",
+                    "codex-judge.stdout",
+                )
+            ]
+        return []
+    raw_text = stdout_path.read_text(encoding="utf-8")
+    if not raw_text.strip():
+        source_verdicts["pair_judge"] = "BLOCKED"
+        return [
+            pair_blocker(
+                "verify-pair-empty-output",
+                "Codex pair-JUDGE stdout was empty; the bounded contract requires a JSONL finding or PASS line.",
+                "codex-judge.stdout",
+            )
+        ]
+    has_jsonl_finding = False
+    has_nonpass_summary = False
+    for line in raw_text.splitlines():
+        raw = line.strip()
+        if not raw:
+            continue
+        if raw.startswith("# SUMMARY "):
+            try:
+                summary = json.loads(raw.removeprefix("# SUMMARY ").strip())
+            except json.JSONDecodeError:
+                continue
+            if summary.get("verdict") in {"NEEDS_WORK", "FAIL", "BLOCKED"}:
+                has_nonpass_summary = True
+            continue
+        if raw.startswith("#"):
+            continue
+        try:
+            item = json.loads(raw)
+        except json.JSONDecodeError:
+            continue
+        if isinstance(item, dict) and str(item.get("severity") or "").upper() in {
+            "CRITICAL",
+            "HIGH",
+            "MEDIUM",
+            "LOW",
+        }:
+            has_jsonl_finding = True
+    if not has_jsonl_finding and not has_nonpass_summary:
+        return []
+    source_verdicts["pair_judge"] = "BLOCKED"
+    return [
+        pair_blocker(
+            "verify-pair-emission-contract-violated",
+            (
+                "Codex pair-JUDGE stdout contained findings or a non-PASS summary, "
+                "but the canonical pair findings JSONL file was empty."
+            ),
+            "codex-judge.stdout",
+        )
+    ]
+def write_outputs(
+    devlyn: pathlib.Path,
+    findings: list[dict[str, Any]],
+    source_verdicts: dict[str, str],
+) -> dict[str, Any]:
+    merged_path = devlyn / "verify-merged.findings.jsonl"
+    summary_path = devlyn / "verify-merge.summary.json"
+    with merged_path.open("w", encoding="utf-8") as handle:
+        for finding in findings:
+            handle.write(json.dumps(finding, sort_keys=True, separators=(",", ":")) + "\n")
+    verdict = "PASS"
+    for source_verdict in source_verdicts.values():
+        verdict = worse(verdict, source_verdict)
+    summary = {
+        "verdict": verdict,
+        "source_verdicts": source_verdicts,
+        "findings_count": len(findings),
+        "findings_file": str(merged_path),
+    }
+    summary_path.write_text(json.dumps(summary, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+    return summary
+def write_state(devlyn: pathlib.Path, summary: dict[str, Any]) -> None:
+    state_path = devlyn / "pipeline.state.json"
+    if not state_path.is_file():
+        raise SystemExit(f"error: {state_path} not found")
+    state = json.loads(state_path.read_text(encoding="utf-8"))
+    phases = state.setdefault("phases", {})
+    verify = phases.get("verify")
+    if not isinstance(verify, dict):
+        verify = {}
+        phases["verify"] = verify
+    verify["verdict"] = summary["verdict"]
+    sub = verify.setdefault("sub_verdicts", {})
+    for source, source_verdict in summary["source_verdicts"].items():
+        if source in {"mechanical", "judge", "pair_judge"}:
+            sub[source] = source_verdict
+    verify["merged"] = {
+        "verdict": summary["verdict"],
+        "findings_file": ".devlyn/verify-merged.findings.jsonl",
+        "summary_file": ".devlyn/verify-merge.summary.json",
+    }
+    state_path.write_text(json.dumps(state, indent=2, sort_keys=True) + "\n", encoding="utf-8")
+def self_test() -> int:
+    with tempfile.TemporaryDirectory() as tmp:
+        devlyn = pathlib.Path(tmp)
+        (devlyn / "pipeline.state.json").write_text(
+            json.dumps({"phases": {"verify": {"verdict": "PASS", "sub_verdicts": {}}}}),
+            encoding="utf-8",
+        )
+        (devlyn / "verify.findings.jsonl").write_text(
+            json.dumps({"id": "j1", "severity": "LOW"}) + "\n",
+            encoding="utf-8",
+        )
+        (devlyn / "verify.pair.findings.jsonl").write_text(
+            json.dumps({"id": "p1", "severity": "HIGH"}) + "\n",
+            encoding="utf-8",
+        )
+        findings, source_verdicts = read_findings(devlyn)
+        summary = write_outputs(devlyn, findings, source_verdicts)
+        write_state(devlyn, summary)
+        state = json.loads((devlyn / "pipeline.state.json").read_text(encoding="utf-8"))
+        assert summary["verdict"] == "NEEDS_WORK", summary
+        assert state["phases"]["verify"]["verdict"] == "NEEDS_WORK", state
+        assert state["phases"]["verify"]["sub_verdicts"]["pair_judge"] == "NEEDS_WORK", state
+        assert (devlyn / "verify-merged.findings.jsonl").read_text(encoding="utf-8")
+        (devlyn / "verify.findings.jsonl").write_text("", encoding="utf-8")
+        (devlyn / "verify.pair.findings.jsonl").write_text("", encoding="utf-8")
+        findings, source_verdicts = read_findings(devlyn)
+        summary = write_outputs(devlyn, findings, source_verdicts)
+        write_state(devlyn, summary)
+        state = json.loads((devlyn / "pipeline.state.json").read_text(encoding="utf-8"))
+        assert summary["verdict"] == "PASS", summary
+        assert state["phases"]["verify"]["verdict"] == "PASS", state
+        assert state["phases"]["verify"]["sub_verdicts"]["pair_judge"] == "PASS", state
+        (devlyn / "codex-judge.stdout").write_text(
+            json.dumps({"id": "cj1", "severity": "HIGH"}) + "\n"
+            + '# SUMMARY {"verdict":"NEEDS_WORK"}\n',
+            encoding="utf-8",
+        )
+        findings, source_verdicts = read_findings(devlyn)
+        summary = write_outputs(devlyn, findings, source_verdicts)
+        write_state(devlyn, summary)
+        state = json.loads((devlyn / "pipeline.state.json").read_text(encoding="utf-8"))
+        assert summary["verdict"] == "BLOCKED", summary
+        assert state["phases"]["verify"]["sub_verdicts"]["pair_judge"] == "BLOCKED", state
+    return 0
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__.splitlines()[0])
+    parser.add_argument("--devlyn-dir", default=".devlyn")
+    parser.add_argument("--write-state", action="store_true")
+    parser.add_argument("--self-test", action="store_true")
+    args = parser.parse_args()
+    if args.self_test:
+        return self_test()
+    devlyn = pathlib.Path(args.devlyn_dir)
+    if not devlyn.is_dir():
+        sys.stderr.write(f"error: {devlyn} is not a directory\n")
+        return 1
+    findings, source_verdicts = read_findings(devlyn)
+    summary = write_outputs(devlyn, findings, source_verdicts)
+    if args.write_state:
+        write_state(devlyn, summary)
+    print(json.dumps(summary, sort_keys=True))
+    return 0
+if __name__ == "__main__":
+    raise SystemExit(main())

package/config/skills/devlyn:resolve/SKILL.md CHANGED Viewed

@@ -1,6 +1,6 @@
 ---
 name: devlyn:resolve
-description: Hands-free pipeline for any coding task — bug fix, feature, refactor, debug, modify, PR review. Free-form goal or formal spec input. Plan → Implement → Build-gate → Cleanup → Verify (fresh subagent, findings-only). Mechanical-first verification; pair-mode optional in Verify. Use when the user says "resolve this", "fix this", "implement this", "refactor this", "debug this", "review this PR", or wants hands-off completion.
+description: Hands-free pipeline for any coding task — bug fix, feature, refactor, debug, modify, PR review. Free-form goal or formal spec input. Plan → Implement → Build-gate → Cleanup → Verify (fresh subagent, findings-only). Mechanical-first verification; pair-mode is gated in Verify. Use when the user says "resolve this", "fix this", "implement this", "refactor this", "debug this", "review this PR", or wants hands-off completion.
 ---
 Orchestrator for the 2-skill harness pipeline. One subagent per phase; file-based handoff via `.devlyn/pipeline.state.json`. VERIFY spawns a fresh-context subagent so independence is structural — not advisory.
@@ -55,10 +55,11 @@ Once `state.implement_passed_sha` is non-null (PHASE 2 returned and produced a d
 1. Parse flags from `<pipeline_config>`:
    - `--max-rounds N` (default 4) — fix-loop budget shared across BUILD_GATE and VERIFY.
-   - `--engine MODE` (default `claude`) — picks the adapter for IMPLEMENT and CLEANUP.
+   - `--engine MODE` (default `claude`) — picks the adapter for IMPLEMENT, CLEANUP, and the primary VERIFY judge. It does not disable VERIFY pair-mode; when a VERIFY pair trigger fires, the second judge uses the OTHER engine.
    - `--spec <path>` — switches to spec mode.
    - `--verify-only <ref>` — switches to verify-only mode. Requires `--spec`.
    - `--pair-verify` — force pair-mode JUDGE in PHASE 5 even when not auto-triggered.
+   - `--risk-probes` — insert PHASE 1.5 cross-engine probe derivation. The OTHER engine converts visible `## Verification` bullets into bounded executable probes before IMPLEMENT; BUILD_GATE and VERIFY replay them mechanically.
    - `--bypass <phase>[,...]` — skip specific phases. Valid: `build-gate`, `cleanup`. PLAN, IMPLEMENT, VERIFY are non-bypassable.
    - `--perf` — opt in to per-phase timing.
@@ -87,6 +88,57 @@ After return:
 1. If `.devlyn/plan.md` lists zero files → halt with verdict `BLOCKED:plan-empty`.
 2. If risk list flags an out-of-scope expansion the user did not authorize → re-spawn once with the reminder; second fail → halt.
+## PHASE 1.5: RISK_PROBES
+Skip unless `--risk-probes` is set. This phase is findings-as-executable-checks,
+not a second plan and not debate.
+Engine: OTHER engine from PHASE 2's selected IMPLEMENT engine. Prompt body:
+`references/phases/probe-derive.md`.
+Inputs: source spec/criteria, `.devlyn/plan.md`, and repo read/search. Forbidden:
+`spec.expected.json`, `.devlyn/spec-verify.json`, `BENCH_FIXTURE_DIR`, hidden
+fixture/verifier paths, previous findings, and harness docs unless excerpted.
+Output: `.devlyn/risk-probes.jsonl`, 1 to 3 JSONL entries. Each entry must be
+one verification command shape plus `id`, `derived_from`, `tags`, and
+`tag_evidence`, where `derived_from` is an exact substring of the visible
+`## Verification` bullet the command directly exercises. `tag_evidence` must be
+a JSON object keyed by tag, with marker arrays as values; a top-level array or
+tag-only probe is malformed. `ordering_inversion` must include
+`input_order_would_choose_wrong_winner` and `asserts_processing_order_result`;
+`prior_consumption` must include `same_resource_consumed_first` and
+`later_entity_fails_or_reroutes`; `stdout_stderr_contract` and `shape_contract`
+do not require marker strings. Cart/pricing success probes should use
+`shape_contract` unless they satisfy the `ordering_inversion` markers. The probe
+command must not reference external network URLs; use only worktree-local or
+localhost resources.
+For high-complexity specs with multiple behavior bullets, at least one probe
+must be compound: it must exercise two or more visible verification bullets in a
+single command. Empty output is invalid when `--risk-probes` is set.
+State write: `phases.probe_derive.{started_at, verdict, completed_at, duration_ms, artifacts}`.
+Invocation contract when OTHER engine is Codex:
+- Invoke Codex only through the monitored wrapper path in `CODEX_MONITORED_PATH`,
+  or `.claude/skills/_shared/codex-monitored.sh` when the env var is absent:
+  `bash "$CODEX_MONITORED_PATH" -C "$PWD" --full-auto -c model_reasoning_effort=high "<probe prompt>"`.
+- Do not run `codex`, `codex exec`, `/Users/.../codex`, or a plugin-provided
+  Codex binary directly. A raw Codex child can outlive the phase and makes the
+  benchmark run invalid even if `.devlyn/risk-probes.jsonl` is written.
+- Capture wrapper stdout/stderr to `.devlyn/probe-derive.stdout` and
+  `.devlyn/probe-derive.stderr`; branch on the wrapper exit code before
+  validating `.devlyn/risk-probes.jsonl`.
+After return:
+1. Run `python3 .claude/skills/_shared/spec-verify-check.py --validate-risk-probes`
+   for the artifact boundary before IMPLEMENT; malformed probes halt with
+   `BLOCKED:probe-derive-malformed`.
+2. IMPLEMENT receives `.devlyn/plan.md` plus `.devlyn/risk-probes.jsonl` as
+   concrete acceptance obligations. It must not receive the producer engine's
+   commentary or any mention of pair/critic/debate.
 ## PHASE 2: IMPLEMENT
 Skip in verify-only mode. Constrained design judgment within PLAN's invariants. Writes code, tests, and inline doc-comments. No standalone DOCS phase — what the spec licenses is updated here, what it does not is out of scope.
@@ -107,7 +159,7 @@ Skip in verify-only mode OR when `build-gate` in `state.bypasses`. Deterministic
 Spawn Claude `Agent` (`mode: "bypassPermissions"`) with prompt body `references/phases/build-gate.md`. The agent:
 1. Detects language/framework via project files (`package.json`, `pyproject.toml`, etc.).
 2. Runs language-specific gates (tsc / lint / test).
-3. Always runs `python3 .claude/skills/_shared/spec-verify-check.py` (verification_commands literal-match).
+3. Always runs `python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` (verification_commands literal-match plus `.devlyn/risk-probes.jsonl` when present).
 4. If `spec.expected.json.browser_flows` declared OR diff touches web-surface files: invokes the browser runner (Chrome MCP → Playwright → curl tier as available).
 5. Emits `.devlyn/build_gate.findings.jsonl` + `.devlyn/build_gate.log.md`.
@@ -140,16 +192,25 @@ Independent quality layer. **Spawned with empty conversation context** — no ca
 Two sub-phases:
-1. **MECHANICAL** (deterministic): re-run `python3 .claude/skills/_shared/spec-verify-check.py` against the post-CLEANUP code (independent of BUILD_GATE's earlier run). Re-scan `spec.expected.json.forbidden_patterns` against the diff. Re-check `required_files` and `forbidden_files`. Emit `.devlyn/verify-mechanical.findings.jsonl`.
+1. **MECHANICAL** (deterministic): re-run `python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes` against the post-CLEANUP code (independent of BUILD_GATE's earlier run). Re-scan `spec.expected.json.forbidden_patterns` against the diff. Re-check `required_files` and `forbidden_files`. Emit `.devlyn/verify-mechanical.findings.jsonl`.
-2. **JUDGE** (fresh-context Agent): grade the diff against the spec on rubric axes (spec compliance, scope, quality, consistency). Default engine = same as IMPLEMENT (solo). Pair-mode (cross-model JUDGE) fires when:
+2. **JUDGE** (fresh-context Agent): grade the diff against the spec on rubric axes (spec compliance, scope, quality, consistency). Split each Requirement into binding clauses and trace code-order counterexamples; a passing verifier proves only the case it exercises, not neighboring `once` / `regardless` / `duplicate` / auth-order / rollback invariants. Respect scope qualifiers such as `inside a warehouse`, `per resource`, `for this line`, and `after validation`; do not widen a scoped clause into a global invariant, and compose multiple ordering rules in the stated order. For stateful flows, explicitly trace failed-operation rollback and the next entity's state before hunting broader edge cases. For high-complexity specs, construct at least one interaction counterexample that combines ordering/priority with failure handling and state mutation, then execute at least one such scenario through the repo's existing CLI/API/test runner without leaving tracked files behind; one-axis examples and pure mental tracing are insufficient. Default engine = same as IMPLEMENT (solo). Pair-mode (cross-model JUDGE) is eligible only when MECHANICAL has no HIGH/CRITICAL findings; deterministic blockers already decide the verdict and route to the fix loop. Pair-mode fires when eligible and:
    - `--pair-verify` flag set, OR
+   - spec frontmatter has `complexity: high`, OR `state.complexity` is `"high"` or `"large"`, OR
    - MECHANICAL emits findings flagged `severity: warning` (not disqualifier — those route to fix loop directly), OR
    - `state.verify.coverage_failed == true` (judge could not exercise a required spec axis from available evidence).
-Pair-mode JUDGE: spawn a second Agent with the OTHER engine's adapter; both judgments merge with the rule "any HIGH/CRITICAL finding either model surfaces is the verdict-binding finding." Cross-model disagreement on lower-severity findings is logged but does not change the verdict.
+Before spawning JUDGE, compute `pair_trigger = { eligible, reasons[] }` and write it into `state.phases.verify`. If `eligible == true` and `reasons` is non-empty, you MUST spawn the second OTHER-engine judge. Skipping that second judge is a VERIFY contract violation, not a discretion call.
+The `--engine` flag never suppresses this rule. Explicit `--engine claude`
+means "Claude is the primary judge"; it does not mean "do not run Codex as the
+second pair judge." The only valid skip reasons after a non-empty eligible
+trigger are deterministic MECHANICAL HIGH/CRITICAL blockers or Codex
+unavailability proven by the invocation layer.
+Pair-mode JUDGE: spawn a second Agent with the OTHER engine's adapter; the second judge is a bounded adversarial complement, not a duplicate broad audit. The primary judge owns broad coverage; pair-JUDGE targets the two highest-risk explicit `## Verification` bullets that cross state mutation, all-or-nothing rollback, ordering, idempotency, auth, or error-priority clauses. It must not read `.claude/skills`, `.codex/skills`, `CLAUDE.md`, `AGENTS.md`, or other harness docs unless the orchestrator pasted a specific excerpt into the prompt. It may use only the spec, diff, implementation files, tests, and the repo's existing CLI/API/test runner. It may execute at most two targeted probes before first output, and each probe must compare the full externally visible result (exit/stdout/stderr plus full parsed output object, including accepted/scheduled rows, rejected rows, and remaining state when present), not just a single property. For priority/stateful specs, at least one probe must include an earlier input entity that would succeed under input-order processing, a later higher-priority entity that consumes or blocks the critical resource, and a failure/blocked/rollback edge that determines a later entity's state. For cart/pricing specs where visible verification combines duplicate items, line promotions, tax, coupon, and shipping, the success-path probe must include interleaved duplicates plus taxable and non-taxable items and assert full output rows. Scope qualifiers are binding: pair-JUDGE must not reinterpret `inside a warehouse`, `per resource`, or line-scoped rules as global rules. When both priority ordering and rollback/blocked-interval behavior appear in the spec, this dominance-loss probe is mandatory and comes before any other probe: an earlier lower-priority entity that would succeed alone or under input-order processing must lose because a later higher-priority entity is processed first; a failed/blocked middle entity must not corrupt later state; and the assertion must cover complete accepted/scheduled and rejected output ordering. It must stop and emit JSONL immediately on the first verdict-binding finding, and must emit PASS immediately if both probes plus static scope/dependency checks pass. Both judgments merge with the rule "any HIGH/CRITICAL finding either model surfaces is verdict-binding; high-confidence MEDIUM findings are also verdict-binding when they identify a concrete behavioral regression against the spec, public contract, or existing test contract." Cross-model disagreement on advisory lower-severity findings is logged but does not change the verdict. If MECHANICAL has a HIGH/CRITICAL finding, skip the second judge and record `pair_judge: null`; the fix loop needs the deterministic finding, not duplicate review.
-Findings written to `.devlyn/verify.findings.jsonl`. **VERIFY agents have no code-mutation tools.** State write: `phases.verify.{started_at, verdict, completed_at, duration_ms, sub_verdicts: {mechanical, judge, pair_judge?}, artifacts}`.
+Findings written to `.devlyn/verify.findings.jsonl`. **VERIFY agents have no code-mutation tools.** Codex pair-JUDGE is read-only: invoke `codex-monitored.sh` directly with `-c model_reasoning_effort=medium` for this bounded two-probe review, without piping to `tail`/`head`/`grep`, capture stdout/stderr by direct tool capture or file redirection, require JSONL findings on stdout, and have the orchestrator write `.devlyn/verify.pair.findings.jsonl`. If stdout is first captured as `.devlyn/codex-judge.stdout`, run `python3 .claude/skills/_shared/collect-codex-findings.py` before merge; that script is the deterministic boundary writer for `.devlyn/verify.pair.findings.jsonl`. Raw stdout remains diagnostic only: if stdout contains findings or a non-PASS summary while `.devlyn/verify.pair.findings.jsonl` is empty, `verify-merge-findings.py` blocks VERIFY for `verify.pair.emission-contract`. Do not ask Codex to `apply_patch` or edit `.devlyn`. After primary and pair findings are written, run `python3 .claude/skills/_shared/verify-merge-findings.py --write-state`. Branch only on the merged `state.phases.verify.verdict`; a HIGH/CRITICAL finding from either judge must mechanically become `NEEDS_WORK`. Never write `.devlyn/verify-merged.findings.jsonl` or `.devlyn/verify-merge.summary.json` by hand; `verify-merge-findings.py` is their only writer. State write: `phases.verify.{started_at, verdict, completed_at, duration_ms, sub_verdicts: {mechanical, judge, pair_judge?}, artifacts}`.
 Branch:
 - `PASS` → PHASE 6.
@@ -166,7 +227,7 @@ State write: `phases.final_report.started_at` at the top of this phase.
 3. State write: `phases.final_report.{verdict, completed_at, duration_ms}` BEFORE archive runs (archive prune logic skips runs whose `final_report.verdict` is null).
-4. **Archive** — invoke the deterministic script: `python3 .claude/skills/_shared/archive_run.py`. The script reads `run_id` from `.devlyn/pipeline.state.json`, moves per-run artifacts (state.json + `*.findings.jsonl` + `*.log.md` + `fix-batch.round-*.json` + `criteria.generated.md` + `spec-verify*.json` + `spec-verify-findings.jsonl`) into `.devlyn/runs/<run_id>/`, then best-effort prunes to last 10 completed runs. Archive must run; running this step as deterministic-script-not-prose ensures the move actually happens (iter-0033a Smoke 3 caught a case where the agent claimed archive ran without moving the files).
+4. **Archive** — invoke the deterministic script: `python3 .claude/skills/_shared/archive_run.py`. The script reads `run_id` from `.devlyn/pipeline.state.json`, moves per-run artifacts (state.json + `*.findings.jsonl` + `*.log.md` + `fix-batch.round-*.json` + `criteria.generated.md` + `risk-probes.jsonl` + `spec-verify*.json` + `spec-verify-findings.jsonl`) into `.devlyn/runs/<run_id>/`, then best-effort prunes to last 10 completed runs. Archive must run; running this step as deterministic-script-not-prose ensures the move actually happens (iter-0033a Smoke 3 caught a case where the agent claimed archive ran without moving the files).
 5. Kill any dev server PHASE 3 left running.

package/config/skills/devlyn:resolve/references/phases/build-gate.md CHANGED Viewed

@@ -22,7 +22,7 @@ Run in this order; each emits findings into `.devlyn/build_gate.findings.jsonl`:
 1. **Type check** (TypeScript / mypy / etc.). Each error → one finding, severity `HIGH`, rule `correctness.type-check`.
 2. **Lint** (eslint / ruff / clippy / etc.). Each error → finding, severity `MEDIUM`, rule `quality.lint`. Warnings stay LOW unless the spec elevates them.
 3. **Test suite** (npm test / pytest / go test / cargo test). Each failing test → finding, severity `HIGH`, rule `correctness.test-failure`. Include the failing test's file:line and the assertion.
-4. **Spec literal verification**: `python3 .claude/skills/_shared/spec-verify-check.py`. The script reads `.devlyn/spec-verify.json` (pre-staged from spec or self-staged from `state.source.spec_path`). Each command mismatch → finding `correctness.spec-literal-mismatch`, severity `CRITICAL`. Missing/malformed carrier on a generated source → finding `correctness.spec-verify-malformed`, severity `CRITICAL`.
+4. **Spec literal verification + risk probes**: `python3 .claude/skills/_shared/spec-verify-check.py --include-risk-probes`. The script reads `.devlyn/spec-verify.json` (pre-staged from spec or self-staged from `state.source.spec_path`) and appends `.devlyn/risk-probes.jsonl` when present. Each verification command mismatch → finding `correctness.spec-literal-mismatch`, severity `CRITICAL`. Each risk-probe mismatch → finding `correctness.risk-probe-failed`, severity `CRITICAL`. Missing/malformed carrier on a generated source → finding `correctness.spec-verify-malformed`, severity `CRITICAL`.
 5. **Browser** (only when `spec.expected.json.browser_flows` declared OR diff touches `*.tsx`, `*.jsx`, `*.vue`, `*.svelte`, `page.*`, `layout.*`, `route.*`, `*.css`, `*.html`): start dev server, run declared flows via Chrome MCP if available, falling back to Playwright, falling back to curl. Each failed flow → finding, severity `HIGH`, rule `correctness.browser-flow-failed`.
 Append all findings; do not stop on the first failure.

package/config/skills/devlyn:resolve/references/phases/probe-derive.md ADDED Viewed

@@ -0,0 +1,183 @@
+# PHASE 1.5 — RISK_PROBES (canonical body)
+Per-engine adapter header is prepended at runtime. This file is engine-agnostic.
+<role>
+Convert visible verification obligations into executable probes. You are not a
+second planner, critic essay, or debate participant. Your output is JSONL only.
+</role>
+<input>
+- Source spec or generated criteria.
+- `.devlyn/plan.md`.
+- Codebase read/search at `state.base_ref.sha`.
+</input>
+<forbidden_input>
+Do not read `spec.expected.json`, `.devlyn/spec-verify.json`,
+`BENCH_FIXTURE_DIR`, benchmark fixture/verifier paths, `.devlyn/*.findings.jsonl`,
+`.claude/skills`, `.codex/skills`, `CLAUDE.md`, `AGENTS.md`, or other harness
+docs unless the orchestrator pasted a specific excerpt into the prompt.
+</forbidden_input>
+<task>
+Read the visible `## Verification` section. Emit 1 to 3 executable probes
+that cover the highest-risk bullets whose failure would change observable
+behavior. Prefer bullets that combine ordering/priority, rollback/state
+mutation, idempotency, auth/error priority, stdout/stderr, or exact output
+shape.
+For high-complexity specs with two or more behavior bullets, at least one probe
+must be compound: one command must exercise two or more visible verification
+bullets together. Do not split every risk into isolated one-axis probes.
+Compound means interaction, not a checklist in one script. If the visible
+verification text includes priority/ordering plus rollback, blocked intervals,
+or failed-operation state, the first probe must be a dominance-loss scenario:
+an earlier lower-priority/input-order entity would succeed alone, a later
+higher-priority entity consumes or blocks the critical resource first, a failed
+or blocked middle entity must not corrupt state, and the assertion must compare
+the complete externally-visible result (accepted/scheduled rows, rejected rows,
+remaining/state rows when present, exit/stdout/stderr).
+When a verification bullet contains an alternative such as "rejected or moved
+later", the probes must cover both sides when bounded: one case with a later
+valid placement and one case with no later valid placement, where rejection is
+the only correct outcome. Do not test only the easier side of an "or" clause.
+For blocked-interval bullets, include boundary probes where a candidate starts
+exactly at `blocked.start`, ends exactly at `blocked.end`, and has only a
+one-minute overlap. Half-open assumptions must be tested by the command rather
+than left implicit in prose.
+When a placement algorithm may advance a candidate start past a blocked
+interval or already-accepted entity, include a no-later-valid case where the
+advanced start would exceed the active availability/window bound. The expected
+result must reject that entity. A probe is too weak if every advanced candidate
+still has enough room after the advance; that misses window-bound recheck bugs.
+When a verification bullet names `remaining`, inventory, stock, balances, or
+state after failures, assert the full externally-visible state. Rows with zero
+quantity do not represent remaining availability; a probe that checks remaining
+state should fail if zero-quantity rows are emitted unless the visible spec
+explicitly requires zero rows. For all-or-nothing rollback, include a later
+entity that can succeed only if the failed entity returned every tentative
+allocation.
+When visible bullets combine priority ordering, all-or-nothing rollback,
+single-resource or single-warehouse constraints, choice ordering such as FEFO,
+and `remaining` output, prefer one compound probe over isolated checks. The
+probe must include: a lower-priority input-first entity that loses because a
+higher-priority entity consumes stock first; a middle entity that tentatively
+allocates at least one line/lot and then fails another line; a later entity that
+can succeed only if that failed entity rolled back; a single-resource constraint
+case where total cross-resource stock would be enough but no single allowed
+resource is enough; and full expected `remaining` output sorted exactly as the
+visible spec says, with zero-quantity rows absent unless explicitly required.
+For all-or-nothing allocation probes, the failed middle entity must not be
+pre-rejected by a whole-order availability shortcut. It must allocate a scarce
+first line from mutable state, then fail a later line because that SKU/resource
+is absent or otherwise impossible under the visible contract. The later entity
+must request the same scarce first-line SKU so the probe proves rollback by
+observable success, not by internal reasoning.
+Each probe must run entirely from the worktree with standard shell/Node/Python
+tools already present in the repo. Use inline temp-file scripts when needed.
+Leave no tracked files behind. Probe commands must not call external network
+APIs or write to external memory/telemetry services.
+</task>
+<output>
+Write `.devlyn/risk-probes.jsonl`. Each line is one JSON object:
+```json
+{"id":"P1","derived_from":"verbatim substring from ## Verification","cmd":"shell command","exit_code":0,"stdout_contains":[],"stdout_not_contains":[],"tags":["ordering_inversion"],"tag_evidence":{"ordering_inversion":["input_order_would_choose_wrong_winner","asserts_processing_order_result"]}}
+```
+Rules:
+- `derived_from` must be an exact substring of the visible `## Verification`
+  bullet that the command directly exercises. For `error_contract`, use the
+  invalid-input/stderr/JSON-error/exit-2 bullet, not a generic test-runner
+  bullet.
+- `tags` is required. Use only these shape tags:
+  `ordering_inversion`, `boundary_overlap`, `prior_consumption`,
+  `rollback_state`, `positive_remaining`, `stdout_stderr_contract`,
+  `error_contract`, `shape_contract`.
+- `tag_evidence` is required and must be a JSON object keyed by tag, never a
+  top-level array. For these tags, include every listed evidence marker in the
+  tag's array and make the command actually exercise it:
+- Do not emit a shape tag unless the visible `## Verification` text names that
+  kind of risk and the command exercises it. In particular, `boundary_overlap`
+  is only for visible blocked-interval/window/overlap boundary semantics; do not
+  use it for inventory, warehouse, or generic resource constraints.
+  - `ordering_inversion`: `input_order_would_choose_wrong_winner`,
+    `asserts_processing_order_result`.
+  - `boundary_overlap`: `starts_at_blocked_start`, `ends_at_blocked_end`,
+    `one_minute_overlap`.
+  - `prior_consumption`: `same_resource_consumed_first`,
+    `later_entity_fails_or_reroutes`.
+  - `rollback_state`: `failed_entity_tentative_state_absent`,
+    `later_entity_uses_released_state`.
+  - `positive_remaining`: `asserts_full_remaining_state`,
+    `zero_quantity_rows_absent`.
+  Tags not listed here may use an empty evidence list or be omitted from
+  `tag_evidence`.
+- `cmd` must not reference `BENCH_FIXTURE_DIR`, `verifiers/`, benchmark fixture
+  paths, hidden oracle files, external URLs, or files outside the worktree.
+  Localhost URLs are allowed only when the visible verification command needs a
+  local server.
+- Match the spec's visible input and output key names literally; do not invent
+  aliases such as `stock` for `lots`, `order_id` for `id`, or `warehouse_id`
+  for `warehouse`.
+- For cart/pricing specs whose visible verification covers duplicate combining,
+  multiple line-promotion types, tax, coupon, and shipping, the compound success
+  probe must include interleaved duplicate SKUs plus taxable and non-taxable
+  items, then assert the full output object and item rows. Use `shape_contract`
+  for this probe unless the command also proves the required
+  `ordering_inversion` evidence markers.
+- Empty output is invalid when this phase is enabled. If no bounded executable
+  probe can be derived, write one JSONL object whose command exits nonzero and
+  whose `derived_from` names the blocking verification bullet; BUILD_GATE will
+  surface the inability as a concrete failure instead of silently proceeding.
+- No prose, no Markdown, no summaries, no alternate plan.
+</output>
+<quality_bar>
+- Executable beats rhetorical. A risk that cannot become a bounded command does
+  not belong in this artifact.
+- Keep probes small. They are BUILD_GATE obligations, not a replacement for the
+  full test suite.
+- Coverage over cleverness: mirror the verification bullet literally before
+  inventing an edge case.
+- If a probe passes while an implementation processes entities in input order
+  instead of the required priority/order, or emits extra zero-value state rows,
+  the probe is too weak.
+- If priority/order appears in the visible contract, at least one probe must
+  carry `ordering_inversion`.
+- If blocked intervals, forbidden windows, or overlap appear in the visible
+  contract, at least one probe must carry `boundary_overlap`.
+  `boundary_overlap` is not satisfied by a generic overlap case. The same
+  probe must assert a candidate starting exactly at the blocked interval start,
+  a candidate ending exactly at the blocked interval end, and a one-minute
+  overlap case with no later valid placement.
+- If the domain has both windows/availability and conflicts that can push a
+  candidate later, at least one probe must assert the pushed candidate is
+  rejected when the pushed start plus duration no longer fits inside the same
+  window. The full expected output must exclude that row from scheduled/accepted
+  output and include the required rejection reason.
+- If accepted operations reduce stock/state/availability for later operations,
+  at least one probe must carry `prior_consumption`: a later lower-priority or
+  later-submitted entity must fail or reroute only because an earlier accepted
+  entity consumed the exact resource/lot/slot.
+- If a visible contract has all-or-nothing rollback plus `remaining`, at least
+  one probe must carry both `rollback_state` and `positive_remaining`; it must
+  prove the rollback by a later successful entity and by the final remaining
+  rows, not just by the rejected order reason.
+- If `remaining` state appears in the visible contract, at least one probe must
+  carry `positive_remaining` and assert that zero-quantity/zero-value rows are
+  absent unless the visible spec explicitly requires them.
+</quality_bar>
+<runtime_principles>
+Read `_shared/runtime-principles.md`. The discipline here is: visible contract
+in, executable obligation out. Hidden oracle leakage is a blocker.
+</runtime_principles>