npm - devlyn-cli - Versions diffs - 2.0.0 → 2.2.0 - Mend

devlyn-cli 2.0.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (138) hide show

package/benchmark/auto-resolve/scripts/test-swebench-frozen-case.sh ADDED Viewed

@@ -0,0 +1,302 @@
+#!/usr/bin/env bash
+# Regression test for the SWE-bench frozen VERIFY case importer.
+set -euo pipefail
+ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
+TMP="$(mktemp -d)"
+trap 'rm -rf "$TMP"' EXIT
+REPO="$TMP/repo"
+mkdir -p "$REPO"
+git -C "$REPO" init -q
+git -C "$REPO" config user.email bench@example.com
+git -C "$REPO" config user.name bench
+printf 'hello\n' > "$REPO/app.txt"
+git -C "$REPO" add app.txt
+git -C "$REPO" commit -q -m base
+BASE_SHA="$(git -C "$REPO" rev-parse HEAD)"
+printf 'goodbye\n' > "$REPO/app.txt"
+git -C "$REPO" diff > "$TMP/model.patch"
+git -C "$REPO" checkout -q -- app.txt
+cat > "$TMP/instance.json" <<JSON
+{
+  "instance_id": "local__repo-1",
+  "repo": "local/repo",
+  "base_commit": "$BASE_SHA",
+  "problem_statement": "Change app.txt so it says goodbye instead of hello.",
+  "version": "test",
+  "issue_url": "https://example.test/issue",
+  "pr_url": "https://example.test/pr"
+}
+JSON
+python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-case.py" \
+  --instance-json "$TMP/instance.json" \
+  --model-patch "$TMP/model.patch" \
+  --cases-root "$TMP/cases" \
+  --repos-root "$TMP/repos" \
+  --repo-dir "$REPO" \
+  --timeout-seconds 60 > "$TMP/prepare.json"
+CASE_DIR="$TMP/cases/local__repo-1"
+BASE_REPO="$TMP/repos/local__repo-${BASE_SHA:0:12}"
+test -f "$CASE_DIR/spec.md"
+test -f "$CASE_DIR/model.patch"
+test -x "$CASE_DIR/setup.sh"
+grep -q 'SWE-bench local__repo-1' "$CASE_DIR/spec.md"
+grep -q -- '--pair-mode gated' "$CASE_DIR/run-command.txt"
+python3 "$ROOT/benchmark/auto-resolve/scripts/fetch-swebench-instances.py" \
+  --dataset lite \
+  --limit 1 \
+  --out "$TMP/fetched-lite.jsonl" > "$TMP/fetch.json"
+grep -q '"rows_written": 1' "$TMP/fetch.json"
+python3 - "$TMP/fetched-lite.jsonl" <<'PY'
+import json, pathlib, sys
+row = json.loads(pathlib.Path(sys.argv[1]).read_text().splitlines()[0])
+for key in ("instance_id", "repo", "base_commit", "problem_statement"):
+    assert row.get(key), key
+PY
+python3 - "$TMP/instance.json" "$TMP/instances.jsonl" "$TMP/model.patch" "$TMP/predictions.jsonl" <<'PY'
+import json, pathlib, sys
+instance = json.loads(pathlib.Path(sys.argv[1]).read_text())
+pathlib.Path(sys.argv[2]).write_text(json.dumps(instance) + "\n")
+patch = pathlib.Path(sys.argv[3]).read_text()
+pathlib.Path(sys.argv[4]).write_text(json.dumps({
+    "instance_id": "local__repo-1",
+    "model_name_or_path": "local-test",
+    "model_patch": patch,
+}) + "\n")
+PY
+mkdir -p "$TMP/patch-root/local__repo-1"
+cp "$TMP/model.patch" "$TMP/patch-root/local__repo-1/patch.diff"
+python3 "$ROOT/benchmark/auto-resolve/scripts/collect-swebench-predictions.py" \
+  --patch-root "$TMP/patch-root" \
+  --instances-jsonl "$TMP/instances.jsonl" \
+  --model-name local-patch-root \
+  --out "$TMP/collected-predictions.jsonl" > "$TMP/collect.json"
+grep -q '"predictions_written": 1' "$TMP/collect.json"
+python3 - "$TMP/collected-predictions.jsonl" <<'PY'
+import json, pathlib, sys
+row = json.loads(pathlib.Path(sys.argv[1]).read_text())
+assert row["instance_id"] == "local__repo-1"
+assert row["model_name_or_path"] == "local-patch-root"
+assert row["model_patch"].endswith("\n")
+PY
+rm -rf "$TMP/cases-batch" "$TMP/repos-batch"
+python3 "$ROOT/benchmark/auto-resolve/scripts/prepare-swebench-frozen-corpus.py" \
+  --instances-jsonl "$TMP/instances.jsonl" \
+  --predictions-jsonl "$TMP/predictions.jsonl" \
+  --cases-root "$TMP/cases-batch" \
+  --repos-root "$TMP/repos-batch" \
+  --repo-dir "$REPO" \
+  --out-manifest "$TMP/manifest.json" > "$TMP/batch.json"
+grep -q '"prepared_count": 1' "$TMP/manifest.json"
+test -f "$TMP/cases-batch/local__repo-1/model.patch"
+bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
+  --fixture local__repo-1 \
+  --fixtures-root "$TMP/cases" \
+  --base-repo "$BASE_REPO" \
+  --diff "$CASE_DIR/model.patch" \
+  --run-id swebench-frozen-case-test \
+  --pair-mode gated \
+  --timeout-seconds 7 \
+  --prepare-only > "$TMP/runner.log"
+grep -q 'Timeout: 7s per arm' "$TMP/runner.log"
+grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/app.txt
+grep -q '^goodbye$' /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/app.txt
+test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-solo/.devlyn/spec-verify.json
+test ! -e /tmp/bench-swebench-frozen-case-test-local__repo-1-pair/.devlyn/spec-verify.json
+RESULTS_DIR="$ROOT/benchmark/auto-resolve/results"
+RESUME_RUN_ID="swebench-resume-arm-test-local__repo-1"
+mkdir -p "$RESULTS_DIR/$RESUME_RUN_ID/solo" "$TMP/fakebin"
+cat > "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json" <<'EOF'
+{
+  "elapsed_seconds": 1,
+  "invoke_exit": 0,
+  "timed_out": false,
+  "verify_verdict": "PASS",
+  "terminal_verdict": "PASS"
+}
+EOF
+cat > "$TMP/fakebin/claude" <<'EOF'
+#!/usr/bin/env bash
+echo "fake claude invoked"
+exit 1
+EOF
+chmod +x "$TMP/fakebin/claude"
+PATH="$TMP/fakebin:$PATH" bash "$ROOT/benchmark/auto-resolve/scripts/run-frozen-verify-pair.sh" \
+  --fixture local__repo-1 \
+  --fixtures-root "$TMP/cases" \
+  --base-repo "$BASE_REPO" \
+  --diff "$CASE_DIR/model.patch" \
+  --run-id "$RESUME_RUN_ID" \
+  --pair-mode gated \
+  --timeout-seconds 3 \
+  --resume-completed-arms > "$TMP/resume-arm.log" 2>&1
+grep -Fq '[frozen-verify] solo: reuse completed summary' "$TMP/resume-arm.log"
+grep -Fq 'fake claude invoked' "$RESULTS_DIR/$RESUME_RUN_ID/pair/transcript.txt"
+grep -q '"invoke_exit": 0' "$RESULTS_DIR/$RESUME_RUN_ID/solo/summary.json"
+bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
+  --manifest "$TMP/manifest.json" \
+  --run-prefix swebench-frozen-corpus-test \
+  --timeout-seconds 7 \
+  --run-ids-out "$TMP/prepare-run-ids.txt" \
+  --out-json "$TMP/gate.json" \
+  --out-md "$TMP/gate.md" \
+  --prepare-only > "$TMP/corpus-runner.log"
+grep -q 'prepare-only complete; gate skipped' "$TMP/corpus-runner.log"
+grep -q 'Timeout: 7s per arm' "$TMP/corpus-runner.log"
+grep -q '^swebench-frozen-corpus-test-1-local__repo-1$' "$TMP/prepare-run-ids.txt"
+grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/app.txt
+grep -q '^goodbye$' /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/app.txt
+test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo/.devlyn/spec-verify.json
+test ! -e /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair/.devlyn/spec-verify.json
+test ! -e "$TMP/gate.json"
+test ! -e "$TMP/gate.md"
+python3 - "$TMP/manifest.json" "$TMP/manifest-bad-diff.json" <<'PY'
+import json, pathlib, sys
+manifest = json.loads(pathlib.Path(sys.argv[1]).read_text())
+manifest["prepared"][0]["case_dir"] = str(pathlib.Path(manifest["prepared"][0]["case_dir"]).parent / "missing-case")
+pathlib.Path(sys.argv[2]).write_text(json.dumps(manifest, indent=2) + "\n")
+PY
+set +e
+bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
+  --manifest "$TMP/manifest-bad-diff.json" \
+  --run-prefix swebench-frozen-corpus-fail-test \
+  --run-ids-out "$TMP/fail-run-ids.txt" \
+  --prepare-only > "$TMP/corpus-fail.log" 2>&1
+fail_status=$?
+set -e
+[ "$fail_status" -ne 0 ]
+grep -q 'row failed: swebench-frozen-corpus-fail-test-1-local__repo-1' "$TMP/corpus-fail.log"
+grep -q '^swebench-frozen-corpus-fail-test-1-local__repo-1$' "$TMP/fail-run-ids.txt"
+test -f "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1/compare.json"
+python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
+  --title "Local SWE-bench Failed Matrix" \
+  --verdict FAIL \
+  --run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
+  --out-json "$TMP/fail-matrix.json" \
+  --out-md "$TMP/fail-matrix.md" > "$TMP/fail-matrix.log"
+grep -q '"classification": "failed attempt: row runner exit=1"' "$TMP/fail-matrix.json"
+grep -q '"trailing_non_gate_rows": 1' "$TMP/fail-matrix.json"
+grep -q '"failed attempt: row runner exit=1": 1' "$TMP/fail-matrix.json"
+grep -Fq 'failed attempt: row runner exit=1' "$TMP/fail-matrix.md"
+grep -Fq 'Trailing non-gate rows: 1' "$TMP/fail-matrix.md"
+set +e
+python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
+  --title "Local SWE-bench Failed Matrix" \
+  --verdict FAIL \
+  --run-id swebench-frozen-corpus-fail-test-1-local__repo-1 \
+  --max-trailing-non-gate 0 \
+  --out-json "$TMP/fail-yield-matrix.json" \
+  --out-md "$TMP/fail-yield-matrix.md" > "$TMP/fail-yield-matrix.log"
+yield_status=$?
+set -e
+[ "$yield_status" -eq 2 ]
+grep -q '"yield_verdict": "FAIL"' "$TMP/fail-yield-matrix.json"
+grep -q '"trailing non-gate rows 1 > maximum 0"' "$TMP/fail-yield-matrix.json"
+grep -Fq 'Yield verdict: **FAIL**' "$TMP/fail-yield-matrix.md"
+PROVIDER_LIMIT_RUN_ID="swebench-provider-limit-test-local__repo-1"
+mkdir -p "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo" "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair"
+cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/solo/input.md" <<'EOF'
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
+EOF
+cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/pair/transcript.txt" <<'EOF'
+You've hit your limit · resets 3am (Asia/Seoul)
+EOF
+cat > "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID/compare.json" <<'EOF'
+{
+  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS", "elapsed_seconds": 1},
+  "pair": {"invoke_exit": 1, "timed_out": false, "verify_verdict": null, "elapsed_seconds": 1},
+  "comparison": {
+    "pair_trigger_missed": false,
+    "pair_verdict_lift": false,
+    "solo_verdict": "PASS",
+    "pair_verdict": null
+  }
+}
+EOF
+python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
+  --title "Local SWE-bench Provider Limit Matrix" \
+  --verdict FAIL \
+  --run-id "$PROVIDER_LIMIT_RUN_ID" \
+  --out-json "$TMP/provider-limit-matrix.json" \
+  --out-md "$TMP/provider-limit-matrix.md" > "$TMP/provider-limit-matrix.log"
+grep -q '"classification": "failed attempt: provider limit"' "$TMP/provider-limit-matrix.json"
+grep -Fq 'failed attempt: provider limit' "$TMP/provider-limit-matrix.md"
+RUN_ID="swebench-gate-only-test-local__repo-1"
+mkdir -p "$RESULTS_DIR/$RUN_ID/pair"
+cat > "$RESULTS_DIR/$RUN_ID/pair/input.md" <<'EOF'
+Use /devlyn:resolve --verify-only --spec docs/roadmap/phase-1/local__repo-1.md.
+EOF
+cat > "$RESULTS_DIR/$RUN_ID/compare.json" <<'EOF'
+{
+  "solo": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "PASS_WITH_ISSUES", "elapsed_seconds": 100},
+  "pair": {"invoke_exit": 0, "timed_out": false, "verify_verdict": "NEEDS_WORK", "pair_mode": true, "elapsed_seconds": 200},
+  "comparison": {
+    "pair_trigger_missed": false,
+    "pair_verdict_lift": true,
+    "solo_verdict": "PASS_WITH_ISSUES",
+    "pair_verdict": "NEEDS_WORK"
+  }
+}
+EOF
+printf '%s\n' "$RUN_ID" > "$TMP/run-ids.txt"
+bash "$ROOT/benchmark/auto-resolve/scripts/run-swebench-frozen-corpus.sh" \
+  --manifest "$TMP/manifest.json" \
+  --gate-only-run-ids "$TMP/run-ids.txt" \
+  --min-runs 1 \
+  --max-pair-solo-wall-ratio 3 \
+  --run-ids-out "$TMP/gate-run-ids.txt" \
+  --out-json "$TMP/gate.json" \
+  --out-md "$TMP/gate.md" > "$TMP/gate-only.log"
+grep -q '"verdict": "PASS"' "$TMP/gate.json"
+grep -q '"avg_pair_solo_wall_ratio": 2.0' "$TMP/gate.json"
+grep -Fq 'Verdict: **PASS**' "$TMP/gate.md"
+grep -Fq 'Max pair/solo wall ratio: 3.00x' "$TMP/gate.md"
+cmp "$TMP/run-ids.txt" "$TMP/gate-run-ids.txt"
+python3 "$ROOT/benchmark/auto-resolve/scripts/swebench-frozen-matrix.py" \
+  --title "Local SWE-bench Matrix" \
+  --verdict PASS \
+  --gate-json "$TMP/gate.json" \
+  --run-id "$RUN_ID" \
+  --min-gate-rate 1 \
+  --max-trailing-non-gate 0 \
+  --out-json "$TMP/matrix.json" \
+  --out-md "$TMP/matrix.md" > "$TMP/matrix.log"
+grep -q '"runs_total": 1' "$TMP/matrix.json"
+grep -q '"gate_rows": 1' "$TMP/matrix.json"
+grep -q '"gate_rate": 1.0' "$TMP/matrix.json"
+grep -q '"trailing_non_gate_rows": 0' "$TMP/matrix.json"
+grep -q '"yield_verdict": "PASS"' "$TMP/matrix.json"
+grep -Fq 'Local SWE-bench Matrix' "$TMP/matrix.md"
+grep -Fq 'Gate rate: 1.000' "$TMP/matrix.md"
+grep -Fq 'Yield verdict: **PASS**' "$TMP/matrix.md"
+rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-solo
+rm -rf /tmp/bench-swebench-frozen-case-test-local__repo-1-pair
+rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-solo
+rm -rf /tmp/bench-swebench-frozen-corpus-test-1-local__repo-1-local__repo-1-pair
+rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-case-test"
+rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-test-1-local__repo-1"
+rm -rf "$ROOT/benchmark/auto-resolve/results/swebench-frozen-corpus-fail-test-1-local__repo-1"
+rm -rf "$RESULTS_DIR/$RESUME_RUN_ID"
+rm -rf "$RESULTS_DIR/$PROVIDER_LIMIT_RUN_ID"
+rm -rf "$RESULTS_DIR/$RUN_ID"
+echo "PASS test-swebench-frozen-case"

package/bin/devlyn.js CHANGED Viewed

@@ -19,6 +19,10 @@ const CLI_TARGETS = {
     instructionsFile: 'AGENTS.md',
     baseInstructionsFile: 'AGENTS.md',
     configDir: null, // Codex uses AGENTS.md at project root
+    // Codex auto-loads skills from ~/.codex/skills/ (user-global). Same
+    // SKILL.md format as Claude Code; descriptions must stay ≤1024 chars.
+    skillsDir: path.join(os.homedir(), '.codex', 'skills'),
+    skillsToInstall: ['devlyn:resolve', 'devlyn:ideate', '_shared'],
     detect: () => fs.existsSync(path.join(process.cwd(), 'AGENTS.md')) || fs.existsSync(path.join(process.cwd(), '.codex')),
   },
   gemini: {
@@ -509,6 +513,37 @@ function detectOtherCLIs() {
   return detected;
 }
+// Install /devlyn:resolve + /devlyn:ideate + _shared skills into a CLI's
+// global skills directory (e.g. ~/.codex/skills/). Returns count of skills
+// copied. Skipped silently for CLIs without a skillsDir (e.g. cursor, copilot
+// at the time of writing — they don't have an analogous skill-loader).
+function installSkillsForCLI(cliKey) {
+  const cli = CLI_TARGETS[cliKey];
+  if (!cli || !cli.skillsDir || !cli.skillsToInstall) return 0;
+  const sourceSkillsDir = path.join(CONFIG_SOURCE, 'skills');
+  if (!fs.existsSync(sourceSkillsDir)) return 0;
+  if (!fs.existsSync(cli.skillsDir)) {
+    fs.mkdirSync(cli.skillsDir, { recursive: true });
+  }
+  let copied = 0;
+  for (const skillName of cli.skillsToInstall) {
+    const src = path.join(sourceSkillsDir, skillName);
+    const dest = path.join(cli.skillsDir, skillName);
+    if (!fs.existsSync(src)) continue;
+    // Full replace per cleanManagedSkillDirs semantics: stale files in the
+    // installed mirror would otherwise persist forever.
+    if (fs.existsSync(dest)) {
+      fs.rmSync(dest, { recursive: true, force: true });
+    }
+    copyRecursive(src, dest, cli.skillsDir);
+    copied++;
+    log(`  → ${cli.skillsDir.replace(os.homedir(), '~')}/${skillName}`, 'dim');
+  }
+  return copied;
+}
 function installAgentsForCLI(cliKey) {
   const cli = CLI_TARGETS[cliKey];
   if (!cli) return false;
@@ -561,6 +596,14 @@ function installAgentsForCLI(cliKey) {
     log(`  → ${cli.instructionsFile} (agent instructions appended)`, 'dim');
   }
+  // If this CLI also supports a global skill-loader (currently Codex), install
+  // /devlyn:resolve + /devlyn:ideate + _shared so the same slash commands work
+  // there. Skipped for CLIs without a skillsDir entry.
+  const skillsCopied = installSkillsForCLI(cliKey);
+  if (skillsCopied > 0) {
+    log(`  → ${skillsCopied} skill${skillsCopied > 1 ? 's' : ''} installed (devlyn:resolve / devlyn:ideate / _shared)`, 'dim');
+  }
   return true;
 }
@@ -695,7 +738,7 @@ async function init(skipPrompts = false) {
   // Skip prompts if -y flag or non-interactive
   if (skipPrompts || !process.stdin.isTTY) {
     log('\n💡 Add optional addons later: run `npx devlyn-cli` without -y', 'dim');
-    log('   Add Codex instructions later: run `npx devlyn-cli agents codex`', 'dim');
+    log('   Add Codex instructions + skills later: run `npx devlyn-cli agents codex`', 'dim');
     log(`\n${COLORS.dim}   Enjoying devlyn? Star it on GitHub — it helps others find it:${COLORS.reset}`);
     log(`   ${COLORS.purple}→ https://github.com/fysoul17/devlyn-cli${COLORS.reset}\n`);
     return;
@@ -703,14 +746,17 @@ async function init(skipPrompts = false) {
   // Ask which non-Claude CLIs should receive instruction files.
   log('\n🤖 Optional AI CLI instructions:\n', 'blue');
-  const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) => ({
-    key,
-    name: cli.name,
-    desc: cli.configDir
-      ? `Install agents into ${cli.configDir}/`
-      : `Install ${cli.instructionsFile}`,
-    type: 'cli',
-  }));
+  const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) => {
+    let desc;
+    if (cli.configDir) {
+      desc = `Install agents into ${cli.configDir}/`;
+    } else if (cli.skillsDir) {
+      desc = `Install ${cli.instructionsFile} + /devlyn:resolve + /devlyn:ideate skills (~/.codex/skills/)`;
+    } else {
+      desc = `Install ${cli.instructionsFile}`;
+    }
+    return { key, name: cli.name, desc, type: 'cli' };
+  });
   const selectedClis = await multiSelect(cliOptions);
   if (selectedClis.length > 0) {
     let agentsInstalled = 0;
@@ -720,7 +766,7 @@ async function init(skipPrompts = false) {
     log(`  ✅ Agent instructions installed for ${agentsInstalled} CLI${agentsInstalled !== 1 ? 's' : ''}`, 'green');
   } else {
     log('💡 No additional CLI instructions selected', 'dim');
-    log('   Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md', 'dim');
+    log('   Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md + /devlyn skills', 'dim');
   }
   // Ask about optional addons (local skills + external packs)

package/config/skills/_shared/archive_run.py CHANGED Viewed

@@ -26,6 +26,7 @@ PER_RUN_PATTERNS = (
     "*.log.md",
     "fix-batch.round-*.json",
     "criteria.generated.md",
+    "risk-probes.jsonl",
     # iter-0019.8: spec-verify carrier artifacts get archived alongside
     # other per-run state. Killed mid-run cleanup is enforced separately
     # by spec-verify-check.py main() — when source markdown has no json
@@ -35,6 +36,7 @@ PER_RUN_PATTERNS = (
     "spec-verify.json",
     "spec-verify.results.json",
     "spec-verify-findings.jsonl",
+    "verify-merge.summary.json",
     # iter-0033a/2026-04-30 archive-fix iter: NEW /devlyn:resolve emits
     # plan.md (PLAN output) + final-report.md (PHASE 6 render) +
     # cumulative.patch (cumulative diff). Smoke 2's archive listing
@@ -52,6 +54,7 @@ PER_RUN_PATTERNS = (
     # ("pair_judge findings archive distinguishable") would false-fail on
     # every paired fixture without this glob.
     "verify-judge-*.md",
+    "codex-judge.*",
 )

package/config/skills/_shared/codex-config.md CHANGED Viewed

@@ -6,7 +6,7 @@ Single source of truth for how every skill calls Codex. **MCP is not used.** Ski
 All long-running Codex calls go through `codex-monitored.sh` — a thin wrapper that closes stdin (codex 0.124.0 hangs when both stdin is open and a prompt arg is given), streams Codex stdout fully (no `tail -n` truncation), and prints a `[codex-monitored] heartbeat` line every 30s so the outer `claude -p` byte-watchdog stays fed during long reasoning gaps. The wrapper passes its arguments through verbatim to the underlying CLI, so the canonical flag set is unchanged from a raw call — only the launcher differs.
-**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex.
+**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex. Read-only critique returns findings on stdout; the orchestrator writes any files.
 ```bash
 bash .claude/skills/_shared/codex-monitored.sh \
@@ -51,4 +51,4 @@ The local Codex CLI (fronted by `codex-monitored.sh`) is the primary (and only)
 Skills write the invocation as a Bash command the runtime executes. Example shape from `/devlyn:resolve` PHASE 2 IMPLEMENT when routed to Codex:
-> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout.
+> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout. Do not pipe the wrapper; direct capture or file redirection preserves streaming and avoids the pipe-refusal exit.

package/config/skills/_shared/codex-monitored.sh CHANGED Viewed

@@ -41,7 +41,10 @@
 #
 # ENV OVERRIDES:
 #   CODEX_MONITORED_HEARTBEAT      — heartbeat interval seconds (default 30).
-#   CODEX_BIN                      — real codex binary path. Default: `codex`.
+#   CODEX_MONITORED_TIMEOUT_SEC    — optional hard timeout. When >0, kill the
+#                                     codex process group and exit 124.
+#   CODEX_BIN                      — real codex binary path. Default:
+#                                     CODEX_REAL_BIN when set, else `codex`.
 #                                     Set this when the shim has put us first
 #                                     on PATH.
 #   CODEX_MONITORED_ALLOW_PIPED    — set non-empty to skip the pipe-stdout
@@ -63,8 +66,10 @@ if [ -n "${CODEX_BLOCKED:-}" ]; then
 fi
 HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
-CODEX_BIN="${CODEX_BIN:-codex}"
+TIMEOUT_SEC="${CODEX_MONITORED_TIMEOUT_SEC:-0}"
+CODEX_BIN="${CODEX_BIN:-${CODEX_REAL_BIN:-codex}}"
 START=$(date +%s)
+TIMEOUT_FLAG=""
 # --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
 # `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
@@ -106,35 +111,95 @@ heartbeat_loop() {
   done
 }
+timeout_loop() {
+  local pid="$1"
+  local seconds="$2"
+  local flag="$3"
+  [ "$seconds" -gt 0 ] || return 0
+  sleep "$seconds"
+  if kill -0 "$pid" 2>/dev/null; then
+    : > "$flag"
+    printf '[codex-monitored] timeout: elapsed=%ds limit=%ds\n' \
+      "$(( $(date +%s) - START ))" "$seconds" >&2
+    kill -TERM -- "-$pid" 2>/dev/null || kill -TERM "$pid" 2>/dev/null || true
+    sleep 5
+    kill -KILL -- "-$pid" 2>/dev/null || kill -KILL "$pid" 2>/dev/null || true
+  fi
+}
+terminate_process_group() {
+  local pgid="$1"
+  local reason="$2"
+  if ! kill -0 -- "-$pgid" 2>/dev/null; then
+    return 0
+  fi
+  printf '[codex-monitored] reap: reason=%s pgid=%s\n' "$reason" "$pgid" >&2
+  kill -TERM -- "-$pgid" 2>/dev/null || true
+  local i
+  for i in 1 2 3 4 5; do
+    sleep 1
+    if ! kill -0 -- "-$pgid" 2>/dev/null; then
+      return 0
+    fi
+  done
+  kill -KILL -- "-$pgid" 2>/dev/null || true
+}
 forward_signal() {
   local sig="$1"
   if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
-    kill -"$sig" "$CODEX_PID" 2>/dev/null || true
+    kill -"$sig" -- "-$CODEX_PID" 2>/dev/null || kill -"$sig" "$CODEX_PID" 2>/dev/null || true
   fi
   if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
     kill -TERM "$HB_PID" 2>/dev/null || true
   fi
+  if [ -n "${WATCHDOG_PID:-}" ] && kill -0 "$WATCHDOG_PID" 2>/dev/null; then
+    kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
+  fi
+}
+cleanup() {
+  forward_signal TERM
+  [ -z "$TIMEOUT_FLAG" ] || rm -f "$TIMEOUT_FLAG"
 }
-trap 'forward_signal TERM' TERM
-trap 'forward_signal INT' INT
+trap 'forward_signal TERM; exit 143' TERM
+trap 'forward_signal INT; exit 130' INT
+trap cleanup EXIT
-printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
-  "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
+printf '[codex-monitored] start: ts=%s heartbeat=%ds timeout=%ss bin=%s\n' \
+  "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$TIMEOUT_SEC" "$CODEX_BIN" >&2
 # Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
+set -m
 "$CODEX_BIN" exec "$@" < /dev/null &
 CODEX_PID=$!
+set +m
 printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
 heartbeat_loop "$CODEX_PID" &
 HB_PID=$!
+if [ "$TIMEOUT_SEC" -gt 0 ]; then
+  TIMEOUT_FLAG=$(mktemp "${TMPDIR:-/tmp}/codex-monitored-timeout.XXXXXX")
+  rm -f "$TIMEOUT_FLAG"
+  timeout_loop "$CODEX_PID" "$TIMEOUT_SEC" "$TIMEOUT_FLAG" &
+  WATCHDOG_PID=$!
+fi
 wait "$CODEX_PID"
 EXIT=$?
+terminate_process_group "$CODEX_PID" "post-exit-descendants"
 kill -TERM "$HB_PID" 2>/dev/null || true
 wait "$HB_PID" 2>/dev/null || true
+if [ -n "${WATCHDOG_PID:-}" ]; then
+  kill -TERM "$WATCHDOG_PID" 2>/dev/null || true
+  wait "$WATCHDOG_PID" 2>/dev/null || true
+fi
+if [ -n "$TIMEOUT_FLAG" ] && [ -f "$TIMEOUT_FLAG" ]; then
+  EXIT=124
+fi
 printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
   "$EXIT" $(( $(date +%s) - START )) >&2