npm - devlyn-cli - Versions diffs - 1.15.0 → 2.0.0 - Mend

devlyn-cli 1.15.0 → 2.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (158) hide show

package/bin/devlyn.js CHANGED Viewed

@@ -17,6 +17,7 @@ const CLI_TARGETS = {
   codex: {
     name: 'Codex CLI (OpenAI)',
     instructionsFile: 'AGENTS.md',
+    baseInstructionsFile: 'AGENTS.md',
     configDir: null, // Codex uses AGENTS.md at project root
     detect: () => fs.existsSync(path.join(process.cwd(), 'AGENTS.md')) || fs.existsSync(path.join(process.cwd(), '.codex')),
   },
@@ -68,8 +69,15 @@ const DEPRECATED_FILES = [
   'commands/devlyn.pencil-push.md', // migrated to skills/devlyn:pencil-push
 ];
-// Skill directories renamed from devlyn-* to devlyn:* in v0.7.x
+// Skill directories renamed from devlyn-* to devlyn:* in v0.7.x, plus
+// iter-0034 Phase 4 cutover (2026-05-03): 15 user skills deleted and 3 moved
+// to optional-skills/. Listed here so post-cutover `npx devlyn-cli` upgrades
+// force-remove stale legacy skill dirs from downstream `~/.claude/skills/`
+// even though the source dirs no longer exist (cleanManagedSkillDirs only
+// removes target dirs that still exist in source — without this list,
+// deleted-from-source skills persist in user installs forever).
 const DEPRECATED_DIRS = [
+  // v0.7.x rename: devlyn-* → devlyn:*
   'skills/devlyn-clean',
   'skills/devlyn-design-system',
   'skills/devlyn-design-ui',
@@ -87,6 +95,28 @@ const DEPRECATED_DIRS = [
   'skills/devlyn-update-docs',
   'skills/devlyn-pencil-pull',
   'skills/devlyn-pencil-push',
+  // iter-0034 Phase 4 cutover: deleted user skills
+  'skills/devlyn:auto-resolve',
+  'skills/devlyn:browser-validate',
+  'skills/devlyn:clean',
+  'skills/devlyn:design-ui',
+  'skills/devlyn:discover-product',
+  'skills/devlyn:evaluate',
+  'skills/devlyn:feature-spec',
+  'skills/devlyn:implement-ui',
+  'skills/devlyn:preflight',
+  'skills/devlyn:product-spec',
+  'skills/devlyn:recommend-features',
+  'skills/devlyn:review',
+  'skills/devlyn:team-resolve',
+  'skills/devlyn:team-review',
+  'skills/devlyn:update-docs',
+  // iter-0034 Phase 4 cutover: moved to optional-skills/. Force-removed on
+  // upgrade so users only have them if they opt in via the interactive
+  // installer (matches the pencil-pull / pencil-push pattern).
+  'skills/devlyn:reap',
+  'skills/devlyn:design-system',
+  'skills/devlyn:team-design-ui',
 ];
 function getTargetDir() {
@@ -148,6 +178,9 @@ const OPTIONAL_ADDONS = [
   { name: 'dokkit', desc: 'Document template filling for DOCX/HWPX — ingest, fill, review, export', type: 'local' },
   { name: 'devlyn:pencil-pull', desc: 'Pull Pencil designs into code with exact visual fidelity', type: 'local' },
   { name: 'devlyn:pencil-push', desc: 'Push codebase UI to Pencil canvas for design sync', type: 'local' },
+  { name: 'devlyn:reap', desc: 'Safely reap orphaned MCP / codex / Superset child processes left behind by long Claude sessions', type: 'local' },
+  { name: 'devlyn:design-system', desc: 'Extract design tokens from a chosen UI style for exact reproduction (creative power-user)', type: 'local' },
+  { name: 'devlyn:team-design-ui', desc: '5 distinct UI style explorations from a full design team (creative power-user)', type: 'local' },
   // External skill packs (installed via npx skills add)
   { name: 'vercel-labs/agent-skills', desc: 'React, Next.js, React Native best practices', type: 'external' },
   { name: 'supabase/agent-skills', desc: 'Supabase integration patterns', type: 'external' },
@@ -155,8 +188,10 @@ const OPTIONAL_ADDONS = [
   { name: 'anthropics/skills', desc: 'Official Anthropic skill-creator with eval framework and description optimizer', type: 'external' },
   { name: 'Leonxlnx/taste-skill', desc: 'Premium frontend design skills — modern layouts, animations, and visual refinement', type: 'external' },
   // MCP servers (installed via claude mcp add)
-  { name: 'codex-cli', desc: 'Codex MCP server for cross-model evaluation via OpenAI Codex', type: 'mcp', command: 'npx -y codex-mcp-server' },
-  { name: 'playwright', desc: 'Playwright MCP for browser testing — powers devlyn:browser-validate Tier 2', type: 'mcp', command: 'npx -y @anthropic-ai/mcp-playwright' },
+  // Note: the Codex integration uses the local `codex` CLI binary (not MCP).
+  // Install the CLI separately per https://platform.openai.com/docs/codex — the
+  // harness auto-detects availability and downgrades to Claude-only on failure.
+  { name: 'playwright', desc: 'Playwright MCP for browser testing — powers /devlyn:resolve BUILD_GATE browser tier', type: 'mcp', command: 'npx -y @anthropic-ai/mcp-playwright' },
 ];
 function log(msg, color = 'reset') {
@@ -262,7 +297,7 @@ function cleanupDeprecated(targetDir) {
     const fullPath = path.join(targetDir, relPath);
     if (fs.existsSync(fullPath)) {
       fs.rmSync(fullPath, { recursive: true });
-      log(`  ✕ ${relPath}/ (renamed)`, 'dim');
+      log(`  ✕ ${relPath}/ (removed)`, 'dim');
       removed++;
     }
   }
@@ -273,6 +308,8 @@ function copyRecursive(src, dest, baseDir) {
   const stats = fs.statSync(src);
   if (stats.isDirectory()) {
+    // Never install dev workspaces, even when running from source repo.
+    if (UNSHIPPED_SKILL_DIRS.has(path.basename(src))) return;
     if (!fs.existsSync(dest)) {
       fs.mkdirSync(dest, { recursive: true });
     }
@@ -290,6 +327,37 @@ function copyRecursive(src, dest, baseDir) {
   }
 }
+// Dev artifacts that live under config/skills/ but must never ship or install.
+// Mirrors the `!` exclusions in package.json files[].
+const UNSHIPPED_SKILL_DIRS = new Set([
+  'devlyn:auto-resolve-workspace',
+  'devlyn:ideate-workspace',
+  'preflight-workspace',
+  'roadmap-archival-workspace',
+]);
+// Clean managed skill directories before copy to prevent stale-file drift.
+// copyRecursive is a pure overlay: if a file was removed or renamed in source,
+// the installed mirror keeps the old copy. For each top-level dir under
+// config/skills/, remove its counterpart in target/skills/ before the copy so
+// each managed skill is fully replaced on every sync. User-installed skills
+// (e.g. skill-creator from optional addons) are left alone because they have
+// no counterpart in source. Dev workspaces are skipped entirely.
+function cleanManagedSkillDirs(sourceSkillsDir, targetSkillsDir) {
+  if (!fs.existsSync(sourceSkillsDir) || !fs.existsSync(targetSkillsDir)) return 0;
+  let cleaned = 0;
+  for (const entry of fs.readdirSync(sourceSkillsDir, { withFileTypes: true })) {
+    if (!entry.isDirectory()) continue;
+    if (UNSHIPPED_SKILL_DIRS.has(entry.name)) continue;
+    const targetPath = path.join(targetSkillsDir, entry.name);
+    if (fs.existsSync(targetPath)) {
+      fs.rmSync(targetPath, { recursive: true, force: true });
+      cleaned++;
+    }
+  }
+  return cleaned;
+}
 function multiSelect(items) {
   return new Promise((resolve) => {
     const selected = new Set();
@@ -310,8 +378,8 @@ function multiSelect(items) {
         const checkbox = selected.has(i) ? `${COLORS.green}◉${COLORS.reset}` : `${COLORS.dim}○${COLORS.reset}`;
         const pointer = i === cursor ? `${COLORS.cyan}❯${COLORS.reset}` : ' ';
         const name = i === cursor ? `${COLORS.cyan}${item.name}${COLORS.reset}` : item.name;
-        const tagLabel = item.type === 'mcp' ? 'mcp' : item.type === 'local' ? 'skill' : 'pack';
-        const tagColor = item.type === 'mcp' ? COLORS.green : item.type === 'local' ? COLORS.magenta : COLORS.cyan;
+        const tagLabel = item.type === 'mcp' ? 'mcp' : item.type === 'local' ? 'skill' : item.type === 'cli' ? 'cli' : 'pack';
+        const tagColor = item.type === 'mcp' ? COLORS.green : item.type === 'local' ? COLORS.magenta : item.type === 'cli' ? COLORS.blue : COLORS.cyan;
         const tag = `${tagColor}${tagLabel}${COLORS.reset}`;
         console.log(`${pointer} ${checkbox} ${name} ${COLORS.dim}[${tag}${COLORS.dim}]${COLORS.reset}`);
         console.log(`    ${COLORS.dim}${item.desc}${COLORS.reset}`);
@@ -482,6 +550,11 @@ function installAgentsForCLI(cliKey) {
         const sepIdx = existing.lastIndexOf('---', markerIdx);
         existing = existing.slice(0, sepIdx > 0 ? sepIdx : markerIdx).trimEnd();
       }
+    } else if (cli.baseInstructionsFile) {
+      const baseInstructionsSrc = path.join(__dirname, '..', cli.baseInstructionsFile);
+      if (fs.existsSync(baseInstructionsSrc)) {
+        existing = fs.readFileSync(baseInstructionsSrc, 'utf8').trimEnd();
+      }
     }
     fs.writeFileSync(destFile, existing + separator + agentContent + '\n');
@@ -514,6 +587,13 @@ async function init(skipPrompts = false) {
   // Install core config
   const targetDir = getTargetDir();
   log('\n📁 Installing core config to .claude/', 'green');
+  const refreshed = cleanManagedSkillDirs(
+    path.join(CONFIG_SOURCE, 'skills'),
+    path.join(targetDir, 'skills'),
+  );
+  if (refreshed > 0) {
+    log(`  🔄 Refreshing ${refreshed} managed skill director${refreshed === 1 ? 'y' : 'ies'}`, 'dim');
+  }
   copyRecursive(CONFIG_SOURCE, targetDir, targetDir);
   // Remove deprecated files from previous versions
@@ -522,7 +602,8 @@ async function init(skipPrompts = false) {
     log(`\n🧹 Cleaned up ${removed} deprecated file${removed > 1 ? 's' : ''}`, 'yellow');
   }
-  // Copy CLAUDE.md to project root
+  // Copy Claude project instructions to project root. Other CLI instruction
+  // files are installed only when explicitly selected below or via `agents`.
   const claudeMdSrc = path.join(__dirname, '..', 'CLAUDE.md');
   const claudeMdDest = path.join(process.cwd(), 'CLAUDE.md');
   if (fs.existsSync(claudeMdSrc)) {
@@ -609,26 +690,39 @@ async function init(skipPrompts = false) {
     log('  → ~/.claude/settings.json (disabled adaptive thinking, enabled 1h prompt caching)', 'dim');
   }
-  // Install agents for other detected CLIs
-  const detected = detectOtherCLIs();
-  if (detected.length > 0) {
-    log(`\n🔍 Detected other AI CLIs: ${detected.map((k) => CLI_TARGETS[k].name).join(', ')}`, 'blue');
-    const agentsInstalled = installAgentsForAllDetected();
-    if (agentsInstalled > 0) {
-      log(`  ✅ Agent instructions installed for ${agentsInstalled} CLI${agentsInstalled > 1 ? 's' : ''}`, 'green');
-    }
-  }
   log('\n✅ Core config installed!', 'green');
   // Skip prompts if -y flag or non-interactive
   if (skipPrompts || !process.stdin.isTTY) {
     log('\n💡 Add optional addons later: run `npx devlyn-cli` without -y', 'dim');
+    log('   Add Codex instructions later: run `npx devlyn-cli agents codex`', 'dim');
     log(`\n${COLORS.dim}   Enjoying devlyn? Star it on GitHub — it helps others find it:${COLORS.reset}`);
     log(`   ${COLORS.purple}→ https://github.com/fysoul17/devlyn-cli${COLORS.reset}\n`);
     return;
   }
+  // Ask which non-Claude CLIs should receive instruction files.
+  log('\n🤖 Optional AI CLI instructions:\n', 'blue');
+  const cliOptions = Object.entries(CLI_TARGETS).map(([key, cli]) => ({
+    key,
+    name: cli.name,
+    desc: cli.configDir
+      ? `Install agents into ${cli.configDir}/`
+      : `Install ${cli.instructionsFile}`,
+    type: 'cli',
+  }));
+  const selectedClis = await multiSelect(cliOptions);
+  if (selectedClis.length > 0) {
+    let agentsInstalled = 0;
+    for (const selectedCli of selectedClis) {
+      if (installAgentsForCLI(selectedCli.key)) agentsInstalled++;
+    }
+    log(`  ✅ Agent instructions installed for ${agentsInstalled} CLI${agentsInstalled !== 1 ? 's' : ''}`, 'green');
+  } else {
+    log('💡 No additional CLI instructions selected', 'dim');
+    log('   Run `npx devlyn-cli agents codex` later to install Codex AGENTS.md', 'dim');
+  }
   // Ask about optional addons (local skills + external packs)
   log('\n📚 Optional skills & packs:\n', 'blue');
@@ -657,6 +751,9 @@ function showHelp() {
   log('  npx devlyn-cli -y           Install without prompts');
   log('  npx devlyn-cli agents       Install agents for detected CLIs');
   log('  npx devlyn-cli agents all   Install agents for all supported CLIs');
+  log('  npx devlyn-cli benchmark    Run the full A/B benchmark suite vs bare');
+  log('  npx devlyn-cli benchmark --n 3 --bless   Ship-decision run + promote baseline if pass');
+  log('  npx devlyn-cli benchmark --dry-run       Validate suite setup without model invocation');
   log('  npx devlyn-cli --help       Show this help\n');
   log('Optional skills (select during install):', 'green');
   OPTIONAL_ADDONS.filter((a) => a.type === 'local').forEach((skill) => {
@@ -694,6 +791,21 @@ switch (command) {
   case 'ls':
     listContents();
     break;
+  case 'benchmark':
+  case 'bench': {
+    // Delegate to benchmark/auto-resolve/scripts/run-suite.sh with all remaining args.
+    const runSuite = path.join(__dirname, '..', 'benchmark', 'auto-resolve', 'scripts', 'run-suite.sh');
+    if (!fs.existsSync(runSuite)) {
+      log('❌ Benchmark suite runner missing — is this a clean devlyn-cli checkout?', 'yellow');
+      log(`   Expected: ${runSuite}`, 'dim');
+      process.exit(1);
+    }
+    const { spawnSync } = require('child_process');
+    const forwardedArgs = args.slice(1);
+    const res = spawnSync('bash', [runSuite, ...forwardedArgs], { stdio: 'inherit' });
+    process.exit(res.status ?? 1);
+    break;
+  }
   case 'agents': {
     showLogo();
     log('─'.repeat(44), 'dim');

package/config/skills/_shared/adapters/README.md ADDED Viewed

@@ -0,0 +1,64 @@
+# Per-engine prompt adapters
+This folder is the LLM-specific delta layer. The harness's canonical phase prompts (in each skill's `references/phases/<phase>.md`) stay model-neutral and outcome-first. Each adapter file in this folder is a **small delta header** that gets injected BEFORE the canonical body when the phase runs against that specific engine.
+## Why adapters exist
+Anthropic and OpenAI publish official prompt-engineering guides for their flagship models. The two guides converge on outcome-first + decision rules + mechanical validation but **diverge on tactics** (XML structure vs stop-rules format, literal interpretation vs decision-rule phrasing, self-check pattern vs validation-tool primacy). A single canonical prompt can't hit both ceilings.
+The split:
+- **Canonical body** (in `<skill>/references/phases/`) = the contract: goal, output format, invariants, common-ground rules from both guides.
+- **Adapter header** (here) = the per-engine elaboration: model-specific guidance from that engine's official guide.
+This is also the load-bearing piece for **multi-LLM evolution**. When Qwen / Gemini / Gemma are added (Mission 2/3), each gets its own adapter file here. The canonical body never moves.
+## Format
+Each adapter is a single markdown file named `<model-id>.md` (e.g. `opus-4-7.md`, `gpt-5-5.md`). Structure:
+```markdown
+# <Model name> adapter
+> Source: <official-prompt-engineering-guide URL>
+## Identity
+1-2 lines telling the model who it is + which guide governs.
+## Output discipline
+Verbosity, formatting, length conventions specific to this model.
+## Tool-use posture
+When to use tools, when to reason, parallel/sequential preferences.
+## Validation pattern
+How this model verifies its work — mechanical-first vs self-check, etc.
+## Anti-patterns
+Specific patterns the official guide warns about for this model.
+```
+Keep each section to ≤ 8 lines. Adapters are deltas, not full prompts. If an adapter grows past ~80 lines, the content probably belongs in canonical body.
+## When to add a new adapter
+A new adapter file ships when:
+1. A new LLM is integrated into the pipeline (the engine is now invocable).
+2. An official prompt-engineering guide for that LLM exists (or a vendor-recommended pattern set).
+3. An empirical A/B shows the adapter's specific guidance lifts that model's performance over the canonical body alone.
+Not all models need adapters. If a model performs well on the canonical body without delta, ship without one.
+## What NOT to put here
+- ❌ Universal rules (those go in canonical body or `_shared/runtime-principles.md`).
+- ❌ Iter-history annotations (`*(iter-0020: F4 evidence...)*` style).
+- ❌ Full phase prompts (defeats the decoupling).
+- ❌ Per-task or per-spec content (adapters are model-scope, not task-scope).
+## Runtime injection
+A skill's phase invocation prepends the resolved engine's adapter file to the canonical body before sending. Mechanism is left to each skill (a `_shared/adapter-inject.sh` helper may land in a later iter); for now, skills consume the adapter file by direct read at phase-spawn time.
+## Standing rule
+Any iter that touches an adapter file MUST cite the corresponding official guide as part of acceptance: "guide section X.Y says Z, this change applies Z." Generic preferences ("feels cleaner") are rejected.

package/config/skills/_shared/adapters/gpt-5-5.md ADDED Viewed

@@ -0,0 +1,29 @@
+# OpenAI GPT-5.5 adapter
+> Source: <https://developers.openai.com/api/docs/guides/prompt-guidance?model=gpt-5.5>
+## Identity
+You are GPT-5.5 by OpenAI. OpenAI's prompt-guidance for this model governs your behavior on top of the canonical phase prompt below. When the canonical body and this header conflict on tactics, the canonical body wins on what to deliver; this header wins on how to deliver it.
+## Output discipline
+Your default is efficient, direct, task-oriented. The canonical body specifies the outcome and constraints; you choose the efficient path. Do not over-specify process steps when an outcome is clearly stated. Use headers, bullets, and bold sparingly — favor short paragraphs and natural transitions unless the canonical body or user requests structure. When `text.verbosity` is `low`, prefer even shorter responses.
+## Tool-use posture
+Resolve the request in the fewest useful tool loops without sacrificing correctness. For retrieval tasks: start with one broad search using short discriminative keywords; make another retrieval call only when the top results don't answer the core question or a required fact / parameter / source is missing. For tool-heavy tasks, start with a brief preamble: a one-line acknowledgment of the request and the first step you'll take.
+## Validation pattern
+Validation is concrete commands and tools, not self-belief. When the canonical body lists verification commands, execute them and trust their output. Do not substitute your judgment for a deterministic check the harness has provided. When validation tools are available (test runners, lint, type-check, the harness's `spec-verify-check.py`), run them before declaring success. The minimum evidence sufficient to answer correctly, cited precisely — then stop.
+## Anti-patterns
+The official guide warns explicitly about carrying over instructions from older prompt stacks — earlier models needed more help, and process-heavy directives now narrow GPT-5.5's search space.
+1. **Avoid absolute imperatives for judgment calls.** ALWAYS / NEVER / must / only are reserved for true safety invariants and required output fields. For judgment calls, use decision rules with conditions ("when X, do Y"). The canonical body uses this style; do not promote softer guidance to absolute rules.
+2. **Don't over-specify process when the destination is clear.** If the canonical body names the outcome, choose the path; do not narrate every step.
+3. **Stop rules are explicit.** When the canonical body or the harness asks you to stop / abstain / ask, follow the stop rule rather than retrying loops indefinitely. Loop-minimization does not outrank correctness or required citation.
+Do not narrate internal deliberation. State results and decisions directly.

package/config/skills/_shared/adapters/opus-4-7.md ADDED Viewed

@@ -0,0 +1,29 @@
+# Claude Opus 4.7 adapter
+> Source: <https://platform.claude.com/docs/en/build-with-claude/prompt-engineering/claude-prompting-best-practices>
+## Identity
+You are Claude Opus 4.7 by Anthropic. Anthropic's prompt-engineering guide for this model governs your behavior on top of the canonical phase prompt below. When the canonical body and this header conflict on tactics, the canonical body wins on what to deliver; this header wins on how to deliver it.
+## Output discipline
+You calibrate response length to task complexity automatically — keep simple lookups short, scale up only when the task warrants it. Do NOT pad with context the user didn't ask for. When the canonical body sets a structural format (XML, JSON, sections), follow it literally; do not silently restructure.
+## Tool-use posture
+You default to fewer tool calls than prior Claude generations. When the canonical body lists tools, use them when their result would change your answer. Make independent tool calls in parallel; chain only when one depends on another's output. Do not narrate "I'll now call X" preambles unless the canonical body requests progress updates.
+## Validation pattern
+When the canonical body asks you to verify your output before declaring done ("self-check" instructions), execute that step literally — re-read the spec's acceptance criteria, run the listed verification commands if available, list any gap. This is not optional. Mechanical gates owned by the harness (spec-verify-check.py, build-gate.py) are the primary correctness guard; your self-check is the secondary layer that catches what regex cannot.
+## Anti-patterns
+You interpret instructions more literally than prior Claude versions. The official guide is explicit about three failure modes:
+1. **Review-prompt self-filtering**: when the canonical body asks for findings, report every issue you find — including low-severity and low-confidence ones. Do NOT pre-filter for importance; the harness has a separate filter step.
+2. **Subagent over-spawning**: do NOT spawn a subagent for work you can complete in a single response. Spawn only when the canonical body explicitly requests it OR when fanning out across independent items.
+3. **Overengineering**: do NOT add files, abstractions, error handling, validation, or "future flexibility" beyond what the spec asks. A bug fix doesn't need surrounding cleanup. The right complexity is the minimum needed for the current task.
+You do NOT need stronger imperatives ("CRITICAL!", "YOU MUST!") to follow rules. Normal phrasing is sufficient.

package/config/skills/{devlyn:auto-resolve/scripts → _shared}/archive_run.py RENAMED Viewed

@@ -26,6 +26,32 @@ PER_RUN_PATTERNS = (
     "*.log.md",
     "fix-batch.round-*.json",
     "criteria.generated.md",
+    # iter-0019.8: spec-verify carrier artifacts get archived alongside
+    # other per-run state. Killed mid-run cleanup is enforced separately
+    # by spec-verify-check.py main() — when source markdown has no json
+    # block AND BENCH_WORKDIR is unset (real-user mode), the script drops
+    # any pre-existing .devlyn/spec-verify.json so a stale orphan from a
+    # killed prior run cannot poison this run's gate.
+    "spec-verify.json",
+    "spec-verify.results.json",
+    "spec-verify-findings.jsonl",
+    # iter-0033a/2026-04-30 archive-fix iter: NEW /devlyn:resolve emits
+    # plan.md (PLAN output) + final-report.md (PHASE 6 render) +
+    # cumulative.patch (cumulative diff). Smoke 2's archive listing
+    # captured all three; archive_run.py was missing them because the
+    # patterns predated the new skill's artifact set. Added explicitly
+    # so the move is deterministic.
+    "plan.md",
+    "final-report.md",
+    "cumulative.patch",
+    # iter-0033c (Codex R-final-smoke Q2): pair-mode VERIFY emits per-judge
+    # deliberation transcripts (verify-judge-claude.md / verify-judge-codex.md
+    # — and any future-engine analogue via wildcard). Smoke 1a (F2 l2_forced)
+    # surfaced the gap: the orchestrator wrote them and listed them as
+    # artifacts, but archive_run.py left them in .devlyn/. Gate 8
+    # ("pair_judge findings archive distinguishable") would false-fail on
+    # every paired fixture without this glob.
+    "verify-judge-*.md",
 )

package/config/skills/_shared/codex-config.md ADDED Viewed

@@ -0,0 +1,54 @@
+# Shared — Codex Invocation
+Single source of truth for how every skill calls Codex. **MCP is not used.** Skills shell out via the wrapper at `_shared/codex-monitored.sh`, which fronts the local Codex CLI (shipped by the `openai-codex` Claude Code plugin).
+## Canonical invocations
+All long-running Codex calls go through `codex-monitored.sh` — a thin wrapper that closes stdin (codex 0.124.0 hangs when both stdin is open and a prompt arg is given), streams Codex stdout fully (no `tail -n` truncation), and prints a `[codex-monitored] heartbeat` line every 30s so the outer `claude -p` byte-watchdog stays fed during long reasoning gaps. The wrapper passes its arguments through verbatim to the underlying CLI, so the canonical flag set is unchanged from a raw call — only the launcher differs.
+**Read-only critique / adversarial review / debate** (ideate CHALLENGE phase, `/devlyn:resolve` VERIFY pair-mode when triggered). Security review is delegated to the native `security-review` Claude Code skill, invoked from `/devlyn:resolve` BUILD_GATE rather than from Codex.
+```bash
+bash .claude/skills/_shared/codex-monitored.sh \
+  -C <project-root> \
+  -s read-only \
+  -c model_reasoning_effort=xhigh \
+  "<inlined-prompt>"
+```
+**Workspace-write implementation** (`/devlyn:resolve` IMPLEMENT phase when `--engine codex` or `--engine auto` routes to Codex, plus codex-routed `/devlyn:ideate` phases):
+```bash
+bash .claude/skills/_shared/codex-monitored.sh \
+  -C <project-root> \
+  --full-auto \
+  -c model_reasoning_effort=xhigh \
+  "<inlined-prompt>"
+```
+Notes:
+- `-C` — project root so Codex's working directory matches.
+- `-s read-only` / `--full-auto` — sandbox policy. `--full-auto` = `-s workspace-write` with auto-approval of sandboxed commands.
+- `-c model_reasoning_effort=xhigh` — config override for reasoning depth. Required for deep critique; skills may choose `high` or `medium` when thoroughness doesn't warrant xhigh.
+- **Omit `-m <model>`** — Codex CLI uses its configured flagship (currently `gpt-5.5`, automatically whatever ships next). This is the zero-touch mechanism. Only name `-m` when a role explicitly needs a different model (e.g., `gpt-5.3-codex` for SWE-bench-heavy coding tasks, `gpt-5.3-codex-spark` for speed).
+- Raw `codex exec ...` invocations are **forbidden** in skill prompts. The benchmark variant arm runs a PATH shim (`scripts/codex-shim/codex`) that transparently re-routes any raw `codex exec` to the wrapper as a safety net, but skills should always emit the wrapper form directly so the orchestrator's first-attempt has the right shape. Two prior iterations (iter-0006 universal foreground ban, iter-0008 prompt-level kill-shape contract) failed because the orchestrator picked starvation-prone shapes (`codex exec ... 2>&1 | tail -200`) from its own pattern prior — the wrapper plus the shim is the runtime binding layer those iters lacked. See `autoresearch/iterations/0009-wrapper-and-hook.md`.
+## Availability check
+Before the first Codex call in a run, verify the CLI is on PATH:
+```bash
+command -v codex >/dev/null 2>&1
+```
+If the check fails, the skill follows the `_shared/engine-preflight.md` downgrade rule — silently switch to Claude for this run and log `engine downgraded: codex-unavailable` in the final report. Never prompt, never abort.
+## Why CLI over other paths
+The local Codex CLI (fronted by `codex-monitored.sh`) is the primary (and only) integration. It beats alternatives on three dimensions: the model is inherited from the CLI's own default so no skill edits are needed when OpenAI ships a new flagship; flags compose on the command line and the skill docs stay grep-friendly; the invocation has one failure mode (the binary is on PATH or it isn't), which the shared availability check covers cleanly.
+## Invocation from inside a skill prompt
+Skills write the invocation as a Bash command the runtime executes. Example shape from `/devlyn:resolve` PHASE 2 IMPLEMENT when routed to Codex:
+> Run `bash .claude/skills/_shared/codex-monitored.sh -C <state.base_ref.repo_root> --full-auto -c model_reasoning_effort=xhigh "<IMPLEMENT prompt>"`. Omit `-m` so the CLI flagship is auto-selected. Capture stdout as the IMPLEMENT reply; non-zero exit → treat as subagent failure. The wrapper emits `[codex-monitored]` heartbeat and lifecycle lines on **stderr** — stdout stays clean for Codex output, so the orchestrator can parse the reply without filtering. Heartbeat-on-stderr keeps the orchestrator's combined-output stream non-silent (defeats the iter-0008 byte-watchdog kill) without polluting the codex-reply view of stdout.

package/config/skills/_shared/codex-monitored.sh ADDED Viewed

@@ -0,0 +1,141 @@
+#!/usr/bin/env bash
+# codex-monitored.sh — run `codex exec` in a monitored shape that keeps the
+# outer claude -p API stream from going silent during long Codex calls.
+#
+# WHY (iter-0009, post iter-0006/0007/0008):
+#   • iter-0007 isolation proved a single foreground `codex exec` Bash dispatch
+#     can starve the outer API stream of bytes during a 10+ min run; Anthropic's
+#     byte-level idle watchdog fires (~300s) and kills the orchestrator.
+#   • iter-0008 saw the orchestrator pick `codex exec ... 2>&1 | tail -200` from
+#     its own pattern prior — `tail` on a pipe buffers until EOF, suppressing
+#     ALL bytes. Same starvation, amplified.
+#   • iter-0008 also documented codex 0.124.0 reads stdin as a `<stdin>` block
+#     when the prompt is passed as an arg AND stdin is open; without
+#     `< /dev/null` the call hangs indefinitely.
+#
+# WHAT THIS WRAPPER DOES:
+#   1. Refuses to run if stdout is a pipe. Piping wrapper output to text tools
+#      (tail/head/awk/sed/grep without --line-buffered) re-introduces the
+#      iter-0008 starvation mechanism — the downstream tool buffers until EOF
+#      and the outer claude -p byte-watchdog never sees bytes. Exits 64 with a
+#      clear message so the orchestrator can self-correct on retry.
+#      (Round 2 finding #1 fix: shim alone does not defeat `| tail`; the
+#      wrapper must reject the pipe shape directly.)
+#   2. Closes stdin (`< /dev/null`) — kills the codex 0.124.0 stdin hang.
+#   3. Streams codex stdout to OUR stdout line-by-line — the orchestrator reads
+#      stdout as the subagent reply (per `_shared/codex-config.md`) so we MUST
+#      NOT swallow it (e.g. `tail -n 200`). codex stderr forwards to OUR stderr.
+#   4. Emits a `[codex-monitored] heartbeat` line every CODEX_MONITORED_HEARTBEAT
+#      seconds (default 30s) on STDERR while codex is alive. Heartbeat-on-stderr
+#      keeps the orchestrator's combined-output stream non-silent without
+#      polluting the codex-reply view of stdout.
+#   5. Forwards SIGTERM/SIGINT from the outer watchdog to the codex child so a
+#      timeout actually reaps codex (otherwise process group kill races with
+#      backgrounded codex).
+#   6. Preserves codex's exact exit code.
+#
+# USAGE:
+#   bash codex-monitored.sh -C <repo> -s read-only -c model_reasoning_effort=xhigh "<prompt>"
+#   bash codex-monitored.sh resume --last
+#   (Args after the script name are passed verbatim to `codex exec`.)
+#
+# ENV OVERRIDES:
+#   CODEX_MONITORED_HEARTBEAT      — heartbeat interval seconds (default 30).
+#   CODEX_BIN                      — real codex binary path. Default: `codex`.
+#                                     Set this when the shim has put us first
+#                                     on PATH.
+#   CODEX_MONITORED_ALLOW_PIPED    — set non-empty to skip the pipe-stdout
+#                                     refusal. Reserved for tests; don't use
+#                                     in skill prompts.
+set -uo pipefail
+# iter-0019 — solo_claude (L1) arm enforcement (defense in depth alongside
+# scripts/codex-shim/codex). If this env is set, the wrapper refuses to invoke
+# codex at all, regardless of how it was reached. Two enforcement points
+# protect against the case where one is bypassed: the shim catches PATH-based
+# resolution, and this wrapper catches direct-path invocations of
+# codex-monitored.sh that don't go through the shim.
+if [ -n "${CODEX_BLOCKED:-}" ]; then
+  printf '[codex-monitored] CODEX_BLOCKED=%s — refusing codex invocation (solo_claude / L1 arm enforcement). args: %s\n' \
+    "${CODEX_BLOCKED}" "$*" >&2
+  exit 126
+fi
+HEARTBEAT_SEC="${CODEX_MONITORED_HEARTBEAT:-30}"
+CODEX_BIN="${CODEX_BIN:-codex}"
+START=$(date +%s)
+# --- Pipe-stdout refusal (iter-0009 R2 finding #1) -------------------------
+# `[ -p /dev/stdout ]` is the POSIX test for "is fd 1 a FIFO/pipe". Verified
+# correct on macOS via lsof: distinguishes piped (`| cat`) from redirected
+# (`> file`) and from claude-bash-tool capture (regular file). Without this
+# refusal, `bash WRAPPER ... 2>&1 | tail -200` would buffer wrapper output —
+# including the heartbeat on stderr after `2>&1` — until EOF, reproducing
+# the iter-0008 byte-watchdog kill.
+if [ -z "${CODEX_MONITORED_ALLOW_PIPED:-}" ] && [ -p /dev/stdout ]; then
+  cat >&2 <<'EOF'
+[codex-monitored] error: stdout is a pipe.
+Piping the wrapper to tail/head/awk/sed/grep buffers wrapper output until EOF,
+which starves the outer claude -p byte-watchdog (iter-0008 starvation mechanism)
+and kills the run after ~300s with empty transcript.
+Fix: invoke the wrapper directly so the bash tool captures its stdout. The
+wrapper streams full Codex output and emits a heartbeat on stderr; you do NOT
+need to truncate.
+  WRONG: bash codex-monitored.sh ... 2>&1 | tail -200
+  RIGHT: bash codex-monitored.sh ...
+If you absolutely must filter, use a line-buffered tool (e.g. `grep --line-buffered`)
+and set CODEX_MONITORED_ALLOW_PIPED=1 in the wrapper's environment.
+EOF
+  exit 64
+fi
+# --- Heartbeat + signal forwarding ----------------------------------------
+heartbeat_loop() {
+  local pid="$1"
+  while kill -0 "$pid" 2>/dev/null; do
+    sleep "$HEARTBEAT_SEC"
+    if kill -0 "$pid" 2>/dev/null; then
+      local elapsed=$(( $(date +%s) - START ))
+      printf '[codex-monitored] heartbeat: elapsed=%ds\n' "$elapsed" >&2
+    fi
+  done
+}
+forward_signal() {
+  local sig="$1"
+  if [ -n "${CODEX_PID:-}" ] && kill -0 "$CODEX_PID" 2>/dev/null; then
+    kill -"$sig" "$CODEX_PID" 2>/dev/null || true
+  fi
+  if [ -n "${HB_PID:-}" ] && kill -0 "$HB_PID" 2>/dev/null; then
+    kill -TERM "$HB_PID" 2>/dev/null || true
+  fi
+}
+trap 'forward_signal TERM' TERM
+trap 'forward_signal INT' INT
+printf '[codex-monitored] start: ts=%s heartbeat=%ds bin=%s\n' \
+  "$(date -u +%FT%TZ)" "$HEARTBEAT_SEC" "$CODEX_BIN" >&2
+# Launch codex with stdin closed; output streams directly to OUR stdout/stderr.
+"$CODEX_BIN" exec "$@" < /dev/null &
+CODEX_PID=$!
+printf '[codex-monitored] codex pid=%d\n' "$CODEX_PID" >&2
+heartbeat_loop "$CODEX_PID" &
+HB_PID=$!
+wait "$CODEX_PID"
+EXIT=$?
+kill -TERM "$HB_PID" 2>/dev/null || true
+wait "$HB_PID" 2>/dev/null || true
+printf '[codex-monitored] codex exited: code=%d elapsed=%ds\n' \
+  "$EXIT" $(( $(date +%s) - START )) >&2
+exit "$EXIT"

package/config/skills/_shared/engine-preflight.md ADDED Viewed

@@ -0,0 +1,35 @@
+# Shared — `--engine` Pre-flight
+Used by `/devlyn:resolve` and `/devlyn:ideate`. One shared availability rule so every skill routes identically.
+## Rule
+Each skill resolves the effective engine from its own SKILL.md default plus any explicit `--engine` flag passed by the user. This pre-flight runs **only when the resolved engine is `auto` or `codex`** — when the resolved engine is `claude` (whether by skill default or explicit flag), the Codex check is skipped entirely.
+When the resolved engine is `auto` or `codex`, on entry (before spawning any phase that could route to Codex):
+1. Check if the Codex CLI is installed: `command -v codex >/dev/null 2>&1` (or equivalent bash test).
+2. On failure → silently set `engine = "claude"` for the remainder of this run AND log `engine downgraded: codex-unavailable` into the skill's final summary/report header.
+3. On success → proceed with the original engine value.
+Never prompt the user. Never abort the run on missing CLI.
+Per-skill defaults: `/devlyn:resolve` defaults to `claude` (post iter-0020 close-out — Codex BUILD/IMPLEMENT below quality floor; iter-0033g + iter-0034 close-out — PLAN-pair research-only until container/sandbox infra justifies a measurement); `/devlyn:ideate` defaults to `auto` for the CHALLENGE phase's cross-model GAN-critic dynamic. Each skill's SKILL.md flag block is the source of truth for that skill's default.
+## Why this is the one permitted silent fallback
+`CLAUDE.md` sets the no-silent-fallback rule for this repo. This downgrade is documented there as the single explicit exception because the hands-free contract — skills the user walks away from — would otherwise fail every run whenever the Codex CLI is absent. The user-visible behavior is identical to an explicit `--engine claude` invocation, and the banner in the final report removes the silence. Any other silent fallback in skills code is a bug.
+## What a skill must log after downgrade
+When the resolved engine was `auto` / `codex` and the Codex CLI was absent, the final user-facing report/summary shows both the requested and effective mode:
+```
+Engine: claude (downgraded from auto — codex-unavailable)
+```
+If no downgrade happened (either Codex was available, or the resolved engine was already `claude`), omit the parenthetical. That single line is the contract — the user can always see why Codex did or did not participate.
+## Canonical Codex invocation
+See `config/skills/_shared/codex-config.md` for the canonical wrapper invocation and flag set skills should use after the availability check passes.