npm - cipher-security - Versions diffs - 2.1.0 → 2.2.0 - Mend

cipher-security 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/bin/cipher.js +10 -0
package/lib/analyze/consistency.js +566 -0
package/lib/analyze/constitution.js +110 -0
package/lib/analyze/sharding.js +251 -0
package/lib/autonomous/agent-tool.js +165 -0
package/lib/autonomous/framework.js +17 -0
package/lib/autonomous/handoff.js +506 -0
package/lib/autonomous/modes/blue.js +26 -0
package/lib/autonomous/modes/red.js +28 -0
package/lib/benchmark/agent.js +88 -26
package/lib/benchmark/baselines.js +3 -0
package/lib/benchmark/claude-code-solver.js +254 -0
package/lib/benchmark/cognitive.js +283 -0
package/lib/benchmark/index.js +12 -2
package/lib/benchmark/knowledge.js +281 -0
package/lib/benchmark/llm.js +156 -15
package/lib/benchmark/models.js +5 -2
package/lib/benchmark/nyu-ctf.js +192 -0
package/lib/benchmark/overthewire.js +347 -0
package/lib/benchmark/picoctf.js +281 -0
package/lib/benchmark/prompts.js +280 -0
package/lib/benchmark/registry.js +219 -0
package/lib/benchmark/remote-solver.js +356 -0
package/lib/benchmark/remote-target.js +263 -0
package/lib/benchmark/reporter.js +35 -0
package/lib/benchmark/runner.js +174 -10
package/lib/benchmark/sandbox.js +35 -0
package/lib/benchmark/scorer.js +22 -4
package/lib/benchmark/solver.js +34 -1
package/lib/benchmark/tools.js +262 -16
package/lib/commands.js +9 -0
package/lib/execution/council.js +434 -0
package/lib/execution/parallel.js +292 -0
package/lib/gates/circuit-breaker.js +135 -0
package/lib/gates/confidence.js +302 -0
package/lib/gates/corrections.js +219 -0
package/lib/gates/self-check.js +245 -0
package/lib/gateway/commands.js +727 -0
package/lib/guardrails/engine.js +364 -0
package/lib/mcp/server.js +349 -3
package/lib/memory/compressor.js +94 -7
package/lib/pipeline/hooks.js +288 -0
package/lib/pipeline/index.js +11 -0
package/lib/review/budget.js +210 -0
package/lib/review/engine.js +526 -0
package/lib/review/layers/acceptance-auditor.js +279 -0
package/lib/review/layers/blind-hunter.js +500 -0
package/lib/review/layers/defense-in-depth.js +209 -0
package/lib/review/layers/edge-case-hunter.js +266 -0
package/lib/review/panel.js +519 -0
package/lib/review/two-stage.js +244 -0
package/lib/session/cost-tracker.js +203 -0
package/lib/session/logger.js +349 -0
package/package.json +1 -1

package/lib/gateway/commands.js CHANGED Viewed

@@ -1206,3 +1206,730 @@ export async function handleUpdate(args = {}) {
     return { error: true, message: `Update failed: ${err.message}` };
   }
 }
+// ---------------------------------------------------------------------------
+// Review — Multi-Layer Code Review Engine
+// ---------------------------------------------------------------------------
+/**
+ * cipher review <target> [--format json|text] [--min-severity critical|high|medium|low|info]
+ *
+ * Run 3-layer parallel code review: Blind Hunter (pattern-based vuln detection),
+ * Edge Case Hunter (boundary/failure analysis), Acceptance Auditor (security architecture).
+ *
+ * @param {string[]|object} args
+ * @returns {Promise<object>}
+ */
+export async function handleReview(args = {}) {
+  const argList = Array.isArray(args) ? args : [args.target].filter(Boolean);
+  const target = argList.find((a) => !a.startsWith('-'));
+  if (!target) {
+    return {
+      error: true,
+      message: [
+        'Usage: cipher review <file|dir|code> [options]',
+        '',
+        'Options:',
+        '  --format <text|json>      Output format (default: text)',
+        '  --min-severity <level>    Filter: critical, high, medium, low, info',
+        '',
+        'Runs 3 parallel review layers:',
+        '  • Blind Hunter       — Pattern-based vulnerability detection',
+        '  • Edge Case Hunter   — Boundary condition & failure analysis',
+        '  • Acceptance Auditor — Security architecture review',
+      ].join('\n'),
+    };
+  }
+  const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
+  const minSeverity = argList.find((a, i) => argList[i - 1] === '--min-severity');
+  try {
+    const { createReviewEngine } = await import('../review/engine.js');
+    const engine = await createReviewEngine();
+    const result = await engine.review(target, { minSeverity });
+    if (format === 'json') {
+      return result.toJSON();
+    }
+    return { output: result.toReport() };
+  } catch (err) {
+    return { error: true, message: `Review failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Panel — Expert Panel Security Assessment
+// ---------------------------------------------------------------------------
+/**
+ * cipher panel <target> [--format json|text]
+ *
+ * Run 3-persona expert panel assessment: Red Team, Blue Team, Architect.
+ *
+ * @param {string[]|object} args
+ * @returns {Promise<object>}
+ */
+export async function handlePanel(args = {}) {
+  const argList = Array.isArray(args) ? args : [args.target].filter(Boolean);
+  const target = argList.find((a) => !a.startsWith('-'));
+  if (!target) {
+    return {
+      error: true,
+      message: [
+        'Usage: cipher panel <file|dir|code> [options]',
+        '',
+        'Options:',
+        '  --format <text|json>      Output format (default: text)',
+        '',
+        'Runs 3 simulated expert persona reviews:',
+        '  • Red Team Expert    — Attack surface, exploitation chains',
+        '  • Blue Team Expert   — Detection gaps, logging, monitoring',
+        '  • Architect Expert   — Trust boundaries, auth design, OWASP',
+      ].join('\n'),
+    };
+  }
+  const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
+  try {
+    const { panelReview } = await import('../review/panel.js');
+    const result = await panelReview(target, { format });
+    if (format === 'json') {
+      return result.toJSON();
+    }
+    return { output: result.toReport() };
+  } catch (err) {
+    return { error: true, message: `Panel review failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Analyze — Cross-Artifact Consistency Analyzer
+// ---------------------------------------------------------------------------
+/**
+ * cipher analyze [--format json|text] [--root <path>]
+ *
+ * Scan CIPHER artifacts for stale references, orphans, mode mismatches,
+ * coverage gaps, and structural issues.
+ *
+ * @param {string[]|object} args
+ * @returns {Promise<object>}
+ */
+export async function handleAnalyze(args = {}) {
+  const argList = Array.isArray(args) ? args : [];
+  const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
+  const root = argList.find((a, i) => argList[i - 1] === '--root');
+  try {
+    const { ConsistencyAnalyzer } = await import('../analyze/consistency.js');
+    const analyzer = new ConsistencyAnalyzer(root || undefined);
+    const result = analyzer.analyze();
+    if (format === 'json') {
+      return result.toJSON();
+    }
+    return { output: result.toReport() };
+  } catch (err) {
+    return { error: true, message: `Analyze failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Shard — Semantic Document Sharding
+// ---------------------------------------------------------------------------
+/**
+ * cipher shard <file> [--level N] [--output <dir>] [--dry-run]
+ *
+ * Split a large markdown document into semantically coherent chunks.
+ *
+ * @param {string[]|object} args
+ * @returns {Promise<object>}
+ */
+export async function handleShard(args = {}) {
+  const argList = Array.isArray(args) ? args : [args.target].filter(Boolean);
+  const target = argList.find((a) => !a.startsWith('-'));
+  if (!target) {
+    return {
+      error: true,
+      message: [
+        'Usage: cipher shard <file.md> [options]',
+        '',
+        'Options:',
+        '  --level <N>        Heading level to split at (default: 2)',
+        '  --output <dir>     Output directory',
+        '  --dry-run          Preview shards without writing files',
+        '  --format <json>    Output as JSON',
+      ].join('\n'),
+    };
+  }
+  const level = parseInt(argList.find((a, i) => argList[i - 1] === '--level') || '2', 10);
+  const output = argList.find((a, i) => argList[i - 1] === '--output');
+  const dryRun = argList.includes('--dry-run');
+  const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
+  try {
+    const { shardDocument } = await import('../analyze/sharding.js');
+    const result = shardDocument(target, { level, output, dryRun });
+    if (format === 'json') {
+      return result.toJSON();
+    }
+    return { output: result.toReport() };
+  } catch (err) {
+    return { error: true, message: `Shard failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Guardrail — Test Input/Output Guardrails
+// ---------------------------------------------------------------------------
+/**
+ * cipher guardrail <text> [--type input|output|both]
+ *
+ * Test text against CIPHER's guardrail tripwire system.
+ *
+ * @param {string[]|object} args
+ * @returns {Promise<object>}
+ */
+export async function handleGuardrail(args = {}) {
+  const argList = Array.isArray(args) ? args : [args.text].filter(Boolean);
+  const text = argList.filter((a) => !a.startsWith('-')).join(' ');
+  if (!text) {
+    return {
+      error: true,
+      message: [
+        'Usage: cipher guardrail <text> [options]',
+        '',
+        'Options:',
+        '  --type <input|output|both>  Which guardrails to run (default: both)',
+        '  --format <json>             Output as JSON',
+        '',
+        'Tests text against guardrail tripwires:',
+        '  • Prompt injection detection',
+        '  • Scope compliance validation',
+        '  • Dangerous command detection',
+        '  • Data leak prevention',
+      ].join('\n'),
+    };
+  }
+  const type = argList.find((a, i) => argList[i - 1] === '--type') || 'both';
+  const format = argList.find((a, i) => argList[i - 1] === '--format') || 'text';
+  try {
+    const { createGuardrailEngine } = await import('../guardrails/engine.js');
+    const engine = createGuardrailEngine();
+    const results = await engine.audit(text);
+    if (format === 'json') {
+      return {
+        tripped: results.length > 0,
+        tripwires: results.map((r) => ({
+          guardrail: r.guardrail,
+          type: r.type,
+          severity: r.severity,
+          reason: r.reason,
+          action: r.action,
+        })),
+      };
+    }
+    if (results.length === 0) {
+      return { output: '✓ No guardrails tripped.' };
+    }
+    const lines = [`✗ ${results.length} guardrail${results.length !== 1 ? 's' : ''} tripped:`, ''];
+    for (const r of results) {
+      lines.push(`  [${r.severity.toUpperCase()}] ${r.guardrail} (${r.type})`);
+      lines.push(`    Reason: ${r.reason}`);
+      lines.push(`    Action: ${r.action}`);
+    }
+    return { output: lines.join('\n') };
+  } catch (err) {
+    return { error: true, message: `Guardrail check failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Chain — multi-mode agent chain
+// ---------------------------------------------------------------------------
+export async function handleChain(args = {}) {
+  try {
+    const { initModes, availableModes } = await import('../autonomous/runner.js');
+    const { runChain } = await import('../autonomous/handoff.js');
+    await initModes();
+    // Parse args — can be array or object
+    const argList = Array.isArray(args) ? args : (args._ || []);
+    const modesArg = argList[0] || '';
+    const taskIdx = argList.indexOf('--task');
+    const task = taskIdx >= 0 ? argList.slice(taskIdx + 1).join(' ') : '';
+    const backendIdx = argList.indexOf('--backend');
+    const backend = backendIdx >= 0 ? argList[backendIdx + 1] : null;
+    if (!modesArg) {
+      const available = availableModes().join(', ');
+      return {
+        error: true,
+        message: `Usage: cipher chain <modes> --task "<task>"\n\nModes (comma-separated): ${available}\nExample: cipher chain red,purple,blue --task "assess target"`,
+      };
+    }
+    const modes = modesArg.split(',').map(m => m.trim().toUpperCase()).filter(Boolean);
+    const available = new Set(availableModes());
+    for (const mode of modes) {
+      if (!available.has(mode)) {
+        return {
+          error: true,
+          message: `Unknown mode: '${mode}'. Available: ${[...available].sort().join(', ')}`,
+        };
+      }
+    }
+    if (!task) {
+      return {
+        error: true,
+        message: 'Missing --task flag. Usage: cipher chain <modes> --task "<task description>"',
+      };
+    }
+    const result = await runChain(modes, { task, user_message: task }, { backend });
+    // Format output
+    const lines = [];
+    lines.push(`Chain: ${modes.join(' → ')}`);
+    lines.push(`Duration: ${result.totalDurationS.toFixed(2)}s`);
+    lines.push(`Tokens: ${result.totalTokensIn} in / ${result.totalTokensOut} out`);
+    lines.push('');
+    for (let i = 0; i < result.results.length; i++) {
+      const r = result.results[i];
+      lines.push(`── ${r.mode} ──`);
+      if (r.error) {
+        lines.push(`  Error: ${r.error}`);
+      } else {
+        lines.push(`  ${(r.outputText || '').slice(0, 500)}`);
+      }
+      lines.push('');
+    }
+    if (result.events.length > 0) {
+      lines.push('── Handoff Timeline ──');
+      for (const e of result.events) {
+        const ts = new Date(e.timestamp * 1000).toISOString().slice(11, 19);
+        lines.push(`  ${ts} ${e.sourceMode} → ${e.targetMode} [${e.status}]`);
+      }
+    }
+    if (result.error) {
+      lines.push(`\nChain error: ${result.error}`);
+    }
+    return { output: lines.join('\n') };
+  } catch (err) {
+    return { error: true, message: `Chain failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Council — multi-model consensus (stub, fully implemented in M016/S04)
+// ---------------------------------------------------------------------------
+export async function handleCouncil(args = {}) {
+  try {
+    const { LLMCouncil, runCouncil } = await import('../execution/council.js');
+    const argList = Array.isArray(args) ? args : (args._ || []);
+    const dryRun = argList.includes('--dry-run');
+    const membersIdx = argList.indexOf('--members');
+    const members = membersIdx >= 0 ? parseInt(argList[membersIdx + 1], 10) : 3;
+    const backendIdx = argList.indexOf('--backend');
+    const backend = backendIdx >= 0 ? argList[backendIdx + 1] : null;
+    const task = argList.filter(a => !a.startsWith('--') && (membersIdx < 0 || argList.indexOf(a) !== membersIdx + 1) && (backendIdx < 0 || argList.indexOf(a) !== backendIdx + 1)).join(' ');
+    if (!task) {
+      return { error: true, message: 'Usage: cipher council "<task>" [--dry-run] [--members N] [--backend <backend>]' };
+    }
+    const result = await runCouncil(task, { members, backend, dryRun });
+    return { output: JSON.stringify(result, null, 2) };
+  } catch (err) {
+    if (err.code === 'ERR_MODULE_NOT_FOUND' || err.message?.includes('Cannot find module')) {
+      return { error: true, message: 'Council module not yet available. Coming in M016/S04.' };
+    }
+    return { error: true, message: `Council failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Resume — resume interrupted sessions
+// ---------------------------------------------------------------------------
+export async function handleResume(args = {}) {
+  try {
+    const { listSessions, loadSession } = await import('../session/logger.js');
+    const argList = Array.isArray(args) ? args : (args._ || []);
+    // cipher resume --list
+    if (argList.includes('--list') || argList.length === 0) {
+      const sessions = listSessions({ limit: 20 });
+      if (sessions.length === 0) {
+        return { output: 'No sessions found. Run `cipher <mode> --autonomous "task"` to create one.' };
+      }
+      const lines = ['Recent Sessions:', ''];
+      for (const s of sessions) {
+        const cost = s.totalCostUSD ? ` $${s.totalCostUSD.toFixed(4)}` : '';
+        const tokens = s.totalTokensIn + s.totalTokensOut;
+        lines.push(`  ${s.sessionId}  ${(s.mode || '?').padEnd(12)} ${(s.status || '?').padEnd(12)} ${tokens} tokens${cost}`);
+        if (s.task) lines.push(`    Task: ${s.task.slice(0, 80)}`);
+      }
+      return { output: lines.join('\n') };
+    }
+    // cipher resume <session-id>
+    const sessionId = argList.find(a => !a.startsWith('--'));
+    if (!sessionId) {
+      return { error: true, message: 'Usage: cipher resume <session-id> | cipher resume --list' };
+    }
+    const session = loadSession(sessionId);
+    if (!session) {
+      return { error: true, message: `Session not found: ${sessionId}` };
+    }
+    const meta = session.metadata;
+    const autoFlag = argList.includes('--auto');
+    // Build resume summary
+    const lines = [
+      `Session: ${meta.sessionId}`,
+      `Mode: ${meta.mode}`,
+      `Task: ${meta.task}`,
+      `Status: ${meta.status}`,
+      `Interactions: ${meta.interactionCount}`,
+      `Tokens: ${meta.totalTokensIn} in / ${meta.totalTokensOut} out`,
+      `Cost: $${(meta.totalCostUSD || 0).toFixed(4)}`,
+      '',
+    ];
+    // Show last few interactions
+    const interactions = session.entries.filter(e => e.type === 'interaction');
+    if (interactions.length > 0) {
+      lines.push('Last interactions:');
+      const recent = interactions.slice(-3);
+      for (const e of recent) {
+        lines.push(`  Turn ${e.turn}: ${e.toolCalls} tool calls, ${e.tokensIn}+${e.tokensOut} tokens`);
+        if (e.outputSummary) lines.push(`    ${e.outputSummary.slice(0, 100)}`);
+      }
+      lines.push('');
+    }
+    if (meta.status === 'completed') {
+      lines.push('Session already completed. Nothing to resume.');
+      return { output: lines.join('\n') };
+    }
+    if (autoFlag) {
+      // Auto-continue: re-run the mode with context from prior session
+      const { runAutonomous, initModes } = await import('../autonomous/runner.js');
+      const { SessionLogger } = await import('../session/logger.js');
+      const { CostTracker } = await import('../session/cost-tracker.js');
+      await initModes();
+      const backendIdx = argList.indexOf('--backend');
+      const backend = backendIdx >= 0 ? argList[backendIdx + 1] : null;
+      // Build continuation task with prior context
+      const priorSummaries = interactions.slice(-5).map(e => e.outputSummary).filter(Boolean).join('\n');
+      const continuationTask = {
+        task: meta.task,
+        user_message: `[Resuming session ${meta.sessionId}]\n\nPrior progress (${meta.interactionCount} interactions):\n${priorSummaries || '(no summaries)'}\n\nContinue the task: ${meta.task}`,
+      };
+      // Create continuation session logger
+      const logger = new SessionLogger({ sessionId: sessionId + '-cont', mode: meta.mode, task: meta.task });
+      logger.start({ resumedFrom: sessionId });
+      const tracker = new CostTracker({ model: 'default', sessionId: logger.sessionId, logger });
+      try {
+        const result = await runAutonomous(meta.mode, continuationTask, backend);
+        tracker.track(result.tokensIn, result.tokensOut);
+        logger.logInteraction({
+          mode: result.mode,
+          turn: 1,
+          toolCalls: result.toolCalls,
+          tokensIn: result.tokensIn,
+          tokensOut: result.tokensOut,
+          costUSD: tracker.totalCost,
+          outputSummary: (result.outputText || '').slice(0, 200),
+        });
+        logger.end('completed');
+        lines.push(`Resumed and completed. New session: ${logger.sessionId}`);
+        lines.push(`Output: ${(result.outputText || '').slice(0, 500)}`);
+      } catch (err) {
+        logger.end('failed', { error: err.message });
+        lines.push(`Resume failed: ${err.message}`);
+      }
+    } else {
+      lines.push('To auto-continue this session, run:');
+      lines.push(`  cipher resume ${sessionId} --auto`);
+    }
+    return { output: lines.join('\n') };
+  } catch (err) {
+    return { error: true, message: `Resume failed: ${err.message}` };
+  }
+}
+// ---------------------------------------------------------------------------
+// Benchmark — run XBOW benchmark suite
+// ---------------------------------------------------------------------------
+export async function handleBenchmark(args = {}) {
+  try {
+    const argList = Array.isArray(args) ? args : (args._ || []);
+    const subCommand = argList[0] || 'status';
+    const suite = argList.find((_, i) => argList[i - 1] === '--suite') || 'xbow';
+    // cipher benchmark status — show available benchmarks and baselines
+    if (subCommand === 'status' || subCommand === 'list') {
+      const { BenchmarkBuilder, HarnessConfig, ALL_BASELINES } = await import('../benchmark/index.js');
+      const { enumerateNyuChallenges, NYU_BASELINES, NYU_CTF_CLONE_DIR, NYU_CATEGORIES } = await import('../benchmark/nyu-ctf.js');
+      const lines = [];
+      // XBOW
+      const builder = new BenchmarkBuilder();
+      let xbowBenchmarks = [];
+      try { xbowBenchmarks = builder.listBenchmarks(); } catch { /* not cloned */ }
+      if (xbowBenchmarks.length > 0) {
+        const byLevel = {};
+        for (const b of xbowBenchmarks) byLevel[b.level] = (byLevel[b.level] || 0) + 1;
+        lines.push(`XBOW: ${xbowBenchmarks.length} benchmarks`);
+        lines.push(`  Level 1: ${byLevel[1] || 0}, Level 2: ${byLevel[2] || 0}, Level 3: ${byLevel[3] || 0}`);
+        lines.push('  Baselines:');
+        for (const bl of ALL_BASELINES) lines.push(`    ${bl.name}: ${bl.overallPct.toFixed(1)}%`);
+      } else {
+        lines.push('XBOW: not cloned (cipher benchmark clone --suite xbow)');
+      }
+      // NYU CTF
+      const nyuChallenges = enumerateNyuChallenges(NYU_CTF_CLONE_DIR);
+      if (nyuChallenges.length > 0) {
+        const byCat = {};
+        for (const c of nyuChallenges) byCat[c.category] = (byCat[c.category] || 0) + 1;
+        lines.push('');
+        lines.push(`NYU CTF: ${nyuChallenges.length} challenges`);
+        lines.push(`  Categories: ${Object.entries(byCat).map(([c,n]) => `${c}(${n})`).join(', ')}`);
+        lines.push('  Baselines:');
+        for (const bl of NYU_BASELINES) lines.push(`    ${bl.name}: ${bl.overallPct.toFixed(1)}%`);
+      } else {
+        lines.push('');
+        lines.push('NYU CTF: not cloned (cipher benchmark clone --suite nyu)');
+      }
+      // PicoCTF
+      const { enumeratePicoChallenges, getPicoCatalogStats } = await import('../benchmark/picoctf.js');
+      const picoStats = getPicoCatalogStats();
+      lines.push('');
+      lines.push(`PicoCTF: ${picoStats.total} challenges (bundled catalog)`);
+      lines.push(`  Categories: ${Object.entries(picoStats.byCategory).map(([c,n]) => `${c}(${n})`).join(', ')}`);
+      lines.push(`  Target types: ${Object.entries(picoStats.byTargetType).map(([t,n]) => `${t}(${n})`).join(', ')}`);
+      // OverTheWire
+      const { getOtwCatalogStats, getProgressStats } = await import('../benchmark/overthewire.js');
+      const otwStats = getOtwCatalogStats();
+      const otwProgress = getProgressStats();
+      lines.push('');
+      lines.push(`OverTheWire: ${otwStats.total} levels across ${Object.keys(otwStats.byWargame).length} wargames`);
+      for (const [wg, stats] of Object.entries(otwProgress)) {
+        lines.push(`  ${stats.name}: ${stats.solved}/${stats.total} solved (${stats.pct}%)`);
+      }
+      return { output: lines.join('\n') };
+    }
+    // cipher benchmark clone — clone benchmark repos
+    if (subCommand === 'clone') {
+      if (suite === 'pico') {
+        const { clonePicoCTF, enumeratePicoChallenges } = await import('../benchmark/picoctf.js');
+        clonePicoCTF();
+        const challenges = enumeratePicoChallenges();
+        return { output: `Initialized PicoCTF catalog: ${challenges.length} challenges` };
+      }
+      if (suite === 'otw') {
+        const { cloneOtw, enumerateOtwChallenges } = await import('../benchmark/overthewire.js');
+        cloneOtw();
+        const challenges = enumerateOtwChallenges();
+        return { output: `Initialized OverTheWire wargames: ${challenges.length} levels` };
+      }
+      if (suite === 'nyu') {
+        const { NYU_CTF_REPO_URL, NYU_CTF_CLONE_DIR, enumerateNyuChallenges } = await import('../benchmark/nyu-ctf.js');
+        const { existsSync } = await import('node:fs');
+        const { spawnSync } = await import('node:child_process');
+        const { mkdirSync } = await import('node:fs');
+        const { join, dirname } = await import('node:path');
+        if (existsSync(join(NYU_CTF_CLONE_DIR, '.git'))) {
+          spawnSync('git', ['-C', NYU_CTF_CLONE_DIR, 'pull', '--ff-only'], { timeout: 120000, stdio: 'pipe' });
+        } else {
+          mkdirSync(dirname(NYU_CTF_CLONE_DIR), { recursive: true });
+          spawnSync('git', ['clone', '--depth=1', NYU_CTF_REPO_URL, NYU_CTF_CLONE_DIR], { timeout: 300000, stdio: 'pipe' });
+        }
+        const challenges = enumerateNyuChallenges(NYU_CTF_CLONE_DIR);
+        return { output: `Cloned NYU CTF benchmarks: ${challenges.length} challenges` };
+      }
+      const { BenchmarkBuilder } = await import('../benchmark/index.js');
+      const builder = new BenchmarkBuilder();
+      builder.clone();
+      const benchmarks = builder.listBenchmarks(true);
+      return { output: `Cloned XBOW benchmarks: ${benchmarks.length} available` };
+    }
+    // cipher benchmark run — execute benchmarks
+    if (subCommand === 'run') {
+      const { BenchmarkBuilder, getSolver, runBenchmarks, generateJsonReport, generateMarkdownReport } = await import('../benchmark/index.js');
+      const { writeFileSync, mkdirSync } = await import('node:fs');
+      const { join } = await import('node:path');
+      const { homedir } = await import('node:os');
+      const all = argList.includes('--all');
+      const solverName = argList.find((_, i) => argList[i - 1] === '--solver') || 'autonomous';
+      const backend = argList.find((_, i) => argList[i - 1] === '--backend') || null;
+      const concurrency = parseInt(argList.find((_, i) => argList[i - 1] === '--concurrency') || '1', 10);
+      const retries = parseInt(argList.find((_, i) => argList[i - 1] === '--retries') || '1', 10);
+      const levelFilter = argList.find((_, i) => argList[i - 1] === '--level');
+      const tagFilter = argList.find((_, i) => argList[i - 1] === '--tag');
+      const categoryFilter = argList.find((_, i) => argList[i - 1] === '--category');
+      const specificNames = argList.filter(a => /^(XBEN-|nyu-)/.test(a));
+      if (!all && specificNames.length === 0) {
+        return {
+          error: true,
+          message: [
+            'Usage: cipher benchmark run [options]',
+            '',
+            'Options:',
+            '  --all                Run all benchmarks',
+            '  --suite <name>       Benchmark suite: xbow (default), nyu, pico, otw',
+            '  --solver <name>      Solver: autonomous (default), autonomous-multi, claude-code, remote, stub',
+            '  --backend <name>     LLM backend: ollama, claude',
+            '  --concurrency <n>    Parallel benchmarks (default 1)',
+            '  --retries <n>        Attempts per benchmark / pass@k (default 1)',
+            '  --level <n>          Filter by level (1-5)',
+            '  --tag <tag>          Filter by tag',
+            '  --category <cat>     Filter by category (NYU: web, pwn, rev, crypto, forensics, misc)',
+            '  XBEN-001-24 ...      Specific benchmark names',
+          ].join('\n'),
+        };
+      }
+      const builder = new BenchmarkBuilder();
+      const solver = getSolver(solverName, { backend });
+      const traces = [];
+      process.stderr.write(`Starting ${suite} benchmark run: solver=${solverName} backend=${backend || 'auto'} concurrency=${concurrency} retries=${retries}\n`);
+      // Apply category filter for NYU
+      let effectiveTagFilter = tagFilter || undefined;
+      if (categoryFilter && suite === 'nyu') {
+        effectiveTagFilter = categoryFilter;
+      }
+      const report = await runBenchmarks({
+        builder,
+        solver,
+        benchmarkNames: specificNames.length > 0 ? specificNames : undefined,
+        runAll: all,
+        levelFilter: levelFilter ? parseInt(levelFilter, 10) : undefined,
+        tagFilter: effectiveTagFilter,
+        concurrency,
+        retries,
+        onResult: (name, result) => {
+          const status = result.passed ? '✅' : result.solverResult.error ? '⚠️' : '❌';
+          process.stderr.write(`  ${status} ${name} (${result.solverResult.durationS.toFixed(1)}s)\n`);
+        },
+        onTrace: (event) => traces.push(event),
+      });
+      // Compute pass@k if retries > 1
+      const passAtK = retries > 1 ? `pass@${retries}` : 'pass@1';
+      // Save reports
+      const reportDir = join(homedir(), '.cipher', 'benchmark-reports');
+      mkdirSync(reportDir, { recursive: true });
+      const timestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
+      const jsonPath = join(reportDir, `${suite}-${timestamp}.json`);
+      writeFileSync(jsonPath, generateJsonReport(report, solverName));
+      const mdPath = join(reportDir, `${suite}-${timestamp}.md`);
+      writeFileSync(mdPath, generateMarkdownReport(report, solverName));
+      const tracePath = join(reportDir, `${suite}-${timestamp}-traces.json`);
+      writeFileSync(tracePath, JSON.stringify(traces, null, 2));
+      // Category breakdown for NYU CTF
+      const categoryLines = [];
+      if (suite === 'nyu') {
+        const byCat = {};
+        for (const r of report.results) {
+          const cat = r.config.category || 'misc';
+          if (!byCat[cat]) byCat[cat] = { total: 0, passed: 0 };
+          byCat[cat].total++;
+          if (r.passed) byCat[cat].passed++;
+        }
+        categoryLines.push('', 'By Category:');
+        for (const [cat, data] of Object.entries(byCat).sort(([a],[b]) => a.localeCompare(b))) {
+          const pct = data.total > 0 ? (data.passed / data.total * 100).toFixed(1) : '0.0';
+          categoryLines.push(`  ${cat}: ${data.passed}/${data.total} (${pct}%)`);
+        }
+      }
+      // Summary output
+      const lines = [
+        `${suite.toUpperCase()} Benchmark Run Complete (${passAtK})`,
+        `  Total: ${report.total}`,
+        `  Passed: ${report.passed} (${report.passRate.toFixed(1)}%)`,
+        `  Failed: ${report.failed}`,
+        `  Skipped: ${report.skipped}`,
+        `  Duration: ${report.durationS.toFixed(1)}s`,
+        `  Cost: $${report.totalCostUsd.toFixed(4)}`,
+        ...categoryLines,
+        '',
+        `Reports saved to:`,
+        `  JSON: ${jsonPath}`,
+        `  Markdown: ${mdPath}`,
+        `  Traces: ${tracePath}`,
+      ];
+      return { output: lines.join('\n') };
+    }
+    return {
+      error: true,
+      message: 'Usage: cipher benchmark <status|clone|run> [options]',
+    };
+  } catch (err) {
+    return { error: true, message: `Benchmark failed: ${err.message}` };
+  }
+}