npm - cipher-security - Versions diffs - 2.1.0 → 2.2.0 - Mend

cipher-security 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (54) hide show

package/bin/cipher.js +10 -0
package/lib/analyze/consistency.js +566 -0
package/lib/analyze/constitution.js +110 -0
package/lib/analyze/sharding.js +251 -0
package/lib/autonomous/agent-tool.js +165 -0
package/lib/autonomous/framework.js +17 -0
package/lib/autonomous/handoff.js +506 -0
package/lib/autonomous/modes/blue.js +26 -0
package/lib/autonomous/modes/red.js +28 -0
package/lib/benchmark/agent.js +88 -26
package/lib/benchmark/baselines.js +3 -0
package/lib/benchmark/claude-code-solver.js +254 -0
package/lib/benchmark/cognitive.js +283 -0
package/lib/benchmark/index.js +12 -2
package/lib/benchmark/knowledge.js +281 -0
package/lib/benchmark/llm.js +156 -15
package/lib/benchmark/models.js +5 -2
package/lib/benchmark/nyu-ctf.js +192 -0
package/lib/benchmark/overthewire.js +347 -0
package/lib/benchmark/picoctf.js +281 -0
package/lib/benchmark/prompts.js +280 -0
package/lib/benchmark/registry.js +219 -0
package/lib/benchmark/remote-solver.js +356 -0
package/lib/benchmark/remote-target.js +263 -0
package/lib/benchmark/reporter.js +35 -0
package/lib/benchmark/runner.js +174 -10
package/lib/benchmark/sandbox.js +35 -0
package/lib/benchmark/scorer.js +22 -4
package/lib/benchmark/solver.js +34 -1
package/lib/benchmark/tools.js +262 -16
package/lib/commands.js +9 -0
package/lib/execution/council.js +434 -0
package/lib/execution/parallel.js +292 -0
package/lib/gates/circuit-breaker.js +135 -0
package/lib/gates/confidence.js +302 -0
package/lib/gates/corrections.js +219 -0
package/lib/gates/self-check.js +245 -0
package/lib/gateway/commands.js +727 -0
package/lib/guardrails/engine.js +364 -0
package/lib/mcp/server.js +349 -3
package/lib/memory/compressor.js +94 -7
package/lib/pipeline/hooks.js +288 -0
package/lib/pipeline/index.js +11 -0
package/lib/review/budget.js +210 -0
package/lib/review/engine.js +526 -0
package/lib/review/layers/acceptance-auditor.js +279 -0
package/lib/review/layers/blind-hunter.js +500 -0
package/lib/review/layers/defense-in-depth.js +209 -0
package/lib/review/layers/edge-case-hunter.js +266 -0
package/lib/review/panel.js +519 -0
package/lib/review/two-stage.js +244 -0
package/lib/session/cost-tracker.js +203 -0
package/lib/session/logger.js +349 -0
package/package.json +1 -1

package/lib/benchmark/remote-solver.js ADDED Viewed

@@ -0,0 +1,356 @@
+// Copyright (c) 2026 defconxt. All rights reserved.
+// Licensed under AGPL-3.0 — see LICENSE file for details.
+// CIPHER is a trademark of defconxt.
+/**
+ * Remote Solver — Bridges SecurityAgent to RemoteTarget for solving
+ * challenges hosted on external servers (PicoCTF, OverTheWire, etc).
+ *
+ * Unlike AutonomousSolver which creates a Docker sandbox and reads
+ * docker-compose.yml, RemoteSolver connects to an existing remote
+ * service via SSH, netcat, or HTTP.
+ *
+ * The agent uses the same tool-use loop but with remote exec dispatched
+ * through the RemoteTarget adapter instead of a sandbox container.
+ *
+ * @module benchmark/remote-solver
+ */
+import { SolverResult } from './models.js';
+import { SolverAdapter } from './solver.js';
+import { createRemoteTarget } from './remote-target.js';
+import { getToolsForWinCondition, FLAG_TOOLS, QUESTION_TOOLS } from './tools.js';
+import { ConfidenceTracker, createCognitiveDispatcher } from './cognitive.js';
+import { generateSystemPrompt } from './prompts.js';
+// ---------------------------------------------------------------------------
+// Remote sandbox adapter — wraps RemoteTarget in the SandboxContainer interface
+// ---------------------------------------------------------------------------
+/**
+ * Adapter that gives RemoteTarget the same interface as SandboxContainer,
+ * so SecurityAgent can dispatch tool calls through it.
+ */
+export class RemoteSandboxAdapter {
+  /**
+   * @param {import('./remote-target.js').RemoteTarget} target
+   */
+  constructor(target) {
+    this._target = target;
+    this._containerId = `remote-${target.type}-${target.host}`;
+  }
+  get containerId() { return this._containerId; }
+  /**
+   * Execute a command on the remote target.
+   * Maps to the SandboxContainer.execTool interface.
+   *
+   * @param {string} command
+   * @param {number} [timeout=60]
+   * @returns {{ exitCode: number, stdout: string, stderr: string }}
+   */
+  execTool(command, timeout = 60) {
+    // RemoteTarget.exec is async, but SandboxContainer.execTool is sync
+    // We need to handle this — use a blocking wrapper or convert the caller.
+    // For now, we store a promise and resolve via the async dispatch path.
+    // The actual exec happens in the async dispatchRemoteTool function below.
+    return { exitCode: 0, stdout: `[remote] Command queued: ${command}`, stderr: '' };
+  }
+  /**
+   * Async exec for the remote dispatch path.
+   */
+  async execToolAsync(command, timeout = 60) {
+    const result = await this._target.exec(command, timeout);
+    return result;
+  }
+  /** No-op — remote targets don't need file injection setup. */
+  create() {}
+  /** No-op for remote targets. */
+  connectNetwork() {}
+  /** Copy file — not supported for remote targets currently. */
+  copyFileIn() {}
+  /** Cleanup the remote connection. */
+  cleanup() {
+    this._target.cleanup();
+  }
+  /** Get the target URL for agent reference. */
+  getInternalUrl(serviceName, port) {
+    return this._target.url;
+  }
+}
+// ---------------------------------------------------------------------------
+// Remote tool dispatch
+// ---------------------------------------------------------------------------
+/**
+ * Dispatch a tool call for remote challenges.
+ * Handles the same tools as the sandbox dispatcher but routes through
+ * RemoteTarget instead of Docker exec.
+ *
+ * @param {string} toolName
+ * @param {object} input
+ * @param {RemoteSandboxAdapter} adapter
+ * @returns {Promise<{ output: string, flagSubmitted?: string, answerSubmitted?: string }>}
+ */
+export async function dispatchRemoteTool(toolName, input, adapter) {
+  // Flag/answer submission tools
+  if (toolName === 'submit_flag') {
+    return { output: `Flag submitted: ${input.flag}`, flagSubmitted: input.flag };
+  }
+  if (toolName === 'submit_answer') {
+    return { output: `Answer submitted: ${input.answer}`, answerSubmitted: input.answer };
+  }
+  // Execution tools — route through remote target
+  if (toolName === 'sandbox_exec') {
+    const result = await adapter.execToolAsync(input.command, input.timeout || 60);
+    const output = result.stdout + (result.stderr ? `\n[stderr] ${result.stderr}` : '');
+    return { output: output || `[exit ${result.exitCode}]` };
+  }
+  // HTTP request tool — useful for web challenges
+  if (toolName === 'http_request') {
+    const method = (input.method || 'GET').toUpperCase();
+    let curlCmd = `curl -s -X ${method}`;
+    if (input.headers) {
+      for (const [k, v] of Object.entries(input.headers)) {
+        curlCmd += ` -H "${k}: ${v}"`;
+      }
+    }
+    if (input.body) {
+      curlCmd += ` -d '${typeof input.body === 'string' ? input.body : JSON.stringify(input.body)}'`;
+    }
+    curlCmd += ` "${input.url}"`;
+    const result = await adapter.execToolAsync(curlCmd, input.timeout || 30);
+    return { output: result.stdout || result.stderr || '[no response]' };
+  }
+  // Read file — try cat on remote
+  if (toolName === 'read_file') {
+    const result = await adapter.execToolAsync(`cat "${input.path}"`, 10);
+    return { output: result.stdout || result.stderr || '[empty]' };
+  }
+  // Write file — try tee on remote
+  if (toolName === 'write_file') {
+    const escaped = input.content.replace(/'/g, "'\\''");
+    const result = await adapter.execToolAsync(`echo '${escaped}' > "${input.path}"`, 10);
+    return { output: result.exitCode === 0 ? `Written to ${input.path}` : result.stderr };
+  }
+  // List directory
+  if (toolName === 'list_directory') {
+    const result = await adapter.execToolAsync(`ls -la "${input.path || '.'}"`, 10);
+    return { output: result.stdout || result.stderr || '[empty]' };
+  }
+  // Cognitive tools — pass through
+  if (toolName === 'update_plan' || toolName === 'check_confidence') {
+    return { output: `[cognitive] ${toolName}: acknowledged` };
+  }
+  // Disassemble/decompile — not available on most remote targets
+  if (toolName === 'disassemble' || toolName === 'decompile') {
+    return { output: `[remote] ${toolName} not available on remote target` };
+  }
+  return { output: `[remote] Unknown tool: ${toolName}` };
+}
+// ---------------------------------------------------------------------------
+// RemoteSolver
+// ---------------------------------------------------------------------------
+/**
+ * Solver that connects to remote CTF challenge servers.
+ *
+ * Supports SSH (OverTheWire), netcat (PicoCTF binary challenges),
+ * and HTTP (PicoCTF web challenges) targets.
+ */
+export class RemoteSolver extends SolverAdapter {
+  /**
+   * @param {object} opts
+   * @param {string} [opts.backend] - LLM backend override
+   * @param {number} [opts.maxTurns=30] - Max agent turns
+   * @param {object} [opts.targetInfo] - Remote target connection info
+   */
+  constructor({ backend, maxTurns = 30, targetInfo } = {}) {
+    super();
+    this._backend = backend;
+    this._maxTurns = maxTurns;
+    this._targetInfo = targetInfo;
+  }
+  get name() { return 'remote'; }
+  /**
+   * Solve a challenge via remote target.
+   *
+   * @param {import('./models.js').BenchmarkConfig} config
+   * @param {string} targetUrl - Target URL (may be overridden by targetInfo)
+   * @param {string} expectedFlag - Expected flag for validation
+   * @returns {Promise<SolverResult>}
+   */
+  async solve(config, targetUrl, expectedFlag) {
+    const start = performance.now();
+    const steps = [];
+    try {
+      // Determine target info from config or constructor
+      const targetInfo = this._targetInfo || await this._inferTargetInfo(config);
+      if (!targetInfo) {
+        return new SolverResult({
+          durationS: (performance.now() - start) / 1000,
+          error: 'No remote target info available for this challenge',
+          steps: ['[error] Cannot determine remote target connection info'],
+        });
+      }
+      // Create remote target
+      const target = createRemoteTarget(targetInfo);
+      const adapter = new RemoteSandboxAdapter(target);
+      // Create LLM client
+      const { makeAgentClient } = await import('./llm.js');
+      const { client, model } = await makeAgentClient({ backendOverride: this._backend });
+      // Generate system prompt
+      const tracker = new ConfidenceTracker();
+      const cognitiveDispatch = createCognitiveDispatcher(tracker);
+      const systemPrompt = generateSystemPrompt({
+        targetUrl: target.url,
+        challengeDescription: config.description,
+        tags: config.tags,
+        winCondition: config.winCondition,
+        serviceUrls: [],
+        hasFiles: false,
+        phase: tracker.phase,
+      });
+      // Build tools
+      const tools = getToolsForWinCondition(config.winCondition);
+      const messages = [{ role: 'user', content: systemPrompt }];
+      let totalIn = 0;
+      let totalOut = 0;
+      let toolCallCount = 0;
+      // Agent loop
+      for (let turn = 0; turn < this._maxTurns; turn++) {
+        let response;
+        try {
+          response = await client.messages.create({
+            model, max_tokens: 4096, tools, messages,
+          });
+        } catch (err) {
+          return new SolverResult({
+            durationS: (performance.now() - start) / 1000,
+            error: `LLM error: ${err.message}`,
+            tokensIn: totalIn, tokensOut: totalOut, toolCalls: toolCallCount, steps,
+          });
+        }
+        totalIn += response.usage?.input_tokens || 0;
+        totalOut += response.usage?.output_tokens || 0;
+        const assistantContent = response.content || [];
+        messages.push({ role: 'assistant', content: assistantContent });
+        const toolUseBlocks = assistantContent.filter(b => b.type === 'tool_use');
+        if (toolUseBlocks.length === 0) {
+          const text = assistantContent.find(b => b.type === 'text')?.text || '';
+          steps.push(`[text] ${text.slice(0, 200)}`);
+          if (response.stop_reason === 'end_turn') break;
+          continue;
+        }
+        const toolResults = [];
+        for (const block of toolUseBlocks) {
+          toolCallCount++;
+          steps.push(`[tool] ${block.name}: ${JSON.stringify(block.input).slice(0, 150)}`);
+          // Try cognitive dispatch first, then remote dispatch
+          const cogResult = cognitiveDispatch(block.name, block.input);
+          const result = cogResult || await dispatchRemoteTool(block.name, block.input, adapter);
+          steps.push(`[result] ${result.output.slice(0, 200)}`);
+          if (result.flagSubmitted) {
+            adapter.cleanup();
+            return new SolverResult({
+              flagFound: true, flagValue: result.flagSubmitted,
+              durationS: (performance.now() - start) / 1000,
+              tokensIn: totalIn, tokensOut: totalOut, toolCalls: toolCallCount, steps,
+            });
+          }
+          if (result.answerSubmitted) {
+            adapter.cleanup();
+            return new SolverResult({
+              answerFound: true, answerValue: result.answerSubmitted,
+              durationS: (performance.now() - start) / 1000,
+              tokensIn: totalIn, tokensOut: totalOut, toolCalls: toolCallCount, steps,
+            });
+          }
+          toolResults.push({ type: 'tool_result', tool_use_id: block.id, content: result.output });
+        }
+        messages.push({ role: 'user', content: toolResults });
+      }
+      adapter.cleanup();
+      return new SolverResult({
+        durationS: (performance.now() - start) / 1000,
+        tokensIn: totalIn, tokensOut: totalOut, toolCalls: toolCallCount, steps,
+      });
+    } catch (e) {
+      return new SolverResult({
+        durationS: (performance.now() - start) / 1000,
+        error: String(e), steps: [...steps, `[error] ${e}`],
+      });
+    }
+  }
+  /**
+   * Infer remote target info from BenchmarkConfig tags/metadata.
+   * @param {import('./models.js').BenchmarkConfig} config
+   * @returns {{ type: string, host: string, port: number, username?: string, password?: string }|null}
+   */
+  async _inferTargetInfo(config) {
+    // Check if config has metadata with target info
+    if (config.metadata?.targetInfo) return config.metadata.targetInfo;
+    // PicoCTF challenges
+    if (config.tags.includes('picoctf')) {
+      try {
+        const { getPicoTargetInfo, PICOCTF_CATALOG } = await import('./picoctf.js');
+        const entry = PICOCTF_CATALOG.find(e => e.name === config.name);
+        if (entry) return getPicoTargetInfo(entry);
+      } catch { /* module not available */ }
+    }
+    // OverTheWire challenges
+    if (config.tags.includes('overthewire')) {
+      try {
+        const { getOtwConnectionInfo } = await import('./overthewire.js');
+        const wargame = config.tags.find(t => ['bandit', 'natas', 'leviathan', 'krypton', 'narnia'].includes(t));
+        if (wargame) {
+          const levelMatch = config.name.match(/Level (\d+)/);
+          const level = levelMatch ? parseInt(levelMatch[1], 10) : 0;
+          const info = getOtwConnectionInfo(wargame, level);
+          return { type: 'ssh', ...info };
+        }
+      } catch { /* module not available */ }
+    }
+    return null;
+  }
+}

package/lib/benchmark/remote-target.js ADDED Viewed

@@ -0,0 +1,263 @@
+// Copyright (c) 2026 defconxt. All rights reserved.
+// Licensed under AGPL-3.0 — see LICENSE file for details.
+// CIPHER is a trademark of defconxt.
+/**
+ * Remote Target — Connection adapters for external CTF challenge servers.
+ *
+ * Unlike XBOW/NYU which run in Docker containers, many CTF platforms
+ * host challenges on remote servers accessible via SSH, netcat, or HTTP.
+ * This module provides a unified interface for connecting to them.
+ *
+ * @module benchmark/remote-target
+ */
+import { spawnSync, spawn } from 'node:child_process';
+// ---------------------------------------------------------------------------
+// Base class
+// ---------------------------------------------------------------------------
+/**
+ * Abstract remote target connection.
+ * Subclasses implement connect(), exec(), and cleanup().
+ */
+export class RemoteTarget {
+  /**
+   * @param {object} opts
+   * @param {string} opts.type       - 'ssh' | 'netcat' | 'http'
+   * @param {string} opts.host       - Hostname or IP
+   * @param {number} opts.port       - Port number
+   * @param {string} [opts.username] - SSH username
+   * @param {string} [opts.password] - SSH password or challenge password
+   * @param {object} [opts.metadata] - Additional target-specific info
+   */
+  constructor({ type, host, port, username, password, metadata = {} }) {
+    this.type = type;
+    this.host = host;
+    this.port = port;
+    this.username = username ?? '';
+    this.password = password ?? '';
+    this.metadata = metadata;
+    this._connected = false;
+  }
+  get connected() { return this._connected; }
+  get url() { return `${this.type}://${this.host}:${this.port}`; }
+  /** Connect to the remote target. */
+  async connect() { throw new Error('Subclass must implement connect()'); }
+  /**
+   * Execute a command against the remote target.
+   * @param {string} command
+   * @param {number} [timeout] - Timeout in seconds
+   * @returns {Promise<{ exitCode: number, stdout: string, stderr: string }>}
+   */
+  async exec(command, timeout = 30) { throw new Error('Subclass must implement exec()'); }
+  /** Clean up connection resources. */
+  async cleanup() { this._connected = false; }
+}
+// ---------------------------------------------------------------------------
+// SSH Target
+// ---------------------------------------------------------------------------
+/**
+ * SSH-based remote target (OverTheWire, SSH-accessible challenges).
+ *
+ * Uses sshpass + ssh for password-based auth, or ssh with key-based auth.
+ * Each exec() call spawns a fresh SSH connection (stateless).
+ */
+export class SSHTarget extends RemoteTarget {
+  constructor({ host, port = 22, username, password, keyPath, metadata = {} }) {
+    super({ type: 'ssh', host, port, username, password, metadata });
+    this.keyPath = keyPath ?? '';
+  }
+  async connect() {
+    // Verify connectivity with a simple echo
+    const result = await this.exec('echo __CIPHER_CONNECTED__', 10);
+    if (result.stdout.includes('__CIPHER_CONNECTED__')) {
+      this._connected = true;
+      return true;
+    }
+    throw new Error(`SSH connection failed to ${this.host}:${this.port}: ${result.stderr}`);
+  }
+  async exec(command, timeout = 30) {
+    const sshArgs = [
+      '-o', 'StrictHostKeyChecking=no',
+      '-o', 'UserKnownHostsFile=/dev/null',
+      '-o', 'LogLevel=ERROR',
+      '-o', `ConnectTimeout=${Math.min(timeout, 10)}`,
+      '-p', String(this.port),
+    ];
+    if (this.keyPath) {
+      sshArgs.push('-i', this.keyPath);
+    }
+    sshArgs.push(`${this.username}@${this.host}`, command);
+    let args, cmd;
+    if (this.password && !this.keyPath) {
+      // Use sshpass for password auth
+      cmd = 'sshpass';
+      args = ['-p', this.password, 'ssh', ...sshArgs];
+    } else {
+      cmd = 'ssh';
+      args = sshArgs;
+    }
+    const result = spawnSync(cmd, args, {
+      timeout: timeout * 1000,
+      stdio: 'pipe',
+      maxBuffer: 5 * 1024 * 1024,
+    });
+    return {
+      exitCode: result.status ?? -1,
+      stdout: (result.stdout || '').toString(),
+      stderr: (result.stderr || '').toString(),
+    };
+  }
+}
+// ---------------------------------------------------------------------------
+// Netcat Target
+// ---------------------------------------------------------------------------
+/**
+ * Netcat-based remote target (interactive TCP challenges).
+ *
+ * Many CTF challenges expose a service on a TCP port that you interact with
+ * by sending/receiving text. This wraps netcat for that purpose.
+ */
+export class NetcatTarget extends RemoteTarget {
+  constructor({ host, port, metadata = {} }) {
+    super({ type: 'netcat', host, port, metadata });
+  }
+  async connect() {
+    // Verify port is reachable
+    const result = spawnSync('nc', ['-z', '-w', '5', this.host, String(this.port)], {
+      timeout: 10000, stdio: 'pipe',
+    });
+    this._connected = result.status === 0;
+    if (!this._connected) {
+      throw new Error(`Netcat connection failed to ${this.host}:${this.port}`);
+    }
+    return true;
+  }
+  /**
+   * Send input to a netcat service and capture output.
+   * @param {string} input - Text to send to the service
+   * @param {number} [timeout] - Timeout in seconds
+   */
+  async exec(input, timeout = 15) {
+    const result = spawnSync('nc', ['-w', String(timeout), this.host, String(this.port)], {
+      input: input + '\n',
+      timeout: timeout * 1000,
+      stdio: ['pipe', 'pipe', 'pipe'],
+      maxBuffer: 1 * 1024 * 1024,
+    });
+    return {
+      exitCode: result.status ?? -1,
+      stdout: (result.stdout || '').toString(),
+      stderr: (result.stderr || '').toString(),
+    };
+  }
+}
+// ---------------------------------------------------------------------------
+// HTTP Target
+// ---------------------------------------------------------------------------
+/**
+ * HTTP-based remote target (web exploitation challenges).
+ *
+ * Wraps curl for HTTP interactions. Supports GET, POST, custom headers.
+ */
+export class HTTPTarget extends RemoteTarget {
+  constructor({ host, port = 80, protocol = 'http', metadata = {} }) {
+    super({ type: 'http', host, port, metadata });
+    this.protocol = protocol;
+  }
+  get url() { return `${this.protocol}://${this.host}:${this.port}`; }
+  async connect() {
+    const result = spawnSync('curl', [
+      '-s', '-o', '/dev/null', '-w', '%{http_code}',
+      '--connect-timeout', '10', '--max-time', '15',
+      this.url,
+    ], { timeout: 20000, stdio: 'pipe' });
+    const statusCode = parseInt((result.stdout || '').toString().trim(), 10);
+    this._connected = statusCode > 0 && statusCode < 600;
+    if (!this._connected) {
+      throw new Error(`HTTP connection failed to ${this.url}`);
+    }
+    return true;
+  }
+  /**
+   * Execute an HTTP request.
+   * @param {string} command - curl-style command args or a URL path
+   * @param {number} [timeout] - Timeout in seconds
+   */
+  async exec(command, timeout = 30) {
+    // If command starts with '/' treat as a path on this target
+    let curlArgs;
+    if (command.startsWith('/') || command.startsWith('http')) {
+      const targetUrl = command.startsWith('http') ? command : `${this.url}${command}`;
+      curlArgs = ['-s', '--max-time', String(timeout), targetUrl];
+    } else {
+      // Raw curl args
+      curlArgs = ['-s', '--max-time', String(timeout), ...command.split(/\s+/)];
+    }
+    const result = spawnSync('curl', curlArgs, {
+      timeout: timeout * 1000,
+      stdio: 'pipe',
+      maxBuffer: 5 * 1024 * 1024,
+    });
+    return {
+      exitCode: result.status ?? -1,
+      stdout: (result.stdout || '').toString(),
+      stderr: (result.stderr || '').toString(),
+    };
+  }
+}
+// ---------------------------------------------------------------------------
+// Factory
+// ---------------------------------------------------------------------------
+/**
+ * Create a RemoteTarget from a challenge config.
+ *
+ * @param {object} opts
+ * @param {string} opts.type - 'ssh' | 'netcat' | 'http'
+ * @param {string} opts.host
+ * @param {number} opts.port
+ * @param {string} [opts.username]
+ * @param {string} [opts.password]
+ * @param {string} [opts.keyPath]
+ * @param {string} [opts.protocol]
+ * @param {object} [opts.metadata]
+ * @returns {RemoteTarget}
+ */
+export function createRemoteTarget(opts) {
+  switch (opts.type) {
+    case 'ssh':    return new SSHTarget(opts);
+    case 'netcat': return new NetcatTarget(opts);
+    case 'http':   return new HTTPTarget(opts);
+    default: throw new Error(`Unknown target type: ${opts.type}`);
+  }
+}

package/lib/benchmark/reporter.js CHANGED Viewed

@@ -99,5 +99,40 @@ export function generateMarkdownReport(report, solverName = '') {
     lines.push('');
   }
+  // Tag-aware reporting
+  const byTag = report.resultsByTag();
+  if (Object.keys(byTag).length > 0) {
+    lines.push('## Results by Vulnerability Category', '');
+    lines.push('| Tag | Total | Passed | Rate |');
+    lines.push('|-----|-------|--------|------|');
+    for (const [tag, tagResults] of Object.entries(byTag).sort(([a], [b]) => a.localeCompare(b))) {
+      const p = tagResults.filter(r => r.passed).length;
+      const pct = tagResults.length ? (p / tagResults.length) * 100 : 0;
+      lines.push(`| ${tag} | ${tagResults.length} | ${p} | ${pct.toFixed(1)}% |`);
+    }
+    lines.push('');
+  }
+  // Category breakdown (for NYU CTF and categorized benchmarks)
+  const byCategory = {};
+  for (const r of report.results) {
+    const cat = r.config.category;
+    if (cat) {
+      if (!byCategory[cat]) byCategory[cat] = { total: 0, passed: 0 };
+      byCategory[cat].total++;
+      if (r.passed) byCategory[cat].passed++;
+    }
+  }
+  if (Object.keys(byCategory).length > 0) {
+    lines.push('## Results by Category', '');
+    lines.push('| Category | Total | Passed | Rate |');
+    lines.push('|----------|-------|--------|------|');
+    for (const [cat, data] of Object.entries(byCategory).sort(([a],[b]) => a.localeCompare(b))) {
+      const pct = data.total > 0 ? (data.passed / data.total * 100).toFixed(1) : '0.0';
+      lines.push(`| ${cat} | ${data.total} | ${data.passed} | ${pct}% |`);
+    }
+    lines.push('');
+  }
   return lines.join('\n');
 }