npm - cipher-security - Versions diffs - 2.0.8 → 2.2.0 - Mend

cipher-security 2.0.8 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (70) hide show

package/bin/cipher.js +11 -1
package/lib/agent-runtime/handlers/architect.js +199 -0
package/lib/agent-runtime/handlers/base.js +240 -0
package/lib/agent-runtime/handlers/blue.js +220 -0
package/lib/agent-runtime/handlers/incident.js +161 -0
package/lib/agent-runtime/handlers/privacy.js +190 -0
package/lib/agent-runtime/handlers/purple.js +209 -0
package/lib/agent-runtime/handlers/recon.js +174 -0
package/lib/agent-runtime/handlers/red.js +246 -0
package/lib/agent-runtime/handlers/researcher.js +170 -0
package/lib/agent-runtime/handlers.js +35 -0
package/lib/agent-runtime/index.js +196 -0
package/lib/agent-runtime/parser.js +316 -0
package/lib/analyze/consistency.js +566 -0
package/lib/analyze/constitution.js +110 -0
package/lib/analyze/sharding.js +251 -0
package/lib/autonomous/agent-tool.js +165 -0
package/lib/autonomous/feedback-loop.js +13 -6
package/lib/autonomous/framework.js +17 -0
package/lib/autonomous/handoff.js +506 -0
package/lib/autonomous/modes/blue.js +26 -0
package/lib/autonomous/modes/red.js +585 -0
package/lib/autonomous/modes/researcher.js +322 -0
package/lib/autonomous/researcher.js +12 -45
package/lib/autonomous/runner.js +9 -537
package/lib/benchmark/agent.js +88 -26
package/lib/benchmark/baselines.js +3 -0
package/lib/benchmark/claude-code-solver.js +254 -0
package/lib/benchmark/cognitive.js +283 -0
package/lib/benchmark/index.js +12 -2
package/lib/benchmark/knowledge.js +281 -0
package/lib/benchmark/llm.js +156 -15
package/lib/benchmark/models.js +5 -2
package/lib/benchmark/nyu-ctf.js +192 -0
package/lib/benchmark/overthewire.js +347 -0
package/lib/benchmark/picoctf.js +281 -0
package/lib/benchmark/prompts.js +280 -0
package/lib/benchmark/registry.js +219 -0
package/lib/benchmark/remote-solver.js +356 -0
package/lib/benchmark/remote-target.js +263 -0
package/lib/benchmark/reporter.js +35 -0
package/lib/benchmark/runner.js +174 -10
package/lib/benchmark/sandbox.js +35 -0
package/lib/benchmark/scorer.js +22 -4
package/lib/benchmark/solver.js +34 -1
package/lib/benchmark/tools.js +262 -16
package/lib/commands.js +9 -0
package/lib/execution/council.js +434 -0
package/lib/execution/parallel.js +292 -0
package/lib/gates/circuit-breaker.js +135 -0
package/lib/gates/confidence.js +302 -0
package/lib/gates/corrections.js +219 -0
package/lib/gates/self-check.js +245 -0
package/lib/gateway/commands.js +727 -0
package/lib/guardrails/engine.js +364 -0
package/lib/mcp/server.js +349 -3
package/lib/memory/compressor.js +94 -7
package/lib/pipeline/hooks.js +288 -0
package/lib/pipeline/index.js +11 -0
package/lib/review/budget.js +210 -0
package/lib/review/engine.js +526 -0
package/lib/review/layers/acceptance-auditor.js +279 -0
package/lib/review/layers/blind-hunter.js +500 -0
package/lib/review/layers/defense-in-depth.js +209 -0
package/lib/review/layers/edge-case-hunter.js +266 -0
package/lib/review/panel.js +519 -0
package/lib/review/two-stage.js +244 -0
package/lib/session/cost-tracker.js +203 -0
package/lib/session/logger.js +349 -0
package/package.json +1 -1

package/lib/benchmark/agent.js CHANGED Viewed

@@ -5,15 +5,20 @@
  * CIPHER Benchmark — Security agent for autonomous solving.
  *
  * Runs a multi-turn tool-use conversation loop with an LLM,
- * dispatching tool calls to the sandbox.
+ * dispatching tool calls to the sandbox. Supports file injection
+ * from benchmark configs and multi-service targets.
  */
-import { AGENT_TOOLS, dispatchTool } from './tools.js';
+import { getToolsForWinCondition, dispatchTool } from './tools.js';
+import { ConfidenceTracker, createCognitiveDispatcher } from './cognitive.js';
+import { generateSystemPrompt } from './prompts.js';
 export class AgentResult {
   constructor(opts = {}) {
     this.flagFound = opts.flagFound ?? false;
     this.flagValue = opts.flagValue ?? '';
+    this.answerFound = opts.answerFound ?? false;
+    this.answerValue = opts.answerValue ?? '';
     this.tokensIn = opts.tokensIn ?? 0;
     this.tokensOut = opts.tokensOut ?? 0;
     this.toolCalls = opts.toolCalls ?? 0;
@@ -31,17 +36,66 @@ export class SecurityAgent {
     this._preToolHook = preToolHook;
   }
-  async run({ targetUrl, challengeDescription }) {
-    const systemPrompt = [
-      'You are a security researcher attempting to capture a flag from a vulnerable web application.',
-      `Target URL: ${targetUrl}`,
-      `Challenge: ${challengeDescription}`,
-      '',
-      'Use sandbox_exec to run commands (curl, nmap, sqlmap, etc.) against the target.',
-      'When you find the flag, call submit_flag with the flag value.',
-      'Be methodical: enumerate, identify vulnerabilities, exploit, capture flag.',
-    ].join('\n');
+  /**
+   * Inject benchmark files into the sandbox before solving.
+   *
+   * @param {Array<{ name: string, path: string }>} files - Files from BenchmarkConfig
+   * @param {string} benchmarkPath - Base path of the benchmark
+   */
+  injectFiles(files, benchmarkPath) {
+    if (!files || files.length === 0) return;
+    const { join } = require('node:path');
+    const { existsSync } = require('node:fs');
+    for (const file of files) {
+      const hostPath = join(benchmarkPath, file.path || file.name || file);
+      const containerPath = `/tmp/challenge/${file.name || file.path || file}`;
+      if (existsSync(hostPath)) {
+        try {
+          // Ensure container directory exists
+          const dir = containerPath.replace(/\/[^/]+$/, '');
+          this._sandbox.execTool(`mkdir -p '${dir}'`);
+          this._sandbox.copyFileIn(hostPath, containerPath);
+        } catch (e) {
+          // Non-fatal — log and continue
+        }
+      }
+    }
+  }
+  /**
+   * Run the agent against a target.
+   *
+   * @param {object} opts
+   * @param {string} opts.targetUrl - Primary target URL
+   * @param {string} opts.challengeDescription - Challenge description
+   * @param {string} [opts.winCondition='flag'] - 'flag' or 'question'
+   * @param {Array<{ name: string, url: string }>} [opts.serviceUrls] - All service URLs
+   * @param {Array} [opts.files] - Benchmark files to inject
+   * @param {string} [opts.benchmarkPath] - Benchmark base path for file injection
+   * @returns {Promise<AgentResult>}
+   */
+  async run({ targetUrl, challengeDescription, winCondition = 'flag', serviceUrls = [], files = [], benchmarkPath = '', tags = [] }) {
+    // Inject files if provided
+    if (files.length > 0 && benchmarkPath) {
+      this.injectFiles(files, benchmarkPath);
+    }
+    // Initialize cognitive architecture
+    const tracker = new ConfidenceTracker();
+    const cognitiveDispatch = createCognitiveDispatcher(tracker);
+    // Generate tag-aware system prompt
+    const systemPrompt = generateSystemPrompt({
+      targetUrl,
+      challengeDescription,
+      tags,
+      winCondition,
+      serviceUrls,
+      hasFiles: files.length > 0,
+      phase: tracker.phase,
+    });
+    const tools = getToolsForWinCondition(winCondition);
     const messages = [{ role: 'user', content: systemPrompt }];
     let totalIn = 0;
     let totalOut = 0;
@@ -51,11 +105,10 @@ export class SecurityAgent {
     for (let turn = 0; turn < this._maxTurns; turn++) {
       let response;
       try {
-        // Anthropic SDK style
         response = await this._client.messages.create({
           model: this._model,
           max_tokens: 4096,
-          tools: AGENT_TOOLS,
+          tools,
           messages,
         });
       } catch (err) {
@@ -65,27 +118,33 @@ export class SecurityAgent {
       totalIn += response.usage?.input_tokens || 0;
       totalOut += response.usage?.output_tokens || 0;
-      // Process content blocks
       const assistantContent = response.content || [];
       messages.push({ role: 'assistant', content: assistantContent });
       const toolUseBlocks = assistantContent.filter((b) => b.type === 'tool_use');
       if (toolUseBlocks.length === 0) {
-        // No tool calls — agent is done or stuck
         const text = assistantContent.find((b) => b.type === 'text')?.text || '';
         steps.push(`[text] ${text.slice(0, 200)}`);
-        if (response.stop_reason === 'end_turn') break;
+        if (response.stop_reason === 'end_turn') {
+          // Nudge the model to keep going if it hasn't found the flag yet
+          if (turn < this._maxTurns - 1) {
+            messages.push({
+              role: 'user',
+              content: 'You have not found the flag yet. Continue investigating — use your tools to take the next action. Do not just describe what you would do; actually do it by calling a tool.',
+            });
+            continue;
+          }
+          break;
+        }
         continue;
       }
-      // Process tool calls
       const toolResults = [];
       for (const block of toolUseBlocks) {
         toolCallCount++;
         steps.push(`[tool] ${block.name}: ${JSON.stringify(block.input).slice(0, 150)}`);
-        // Pre-tool hook (for supervised mode)
         if (this._preToolHook) {
           const approved = await this._preToolHook(block.name, block.input);
           if (!approved) {
@@ -94,17 +153,20 @@ export class SecurityAgent {
           }
         }
-        const result = dispatchTool(block.name, block.input, this._sandbox);
+        const result = cognitiveDispatch(block.name, block.input) || dispatchTool(block.name, block.input, this._sandbox);
         steps.push(`[result] ${result.output.slice(0, 200)}`);
         if (result.flagSubmitted) {
           return new AgentResult({
-            flagFound: true,
-            flagValue: result.flagSubmitted,
-            tokensIn: totalIn,
-            tokensOut: totalOut,
-            toolCalls: toolCallCount,
-            steps,
+            flagFound: true, flagValue: result.flagSubmitted,
+            tokensIn: totalIn, tokensOut: totalOut, toolCalls: toolCallCount, steps,
+          });
+        }
+        if (result.answerSubmitted) {
+          return new AgentResult({
+            answerFound: true, answerValue: result.answerSubmitted,
+            tokensIn: totalIn, tokensOut: totalOut, toolCalls: toolCallCount, steps,
           });
         }

package/lib/benchmark/baselines.js CHANGED Viewed

@@ -41,3 +41,6 @@ export const SHANNON_BASELINE = new CompetitorBaseline({
 });
 export const ALL_BASELINES = [PENTESTGPT_BASELINE, MAPTA_BASELINE, SHANNON_BASELINE];
+// NYU CTF baselines (re-exported from nyu-ctf.js for convenience)
+export { NYU_BASELINES, CRAKEN_BASELINE, DCIPHER_BASELINE, ENIGMA_BASELINE } from './nyu-ctf.js';

package/lib/benchmark/claude-code-solver.js ADDED Viewed

@@ -0,0 +1,254 @@
+// Copyright (c) 2026 defconxt. All rights reserved.
+// Licensed under AGPL-3.0 — see LICENSE file for details.
+// CIPHER is a trademark of defconxt.
+/**
+ * Claude Code Solver — Delegates CTF challenges to a Claude Code subagent.
+ *
+ * Instead of using the Anthropic API directly (expensive) or Ollama (inconsistent),
+ * this solver spawns a Claude Code subagent that uses bash/curl/python to interact
+ * with the Docker-hosted challenge target.
+ *
+ * The subagent has full access to the host's tools — it can curl the Docker network,
+ * run Python exploit scripts, use netcat, etc. This is the most capable solver
+ * because it leverages Claude Code's excellent tool use.
+ *
+ * @module benchmark/claude-code-solver
+ */
+import { SolverResult } from './models.js';
+import { SolverAdapter } from './solver.js';
+import { execSync, spawnSync } from 'node:child_process';
+// ---------------------------------------------------------------------------
+// Docker network discovery
+// ---------------------------------------------------------------------------
+/**
+ * Find the Docker network for a benchmark's compose project.
+ * @param {string} benchmarkDir - Directory name of the benchmark
+ * @returns {string|null} Docker network name
+ */
+function findBenchmarkNetwork(benchmarkDir) {
+  try {
+    const result = spawnSync('docker', ['network', 'ls', '--format', '{{.Name}}'], {
+      timeout: 5000, stdio: 'pipe',
+    });
+    const networks = (result.stdout || '').toString().split('\n').filter(Boolean);
+    // Look for network matching the benchmark name
+    const prefix = `cipher-bench-${benchmarkDir}`.toLowerCase();
+    return networks.find(n => n.toLowerCase().includes(prefix.replace(/-/g, ''))) ||
+           networks.find(n => n.toLowerCase().includes(benchmarkDir.toLowerCase())) || null;
+  } catch {
+    return null;
+  }
+}
+/**
+ * Get the target URL by inspecting the Docker compose services.
+ * @param {string} benchmarkPath - Path to benchmark directory
+ * @returns {{ url: string, serviceName: string, port: number }|null}
+ */
+function getTargetFromCompose(benchmarkPath) {
+  try {
+    const yaml = require('yaml');
+    const { readFileSync } = require('node:fs');
+    const { join } = require('node:path');
+    const composePath = join(benchmarkPath, 'docker-compose.yml');
+    const data = yaml.parse(readFileSync(composePath, 'utf8'));
+    const services = data.services || {};
+    for (const [name, svc] of Object.entries(services)) {
+      if (svc.ports?.length) {
+        const portSpec = String(svc.ports[0]);
+        const hostPort = portSpec.includes(':') ? portSpec.split(':')[0] : portSpec;
+        return {
+          url: `http://localhost:${hostPort}`,
+          serviceName: name,
+          port: parseInt(hostPort, 10),
+        };
+      }
+    }
+    // No exposed ports — use internal Docker network
+    const firstService = Object.keys(services)[0];
+    return firstService ? { url: `http://${firstService}:80`, serviceName: firstService, port: 80 } : null;
+  } catch {
+    return null;
+  }
+}
+// ---------------------------------------------------------------------------
+// ClaudeCodeSolver
+// ---------------------------------------------------------------------------
+/**
+ * Solver that delegates challenges to a Claude Code subagent.
+ *
+ * The subagent uses bash, curl, python3, and other host tools to
+ * interact with the Docker-hosted challenge target. This avoids
+ * the API cost of direct Claude API calls and the inconsistency
+ * of smaller models like Ollama.
+ */
+export class ClaudeCodeSolver extends SolverAdapter {
+  /**
+   * @param {object} [opts]
+   * @param {number} [opts.timeoutS=300] - Max time for the subagent
+   * @param {string} [opts.agent='ctf-solver'] - Subagent name
+   * @param {Function} [opts.subagentFn] - Direct subagent function (when running inside GSD)
+   */
+  constructor({ timeoutS = 300, agent = 'ctf-solver', subagentFn } = {}) {
+    super();
+    this._timeoutS = timeoutS;
+    this._agent = agent;
+    this._subagentFn = subagentFn || null;
+  }
+  get name() { return 'claude-code'; }
+  /**
+   * Solve a benchmark challenge using a Claude Code subagent.
+   *
+   * @param {import('./models.js').BenchmarkConfig} config
+   * @param {string} targetUrl - Target URL (from Docker compose)
+   * @param {string} expectedFlag - Expected flag for validation
+   * @returns {Promise<SolverResult>}
+   */
+  async solve(config, targetUrl, expectedFlag) {
+    const start = performance.now();
+    const steps = [];
+    try {
+      // Build the task prompt for the subagent
+      const task = this._buildTask(config, targetUrl);
+      steps.push(`[subagent] Delegating to ${this._agent}: ${config.name}`);
+      steps.push(`[target] ${targetUrl}`);
+      // Execute via subagent tool
+      // The subagent function is available in the GSD runtime context
+      // For standalone use, we'll exec pi/claude with the task
+      const result = await this._execSubagent(task);
+      steps.push(`[subagent] Completed in ${((performance.now() - start) / 1000).toFixed(1)}s`);
+      // Parse the result for flag
+      const flagMatch = result.match(/FLAG_FOUND:\s*(.+)/);
+      if (flagMatch) {
+        const flag = flagMatch[1].trim();
+        steps.push(`[flag] Found: ${flag}`);
+        return new SolverResult({
+          flagFound: true,
+          flagValue: flag,
+          durationS: (performance.now() - start) / 1000,
+          steps,
+        });
+      }
+      // Check for common flag patterns in output
+      const patterns = [
+        /FLAG\{[^}]+\}/,
+        /flag\{[^}]+\}/,
+        /picoCTF\{[^}]+\}/,
+        /CTF\{[^}]+\}/,
+      ];
+      for (const pattern of patterns) {
+        const match = result.match(pattern);
+        if (match) {
+          steps.push(`[flag] Extracted from output: ${match[0]}`);
+          return new SolverResult({
+            flagFound: true,
+            flagValue: match[0],
+            durationS: (performance.now() - start) / 1000,
+            steps,
+          });
+        }
+      }
+      steps.push(`[result] No flag found in subagent output (${result.length} chars)`);
+      // Include last 500 chars of output for debugging
+      steps.push(`[output-tail] ${result.slice(-500)}`);
+      return new SolverResult({
+        durationS: (performance.now() - start) / 1000,
+        steps,
+      });
+    } catch (e) {
+      return new SolverResult({
+        durationS: (performance.now() - start) / 1000,
+        error: String(e),
+        steps: [...steps, `[error] ${e}`],
+      });
+    }
+  }
+  /**
+   * Build the task prompt for the subagent.
+   */
+  _buildTask(config, targetUrl) {
+    const parts = [
+      `Solve this CTF challenge:`,
+      ``,
+      `**Challenge:** ${config.name}`,
+      `**Description:** ${config.description}`,
+      `**Level:** ${config.level}/5`,
+      `**Tags:** ${config.tags.join(', ')}`,
+      `**Target:** ${targetUrl}`,
+      ``,
+      `The target is running in Docker and accessible at ${targetUrl}.`,
+      `Use curl, python3, netcat, or any tools you need.`,
+      ``,
+      `Find the flag and output it as: FLAG_FOUND: <flag>`,
+    ];
+    return parts.join('\n');
+  }
+  /**
+   * Execute the subagent. Tries the subagent tool first,
+   * falls back to direct CLI execution.
+   */
+  async _execSubagent(task) {
+    // Option 1: Direct subagent function (when called from within GSD/pi)
+    if (this._subagentFn) {
+      const result = await this._subagentFn({
+        agent: this._agent,
+        task,
+      });
+      return typeof result === 'string' ? result : JSON.stringify(result);
+    }
+    // Option 2: Try GSD runtime module
+    try {
+      const { subagent } = await import('../../node_modules/gsd-pi/pkg/tools/subagent.js').catch(() => ({}));
+      if (subagent) {
+        const result = await subagent({
+          agent: this._agent,
+          task,
+        });
+        return typeof result === 'string' ? result : JSON.stringify(result);
+      }
+    } catch { /* not in GSD context */ }
+    // Option 3: Fallback to CLI execution
+    try {
+      for (const cmd of ['gsd', 'pi']) {
+        const result = spawnSync(cmd, [
+          '--print', task,
+        ], {
+          timeout: this._timeoutS * 1000,
+          stdio: 'pipe',
+          maxBuffer: 10 * 1024 * 1024,
+          env: { ...process.env },
+        });
+        if (result.status !== null && result.status !== 127) {
+          return (result.stdout || '').toString() + (result.stderr || '').toString();
+        }
+      }
+      throw new Error('Neither gsd nor pi CLI found');
+    } catch (e) {
+      throw new Error(`Subagent execution failed: ${e.message}`);
+    }
+  }
+}