cipher-security 2.0.8 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/bin/cipher.js +11 -1
  2. package/lib/agent-runtime/handlers/architect.js +199 -0
  3. package/lib/agent-runtime/handlers/base.js +240 -0
  4. package/lib/agent-runtime/handlers/blue.js +220 -0
  5. package/lib/agent-runtime/handlers/incident.js +161 -0
  6. package/lib/agent-runtime/handlers/privacy.js +190 -0
  7. package/lib/agent-runtime/handlers/purple.js +209 -0
  8. package/lib/agent-runtime/handlers/recon.js +174 -0
  9. package/lib/agent-runtime/handlers/red.js +246 -0
  10. package/lib/agent-runtime/handlers/researcher.js +170 -0
  11. package/lib/agent-runtime/handlers.js +35 -0
  12. package/lib/agent-runtime/index.js +196 -0
  13. package/lib/agent-runtime/parser.js +316 -0
  14. package/lib/analyze/consistency.js +566 -0
  15. package/lib/analyze/constitution.js +110 -0
  16. package/lib/analyze/sharding.js +251 -0
  17. package/lib/autonomous/agent-tool.js +165 -0
  18. package/lib/autonomous/feedback-loop.js +13 -6
  19. package/lib/autonomous/framework.js +17 -0
  20. package/lib/autonomous/handoff.js +506 -0
  21. package/lib/autonomous/modes/blue.js +26 -0
  22. package/lib/autonomous/modes/red.js +585 -0
  23. package/lib/autonomous/modes/researcher.js +322 -0
  24. package/lib/autonomous/researcher.js +12 -45
  25. package/lib/autonomous/runner.js +9 -537
  26. package/lib/benchmark/agent.js +88 -26
  27. package/lib/benchmark/baselines.js +3 -0
  28. package/lib/benchmark/claude-code-solver.js +254 -0
  29. package/lib/benchmark/cognitive.js +283 -0
  30. package/lib/benchmark/index.js +12 -2
  31. package/lib/benchmark/knowledge.js +281 -0
  32. package/lib/benchmark/llm.js +156 -15
  33. package/lib/benchmark/models.js +5 -2
  34. package/lib/benchmark/nyu-ctf.js +192 -0
  35. package/lib/benchmark/overthewire.js +347 -0
  36. package/lib/benchmark/picoctf.js +281 -0
  37. package/lib/benchmark/prompts.js +280 -0
  38. package/lib/benchmark/registry.js +219 -0
  39. package/lib/benchmark/remote-solver.js +356 -0
  40. package/lib/benchmark/remote-target.js +263 -0
  41. package/lib/benchmark/reporter.js +35 -0
  42. package/lib/benchmark/runner.js +174 -10
  43. package/lib/benchmark/sandbox.js +35 -0
  44. package/lib/benchmark/scorer.js +22 -4
  45. package/lib/benchmark/solver.js +34 -1
  46. package/lib/benchmark/tools.js +262 -16
  47. package/lib/commands.js +9 -0
  48. package/lib/execution/council.js +434 -0
  49. package/lib/execution/parallel.js +292 -0
  50. package/lib/gates/circuit-breaker.js +135 -0
  51. package/lib/gates/confidence.js +302 -0
  52. package/lib/gates/corrections.js +219 -0
  53. package/lib/gates/self-check.js +245 -0
  54. package/lib/gateway/commands.js +727 -0
  55. package/lib/guardrails/engine.js +364 -0
  56. package/lib/mcp/server.js +349 -3
  57. package/lib/memory/compressor.js +94 -7
  58. package/lib/pipeline/hooks.js +288 -0
  59. package/lib/pipeline/index.js +11 -0
  60. package/lib/review/budget.js +210 -0
  61. package/lib/review/engine.js +526 -0
  62. package/lib/review/layers/acceptance-auditor.js +279 -0
  63. package/lib/review/layers/blind-hunter.js +500 -0
  64. package/lib/review/layers/defense-in-depth.js +209 -0
  65. package/lib/review/layers/edge-case-hunter.js +266 -0
  66. package/lib/review/panel.js +519 -0
  67. package/lib/review/two-stage.js +244 -0
  68. package/lib/session/cost-tracker.js +203 -0
  69. package/lib/session/logger.js +349 -0
  70. package/package.json +1 -1
@@ -4,20 +4,148 @@
4
4
  /**
5
5
  * CIPHER Benchmark — LLM client factory for the security agent.
6
6
  *
7
- * Auto-detects the best available backend and returns an
8
- * Anthropic-SDK-compatible client for tool-use.
7
+ * Returns an Anthropic-SDK-compatible client for tool-use, wrapping
8
+ * Ollama's OpenAI-compatible API into the Anthropic messages format
9
+ * that SecurityAgent expects.
9
10
  */
10
11
 
12
+ // ---------------------------------------------------------------------------
13
+ // Ollama → Anthropic adapter
14
+ // ---------------------------------------------------------------------------
15
+
16
+ /**
17
+ * Wraps an OpenAI-compatible client to present an Anthropic-style interface.
18
+ * Translates tool schemas and response formats.
19
+ */
20
+ class OllamaAnthropicAdapter {
21
+ constructor(openaiClient, model) {
22
+ this._client = openaiClient;
23
+ this._model = model;
24
+ this.messages = { create: this._create.bind(this) };
25
+ }
26
+
27
+ async _create({ model, max_tokens, tools, messages, system }) {
28
+ // Convert Anthropic tool schemas to OpenAI function schemas
29
+ const openaiTools = (tools || []).map(t => ({
30
+ type: 'function',
31
+ function: {
32
+ name: t.name,
33
+ description: t.description,
34
+ parameters: t.input_schema,
35
+ },
36
+ }));
37
+
38
+ // Convert Anthropic messages to OpenAI format
39
+ const openaiMessages = [];
40
+ if (system) {
41
+ openaiMessages.push({ role: 'system', content: system });
42
+ }
43
+
44
+ for (const msg of messages) {
45
+ if (msg.role === 'user') {
46
+ if (typeof msg.content === 'string') {
47
+ openaiMessages.push({ role: 'user', content: msg.content });
48
+ } else if (Array.isArray(msg.content)) {
49
+ // Tool results from Anthropic format
50
+ for (const block of msg.content) {
51
+ if (block.type === 'tool_result') {
52
+ openaiMessages.push({
53
+ role: 'tool',
54
+ tool_call_id: block.tool_use_id,
55
+ content: typeof block.content === 'string' ? block.content : JSON.stringify(block.content),
56
+ });
57
+ }
58
+ }
59
+ }
60
+ } else if (msg.role === 'assistant') {
61
+ if (typeof msg.content === 'string') {
62
+ openaiMessages.push({ role: 'assistant', content: msg.content });
63
+ } else if (Array.isArray(msg.content)) {
64
+ // Convert Anthropic assistant blocks to OpenAI format
65
+ let textContent = '';
66
+ const toolCalls = [];
67
+ for (const block of msg.content) {
68
+ if (block.type === 'text') {
69
+ textContent += block.text;
70
+ } else if (block.type === 'tool_use') {
71
+ toolCalls.push({
72
+ id: block.id,
73
+ type: 'function',
74
+ function: { name: block.name, arguments: JSON.stringify(block.input) },
75
+ });
76
+ }
77
+ }
78
+ const assistantMsg = { role: 'assistant', content: textContent || null };
79
+ if (toolCalls.length > 0) assistantMsg.tool_calls = toolCalls;
80
+ openaiMessages.push(assistantMsg);
81
+ }
82
+ }
83
+ }
84
+
85
+ // Call OpenAI-compatible endpoint
86
+ const response = await this._client.chat.completions.create({
87
+ model: model || this._model,
88
+ max_tokens: max_tokens || 4096,
89
+ tools: openaiTools.length > 0 ? openaiTools : undefined,
90
+ messages: openaiMessages,
91
+ });
92
+
93
+ // Convert OpenAI response to Anthropic format
94
+ const choice = response.choices?.[0];
95
+ if (!choice) {
96
+ return {
97
+ content: [{ type: 'text', text: '' }],
98
+ stop_reason: 'end_turn',
99
+ usage: { input_tokens: response.usage?.prompt_tokens || 0, output_tokens: response.usage?.completion_tokens || 0 },
100
+ };
101
+ }
102
+
103
+ const content = [];
104
+ if (choice.message?.content) {
105
+ content.push({ type: 'text', text: choice.message.content });
106
+ }
107
+ if (choice.message?.tool_calls) {
108
+ for (const tc of choice.message.tool_calls) {
109
+ let parsedArgs = {};
110
+ try { parsedArgs = JSON.parse(tc.function.arguments); } catch { /* ignore */ }
111
+ content.push({
112
+ type: 'tool_use',
113
+ id: tc.id,
114
+ name: tc.function.name,
115
+ input: parsedArgs,
116
+ });
117
+ }
118
+ }
119
+
120
+ const stopReason = choice.finish_reason === 'tool_calls' ? 'tool_use' : 'end_turn';
121
+
122
+ return {
123
+ content: content.length > 0 ? content : [{ type: 'text', text: '' }],
124
+ stop_reason: stopReason,
125
+ usage: {
126
+ input_tokens: response.usage?.prompt_tokens || 0,
127
+ output_tokens: response.usage?.completion_tokens || 0,
128
+ },
129
+ };
130
+ }
131
+ }
132
+
133
+ // ---------------------------------------------------------------------------
134
+ // Client factory
135
+ // ---------------------------------------------------------------------------
136
+
11
137
  /**
12
138
  * Create an LLM client for the benchmark agent.
139
+ * Always returns an Anthropic-SDK-compatible interface.
140
+ *
13
141
  * @param {object} [opts]
14
142
  * @param {string} [opts.backendOverride]
15
- * @returns {{ client: object, model: string }}
143
+ * @returns {Promise<{ client: object, model: string }>}
16
144
  */
17
145
  export async function makeAgentClient(opts = {}) {
18
146
  const backendOverride = opts.backendOverride;
19
147
 
20
- // Try explicit override first
148
+ // Try Claude first
21
149
  if (backendOverride === 'claude' || (!backendOverride && process.env.ANTHROPIC_API_KEY)) {
22
150
  try {
23
151
  const { default: Anthropic } = await import('@anthropic-ai/sdk');
@@ -28,26 +156,39 @@ export async function makeAgentClient(opts = {}) {
28
156
  }
29
157
  }
30
158
 
159
+ // Try Ollama via OpenAI-compatible API
31
160
  if (backendOverride === 'ollama' || !backendOverride) {
32
- // Check if Ollama is running
33
161
  try {
34
162
  const { request } = await import('node:http');
35
163
  const alive = await new Promise((resolve) => {
36
164
  const req = request({ hostname: '127.0.0.1', port: 11434, path: '/api/tags', timeout: 2000 }, (res) => {
37
- res.resume();
38
- resolve(res.statusCode === 200);
165
+ let body = '';
166
+ res.on('data', (d) => body += d);
167
+ res.on('end', () => {
168
+ try {
169
+ const data = JSON.parse(body);
170
+ resolve(data);
171
+ } catch { resolve(null); }
172
+ });
39
173
  });
40
- req.on('error', () => resolve(false));
41
- req.on('timeout', () => { req.destroy(); resolve(false); });
174
+ req.on('error', () => resolve(null));
175
+ req.on('timeout', () => { req.destroy(); resolve(null); });
42
176
  req.end();
43
177
  });
178
+
44
179
  if (alive) {
45
- // Use OpenAI SDK pointed at Ollama
46
- try {
47
- const { default: OpenAI } = await import('openai');
48
- const client = new OpenAI({ baseURL: 'http://127.0.0.1:11434/v1', apiKey: 'ollama' });
49
- return { client, model: 'qwen2.5-coder:14b' };
50
- } catch { /* fall through */ }
180
+ // Pick the best available model
181
+ const models = (alive.models || []).map(m => m.name);
182
+ const preferred = ['qwen2.5:32b', 'cipher:latest', 'qwen2.5-coder:14b', 'qwen2.5:14b', 'llama3.1:70b'];
183
+ let selectedModel = models[0] || 'qwen2.5:32b';
184
+ for (const p of preferred) {
185
+ if (models.includes(p)) { selectedModel = p; break; }
186
+ }
187
+
188
+ const { default: OpenAI } = await import('openai');
189
+ const rawClient = new OpenAI({ baseURL: 'http://127.0.0.1:11434/v1', apiKey: 'ollama' });
190
+ const client = new OllamaAnthropicAdapter(rawClient, selectedModel);
191
+ return { client, model: selectedModel };
51
192
  }
52
193
  } catch { /* fall through */ }
53
194
  }
@@ -9,8 +9,8 @@ import { readFileSync } from 'node:fs';
9
9
  import { basename } from 'node:path';
10
10
 
11
11
  export class BenchmarkConfig {
12
- constructor({ name, description, level, tags = [], winCondition = 'flag', files = [], path = '.' }) {
13
- if (![1, 2, 3].includes(level)) throw new Error(`level must be 1, 2, or 3, got ${level}`);
12
+ constructor({ name, description, level, tags = [], winCondition = 'flag', files = [], path = '.', category = '' }) {
13
+ if (level < 1 || level > 5) throw new Error(`level must be 1-5, got ${level}`);
14
14
  if (!['flag', 'question'].includes(winCondition)) throw new Error(`winCondition must be 'flag' or 'question'`);
15
15
  this.name = name;
16
16
  this.description = description;
@@ -19,6 +19,7 @@ export class BenchmarkConfig {
19
19
  this.winCondition = winCondition;
20
20
  this.files = files;
21
21
  this.path = path;
22
+ this.category = category;
22
23
  }
23
24
 
24
25
  get dirName() { return basename(this.path); }
@@ -61,6 +62,8 @@ export class SolverResult {
61
62
  constructor(opts = {}) {
62
63
  this.flagFound = opts.flagFound ?? false;
63
64
  this.flagValue = opts.flagValue ?? '';
65
+ this.answerFound = opts.answerFound ?? false;
66
+ this.answerValue = opts.answerValue ?? '';
64
67
  this.durationS = opts.durationS ?? 0;
65
68
  this.tokensIn = opts.tokensIn ?? 0;
66
69
  this.tokensOut = opts.tokensOut ?? 0;
@@ -0,0 +1,192 @@
1
+ // Copyright (c) 2026 defconxt. All rights reserved.
2
+ // Licensed under AGPL-3.0 — see LICENSE file for details.
3
+ // CIPHER is a trademark of defconxt.
4
+
5
+ /**
6
+ * NYU CTF Bench — Challenge loader for the NYU CTF benchmark suite.
7
+ *
8
+ * Loads challenges from the NYU CTF Bench format (NeurIPS 2024):
9
+ * - challenge.json with name, description, category, difficulty, flag
10
+ * - Per-challenge docker-compose.yaml for target services
11
+ * - 6 categories: web, pwn, rev, crypto, forensics, misc
12
+ * - 255 challenges across 5 difficulty levels
13
+ *
14
+ * @module benchmark/nyu-ctf
15
+ */
16
+
17
+ import { existsSync, readdirSync, readFileSync, statSync } from 'node:fs';
18
+ import { join, basename } from 'node:path';
19
+ import { homedir } from 'node:os';
20
+ import { BenchmarkConfig } from './models.js';
21
+ import { CompetitorBaseline } from './models.js';
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Constants
25
+ // ---------------------------------------------------------------------------
26
+
27
+ export const NYU_CTF_REPO_URL = 'https://github.com/NYU-LLM-CTF/LLM_CTF_Database.git';
28
+ export const NYU_CTF_CLONE_DIR = join(homedir(), '.cipher', 'benchmarks', 'nyu-ctf-bench');
29
+
30
+ /** NYU CTF categories with XBOW tag mappings. */
31
+ export const NYU_CATEGORIES = ['web', 'pwn', 'rev', 'crypto', 'forensics', 'misc'];
32
+
33
+ /** Difficulty mapping: NYU uses string labels, we use 1-5. */
34
+ const DIFFICULTY_MAP = {
35
+ 'easy': 1,
36
+ 'medium': 2,
37
+ 'hard': 3,
38
+ 'very hard': 4,
39
+ 'expert': 5,
40
+ // Numeric fallbacks
41
+ '1': 1, '2': 2, '3': 3, '4': 4, '5': 5,
42
+ };
43
+
44
+ // ---------------------------------------------------------------------------
45
+ // NYU CTF Baselines
46
+ // ---------------------------------------------------------------------------
47
+
48
+ export const CRAKEN_BASELINE = new CompetitorBaseline({
49
+ name: 'CRAKEN',
50
+ overallPct: 22.0,
51
+ overallPassed: 56,
52
+ overallTotal: 255,
53
+ levelPct: {},
54
+ medianCostUsd: 0,
55
+ medianTimeS: 0,
56
+ source: 'NYU CTF Bench — CRAKEN (NeurIPS 2024)',
57
+ });
58
+
59
+ export const DCIPHER_BASELINE = new CompetitorBaseline({
60
+ name: 'D-CIPHER',
61
+ overallPct: 19.0,
62
+ overallPassed: 48,
63
+ overallTotal: 255,
64
+ levelPct: {},
65
+ medianCostUsd: 0,
66
+ medianTimeS: 0,
67
+ source: 'NYU CTF Bench — D-CIPHER (NeurIPS 2024)',
68
+ });
69
+
70
+ export const ENIGMA_BASELINE = new CompetitorBaseline({
71
+ name: 'EnIGMA',
72
+ overallPct: 13.5,
73
+ overallPassed: 34,
74
+ overallTotal: 255,
75
+ levelPct: {},
76
+ medianCostUsd: 0,
77
+ medianTimeS: 0,
78
+ source: 'NYU CTF Bench — EnIGMA (NeurIPS 2024)',
79
+ });
80
+
81
+ export const NYU_BASELINES = [CRAKEN_BASELINE, DCIPHER_BASELINE, ENIGMA_BASELINE];
82
+
83
+ // ---------------------------------------------------------------------------
84
+ // Challenge loader
85
+ // ---------------------------------------------------------------------------
86
+
87
+ /**
88
+ * Parse a NYU CTF challenge.json file into a BenchmarkConfig.
89
+ *
90
+ * Expected format:
91
+ * {
92
+ * "name": "challenge-name",
93
+ * "description": "Challenge description",
94
+ * "category": "web",
95
+ * "difficulty": "medium",
96
+ * "flag": "flag{...}",
97
+ * "files": ["file1.py", "file2.txt"],
98
+ * "docker_compose": true
99
+ * }
100
+ *
101
+ * @param {string} configPath - Path to challenge.json
102
+ * @returns {BenchmarkConfig}
103
+ */
104
+ export function loadNyuChallenge(configPath) {
105
+ const data = JSON.parse(readFileSync(configPath, 'utf8'));
106
+ const dir = configPath.replace(/\/[^/]+$/, '');
107
+
108
+ const category = (data.category || 'misc').toLowerCase();
109
+ const difficultyStr = String(data.difficulty || 'medium').toLowerCase();
110
+ const level = DIFFICULTY_MAP[difficultyStr] ?? 2;
111
+
112
+ // Build tags from category + any explicit tags
113
+ const tags = [category];
114
+ if (data.tags) tags.push(...data.tags);
115
+
116
+ // Detect files
117
+ const files = (data.files || []).map(f => (typeof f === 'string' ? { name: f } : f));
118
+
119
+ return new BenchmarkConfig({
120
+ name: data.name || basename(dir),
121
+ description: data.description || '',
122
+ level: Math.min(5, Math.max(1, level)),
123
+ tags,
124
+ winCondition: 'flag',
125
+ files,
126
+ path: dir,
127
+ category,
128
+ });
129
+ }
130
+
131
+ /**
132
+ * Enumerate all NYU CTF challenges in a directory.
133
+ *
134
+ * Walks the directory tree looking for challenge.json files.
135
+ * Supports both flat (all in one dir) and nested (category/challenge/) layouts.
136
+ *
137
+ * @param {string} baseDir - Root directory to scan
138
+ * @returns {BenchmarkConfig[]}
139
+ */
140
+ export function enumerateNyuChallenges(baseDir) {
141
+ const configs = [];
142
+ if (!existsSync(baseDir)) return configs;
143
+
144
+ function walk(dir, depth = 0) {
145
+ if (depth > 3) return; // Don't recurse too deep
146
+ for (const entry of readdirSync(dir).sort()) {
147
+ const entryPath = join(dir, entry);
148
+ if (!statSync(entryPath).isDirectory() && entry === 'challenge.json') {
149
+ try {
150
+ configs.push(loadNyuChallenge(entryPath));
151
+ } catch { /* skip malformed */ }
152
+ return; // Don't recurse further in this dir
153
+ }
154
+ }
155
+ // Recurse into subdirectories
156
+ for (const entry of readdirSync(dir).sort()) {
157
+ const entryPath = join(dir, entry);
158
+ if (statSync(entryPath).isDirectory()) {
159
+ walk(entryPath, depth + 1);
160
+ }
161
+ }
162
+ }
163
+
164
+ walk(baseDir);
165
+ return configs;
166
+ }
167
+
168
+ /**
169
+ * Get the expected flag for a NYU CTF challenge.
170
+ *
171
+ * @param {string} configPath - Path to challenge.json
172
+ * @returns {string|null}
173
+ */
174
+ export function getNyuFlag(configPath) {
175
+ try {
176
+ const data = JSON.parse(readFileSync(configPath, 'utf8'));
177
+ return data.flag || null;
178
+ } catch {
179
+ return null;
180
+ }
181
+ }
182
+
183
+ /**
184
+ * Check if a directory has a docker-compose file for the challenge.
185
+ *
186
+ * @param {string} challengeDir
187
+ * @returns {boolean}
188
+ */
189
+ export function hasDockerCompose(challengeDir) {
190
+ return existsSync(join(challengeDir, 'docker-compose.yml'))
191
+ || existsSync(join(challengeDir, 'docker-compose.yaml'));
192
+ }