cipher-security 2.1.0 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/bin/cipher.js +10 -0
  2. package/lib/analyze/consistency.js +566 -0
  3. package/lib/analyze/constitution.js +110 -0
  4. package/lib/analyze/sharding.js +251 -0
  5. package/lib/autonomous/agent-tool.js +165 -0
  6. package/lib/autonomous/framework.js +17 -0
  7. package/lib/autonomous/handoff.js +506 -0
  8. package/lib/autonomous/modes/blue.js +26 -0
  9. package/lib/autonomous/modes/red.js +28 -0
  10. package/lib/benchmark/agent.js +88 -26
  11. package/lib/benchmark/baselines.js +3 -0
  12. package/lib/benchmark/claude-code-solver.js +254 -0
  13. package/lib/benchmark/cognitive.js +283 -0
  14. package/lib/benchmark/index.js +12 -2
  15. package/lib/benchmark/knowledge.js +281 -0
  16. package/lib/benchmark/llm.js +156 -15
  17. package/lib/benchmark/models.js +5 -2
  18. package/lib/benchmark/nyu-ctf.js +192 -0
  19. package/lib/benchmark/overthewire.js +347 -0
  20. package/lib/benchmark/picoctf.js +281 -0
  21. package/lib/benchmark/prompts.js +280 -0
  22. package/lib/benchmark/registry.js +219 -0
  23. package/lib/benchmark/remote-solver.js +356 -0
  24. package/lib/benchmark/remote-target.js +263 -0
  25. package/lib/benchmark/reporter.js +35 -0
  26. package/lib/benchmark/runner.js +174 -10
  27. package/lib/benchmark/sandbox.js +35 -0
  28. package/lib/benchmark/scorer.js +22 -4
  29. package/lib/benchmark/solver.js +34 -1
  30. package/lib/benchmark/tools.js +262 -16
  31. package/lib/commands.js +9 -0
  32. package/lib/execution/council.js +434 -0
  33. package/lib/execution/parallel.js +292 -0
  34. package/lib/gates/circuit-breaker.js +135 -0
  35. package/lib/gates/confidence.js +302 -0
  36. package/lib/gates/corrections.js +219 -0
  37. package/lib/gates/self-check.js +245 -0
  38. package/lib/gateway/commands.js +727 -0
  39. package/lib/guardrails/engine.js +364 -0
  40. package/lib/mcp/server.js +349 -3
  41. package/lib/memory/compressor.js +94 -7
  42. package/lib/pipeline/hooks.js +288 -0
  43. package/lib/pipeline/index.js +11 -0
  44. package/lib/review/budget.js +210 -0
  45. package/lib/review/engine.js +526 -0
  46. package/lib/review/layers/acceptance-auditor.js +279 -0
  47. package/lib/review/layers/blind-hunter.js +500 -0
  48. package/lib/review/layers/defense-in-depth.js +209 -0
  49. package/lib/review/layers/edge-case-hunter.js +266 -0
  50. package/lib/review/panel.js +519 -0
  51. package/lib/review/two-stage.js +244 -0
  52. package/lib/session/cost-tracker.js +203 -0
  53. package/lib/session/logger.js +349 -0
  54. package/package.json +1 -1
@@ -3,6 +3,9 @@
3
3
 
4
4
  /**
5
5
  * CIPHER Benchmark — Runner (orchestrates build → start → solve → score → report).
6
+ *
7
+ * Supports parallel execution with concurrency limits and retry/best-of-N
8
+ * for reliability on flaky benchmarks.
6
9
  */
7
10
 
8
11
  import { BenchmarkBuilder } from './builder.js';
@@ -11,18 +14,18 @@ import { SolverResult } from './models.js';
11
14
  import { aggregateResults, scoreResult } from './scorer.js';
12
15
  import { getSolver } from './solver.js';
13
16
 
14
- export function runSingleBenchmark(builder, solver, name, flag) {
17
+ // ---------------------------------------------------------------------------
18
+ // Single benchmark execution
19
+ // ---------------------------------------------------------------------------
20
+
21
+ export async function runSingleBenchmark(builder, solver, name, flag) {
15
22
  const config = builder.getBenchmark(name);
16
23
  flag = flag || builder.config.generateFlag(name);
17
24
  let targetUrl = '';
18
25
  try {
19
26
  const result = builder.runBenchmark(name, flag);
20
27
  targetUrl = result.targetUrl;
21
- const solverResult = solver.solve(config, targetUrl, result.expectedFlag);
22
- // Handle promise (autonomous) or sync (stub)
23
- if (solverResult && typeof solverResult.then === 'function') {
24
- return solverResult.then((sr) => scoreResult(config, sr, flag, targetUrl));
25
- }
28
+ const solverResult = await solver.solve(config, targetUrl, result.expectedFlag);
26
29
  return scoreResult(config, solverResult, flag, targetUrl);
27
30
  } catch (e) {
28
31
  return scoreResult(config, new SolverResult({ error: String(e) }), flag, targetUrl);
@@ -31,7 +34,85 @@ export function runSingleBenchmark(builder, solver, name, flag) {
31
34
  }
32
35
  }
33
36
 
34
- export async function runBenchmarks({ builder, solver, benchmarkNames, runAll = false, levelFilter, tagFilter }) {
37
+ // ---------------------------------------------------------------------------
38
+ // Retry logic — best-of-N
39
+ // ---------------------------------------------------------------------------
40
+
41
+ /**
42
+ * Run a single benchmark with retries, keeping the best result.
43
+ *
44
+ * @param {BenchmarkBuilder} builder
45
+ * @param {object} solver
46
+ * @param {string} name
47
+ * @param {object} [opts]
48
+ * @param {number} [opts.retries=1] - Total attempts (1 = no retry)
49
+ * @param {string} [opts.flag]
50
+ * @param {Function} [opts.onAttempt] - Called with (attemptNum, result) after each attempt
51
+ * @returns {Promise<import('./models.js').BenchmarkResult>}
52
+ */
53
+ export async function runWithRetry(builder, solver, name, opts = {}) {
54
+ const maxAttempts = opts.retries ?? 1;
55
+ const flag = opts.flag || builder.config.generateFlag(name);
56
+ let bestResult = null;
57
+
58
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
59
+ const result = await runSingleBenchmark(builder, solver, name, flag);
60
+
61
+ if (opts.onAttempt) opts.onAttempt(attempt, result);
62
+
63
+ // Keep the best result: passed > failed > error
64
+ if (!bestResult || _isBetterResult(result, bestResult)) {
65
+ bestResult = result;
66
+ }
67
+
68
+ // If passed, no need to retry
69
+ if (result.passed) break;
70
+ }
71
+
72
+ return bestResult;
73
+ }
74
+
75
+ /**
76
+ * Compare two results — passed beats failed, failed beats error.
77
+ * @param {import('./models.js').BenchmarkResult} a
78
+ * @param {import('./models.js').BenchmarkResult} b
79
+ * @returns {boolean} True if a is better than b
80
+ */
81
+ function _isBetterResult(a, b) {
82
+ if (a.passed && !b.passed) return true;
83
+ if (!a.passed && b.passed) return false;
84
+ // Both same pass/fail status — prefer no error
85
+ if (!a.solverResult.error && b.solverResult.error) return true;
86
+ if (a.solverResult.error && !b.solverResult.error) return false;
87
+ // Both same error status — prefer more tool calls (more progress)
88
+ return a.solverResult.toolCalls > b.solverResult.toolCalls;
89
+ }
90
+
91
+ // ---------------------------------------------------------------------------
92
+ // Parallel execution with concurrency limit
93
+ // ---------------------------------------------------------------------------
94
+
95
+ /**
96
+ * Run benchmarks with parallel execution and optional retries.
97
+ *
98
+ * @param {object} opts
99
+ * @param {BenchmarkBuilder} opts.builder
100
+ * @param {object} opts.solver
101
+ * @param {string[]} [opts.benchmarkNames]
102
+ * @param {boolean} [opts.runAll=false]
103
+ * @param {number} [opts.levelFilter]
104
+ * @param {string} [opts.tagFilter]
105
+ * @param {number} [opts.concurrency=1] - Max parallel benchmarks
106
+ * @param {number} [opts.retries=1] - Attempts per benchmark
107
+ * @param {Function} [opts.onResult] - Called with (name, result) after each benchmark
108
+ * @param {Function} [opts.onTrace] - Called with trace event objects for observability
109
+ * @returns {Promise<import('./models.js').RunReport>}
110
+ */
111
+ export async function runBenchmarks({
112
+ builder, solver, benchmarkNames, runAll = false,
113
+ levelFilter, tagFilter, concurrency = 1, retries = 1,
114
+ onResult, onTrace,
115
+ }) {
35
116
  let allBenchmarks;
36
117
  try {
37
118
  allBenchmarks = builder.listBenchmarks();
@@ -61,14 +142,94 @@ export async function runBenchmarks({ builder, solver, benchmarkNames, runAll =
61
142
  }
62
143
 
63
144
  const results = [];
64
- for (const name of targets) {
65
- const result = await runSingleBenchmark(builder, solver, name);
66
- results.push(result);
145
+
146
+ if (concurrency <= 1) {
147
+ // Sequential execution (original behavior)
148
+ for (const name of targets) {
149
+ const startTime = performance.now();
150
+ const result = await runWithRetry(builder, solver, name, { retries });
151
+ const durationMs = performance.now() - startTime;
152
+
153
+ results.push(result);
154
+ if (onResult) onResult(name, result);
155
+ if (onTrace) onTrace({
156
+ type: 'benchmark_complete',
157
+ benchmark: name,
158
+ passed: result.passed,
159
+ durationMs: Math.round(durationMs),
160
+ attempts: retries,
161
+ error: result.solverResult.error,
162
+ });
163
+ }
164
+ } else {
165
+ // Parallel execution with concurrency limit
166
+ const inFlight = new Set();
167
+ const queue = [...targets];
168
+
169
+ while (queue.length > 0 || inFlight.size > 0) {
170
+ // Fill up to concurrency limit
171
+ while (queue.length > 0 && inFlight.size < concurrency) {
172
+ const name = queue.shift();
173
+ const startTime = performance.now();
174
+ const promise = runWithRetry(builder, solver, name, { retries })
175
+ .then((result) => {
176
+ const durationMs = performance.now() - startTime;
177
+ results.push(result);
178
+ inFlight.delete(promise);
179
+ if (onResult) onResult(name, result);
180
+ if (onTrace) onTrace({
181
+ type: 'benchmark_complete',
182
+ benchmark: name,
183
+ passed: result.passed,
184
+ durationMs: Math.round(durationMs),
185
+ attempts: retries,
186
+ error: result.solverResult.error,
187
+ });
188
+ })
189
+ .catch((err) => {
190
+ results.push({
191
+ config: builder.getBenchmark(name),
192
+ solverResult: new SolverResult({ error: String(err) }),
193
+ passed: false,
194
+ expectedFlag: '',
195
+ actualFlag: '',
196
+ });
197
+ inFlight.delete(promise);
198
+ if (onTrace) onTrace({
199
+ type: 'benchmark_error',
200
+ benchmark: name,
201
+ error: String(err),
202
+ });
203
+ });
204
+ inFlight.add(promise);
205
+ }
206
+
207
+ // Wait for at least one to complete
208
+ if (inFlight.size > 0) {
209
+ await Promise.race(inFlight);
210
+ }
211
+ }
67
212
  }
213
+
68
214
  return aggregateResults(results);
69
215
  }
70
216
 
217
+ // ---------------------------------------------------------------------------
218
+ // Report serialization
219
+ // ---------------------------------------------------------------------------
220
+
71
221
  export function reportToDict(report) {
222
+ const byTag = report.resultsByTag();
223
+ const tagRates = {};
224
+ for (const [tag, tagResults] of Object.entries(byTag)) {
225
+ const p = tagResults.filter(r => r.passed).length;
226
+ tagRates[tag] = {
227
+ total: tagResults.length,
228
+ passed: p,
229
+ rate: tagResults.length ? Math.round((p / tagResults.length) * 1000) / 10 : 0,
230
+ };
231
+ }
232
+
72
233
  return {
73
234
  summary: {
74
235
  total: report.total,
@@ -81,6 +242,7 @@ export function reportToDict(report) {
81
242
  pass_rate_by_level: Object.fromEntries(
82
243
  Object.entries(report.passRateByLevel()).map(([k, v]) => [k, Math.round(v * 10) / 10]),
83
244
  ),
245
+ pass_rate_by_tag: tagRates,
84
246
  },
85
247
  results: report.results.map((r) => ({
86
248
  benchmark: r.config.dirName,
@@ -101,3 +263,5 @@ export function reportToDict(report) {
101
263
  })),
102
264
  };
103
265
  }
266
+
267
+ export { _isBetterResult };
@@ -77,6 +77,41 @@ export class SandboxContainer {
77
77
  this._connectedNetworks.push(networkName);
78
78
  }
79
79
 
80
+ /**
81
+ * Copy a file from the host into the sandbox container.
82
+ * @param {string} hostPath - Absolute path on the host
83
+ * @param {string} containerPath - Path inside the container (default: /tmp/<filename>)
84
+ */
85
+ copyFileIn(hostPath, containerPath) {
86
+ if (!this._containerId) throw new SandboxError('Container not created.');
87
+ const result = spawnSync('docker', ['cp', hostPath, `${this._containerId}:${containerPath}`], {
88
+ timeout: 30000,
89
+ stdio: 'pipe',
90
+ });
91
+ if (result.status !== 0) {
92
+ throw new SandboxError(`Failed to copy ${hostPath} to container: ${(result.stderr || '').toString().slice(0, 200)}`);
93
+ }
94
+ }
95
+
96
+ /**
97
+ * Parse a docker-compose.yml and return all service URLs.
98
+ * @param {object} services - Parsed docker-compose services object
99
+ * @param {string} networkName - Docker network the sandbox is connected to
100
+ * @returns {Array<{ name: string, url: string, port: number }>}
101
+ */
102
+ static resolveServiceUrls(services, networkName) {
103
+ const urls = [];
104
+ for (const [name, svc] of Object.entries(services)) {
105
+ let port = 80;
106
+ if (svc.ports?.length) {
107
+ port = parseInt(String(svc.ports[0]).split(':').pop(), 10);
108
+ }
109
+ const protocol = [443, 8443].includes(port) ? 'https' : 'http';
110
+ urls.push({ name, url: `${protocol}://${name}:${port}`, port });
111
+ }
112
+ return urls;
113
+ }
114
+
80
115
  getInternalUrl(serviceName, port = 80) {
81
116
  const protocol = [443, 8443].includes(port) ? 'https' : 'http';
82
117
  return `${protocol}://${serviceName}:${port}`;
@@ -13,10 +13,28 @@ export function scoreFlag(expected, actual) {
13
13
  }
14
14
 
15
15
  export function scoreResult(config, solverResult, expectedFlag, targetUrl = '') {
16
- const passed = solverResult.flagFound && solverResult.flagValue
17
- ? scoreFlag(expectedFlag, solverResult.flagValue)
18
- : false;
19
- return new BenchmarkResult({ config, solverResult, passed, expectedFlag, actualFlag: solverResult.flagValue, targetUrl });
16
+ let passed = false;
17
+ let actualValue = '';
18
+
19
+ if (config.winCondition === 'question') {
20
+ // Question-type: check answer
21
+ if (solverResult.answerFound && solverResult.answerValue) {
22
+ actualValue = solverResult.answerValue;
23
+ passed = scoreFlag(expectedFlag, solverResult.answerValue);
24
+ }
25
+ } else {
26
+ // Flag-type: check flag
27
+ if (solverResult.flagFound && solverResult.flagValue) {
28
+ actualValue = solverResult.flagValue;
29
+ passed = scoreFlag(expectedFlag, solverResult.flagValue);
30
+ }
31
+ }
32
+
33
+ return new BenchmarkResult({
34
+ config, solverResult, passed, expectedFlag,
35
+ actualFlag: actualValue || solverResult.flagValue || solverResult.answerValue || '',
36
+ targetUrl,
37
+ });
20
38
  }
21
39
 
22
40
  export function aggregateResults(results) {
@@ -159,8 +159,41 @@ export class MultiAgentSolver extends SolverAdapter {
159
159
 
160
160
  export const SOLVERS = { stub: StubSolver, manual: ManualSolver, autonomous: AutonomousSolver, 'autonomous-multi': MultiAgentSolver };
161
161
 
162
+ // Lazy-load RemoteSolver to avoid circular imports
162
163
  export function getSolver(name, opts = {}) {
164
+ if (name === 'remote') {
165
+ return new RemoteSolverProxy(opts);
166
+ }
167
+ if (name === 'claude-code') {
168
+ return new ClaudeCodeSolverProxy(opts);
169
+ }
163
170
  const Cls = SOLVERS[name];
164
- if (!Cls) throw new Error(`Unknown solver: ${name}. Available: ${Object.keys(SOLVERS).join(', ')}`);
171
+ if (!Cls) throw new Error(`Unknown solver: ${name}. Available: ${[...Object.keys(SOLVERS), 'remote', 'claude-code'].join(', ')}`);
165
172
  return new Cls(opts);
166
173
  }
174
+
175
+ /** Proxy that lazy-loads RemoteSolver on first solve(). */
176
+ class RemoteSolverProxy extends SolverAdapter {
177
+ constructor(opts) { super(); this._opts = opts; this._impl = null; }
178
+ get name() { return 'remote'; }
179
+ async solve(config, targetUrl, expectedFlag) {
180
+ if (!this._impl) {
181
+ const { RemoteSolver } = await import('./remote-solver.js');
182
+ this._impl = new RemoteSolver(this._opts);
183
+ }
184
+ return this._impl.solve(config, targetUrl, expectedFlag);
185
+ }
186
+ }
187
+
188
+ /** Proxy that lazy-loads ClaudeCodeSolver on first solve(). */
189
+ class ClaudeCodeSolverProxy extends SolverAdapter {
190
+ constructor(opts) { super(); this._opts = opts; this._impl = null; }
191
+ get name() { return 'claude-code'; }
192
+ async solve(config, targetUrl, expectedFlag) {
193
+ if (!this._impl) {
194
+ const { ClaudeCodeSolver } = await import('./claude-code-solver.js');
195
+ this._impl = new ClaudeCodeSolver(this._opts);
196
+ }
197
+ return this._impl.solve(config, targetUrl, expectedFlag);
198
+ }
199
+ }
@@ -3,10 +3,23 @@
3
3
 
4
4
  /**
5
5
  * CIPHER Benchmark — Tool definitions for the security agent.
6
+ *
7
+ * 7 tools for autonomous CTF solving:
8
+ * - sandbox_exec: execute shell commands
9
+ * - submit_flag: submit a captured flag (flag win condition)
10
+ * - submit_answer: submit an answer (question win condition)
11
+ * - read_file: read a file from the sandbox filesystem
12
+ * - write_file: write a file to the sandbox filesystem
13
+ * - list_directory: list directory contents in the sandbox
14
+ * - http_request: make structured HTTP requests with headers/body
6
15
  */
7
16
 
8
17
  const MAX_OUTPUT_CHARS = 4000;
9
18
 
19
+ // ---------------------------------------------------------------------------
20
+ // Tool schemas
21
+ // ---------------------------------------------------------------------------
22
+
10
23
  export const SANDBOX_EXEC_SCHEMA = {
11
24
  name: 'sandbox_exec',
12
25
  description: 'Execute a shell command in the security sandbox. Returns stdout, stderr, and exit code.',
@@ -32,31 +45,264 @@ export const SUBMIT_FLAG_SCHEMA = {
32
45
  },
33
46
  };
34
47
 
35
- export const AGENT_TOOLS = [SANDBOX_EXEC_SCHEMA, SUBMIT_FLAG_SCHEMA];
48
+ export const SUBMIT_ANSWER_SCHEMA = {
49
+ name: 'submit_answer',
50
+ description: 'Submit an answer to a question-type challenge. Use this instead of submit_flag when the challenge asks a question rather than requiring flag capture.',
51
+ input_schema: {
52
+ type: 'object',
53
+ properties: {
54
+ answer: { type: 'string', description: 'The answer to the challenge question' },
55
+ },
56
+ required: ['answer'],
57
+ },
58
+ };
59
+
60
+ export const READ_FILE_SCHEMA = {
61
+ name: 'read_file',
62
+ description: 'Read a file from the sandbox filesystem. Returns the file contents (truncated if too large).',
63
+ input_schema: {
64
+ type: 'object',
65
+ properties: {
66
+ path: { type: 'string', description: 'Absolute path to the file in the sandbox' },
67
+ encoding: { type: 'string', description: 'File encoding (default: utf-8). Use "base64" for binary files.', default: 'utf-8' },
68
+ },
69
+ required: ['path'],
70
+ },
71
+ };
72
+
73
+ export const WRITE_FILE_SCHEMA = {
74
+ name: 'write_file',
75
+ description: 'Write content to a file in the sandbox filesystem. Creates parent directories if needed.',
76
+ input_schema: {
77
+ type: 'object',
78
+ properties: {
79
+ path: { type: 'string', description: 'Absolute path for the file in the sandbox' },
80
+ content: { type: 'string', description: 'Content to write to the file' },
81
+ mode: { type: 'string', description: 'File permissions (default: 644)', default: '644' },
82
+ },
83
+ required: ['path', 'content'],
84
+ },
85
+ };
86
+
87
+ export const LIST_DIR_SCHEMA = {
88
+ name: 'list_directory',
89
+ description: 'List files and directories at a path in the sandbox. Returns a detailed listing with permissions and sizes.',
90
+ input_schema: {
91
+ type: 'object',
92
+ properties: {
93
+ path: { type: 'string', description: 'Directory path to list (default: /)', default: '/' },
94
+ },
95
+ },
96
+ };
97
+
98
+ export const HTTP_REQUEST_SCHEMA = {
99
+ name: 'http_request',
100
+ description: 'Make an HTTP request from within the sandbox. Returns status code, headers, and body.',
101
+ input_schema: {
102
+ type: 'object',
103
+ properties: {
104
+ url: { type: 'string', description: 'Target URL' },
105
+ method: { type: 'string', description: 'HTTP method (default: GET)', default: 'GET' },
106
+ headers: { type: 'object', description: 'Request headers as key-value pairs' },
107
+ body: { type: 'string', description: 'Request body (for POST/PUT)' },
108
+ follow_redirects: { type: 'boolean', description: 'Follow redirects (default: true)', default: true },
109
+ },
110
+ required: ['url'],
111
+ },
112
+ };
113
+
114
+ // ---------------------------------------------------------------------------
115
+ // Binary analysis tools (Ghidra)
116
+ // ---------------------------------------------------------------------------
117
+
118
+ export const DISASSEMBLE_SCHEMA = {
119
+ name: 'disassemble',
120
+ description: 'Disassemble a binary file at a specific function or address. Uses objdump/radare2 in the sandbox. For detailed analysis of binary challenges.',
121
+ input_schema: {
122
+ type: 'object',
123
+ properties: {
124
+ path: { type: 'string', description: 'Path to the binary in the sandbox' },
125
+ function_name: { type: 'string', description: 'Function name to disassemble (e.g. "main")' },
126
+ address: { type: 'string', description: 'Address to disassemble from (e.g. "0x401000")' },
127
+ },
128
+ required: ['path'],
129
+ },
130
+ };
131
+
132
+ export const DECOMPILE_SCHEMA = {
133
+ name: 'decompile',
134
+ description: 'Decompile a binary to pseudo-C code. Uses Ghidra headless analysis if available, or falls back to objdump + strings analysis.',
135
+ input_schema: {
136
+ type: 'object',
137
+ properties: {
138
+ path: { type: 'string', description: 'Path to the binary in the sandbox' },
139
+ function_name: { type: 'string', description: 'Function to decompile (default: all)' },
140
+ },
141
+ required: ['path'],
142
+ },
143
+ };
144
+
145
+ // ---------------------------------------------------------------------------
146
+ // Tool collections
147
+ // ---------------------------------------------------------------------------
148
+
149
+ import { COGNITIVE_TOOLS } from './cognitive.js';
150
+
151
+ /** Tools for flag-based challenges (includes cognitive + binary tools). */
152
+ export const FLAG_TOOLS = [
153
+ SANDBOX_EXEC_SCHEMA, SUBMIT_FLAG_SCHEMA,
154
+ READ_FILE_SCHEMA, WRITE_FILE_SCHEMA, LIST_DIR_SCHEMA, HTTP_REQUEST_SCHEMA,
155
+ DISASSEMBLE_SCHEMA, DECOMPILE_SCHEMA,
156
+ ...COGNITIVE_TOOLS,
157
+ ];
158
+
159
+ /** Tools for question-based challenges (includes cognitive tools). */
160
+ export const QUESTION_TOOLS = [
161
+ SANDBOX_EXEC_SCHEMA, SUBMIT_ANSWER_SCHEMA,
162
+ READ_FILE_SCHEMA, WRITE_FILE_SCHEMA, LIST_DIR_SCHEMA, HTTP_REQUEST_SCHEMA,
163
+ ...COGNITIVE_TOOLS,
164
+ ];
165
+
166
+ /** All tools (backward compat — includes both submit types + cognitive + binary). */
167
+ export const AGENT_TOOLS = [
168
+ SANDBOX_EXEC_SCHEMA, SUBMIT_FLAG_SCHEMA, SUBMIT_ANSWER_SCHEMA,
169
+ READ_FILE_SCHEMA, WRITE_FILE_SCHEMA, LIST_DIR_SCHEMA, HTTP_REQUEST_SCHEMA,
170
+ DISASSEMBLE_SCHEMA, DECOMPILE_SCHEMA,
171
+ ...COGNITIVE_TOOLS,
172
+ ];
173
+
174
+ /**
175
+ * Get the appropriate tool set for a win condition.
176
+ * @param {'flag' | 'question'} winCondition
177
+ * @returns {object[]}
178
+ */
179
+ export function getToolsForWinCondition(winCondition) {
180
+ return winCondition === 'question' ? QUESTION_TOOLS : FLAG_TOOLS;
181
+ }
182
+
183
+ // ---------------------------------------------------------------------------
184
+ // Tool dispatch
185
+ // ---------------------------------------------------------------------------
36
186
 
37
187
  /**
38
188
  * Dispatch a tool call within the sandbox.
39
189
  * @param {string} toolName
40
190
  * @param {object} toolInput
41
191
  * @param {import('./sandbox.js').SandboxContainer} sandbox
42
- * @returns {{ output: string, flagSubmitted?: string }}
192
+ * @returns {{ output: string, flagSubmitted?: string, answerSubmitted?: string }}
43
193
  */
44
194
  export function dispatchTool(toolName, toolInput, sandbox) {
45
- if (toolName === 'sandbox_exec') {
46
- const { command, timeout = 60 } = toolInput;
47
- const result = sandbox.execTool(command, timeout);
48
- let output = `Exit code: ${result.exitCode}\n`;
49
- if (result.stdout) output += `STDOUT:\n${result.stdout.slice(0, MAX_OUTPUT_CHARS)}\n`;
50
- if (result.stderr) output += `STDERR:\n${result.stderr.slice(0, MAX_OUTPUT_CHARS)}\n`;
51
- if (result.stdout.length > MAX_OUTPUT_CHARS || result.stderr.length > MAX_OUTPUT_CHARS) {
52
- output += '[output truncated]\n';
195
+ switch (toolName) {
196
+ case 'sandbox_exec': {
197
+ const { command, timeout = 60 } = toolInput;
198
+ const result = sandbox.execTool(command, timeout);
199
+ let output = `Exit code: ${result.exitCode}\n`;
200
+ if (result.stdout) output += `STDOUT:\n${result.stdout.slice(0, MAX_OUTPUT_CHARS)}\n`;
201
+ if (result.stderr) output += `STDERR:\n${result.stderr.slice(0, MAX_OUTPUT_CHARS)}\n`;
202
+ if (result.stdout.length > MAX_OUTPUT_CHARS || result.stderr.length > MAX_OUTPUT_CHARS) {
203
+ output += '[output truncated]\n';
204
+ }
205
+ return { output };
53
206
  }
54
- return { output };
55
- }
56
207
 
57
- if (toolName === 'submit_flag') {
58
- return { output: `Flag submitted: ${toolInput.flag}`, flagSubmitted: toolInput.flag };
59
- }
208
+ case 'submit_flag':
209
+ return { output: `Flag submitted: ${toolInput.flag}`, flagSubmitted: toolInput.flag };
210
+
211
+ case 'submit_answer':
212
+ return { output: `Answer submitted: ${toolInput.answer}`, answerSubmitted: toolInput.answer };
213
+
214
+ case 'read_file': {
215
+ const { path, encoding = 'utf-8' } = toolInput;
216
+ const catCmd = encoding === 'base64'
217
+ ? `base64 < '${path.replace(/'/g, "'\\''")}'`
218
+ : `cat '${path.replace(/'/g, "'\\''")}'`;
219
+ const result = sandbox.execTool(catCmd);
220
+ if (result.exitCode !== 0) {
221
+ return { output: `Error reading ${path}: ${result.stderr.slice(0, 500)}` };
222
+ }
223
+ const content = result.stdout.slice(0, MAX_OUTPUT_CHARS);
224
+ const truncated = result.stdout.length > MAX_OUTPUT_CHARS ? '\n[truncated]' : '';
225
+ return { output: `File: ${path}\n${content}${truncated}` };
226
+ }
227
+
228
+ case 'write_file': {
229
+ const { path, content, mode = '644' } = toolInput;
230
+ // Escape content for shell
231
+ const escaped = content.replace(/'/g, "'\\''");
232
+ const dirCmd = `mkdir -p '${path.replace(/'/g, "'\\''").replace(/\/[^/]+$/, '')}'`;
233
+ const writeCmd = `${dirCmd} && printf '%s' '${escaped}' > '${path.replace(/'/g, "'\\''")}'`;
234
+ const chmodCmd = `${writeCmd} && chmod ${mode} '${path.replace(/'/g, "'\\''")}'`;
235
+ const result = sandbox.execTool(chmodCmd);
236
+ if (result.exitCode !== 0) {
237
+ return { output: `Error writing ${path}: ${result.stderr.slice(0, 500)}` };
238
+ }
239
+ return { output: `Written ${content.length} bytes to ${path} (mode ${mode})` };
240
+ }
60
241
 
61
- return { output: `Unknown tool: ${toolName}` };
242
+ case 'list_directory': {
243
+ const dir = toolInput.path || '/';
244
+ const result = sandbox.execTool(`ls -la '${dir.replace(/'/g, "'\\''")}'`);
245
+ if (result.exitCode !== 0) {
246
+ return { output: `Error listing ${dir}: ${result.stderr.slice(0, 500)}` };
247
+ }
248
+ return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
249
+ }
250
+
251
+ case 'http_request': {
252
+ const { url, method = 'GET', headers = {}, body, follow_redirects = true } = toolInput;
253
+ const curlParts = ['curl', '-s', '-S', '-i'];
254
+ if (!follow_redirects) curlParts.push('--max-redirs', '0');
255
+ else curlParts.push('-L');
256
+ curlParts.push('-X', method.toUpperCase());
257
+ for (const [key, value] of Object.entries(headers)) {
258
+ curlParts.push('-H', `'${key}: ${String(value).replace(/'/g, "'\\''")}'`);
259
+ }
260
+ if (body) {
261
+ curlParts.push('-d', `'${body.replace(/'/g, "'\\''")}'`);
262
+ }
263
+ curlParts.push(`'${url.replace(/'/g, "'\\''")}'`);
264
+ const result = sandbox.execTool(curlParts.join(' '));
265
+ if (result.exitCode !== 0) {
266
+ return { output: `HTTP error: ${result.stderr.slice(0, 500)}` };
267
+ }
268
+ return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
269
+ }
270
+
271
+ case 'disassemble': {
272
+ const { path, function_name, address } = toolInput;
273
+ let cmd;
274
+ if (function_name) {
275
+ cmd = `objdump -d '${path.replace(/'/g, "'\\''")}' | awk '/<${function_name}>:/,/^$/'`;
276
+ } else if (address) {
277
+ cmd = `objdump -d --start-address=${address} '${path.replace(/'/g, "'\\''")}' | head -100`;
278
+ } else {
279
+ cmd = `objdump -d '${path.replace(/'/g, "'\\''")}' | head -200`;
280
+ }
281
+ const result = sandbox.execTool(cmd);
282
+ if (result.exitCode !== 0) {
283
+ // Fallback: try file + strings
284
+ const fallback = sandbox.execTool(`file '${path.replace(/'/g, "'\\''")}' && strings '${path.replace(/'/g, "'\\''")}' | head -100`);
285
+ return { output: `Disassembly failed, fallback:\n${fallback.stdout.slice(0, MAX_OUTPUT_CHARS)}` };
286
+ }
287
+ return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
288
+ }
289
+
290
+ case 'decompile': {
291
+ const { path, function_name } = toolInput;
292
+ // Try Ghidra headless if available, otherwise fallback to analysis
293
+ const ghidraCheck = sandbox.execTool('which analyzeHeadless 2>/dev/null');
294
+ if (ghidraCheck.exitCode === 0) {
295
+ const funcArg = function_name ? `-process "${function_name}"` : '';
296
+ const cmd = `analyzeHeadless /tmp/ghidra_project proj -import '${path.replace(/'/g, "'\\''")}' -postScript DecompileAll.java ${funcArg} 2>&1 | tail -200`;
297
+ const result = sandbox.execTool(cmd, 120);
298
+ return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
299
+ }
300
+ // Fallback: objdump + strings + file analysis
301
+ const analysis = sandbox.execTool(`echo "=== File Info ===" && file '${path.replace(/'/g, "'\\''")}' && echo "\\n=== Sections ===" && objdump -h '${path.replace(/'/g, "'\\''")}' 2>/dev/null | head -30 && echo "\\n=== Strings ===" && strings '${path.replace(/'/g, "'\\''")}' | head -50`);
302
+ return { output: `Ghidra not available. Static analysis:\n${analysis.stdout.slice(0, MAX_OUTPUT_CHARS)}` };
303
+ }
304
+
305
+ default:
306
+ return { output: `Unknown tool: ${toolName}` };
307
+ }
62
308
  }
package/lib/commands.js CHANGED
@@ -49,10 +49,19 @@ export const COMMAND_MODES = {
49
49
  // ── Passthrough commands (direct Python spawn, full terminal) ──────────
50
50
  // These were Python-dependent — now ported to Node.js.
51
51
  scan: { mode: 'native', description: 'Run a security scan' },
52
+ review: { mode: 'native', description: 'Multi-layer code review (3 parallel analyzers)' },
53
+ panel: { mode: 'native', description: 'Expert panel security assessment (3 personas)' },
54
+ shard: { mode: 'native', description: 'Split large docs into semantic chunks' },
55
+ guardrail: { mode: 'native', description: 'Test input/output guardrails' },
56
+ analyze: { mode: 'native', description: 'Cross-artifact consistency analysis' },
52
57
  bot: { mode: 'native', description: 'Manage bot integrations (long-running service)' },
53
58
  mcp: { mode: 'native', description: 'MCP server tools (long-running service)' },
54
59
  api: { mode: 'native', description: 'API management (long-running server)' },
55
60
  'setup-signal': { mode: 'native', description: 'Configure Signal integration' },
61
+ chain: { mode: 'native', description: 'Run multi-mode agent chain (e.g. red,purple,blue)' },
62
+ council: { mode: 'native', description: 'Multi-model consensus evaluation' },
63
+ resume: { mode: 'native', description: 'Resume interrupted autonomous sessions' },
64
+ benchmark: { mode: 'native', description: 'Run benchmark suites (XBOW, NYU CTF, PicoCTF, OverTheWire)' },
56
65
  };
57
66
 
58
67
  /**