cipher-security 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cipher.js +10 -0
- package/lib/analyze/consistency.js +566 -0
- package/lib/analyze/constitution.js +110 -0
- package/lib/analyze/sharding.js +251 -0
- package/lib/autonomous/agent-tool.js +165 -0
- package/lib/autonomous/framework.js +17 -0
- package/lib/autonomous/handoff.js +506 -0
- package/lib/autonomous/modes/blue.js +26 -0
- package/lib/autonomous/modes/red.js +28 -0
- package/lib/benchmark/agent.js +88 -26
- package/lib/benchmark/baselines.js +3 -0
- package/lib/benchmark/claude-code-solver.js +254 -0
- package/lib/benchmark/cognitive.js +283 -0
- package/lib/benchmark/index.js +12 -2
- package/lib/benchmark/knowledge.js +281 -0
- package/lib/benchmark/llm.js +156 -15
- package/lib/benchmark/models.js +5 -2
- package/lib/benchmark/nyu-ctf.js +192 -0
- package/lib/benchmark/overthewire.js +347 -0
- package/lib/benchmark/picoctf.js +281 -0
- package/lib/benchmark/prompts.js +280 -0
- package/lib/benchmark/registry.js +219 -0
- package/lib/benchmark/remote-solver.js +356 -0
- package/lib/benchmark/remote-target.js +263 -0
- package/lib/benchmark/reporter.js +35 -0
- package/lib/benchmark/runner.js +174 -10
- package/lib/benchmark/sandbox.js +35 -0
- package/lib/benchmark/scorer.js +22 -4
- package/lib/benchmark/solver.js +34 -1
- package/lib/benchmark/tools.js +262 -16
- package/lib/commands.js +9 -0
- package/lib/execution/council.js +434 -0
- package/lib/execution/parallel.js +292 -0
- package/lib/gates/circuit-breaker.js +135 -0
- package/lib/gates/confidence.js +302 -0
- package/lib/gates/corrections.js +219 -0
- package/lib/gates/self-check.js +245 -0
- package/lib/gateway/commands.js +727 -0
- package/lib/guardrails/engine.js +364 -0
- package/lib/mcp/server.js +349 -3
- package/lib/memory/compressor.js +94 -7
- package/lib/pipeline/hooks.js +288 -0
- package/lib/pipeline/index.js +11 -0
- package/lib/review/budget.js +210 -0
- package/lib/review/engine.js +526 -0
- package/lib/review/layers/acceptance-auditor.js +279 -0
- package/lib/review/layers/blind-hunter.js +500 -0
- package/lib/review/layers/defense-in-depth.js +209 -0
- package/lib/review/layers/edge-case-hunter.js +266 -0
- package/lib/review/panel.js +519 -0
- package/lib/review/two-stage.js +244 -0
- package/lib/session/cost-tracker.js +203 -0
- package/lib/session/logger.js +349 -0
- package/package.json +1 -1
package/lib/benchmark/runner.js
CHANGED
|
@@ -3,6 +3,9 @@
|
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* CIPHER Benchmark — Runner (orchestrates build → start → solve → score → report).
|
|
6
|
+
*
|
|
7
|
+
* Supports parallel execution with concurrency limits and retry/best-of-N
|
|
8
|
+
* for reliability on flaky benchmarks.
|
|
6
9
|
*/
|
|
7
10
|
|
|
8
11
|
import { BenchmarkBuilder } from './builder.js';
|
|
@@ -11,18 +14,18 @@ import { SolverResult } from './models.js';
|
|
|
11
14
|
import { aggregateResults, scoreResult } from './scorer.js';
|
|
12
15
|
import { getSolver } from './solver.js';
|
|
13
16
|
|
|
14
|
-
|
|
17
|
+
// ---------------------------------------------------------------------------
|
|
18
|
+
// Single benchmark execution
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
|
|
21
|
+
export async function runSingleBenchmark(builder, solver, name, flag) {
|
|
15
22
|
const config = builder.getBenchmark(name);
|
|
16
23
|
flag = flag || builder.config.generateFlag(name);
|
|
17
24
|
let targetUrl = '';
|
|
18
25
|
try {
|
|
19
26
|
const result = builder.runBenchmark(name, flag);
|
|
20
27
|
targetUrl = result.targetUrl;
|
|
21
|
-
const solverResult = solver.solve(config, targetUrl, result.expectedFlag);
|
|
22
|
-
// Handle promise (autonomous) or sync (stub)
|
|
23
|
-
if (solverResult && typeof solverResult.then === 'function') {
|
|
24
|
-
return solverResult.then((sr) => scoreResult(config, sr, flag, targetUrl));
|
|
25
|
-
}
|
|
28
|
+
const solverResult = await solver.solve(config, targetUrl, result.expectedFlag);
|
|
26
29
|
return scoreResult(config, solverResult, flag, targetUrl);
|
|
27
30
|
} catch (e) {
|
|
28
31
|
return scoreResult(config, new SolverResult({ error: String(e) }), flag, targetUrl);
|
|
@@ -31,7 +34,85 @@ export function runSingleBenchmark(builder, solver, name, flag) {
|
|
|
31
34
|
}
|
|
32
35
|
}
|
|
33
36
|
|
|
34
|
-
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
// Retry logic — best-of-N
|
|
39
|
+
// ---------------------------------------------------------------------------
|
|
40
|
+
|
|
41
|
+
/**
|
|
42
|
+
* Run a single benchmark with retries, keeping the best result.
|
|
43
|
+
*
|
|
44
|
+
* @param {BenchmarkBuilder} builder
|
|
45
|
+
* @param {object} solver
|
|
46
|
+
* @param {string} name
|
|
47
|
+
* @param {object} [opts]
|
|
48
|
+
* @param {number} [opts.retries=1] - Total attempts (1 = no retry)
|
|
49
|
+
* @param {string} [opts.flag]
|
|
50
|
+
* @param {Function} [opts.onAttempt] - Called with (attemptNum, result) after each attempt
|
|
51
|
+
* @returns {Promise<import('./models.js').BenchmarkResult>}
|
|
52
|
+
*/
|
|
53
|
+
export async function runWithRetry(builder, solver, name, opts = {}) {
|
|
54
|
+
const maxAttempts = opts.retries ?? 1;
|
|
55
|
+
const flag = opts.flag || builder.config.generateFlag(name);
|
|
56
|
+
let bestResult = null;
|
|
57
|
+
|
|
58
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
59
|
+
const result = await runSingleBenchmark(builder, solver, name, flag);
|
|
60
|
+
|
|
61
|
+
if (opts.onAttempt) opts.onAttempt(attempt, result);
|
|
62
|
+
|
|
63
|
+
// Keep the best result: passed > failed > error
|
|
64
|
+
if (!bestResult || _isBetterResult(result, bestResult)) {
|
|
65
|
+
bestResult = result;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// If passed, no need to retry
|
|
69
|
+
if (result.passed) break;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
return bestResult;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Compare two results — passed beats failed, failed beats error.
|
|
77
|
+
* @param {import('./models.js').BenchmarkResult} a
|
|
78
|
+
* @param {import('./models.js').BenchmarkResult} b
|
|
79
|
+
* @returns {boolean} True if a is better than b
|
|
80
|
+
*/
|
|
81
|
+
function _isBetterResult(a, b) {
|
|
82
|
+
if (a.passed && !b.passed) return true;
|
|
83
|
+
if (!a.passed && b.passed) return false;
|
|
84
|
+
// Both same pass/fail status — prefer no error
|
|
85
|
+
if (!a.solverResult.error && b.solverResult.error) return true;
|
|
86
|
+
if (a.solverResult.error && !b.solverResult.error) return false;
|
|
87
|
+
// Both same error status — prefer more tool calls (more progress)
|
|
88
|
+
return a.solverResult.toolCalls > b.solverResult.toolCalls;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// ---------------------------------------------------------------------------
|
|
92
|
+
// Parallel execution with concurrency limit
|
|
93
|
+
// ---------------------------------------------------------------------------
|
|
94
|
+
|
|
95
|
+
/**
|
|
96
|
+
* Run benchmarks with parallel execution and optional retries.
|
|
97
|
+
*
|
|
98
|
+
* @param {object} opts
|
|
99
|
+
* @param {BenchmarkBuilder} opts.builder
|
|
100
|
+
* @param {object} opts.solver
|
|
101
|
+
* @param {string[]} [opts.benchmarkNames]
|
|
102
|
+
* @param {boolean} [opts.runAll=false]
|
|
103
|
+
* @param {number} [opts.levelFilter]
|
|
104
|
+
* @param {string} [opts.tagFilter]
|
|
105
|
+
* @param {number} [opts.concurrency=1] - Max parallel benchmarks
|
|
106
|
+
* @param {number} [opts.retries=1] - Attempts per benchmark
|
|
107
|
+
* @param {Function} [opts.onResult] - Called with (name, result) after each benchmark
|
|
108
|
+
* @param {Function} [opts.onTrace] - Called with trace event objects for observability
|
|
109
|
+
* @returns {Promise<import('./models.js').RunReport>}
|
|
110
|
+
*/
|
|
111
|
+
export async function runBenchmarks({
|
|
112
|
+
builder, solver, benchmarkNames, runAll = false,
|
|
113
|
+
levelFilter, tagFilter, concurrency = 1, retries = 1,
|
|
114
|
+
onResult, onTrace,
|
|
115
|
+
}) {
|
|
35
116
|
let allBenchmarks;
|
|
36
117
|
try {
|
|
37
118
|
allBenchmarks = builder.listBenchmarks();
|
|
@@ -61,14 +142,94 @@ export async function runBenchmarks({ builder, solver, benchmarkNames, runAll =
|
|
|
61
142
|
}
|
|
62
143
|
|
|
63
144
|
const results = [];
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
145
|
+
|
|
146
|
+
if (concurrency <= 1) {
|
|
147
|
+
// Sequential execution (original behavior)
|
|
148
|
+
for (const name of targets) {
|
|
149
|
+
const startTime = performance.now();
|
|
150
|
+
const result = await runWithRetry(builder, solver, name, { retries });
|
|
151
|
+
const durationMs = performance.now() - startTime;
|
|
152
|
+
|
|
153
|
+
results.push(result);
|
|
154
|
+
if (onResult) onResult(name, result);
|
|
155
|
+
if (onTrace) onTrace({
|
|
156
|
+
type: 'benchmark_complete',
|
|
157
|
+
benchmark: name,
|
|
158
|
+
passed: result.passed,
|
|
159
|
+
durationMs: Math.round(durationMs),
|
|
160
|
+
attempts: retries,
|
|
161
|
+
error: result.solverResult.error,
|
|
162
|
+
});
|
|
163
|
+
}
|
|
164
|
+
} else {
|
|
165
|
+
// Parallel execution with concurrency limit
|
|
166
|
+
const inFlight = new Set();
|
|
167
|
+
const queue = [...targets];
|
|
168
|
+
|
|
169
|
+
while (queue.length > 0 || inFlight.size > 0) {
|
|
170
|
+
// Fill up to concurrency limit
|
|
171
|
+
while (queue.length > 0 && inFlight.size < concurrency) {
|
|
172
|
+
const name = queue.shift();
|
|
173
|
+
const startTime = performance.now();
|
|
174
|
+
const promise = runWithRetry(builder, solver, name, { retries })
|
|
175
|
+
.then((result) => {
|
|
176
|
+
const durationMs = performance.now() - startTime;
|
|
177
|
+
results.push(result);
|
|
178
|
+
inFlight.delete(promise);
|
|
179
|
+
if (onResult) onResult(name, result);
|
|
180
|
+
if (onTrace) onTrace({
|
|
181
|
+
type: 'benchmark_complete',
|
|
182
|
+
benchmark: name,
|
|
183
|
+
passed: result.passed,
|
|
184
|
+
durationMs: Math.round(durationMs),
|
|
185
|
+
attempts: retries,
|
|
186
|
+
error: result.solverResult.error,
|
|
187
|
+
});
|
|
188
|
+
})
|
|
189
|
+
.catch((err) => {
|
|
190
|
+
results.push({
|
|
191
|
+
config: builder.getBenchmark(name),
|
|
192
|
+
solverResult: new SolverResult({ error: String(err) }),
|
|
193
|
+
passed: false,
|
|
194
|
+
expectedFlag: '',
|
|
195
|
+
actualFlag: '',
|
|
196
|
+
});
|
|
197
|
+
inFlight.delete(promise);
|
|
198
|
+
if (onTrace) onTrace({
|
|
199
|
+
type: 'benchmark_error',
|
|
200
|
+
benchmark: name,
|
|
201
|
+
error: String(err),
|
|
202
|
+
});
|
|
203
|
+
});
|
|
204
|
+
inFlight.add(promise);
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
// Wait for at least one to complete
|
|
208
|
+
if (inFlight.size > 0) {
|
|
209
|
+
await Promise.race(inFlight);
|
|
210
|
+
}
|
|
211
|
+
}
|
|
67
212
|
}
|
|
213
|
+
|
|
68
214
|
return aggregateResults(results);
|
|
69
215
|
}
|
|
70
216
|
|
|
217
|
+
// ---------------------------------------------------------------------------
|
|
218
|
+
// Report serialization
|
|
219
|
+
// ---------------------------------------------------------------------------
|
|
220
|
+
|
|
71
221
|
export function reportToDict(report) {
|
|
222
|
+
const byTag = report.resultsByTag();
|
|
223
|
+
const tagRates = {};
|
|
224
|
+
for (const [tag, tagResults] of Object.entries(byTag)) {
|
|
225
|
+
const p = tagResults.filter(r => r.passed).length;
|
|
226
|
+
tagRates[tag] = {
|
|
227
|
+
total: tagResults.length,
|
|
228
|
+
passed: p,
|
|
229
|
+
rate: tagResults.length ? Math.round((p / tagResults.length) * 1000) / 10 : 0,
|
|
230
|
+
};
|
|
231
|
+
}
|
|
232
|
+
|
|
72
233
|
return {
|
|
73
234
|
summary: {
|
|
74
235
|
total: report.total,
|
|
@@ -81,6 +242,7 @@ export function reportToDict(report) {
|
|
|
81
242
|
pass_rate_by_level: Object.fromEntries(
|
|
82
243
|
Object.entries(report.passRateByLevel()).map(([k, v]) => [k, Math.round(v * 10) / 10]),
|
|
83
244
|
),
|
|
245
|
+
pass_rate_by_tag: tagRates,
|
|
84
246
|
},
|
|
85
247
|
results: report.results.map((r) => ({
|
|
86
248
|
benchmark: r.config.dirName,
|
|
@@ -101,3 +263,5 @@ export function reportToDict(report) {
|
|
|
101
263
|
})),
|
|
102
264
|
};
|
|
103
265
|
}
|
|
266
|
+
|
|
267
|
+
export { _isBetterResult };
|
package/lib/benchmark/sandbox.js
CHANGED
|
@@ -77,6 +77,41 @@ export class SandboxContainer {
|
|
|
77
77
|
this._connectedNetworks.push(networkName);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
+
/**
|
|
81
|
+
* Copy a file from the host into the sandbox container.
|
|
82
|
+
* @param {string} hostPath - Absolute path on the host
|
|
83
|
+
* @param {string} containerPath - Path inside the container (default: /tmp/<filename>)
|
|
84
|
+
*/
|
|
85
|
+
copyFileIn(hostPath, containerPath) {
|
|
86
|
+
if (!this._containerId) throw new SandboxError('Container not created.');
|
|
87
|
+
const result = spawnSync('docker', ['cp', hostPath, `${this._containerId}:${containerPath}`], {
|
|
88
|
+
timeout: 30000,
|
|
89
|
+
stdio: 'pipe',
|
|
90
|
+
});
|
|
91
|
+
if (result.status !== 0) {
|
|
92
|
+
throw new SandboxError(`Failed to copy ${hostPath} to container: ${(result.stderr || '').toString().slice(0, 200)}`);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Parse a docker-compose.yml and return all service URLs.
|
|
98
|
+
* @param {object} services - Parsed docker-compose services object
|
|
99
|
+
* @param {string} networkName - Docker network the sandbox is connected to
|
|
100
|
+
* @returns {Array<{ name: string, url: string, port: number }>}
|
|
101
|
+
*/
|
|
102
|
+
static resolveServiceUrls(services, networkName) {
|
|
103
|
+
const urls = [];
|
|
104
|
+
for (const [name, svc] of Object.entries(services)) {
|
|
105
|
+
let port = 80;
|
|
106
|
+
if (svc.ports?.length) {
|
|
107
|
+
port = parseInt(String(svc.ports[0]).split(':').pop(), 10);
|
|
108
|
+
}
|
|
109
|
+
const protocol = [443, 8443].includes(port) ? 'https' : 'http';
|
|
110
|
+
urls.push({ name, url: `${protocol}://${name}:${port}`, port });
|
|
111
|
+
}
|
|
112
|
+
return urls;
|
|
113
|
+
}
|
|
114
|
+
|
|
80
115
|
getInternalUrl(serviceName, port = 80) {
|
|
81
116
|
const protocol = [443, 8443].includes(port) ? 'https' : 'http';
|
|
82
117
|
return `${protocol}://${serviceName}:${port}`;
|
package/lib/benchmark/scorer.js
CHANGED
|
@@ -13,10 +13,28 @@ export function scoreFlag(expected, actual) {
|
|
|
13
13
|
}
|
|
14
14
|
|
|
15
15
|
export function scoreResult(config, solverResult, expectedFlag, targetUrl = '') {
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
16
|
+
let passed = false;
|
|
17
|
+
let actualValue = '';
|
|
18
|
+
|
|
19
|
+
if (config.winCondition === 'question') {
|
|
20
|
+
// Question-type: check answer
|
|
21
|
+
if (solverResult.answerFound && solverResult.answerValue) {
|
|
22
|
+
actualValue = solverResult.answerValue;
|
|
23
|
+
passed = scoreFlag(expectedFlag, solverResult.answerValue);
|
|
24
|
+
}
|
|
25
|
+
} else {
|
|
26
|
+
// Flag-type: check flag
|
|
27
|
+
if (solverResult.flagFound && solverResult.flagValue) {
|
|
28
|
+
actualValue = solverResult.flagValue;
|
|
29
|
+
passed = scoreFlag(expectedFlag, solverResult.flagValue);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
return new BenchmarkResult({
|
|
34
|
+
config, solverResult, passed, expectedFlag,
|
|
35
|
+
actualFlag: actualValue || solverResult.flagValue || solverResult.answerValue || '',
|
|
36
|
+
targetUrl,
|
|
37
|
+
});
|
|
20
38
|
}
|
|
21
39
|
|
|
22
40
|
export function aggregateResults(results) {
|
package/lib/benchmark/solver.js
CHANGED
|
@@ -159,8 +159,41 @@ export class MultiAgentSolver extends SolverAdapter {
|
|
|
159
159
|
|
|
160
160
|
export const SOLVERS = { stub: StubSolver, manual: ManualSolver, autonomous: AutonomousSolver, 'autonomous-multi': MultiAgentSolver };
|
|
161
161
|
|
|
162
|
+
// Lazy-load RemoteSolver to avoid circular imports
|
|
162
163
|
export function getSolver(name, opts = {}) {
|
|
164
|
+
if (name === 'remote') {
|
|
165
|
+
return new RemoteSolverProxy(opts);
|
|
166
|
+
}
|
|
167
|
+
if (name === 'claude-code') {
|
|
168
|
+
return new ClaudeCodeSolverProxy(opts);
|
|
169
|
+
}
|
|
163
170
|
const Cls = SOLVERS[name];
|
|
164
|
-
if (!Cls) throw new Error(`Unknown solver: ${name}. Available: ${Object.keys(SOLVERS).join(', ')}`);
|
|
171
|
+
if (!Cls) throw new Error(`Unknown solver: ${name}. Available: ${[...Object.keys(SOLVERS), 'remote', 'claude-code'].join(', ')}`);
|
|
165
172
|
return new Cls(opts);
|
|
166
173
|
}
|
|
174
|
+
|
|
175
|
+
/** Proxy that lazy-loads RemoteSolver on first solve(). */
|
|
176
|
+
class RemoteSolverProxy extends SolverAdapter {
|
|
177
|
+
constructor(opts) { super(); this._opts = opts; this._impl = null; }
|
|
178
|
+
get name() { return 'remote'; }
|
|
179
|
+
async solve(config, targetUrl, expectedFlag) {
|
|
180
|
+
if (!this._impl) {
|
|
181
|
+
const { RemoteSolver } = await import('./remote-solver.js');
|
|
182
|
+
this._impl = new RemoteSolver(this._opts);
|
|
183
|
+
}
|
|
184
|
+
return this._impl.solve(config, targetUrl, expectedFlag);
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
/** Proxy that lazy-loads ClaudeCodeSolver on first solve(). */
|
|
189
|
+
class ClaudeCodeSolverProxy extends SolverAdapter {
|
|
190
|
+
constructor(opts) { super(); this._opts = opts; this._impl = null; }
|
|
191
|
+
get name() { return 'claude-code'; }
|
|
192
|
+
async solve(config, targetUrl, expectedFlag) {
|
|
193
|
+
if (!this._impl) {
|
|
194
|
+
const { ClaudeCodeSolver } = await import('./claude-code-solver.js');
|
|
195
|
+
this._impl = new ClaudeCodeSolver(this._opts);
|
|
196
|
+
}
|
|
197
|
+
return this._impl.solve(config, targetUrl, expectedFlag);
|
|
198
|
+
}
|
|
199
|
+
}
|
package/lib/benchmark/tools.js
CHANGED
|
@@ -3,10 +3,23 @@
|
|
|
3
3
|
|
|
4
4
|
/**
|
|
5
5
|
* CIPHER Benchmark — Tool definitions for the security agent.
|
|
6
|
+
*
|
|
7
|
+
* 7 tools for autonomous CTF solving:
|
|
8
|
+
* - sandbox_exec: execute shell commands
|
|
9
|
+
* - submit_flag: submit a captured flag (flag win condition)
|
|
10
|
+
* - submit_answer: submit an answer (question win condition)
|
|
11
|
+
* - read_file: read a file from the sandbox filesystem
|
|
12
|
+
* - write_file: write a file to the sandbox filesystem
|
|
13
|
+
* - list_directory: list directory contents in the sandbox
|
|
14
|
+
* - http_request: make structured HTTP requests with headers/body
|
|
6
15
|
*/
|
|
7
16
|
|
|
8
17
|
const MAX_OUTPUT_CHARS = 4000;
|
|
9
18
|
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Tool schemas
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
|
|
10
23
|
export const SANDBOX_EXEC_SCHEMA = {
|
|
11
24
|
name: 'sandbox_exec',
|
|
12
25
|
description: 'Execute a shell command in the security sandbox. Returns stdout, stderr, and exit code.',
|
|
@@ -32,31 +45,264 @@ export const SUBMIT_FLAG_SCHEMA = {
|
|
|
32
45
|
},
|
|
33
46
|
};
|
|
34
47
|
|
|
35
|
-
export const
|
|
48
|
+
export const SUBMIT_ANSWER_SCHEMA = {
|
|
49
|
+
name: 'submit_answer',
|
|
50
|
+
description: 'Submit an answer to a question-type challenge. Use this instead of submit_flag when the challenge asks a question rather than requiring flag capture.',
|
|
51
|
+
input_schema: {
|
|
52
|
+
type: 'object',
|
|
53
|
+
properties: {
|
|
54
|
+
answer: { type: 'string', description: 'The answer to the challenge question' },
|
|
55
|
+
},
|
|
56
|
+
required: ['answer'],
|
|
57
|
+
},
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
export const READ_FILE_SCHEMA = {
|
|
61
|
+
name: 'read_file',
|
|
62
|
+
description: 'Read a file from the sandbox filesystem. Returns the file contents (truncated if too large).',
|
|
63
|
+
input_schema: {
|
|
64
|
+
type: 'object',
|
|
65
|
+
properties: {
|
|
66
|
+
path: { type: 'string', description: 'Absolute path to the file in the sandbox' },
|
|
67
|
+
encoding: { type: 'string', description: 'File encoding (default: utf-8). Use "base64" for binary files.', default: 'utf-8' },
|
|
68
|
+
},
|
|
69
|
+
required: ['path'],
|
|
70
|
+
},
|
|
71
|
+
};
|
|
72
|
+
|
|
73
|
+
export const WRITE_FILE_SCHEMA = {
|
|
74
|
+
name: 'write_file',
|
|
75
|
+
description: 'Write content to a file in the sandbox filesystem. Creates parent directories if needed.',
|
|
76
|
+
input_schema: {
|
|
77
|
+
type: 'object',
|
|
78
|
+
properties: {
|
|
79
|
+
path: { type: 'string', description: 'Absolute path for the file in the sandbox' },
|
|
80
|
+
content: { type: 'string', description: 'Content to write to the file' },
|
|
81
|
+
mode: { type: 'string', description: 'File permissions (default: 644)', default: '644' },
|
|
82
|
+
},
|
|
83
|
+
required: ['path', 'content'],
|
|
84
|
+
},
|
|
85
|
+
};
|
|
86
|
+
|
|
87
|
+
export const LIST_DIR_SCHEMA = {
|
|
88
|
+
name: 'list_directory',
|
|
89
|
+
description: 'List files and directories at a path in the sandbox. Returns a detailed listing with permissions and sizes.',
|
|
90
|
+
input_schema: {
|
|
91
|
+
type: 'object',
|
|
92
|
+
properties: {
|
|
93
|
+
path: { type: 'string', description: 'Directory path to list (default: /)', default: '/' },
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
};
|
|
97
|
+
|
|
98
|
+
export const HTTP_REQUEST_SCHEMA = {
|
|
99
|
+
name: 'http_request',
|
|
100
|
+
description: 'Make an HTTP request from within the sandbox. Returns status code, headers, and body.',
|
|
101
|
+
input_schema: {
|
|
102
|
+
type: 'object',
|
|
103
|
+
properties: {
|
|
104
|
+
url: { type: 'string', description: 'Target URL' },
|
|
105
|
+
method: { type: 'string', description: 'HTTP method (default: GET)', default: 'GET' },
|
|
106
|
+
headers: { type: 'object', description: 'Request headers as key-value pairs' },
|
|
107
|
+
body: { type: 'string', description: 'Request body (for POST/PUT)' },
|
|
108
|
+
follow_redirects: { type: 'boolean', description: 'Follow redirects (default: true)', default: true },
|
|
109
|
+
},
|
|
110
|
+
required: ['url'],
|
|
111
|
+
},
|
|
112
|
+
};
|
|
113
|
+
|
|
114
|
+
// ---------------------------------------------------------------------------
|
|
115
|
+
// Binary analysis tools (Ghidra)
|
|
116
|
+
// ---------------------------------------------------------------------------
|
|
117
|
+
|
|
118
|
+
export const DISASSEMBLE_SCHEMA = {
|
|
119
|
+
name: 'disassemble',
|
|
120
|
+
description: 'Disassemble a binary file at a specific function or address. Uses objdump/radare2 in the sandbox. For detailed analysis of binary challenges.',
|
|
121
|
+
input_schema: {
|
|
122
|
+
type: 'object',
|
|
123
|
+
properties: {
|
|
124
|
+
path: { type: 'string', description: 'Path to the binary in the sandbox' },
|
|
125
|
+
function_name: { type: 'string', description: 'Function name to disassemble (e.g. "main")' },
|
|
126
|
+
address: { type: 'string', description: 'Address to disassemble from (e.g. "0x401000")' },
|
|
127
|
+
},
|
|
128
|
+
required: ['path'],
|
|
129
|
+
},
|
|
130
|
+
};
|
|
131
|
+
|
|
132
|
+
export const DECOMPILE_SCHEMA = {
|
|
133
|
+
name: 'decompile',
|
|
134
|
+
description: 'Decompile a binary to pseudo-C code. Uses Ghidra headless analysis if available, or falls back to objdump + strings analysis.',
|
|
135
|
+
input_schema: {
|
|
136
|
+
type: 'object',
|
|
137
|
+
properties: {
|
|
138
|
+
path: { type: 'string', description: 'Path to the binary in the sandbox' },
|
|
139
|
+
function_name: { type: 'string', description: 'Function to decompile (default: all)' },
|
|
140
|
+
},
|
|
141
|
+
required: ['path'],
|
|
142
|
+
},
|
|
143
|
+
};
|
|
144
|
+
|
|
145
|
+
// ---------------------------------------------------------------------------
|
|
146
|
+
// Tool collections
|
|
147
|
+
// ---------------------------------------------------------------------------
|
|
148
|
+
|
|
149
|
+
import { COGNITIVE_TOOLS } from './cognitive.js';
|
|
150
|
+
|
|
151
|
+
/** Tools for flag-based challenges (includes cognitive + binary tools). */
|
|
152
|
+
export const FLAG_TOOLS = [
|
|
153
|
+
SANDBOX_EXEC_SCHEMA, SUBMIT_FLAG_SCHEMA,
|
|
154
|
+
READ_FILE_SCHEMA, WRITE_FILE_SCHEMA, LIST_DIR_SCHEMA, HTTP_REQUEST_SCHEMA,
|
|
155
|
+
DISASSEMBLE_SCHEMA, DECOMPILE_SCHEMA,
|
|
156
|
+
...COGNITIVE_TOOLS,
|
|
157
|
+
];
|
|
158
|
+
|
|
159
|
+
/** Tools for question-based challenges (includes cognitive tools). */
|
|
160
|
+
export const QUESTION_TOOLS = [
|
|
161
|
+
SANDBOX_EXEC_SCHEMA, SUBMIT_ANSWER_SCHEMA,
|
|
162
|
+
READ_FILE_SCHEMA, WRITE_FILE_SCHEMA, LIST_DIR_SCHEMA, HTTP_REQUEST_SCHEMA,
|
|
163
|
+
...COGNITIVE_TOOLS,
|
|
164
|
+
];
|
|
165
|
+
|
|
166
|
+
/** All tools (backward compat — includes both submit types + cognitive + binary). */
|
|
167
|
+
export const AGENT_TOOLS = [
|
|
168
|
+
SANDBOX_EXEC_SCHEMA, SUBMIT_FLAG_SCHEMA, SUBMIT_ANSWER_SCHEMA,
|
|
169
|
+
READ_FILE_SCHEMA, WRITE_FILE_SCHEMA, LIST_DIR_SCHEMA, HTTP_REQUEST_SCHEMA,
|
|
170
|
+
DISASSEMBLE_SCHEMA, DECOMPILE_SCHEMA,
|
|
171
|
+
...COGNITIVE_TOOLS,
|
|
172
|
+
];
|
|
173
|
+
|
|
174
|
+
/**
|
|
175
|
+
* Get the appropriate tool set for a win condition.
|
|
176
|
+
* @param {'flag' | 'question'} winCondition
|
|
177
|
+
* @returns {object[]}
|
|
178
|
+
*/
|
|
179
|
+
export function getToolsForWinCondition(winCondition) {
|
|
180
|
+
return winCondition === 'question' ? QUESTION_TOOLS : FLAG_TOOLS;
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// ---------------------------------------------------------------------------
|
|
184
|
+
// Tool dispatch
|
|
185
|
+
// ---------------------------------------------------------------------------
|
|
36
186
|
|
|
37
187
|
/**
|
|
38
188
|
* Dispatch a tool call within the sandbox.
|
|
39
189
|
* @param {string} toolName
|
|
40
190
|
* @param {object} toolInput
|
|
41
191
|
* @param {import('./sandbox.js').SandboxContainer} sandbox
|
|
42
|
-
* @returns {{ output: string, flagSubmitted?: string }}
|
|
192
|
+
* @returns {{ output: string, flagSubmitted?: string, answerSubmitted?: string }}
|
|
43
193
|
*/
|
|
44
194
|
export function dispatchTool(toolName, toolInput, sandbox) {
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
195
|
+
switch (toolName) {
|
|
196
|
+
case 'sandbox_exec': {
|
|
197
|
+
const { command, timeout = 60 } = toolInput;
|
|
198
|
+
const result = sandbox.execTool(command, timeout);
|
|
199
|
+
let output = `Exit code: ${result.exitCode}\n`;
|
|
200
|
+
if (result.stdout) output += `STDOUT:\n${result.stdout.slice(0, MAX_OUTPUT_CHARS)}\n`;
|
|
201
|
+
if (result.stderr) output += `STDERR:\n${result.stderr.slice(0, MAX_OUTPUT_CHARS)}\n`;
|
|
202
|
+
if (result.stdout.length > MAX_OUTPUT_CHARS || result.stderr.length > MAX_OUTPUT_CHARS) {
|
|
203
|
+
output += '[output truncated]\n';
|
|
204
|
+
}
|
|
205
|
+
return { output };
|
|
53
206
|
}
|
|
54
|
-
return { output };
|
|
55
|
-
}
|
|
56
207
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
208
|
+
case 'submit_flag':
|
|
209
|
+
return { output: `Flag submitted: ${toolInput.flag}`, flagSubmitted: toolInput.flag };
|
|
210
|
+
|
|
211
|
+
case 'submit_answer':
|
|
212
|
+
return { output: `Answer submitted: ${toolInput.answer}`, answerSubmitted: toolInput.answer };
|
|
213
|
+
|
|
214
|
+
case 'read_file': {
|
|
215
|
+
const { path, encoding = 'utf-8' } = toolInput;
|
|
216
|
+
const catCmd = encoding === 'base64'
|
|
217
|
+
? `base64 < '${path.replace(/'/g, "'\\''")}'`
|
|
218
|
+
: `cat '${path.replace(/'/g, "'\\''")}'`;
|
|
219
|
+
const result = sandbox.execTool(catCmd);
|
|
220
|
+
if (result.exitCode !== 0) {
|
|
221
|
+
return { output: `Error reading ${path}: ${result.stderr.slice(0, 500)}` };
|
|
222
|
+
}
|
|
223
|
+
const content = result.stdout.slice(0, MAX_OUTPUT_CHARS);
|
|
224
|
+
const truncated = result.stdout.length > MAX_OUTPUT_CHARS ? '\n[truncated]' : '';
|
|
225
|
+
return { output: `File: ${path}\n${content}${truncated}` };
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
case 'write_file': {
|
|
229
|
+
const { path, content, mode = '644' } = toolInput;
|
|
230
|
+
// Escape content for shell
|
|
231
|
+
const escaped = content.replace(/'/g, "'\\''");
|
|
232
|
+
const dirCmd = `mkdir -p '${path.replace(/'/g, "'\\''").replace(/\/[^/]+$/, '')}'`;
|
|
233
|
+
const writeCmd = `${dirCmd} && printf '%s' '${escaped}' > '${path.replace(/'/g, "'\\''")}'`;
|
|
234
|
+
const chmodCmd = `${writeCmd} && chmod ${mode} '${path.replace(/'/g, "'\\''")}'`;
|
|
235
|
+
const result = sandbox.execTool(chmodCmd);
|
|
236
|
+
if (result.exitCode !== 0) {
|
|
237
|
+
return { output: `Error writing ${path}: ${result.stderr.slice(0, 500)}` };
|
|
238
|
+
}
|
|
239
|
+
return { output: `Written ${content.length} bytes to ${path} (mode ${mode})` };
|
|
240
|
+
}
|
|
60
241
|
|
|
61
|
-
|
|
242
|
+
case 'list_directory': {
|
|
243
|
+
const dir = toolInput.path || '/';
|
|
244
|
+
const result = sandbox.execTool(`ls -la '${dir.replace(/'/g, "'\\''")}'`);
|
|
245
|
+
if (result.exitCode !== 0) {
|
|
246
|
+
return { output: `Error listing ${dir}: ${result.stderr.slice(0, 500)}` };
|
|
247
|
+
}
|
|
248
|
+
return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
case 'http_request': {
|
|
252
|
+
const { url, method = 'GET', headers = {}, body, follow_redirects = true } = toolInput;
|
|
253
|
+
const curlParts = ['curl', '-s', '-S', '-i'];
|
|
254
|
+
if (!follow_redirects) curlParts.push('--max-redirs', '0');
|
|
255
|
+
else curlParts.push('-L');
|
|
256
|
+
curlParts.push('-X', method.toUpperCase());
|
|
257
|
+
for (const [key, value] of Object.entries(headers)) {
|
|
258
|
+
curlParts.push('-H', `'${key}: ${String(value).replace(/'/g, "'\\''")}'`);
|
|
259
|
+
}
|
|
260
|
+
if (body) {
|
|
261
|
+
curlParts.push('-d', `'${body.replace(/'/g, "'\\''")}'`);
|
|
262
|
+
}
|
|
263
|
+
curlParts.push(`'${url.replace(/'/g, "'\\''")}'`);
|
|
264
|
+
const result = sandbox.execTool(curlParts.join(' '));
|
|
265
|
+
if (result.exitCode !== 0) {
|
|
266
|
+
return { output: `HTTP error: ${result.stderr.slice(0, 500)}` };
|
|
267
|
+
}
|
|
268
|
+
return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
case 'disassemble': {
|
|
272
|
+
const { path, function_name, address } = toolInput;
|
|
273
|
+
let cmd;
|
|
274
|
+
if (function_name) {
|
|
275
|
+
cmd = `objdump -d '${path.replace(/'/g, "'\\''")}' | awk '/<${function_name}>:/,/^$/'`;
|
|
276
|
+
} else if (address) {
|
|
277
|
+
cmd = `objdump -d --start-address=${address} '${path.replace(/'/g, "'\\''")}' | head -100`;
|
|
278
|
+
} else {
|
|
279
|
+
cmd = `objdump -d '${path.replace(/'/g, "'\\''")}' | head -200`;
|
|
280
|
+
}
|
|
281
|
+
const result = sandbox.execTool(cmd);
|
|
282
|
+
if (result.exitCode !== 0) {
|
|
283
|
+
// Fallback: try file + strings
|
|
284
|
+
const fallback = sandbox.execTool(`file '${path.replace(/'/g, "'\\''")}' && strings '${path.replace(/'/g, "'\\''")}' | head -100`);
|
|
285
|
+
return { output: `Disassembly failed, fallback:\n${fallback.stdout.slice(0, MAX_OUTPUT_CHARS)}` };
|
|
286
|
+
}
|
|
287
|
+
return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
case 'decompile': {
|
|
291
|
+
const { path, function_name } = toolInput;
|
|
292
|
+
// Try Ghidra headless if available, otherwise fallback to analysis
|
|
293
|
+
const ghidraCheck = sandbox.execTool('which analyzeHeadless 2>/dev/null');
|
|
294
|
+
if (ghidraCheck.exitCode === 0) {
|
|
295
|
+
const funcArg = function_name ? `-process "${function_name}"` : '';
|
|
296
|
+
const cmd = `analyzeHeadless /tmp/ghidra_project proj -import '${path.replace(/'/g, "'\\''")}' -postScript DecompileAll.java ${funcArg} 2>&1 | tail -200`;
|
|
297
|
+
const result = sandbox.execTool(cmd, 120);
|
|
298
|
+
return { output: result.stdout.slice(0, MAX_OUTPUT_CHARS) };
|
|
299
|
+
}
|
|
300
|
+
// Fallback: objdump + strings + file analysis
|
|
301
|
+
const analysis = sandbox.execTool(`echo "=== File Info ===" && file '${path.replace(/'/g, "'\\''")}' && echo "\\n=== Sections ===" && objdump -h '${path.replace(/'/g, "'\\''")}' 2>/dev/null | head -30 && echo "\\n=== Strings ===" && strings '${path.replace(/'/g, "'\\''")}' | head -50`);
|
|
302
|
+
return { output: `Ghidra not available. Static analysis:\n${analysis.stdout.slice(0, MAX_OUTPUT_CHARS)}` };
|
|
303
|
+
}
|
|
304
|
+
|
|
305
|
+
default:
|
|
306
|
+
return { output: `Unknown tool: ${toolName}` };
|
|
307
|
+
}
|
|
62
308
|
}
|
package/lib/commands.js
CHANGED
|
@@ -49,10 +49,19 @@ export const COMMAND_MODES = {
|
|
|
49
49
|
// ── Passthrough commands (direct Python spawn, full terminal) ──────────
|
|
50
50
|
// These were Python-dependent — now ported to Node.js.
|
|
51
51
|
scan: { mode: 'native', description: 'Run a security scan' },
|
|
52
|
+
review: { mode: 'native', description: 'Multi-layer code review (3 parallel analyzers)' },
|
|
53
|
+
panel: { mode: 'native', description: 'Expert panel security assessment (3 personas)' },
|
|
54
|
+
shard: { mode: 'native', description: 'Split large docs into semantic chunks' },
|
|
55
|
+
guardrail: { mode: 'native', description: 'Test input/output guardrails' },
|
|
56
|
+
analyze: { mode: 'native', description: 'Cross-artifact consistency analysis' },
|
|
52
57
|
bot: { mode: 'native', description: 'Manage bot integrations (long-running service)' },
|
|
53
58
|
mcp: { mode: 'native', description: 'MCP server tools (long-running service)' },
|
|
54
59
|
api: { mode: 'native', description: 'API management (long-running server)' },
|
|
55
60
|
'setup-signal': { mode: 'native', description: 'Configure Signal integration' },
|
|
61
|
+
chain: { mode: 'native', description: 'Run multi-mode agent chain (e.g. red,purple,blue)' },
|
|
62
|
+
council: { mode: 'native', description: 'Multi-model consensus evaluation' },
|
|
63
|
+
resume: { mode: 'native', description: 'Resume interrupted autonomous sessions' },
|
|
64
|
+
benchmark: { mode: 'native', description: 'Run benchmark suites (XBOW, NYU CTF, PicoCTF, OverTheWire)' },
|
|
56
65
|
};
|
|
57
66
|
|
|
58
67
|
/**
|