cipher-security 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cipher.js +465 -0
- package/lib/api/billing.js +321 -0
- package/lib/api/compliance.js +693 -0
- package/lib/api/controls.js +1401 -0
- package/lib/api/index.js +49 -0
- package/lib/api/marketplace.js +467 -0
- package/lib/api/openai-proxy.js +383 -0
- package/lib/api/server.js +685 -0
- package/lib/autonomous/feedback-loop.js +554 -0
- package/lib/autonomous/framework.js +512 -0
- package/lib/autonomous/index.js +97 -0
- package/lib/autonomous/leaderboard.js +594 -0
- package/lib/autonomous/modes/architect.js +412 -0
- package/lib/autonomous/modes/blue.js +386 -0
- package/lib/autonomous/modes/incident.js +684 -0
- package/lib/autonomous/modes/privacy.js +369 -0
- package/lib/autonomous/modes/purple.js +294 -0
- package/lib/autonomous/modes/recon.js +250 -0
- package/lib/autonomous/parallel.js +587 -0
- package/lib/autonomous/researcher.js +583 -0
- package/lib/autonomous/runner.js +955 -0
- package/lib/autonomous/scheduler.js +615 -0
- package/lib/autonomous/task-parser.js +127 -0
- package/lib/autonomous/validators/forensic.js +266 -0
- package/lib/autonomous/validators/osint.js +216 -0
- package/lib/autonomous/validators/privacy.js +296 -0
- package/lib/autonomous/validators/purple.js +298 -0
- package/lib/autonomous/validators/sigma.js +248 -0
- package/lib/autonomous/validators/threat-model.js +363 -0
- package/lib/benchmark/agent.js +119 -0
- package/lib/benchmark/baselines.js +43 -0
- package/lib/benchmark/builder.js +143 -0
- package/lib/benchmark/config.js +35 -0
- package/lib/benchmark/coordinator.js +91 -0
- package/lib/benchmark/index.js +20 -0
- package/lib/benchmark/llm.js +58 -0
- package/lib/benchmark/models.js +137 -0
- package/lib/benchmark/reporter.js +103 -0
- package/lib/benchmark/runner.js +103 -0
- package/lib/benchmark/sandbox.js +96 -0
- package/lib/benchmark/scorer.js +32 -0
- package/lib/benchmark/solver.js +166 -0
- package/lib/benchmark/tools.js +62 -0
- package/lib/bot/bot.js +130 -0
- package/lib/commands.js +99 -0
- package/lib/complexity.js +377 -0
- package/lib/config.js +213 -0
- package/lib/gateway/client.js +309 -0
- package/lib/gateway/commands.js +830 -0
- package/lib/gateway/config-validate.js +109 -0
- package/lib/gateway/gateway.js +367 -0
- package/lib/gateway/index.js +62 -0
- package/lib/gateway/mode.js +309 -0
- package/lib/gateway/plugins.js +222 -0
- package/lib/gateway/prompt.js +214 -0
- package/lib/mcp/server.js +262 -0
- package/lib/memory/compressor.js +425 -0
- package/lib/memory/engine.js +763 -0
- package/lib/memory/evolution.js +668 -0
- package/lib/memory/index.js +58 -0
- package/lib/memory/orchestrator.js +506 -0
- package/lib/memory/retriever.js +515 -0
- package/lib/memory/synthesizer.js +333 -0
- package/lib/pipeline/async-scanner.js +510 -0
- package/lib/pipeline/binary-analysis.js +1043 -0
- package/lib/pipeline/dom-xss-scanner.js +435 -0
- package/lib/pipeline/github-actions.js +792 -0
- package/lib/pipeline/index.js +124 -0
- package/lib/pipeline/osint.js +498 -0
- package/lib/pipeline/sarif.js +373 -0
- package/lib/pipeline/scanner.js +880 -0
- package/lib/pipeline/template-manager.js +525 -0
- package/lib/pipeline/xss-scanner.js +353 -0
- package/lib/setup-wizard.js +229 -0
- package/package.json +30 -0
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CIPHER Benchmark — Report generation (JSON + Markdown with gap analysis).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { ALL_BASELINES, PENTESTGPT_BASELINE } from './baselines.js';
|
|
9
|
+
import { reportToDict } from './runner.js';
|
|
10
|
+
|
|
11
|
+
export function generateJsonReport(report, solverName = '') {
|
|
12
|
+
const data = reportToDict(report);
|
|
13
|
+
data.metadata = { generated_at: new Date().toISOString(), solver: solverName };
|
|
14
|
+
data.baselines = {};
|
|
15
|
+
for (const bl of ALL_BASELINES) {
|
|
16
|
+
data.baselines[bl.name] = {
|
|
17
|
+
overall_pct: bl.overallPct,
|
|
18
|
+
overall_passed: bl.overallPassed,
|
|
19
|
+
overall_total: bl.overallTotal,
|
|
20
|
+
level_pct: Object.fromEntries(Object.entries(bl.levelPct).map(([k, v]) => [String(k), v])),
|
|
21
|
+
median_cost_usd: bl.medianCostUsd,
|
|
22
|
+
source: bl.source,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
const byLevel = report.passRateByLevel();
|
|
26
|
+
data.gap_analysis = {
|
|
27
|
+
vs_pentestgpt: {
|
|
28
|
+
overall_delta: Math.round((report.passRate - PENTESTGPT_BASELINE.overallPct) * 10) / 10,
|
|
29
|
+
level_deltas: {},
|
|
30
|
+
},
|
|
31
|
+
};
|
|
32
|
+
for (const [level, pgtPct] of Object.entries(PENTESTGPT_BASELINE.levelPct)) {
|
|
33
|
+
const cipherPct = byLevel[level] || 0;
|
|
34
|
+
data.gap_analysis.vs_pentestgpt.level_deltas[level] = {
|
|
35
|
+
cipher: Math.round(cipherPct * 10) / 10,
|
|
36
|
+
pentestgpt: pgtPct,
|
|
37
|
+
delta: Math.round((cipherPct - pgtPct) * 10) / 10,
|
|
38
|
+
};
|
|
39
|
+
}
|
|
40
|
+
return JSON.stringify(data, null, 2);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
function medianCost(report) {
|
|
44
|
+
const costs = report.results.map((r) => r.solverResult.apiCostUsd).sort((a, b) => a - b);
|
|
45
|
+
if (!costs.length) return 0;
|
|
46
|
+
const mid = Math.floor(costs.length / 2);
|
|
47
|
+
return costs.length % 2 === 0 ? (costs[mid - 1] + costs[mid]) / 2 : costs[mid];
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
export function generateMarkdownReport(report, solverName = '') {
|
|
51
|
+
const lines = [];
|
|
52
|
+
lines.push('# CIPHER Benchmark Report', '');
|
|
53
|
+
lines.push(`**Generated:** ${new Date().toISOString().slice(0, 16).replace('T', ' ')} UTC`);
|
|
54
|
+
lines.push(`**Solver:** ${solverName}`, '');
|
|
55
|
+
lines.push('## Summary', '');
|
|
56
|
+
lines.push('| Metric | Value |', '|--------|-------|');
|
|
57
|
+
lines.push(`| Total benchmarks | ${report.total} |`);
|
|
58
|
+
lines.push(`| Passed | ${report.passed} |`);
|
|
59
|
+
lines.push(`| Failed | ${report.failed} |`);
|
|
60
|
+
lines.push(`| Skipped (errors) | ${report.skipped} |`);
|
|
61
|
+
lines.push(`| **Pass rate** | **${report.passRate.toFixed(1)}%** |`);
|
|
62
|
+
lines.push(`| Total duration | ${report.durationS.toFixed(1)}s |`);
|
|
63
|
+
lines.push(`| Total cost | $${report.totalCostUsd.toFixed(4)} |`, '');
|
|
64
|
+
|
|
65
|
+
lines.push('## Gap Analysis vs Competitors', '');
|
|
66
|
+
lines.push('| System | Overall | L1 (Easy) | L2 (Medium) | L3 (Hard) | Median Cost |');
|
|
67
|
+
lines.push('|--------|---------|-----------|-------------|-----------|-------------|');
|
|
68
|
+
const byLevel = report.passRateByLevel();
|
|
69
|
+
const mc = medianCost(report);
|
|
70
|
+
lines.push(`| **CIPHER** | **${report.passRate.toFixed(1)}%** | ${(byLevel[1] || 0).toFixed(1)}% | ${(byLevel[2] || 0).toFixed(1)}% | ${(byLevel[3] || 0).toFixed(1)}% | $${mc.toFixed(2)} |`);
|
|
71
|
+
for (const bl of ALL_BASELINES) {
|
|
72
|
+
const l1 = bl.levelPct[1] ? `${bl.levelPct[1].toFixed(1)}%` : '—';
|
|
73
|
+
const l2 = bl.levelPct[2] ? `${bl.levelPct[2].toFixed(1)}%` : '—';
|
|
74
|
+
const l3 = bl.levelPct[3] ? `${bl.levelPct[3].toFixed(1)}%` : '—';
|
|
75
|
+
const cost = bl.medianCostUsd ? `$${bl.medianCostUsd.toFixed(2)}` : '—';
|
|
76
|
+
lines.push(`| ${bl.name} | ${bl.overallPct.toFixed(1)}% | ${l1} | ${l2} | ${l3} | ${cost} |`);
|
|
77
|
+
}
|
|
78
|
+
lines.push('');
|
|
79
|
+
|
|
80
|
+
const delta = report.passRate - PENTESTGPT_BASELINE.overallPct;
|
|
81
|
+
const dir = delta > 0 ? 'ahead' : delta < 0 ? 'behind' : 'tied';
|
|
82
|
+
lines.push('### Delta vs PentestGPT', '');
|
|
83
|
+
lines.push(`**Overall:** ${delta >= 0 ? '+' : ''}${delta.toFixed(1)}pp (${dir})`, '');
|
|
84
|
+
|
|
85
|
+
lines.push('## Results by Difficulty Level', '');
|
|
86
|
+
for (const level of Object.keys(byLevel).sort()) {
|
|
87
|
+
const lr = report.results.filter((r) => r.config.level === parseInt(level));
|
|
88
|
+
const p = lr.filter((r) => r.passed).length;
|
|
89
|
+
const pct = lr.length ? (p / lr.length) * 100 : 0;
|
|
90
|
+
const label = { 1: 'Easy', 2: 'Medium', 3: 'Hard' }[level] || `L${level}`;
|
|
91
|
+
lines.push(`### Level ${level} — ${label} (${p}/${lr.length} = ${pct.toFixed(1)}%)`, '');
|
|
92
|
+
lines.push('| Benchmark | Tags | Result | Duration | Cost |');
|
|
93
|
+
lines.push('|-----------|------|--------|----------|------|');
|
|
94
|
+
for (const r of lr.sort((a, b) => a.config.dirName.localeCompare(b.config.dirName))) {
|
|
95
|
+
let status = r.passed ? '✅ PASS' : '❌ FAIL';
|
|
96
|
+
if (r.solverResult.error) status = '⚠️ ERROR';
|
|
97
|
+
lines.push(`| ${r.config.dirName} | ${r.config.tags.join(', ')} | ${status} | ${r.solverResult.durationS.toFixed(1)}s | $${r.solverResult.apiCostUsd.toFixed(4)} |`);
|
|
98
|
+
}
|
|
99
|
+
lines.push('');
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return lines.join('\n');
|
|
103
|
+
}
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CIPHER Benchmark — Runner (orchestrates build → start → solve → score → report).
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { BenchmarkBuilder } from './builder.js';
|
|
9
|
+
import { HarnessConfig } from './config.js';
|
|
10
|
+
import { SolverResult } from './models.js';
|
|
11
|
+
import { aggregateResults, scoreResult } from './scorer.js';
|
|
12
|
+
import { getSolver } from './solver.js';
|
|
13
|
+
|
|
14
|
+
export function runSingleBenchmark(builder, solver, name, flag) {
|
|
15
|
+
const config = builder.getBenchmark(name);
|
|
16
|
+
flag = flag || builder.config.generateFlag(name);
|
|
17
|
+
let targetUrl = '';
|
|
18
|
+
try {
|
|
19
|
+
const result = builder.runBenchmark(name, flag);
|
|
20
|
+
targetUrl = result.targetUrl;
|
|
21
|
+
const solverResult = solver.solve(config, targetUrl, result.expectedFlag);
|
|
22
|
+
// Handle promise (autonomous) or sync (stub)
|
|
23
|
+
if (solverResult && typeof solverResult.then === 'function') {
|
|
24
|
+
return solverResult.then((sr) => scoreResult(config, sr, flag, targetUrl));
|
|
25
|
+
}
|
|
26
|
+
return scoreResult(config, solverResult, flag, targetUrl);
|
|
27
|
+
} catch (e) {
|
|
28
|
+
return scoreResult(config, new SolverResult({ error: String(e) }), flag, targetUrl);
|
|
29
|
+
} finally {
|
|
30
|
+
try { builder.stop(name); } catch { /* ignore */ }
|
|
31
|
+
}
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
export async function runBenchmarks({ builder, solver, benchmarkNames, runAll = false, levelFilter, tagFilter }) {
|
|
35
|
+
let allBenchmarks;
|
|
36
|
+
try {
|
|
37
|
+
allBenchmarks = builder.listBenchmarks();
|
|
38
|
+
} catch {
|
|
39
|
+
builder.clone();
|
|
40
|
+
allBenchmarks = builder.listBenchmarks(true);
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
let targets;
|
|
44
|
+
if (benchmarkNames?.length) {
|
|
45
|
+
const available = new Set(allBenchmarks.map((b) => b.dirName));
|
|
46
|
+
targets = benchmarkNames.filter((n) => available.has(n));
|
|
47
|
+
} else if (runAll) {
|
|
48
|
+
targets = allBenchmarks.map((b) => b.dirName);
|
|
49
|
+
} else {
|
|
50
|
+
throw new Error('Must specify benchmarkNames or runAll');
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
if (levelFilter != null || tagFilter) {
|
|
54
|
+
const byName = Object.fromEntries(allBenchmarks.map((b) => [b.dirName, b]));
|
|
55
|
+
targets = targets.filter((name) => {
|
|
56
|
+
const b = byName[name];
|
|
57
|
+
if (levelFilter != null && b.level !== levelFilter) return false;
|
|
58
|
+
if (tagFilter && !b.tags.includes(tagFilter)) return false;
|
|
59
|
+
return true;
|
|
60
|
+
});
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
const results = [];
|
|
64
|
+
for (const name of targets) {
|
|
65
|
+
const result = await runSingleBenchmark(builder, solver, name);
|
|
66
|
+
results.push(result);
|
|
67
|
+
}
|
|
68
|
+
return aggregateResults(results);
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
export function reportToDict(report) {
|
|
72
|
+
return {
|
|
73
|
+
summary: {
|
|
74
|
+
total: report.total,
|
|
75
|
+
passed: report.passed,
|
|
76
|
+
failed: report.failed,
|
|
77
|
+
skipped: report.skipped,
|
|
78
|
+
pass_rate: Math.round(report.passRate * 10) / 10,
|
|
79
|
+
duration_s: Math.round(report.durationS * 10) / 10,
|
|
80
|
+
total_cost_usd: Math.round(report.totalCostUsd * 10000) / 10000,
|
|
81
|
+
pass_rate_by_level: Object.fromEntries(
|
|
82
|
+
Object.entries(report.passRateByLevel()).map(([k, v]) => [k, Math.round(v * 10) / 10]),
|
|
83
|
+
),
|
|
84
|
+
},
|
|
85
|
+
results: report.results.map((r) => ({
|
|
86
|
+
benchmark: r.config.dirName,
|
|
87
|
+
name: r.config.name,
|
|
88
|
+
level: r.config.level,
|
|
89
|
+
tags: r.config.tags,
|
|
90
|
+
passed: r.passed,
|
|
91
|
+
expected_flag: r.expectedFlag,
|
|
92
|
+
actual_flag: r.actualFlag,
|
|
93
|
+
target_url: r.targetUrl,
|
|
94
|
+
duration_s: Math.round(r.solverResult.durationS * 1000) / 1000,
|
|
95
|
+
tokens_in: r.solverResult.tokensIn,
|
|
96
|
+
tokens_out: r.solverResult.tokensOut,
|
|
97
|
+
api_cost_usd: r.solverResult.apiCostUsd,
|
|
98
|
+
tool_calls: r.solverResult.toolCalls,
|
|
99
|
+
steps: r.solverResult.steps,
|
|
100
|
+
error: r.solverResult.error,
|
|
101
|
+
})),
|
|
102
|
+
};
|
|
103
|
+
}
|
|
@@ -0,0 +1,96 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CIPHER Benchmark — Docker sandbox for autonomous agent execution.
|
|
6
|
+
*
|
|
7
|
+
* Uses child_process to exec commands in a Docker container instead of
|
|
8
|
+
* dockerode, keeping the dependency footprint minimal.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { execSync, spawnSync } from 'node:child_process';
|
|
12
|
+
import { randomBytes } from 'node:crypto';
|
|
13
|
+
|
|
14
|
+
const DEFAULT_IMAGE = 'python:3.12-slim';
|
|
15
|
+
const CONTAINER_PREFIX = 'cipher-sandbox';
|
|
16
|
+
|
|
17
|
+
export class SandboxError extends Error {
|
|
18
|
+
constructor(msg) { super(msg); this.name = 'SandboxError'; }
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export class SandboxContainer {
|
|
22
|
+
constructor({ image = DEFAULT_IMAGE, nameSuffix = '' } = {}) {
|
|
23
|
+
this._image = image;
|
|
24
|
+
this._containerId = null;
|
|
25
|
+
this._connectedNetworks = [];
|
|
26
|
+
this._nameSuffix = nameSuffix;
|
|
27
|
+
this._setupDone = false;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
get containerId() { return this._containerId; }
|
|
31
|
+
|
|
32
|
+
create() {
|
|
33
|
+
const name = this._nameSuffix
|
|
34
|
+
? `${CONTAINER_PREFIX}-${this._nameSuffix}`
|
|
35
|
+
: `${CONTAINER_PREFIX}-${randomBytes(4).toString('hex')}`;
|
|
36
|
+
|
|
37
|
+
const result = spawnSync('docker', [
|
|
38
|
+
'run', '-d', '--name', name,
|
|
39
|
+
'--memory=512m', '--cpus=0.5',
|
|
40
|
+
'--network=bridge',
|
|
41
|
+
this._image, 'sleep', 'infinity',
|
|
42
|
+
], { timeout: 30000, stdio: 'pipe' });
|
|
43
|
+
|
|
44
|
+
if (result.status !== 0) {
|
|
45
|
+
throw new SandboxError(`Failed to create sandbox: ${(result.stderr || '').toString().slice(0, 300)}`);
|
|
46
|
+
}
|
|
47
|
+
this._containerId = (result.stdout || '').toString().trim().slice(0, 12);
|
|
48
|
+
this._installBaseTools();
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
_installBaseTools() {
|
|
52
|
+
if (this._setupDone) return;
|
|
53
|
+
this.execTool('apt-get update -qq && apt-get install -y -qq curl nmap > /dev/null 2>&1', 120);
|
|
54
|
+
this._setupDone = true;
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
execTool(command, timeout = 60) {
|
|
58
|
+
if (!this._containerId) throw new SandboxError('Container not created. Call create() first.');
|
|
59
|
+
const result = spawnSync('docker', ['exec', this._containerId, '/bin/sh', '-c', command], {
|
|
60
|
+
timeout: timeout * 1000,
|
|
61
|
+
stdio: 'pipe',
|
|
62
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
63
|
+
});
|
|
64
|
+
return {
|
|
65
|
+
exitCode: result.status ?? -1,
|
|
66
|
+
stdout: (result.stdout || '').toString(),
|
|
67
|
+
stderr: (result.stderr || '').toString(),
|
|
68
|
+
};
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
connectNetwork(networkName) {
|
|
72
|
+
if (!this._containerId) throw new SandboxError('Container not created.');
|
|
73
|
+
const result = spawnSync('docker', ['network', 'connect', networkName, this._containerId], { timeout: 10000, stdio: 'pipe' });
|
|
74
|
+
if (result.status !== 0) {
|
|
75
|
+
throw new SandboxError(`Failed to connect to network ${networkName}: ${(result.stderr || '').toString().slice(0, 200)}`);
|
|
76
|
+
}
|
|
77
|
+
this._connectedNetworks.push(networkName);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
getInternalUrl(serviceName, port = 80) {
|
|
81
|
+
const protocol = [443, 8443].includes(port) ? 'https' : 'http';
|
|
82
|
+
return `${protocol}://${serviceName}:${port}`;
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
cleanup() {
|
|
86
|
+
if (!this._containerId) return;
|
|
87
|
+
for (const net of this._connectedNetworks) {
|
|
88
|
+
try { spawnSync('docker', ['network', 'disconnect', '-f', net, this._containerId], { timeout: 5000, stdio: 'pipe' }); }
|
|
89
|
+
catch { /* ignore */ }
|
|
90
|
+
}
|
|
91
|
+
this._connectedNetworks = [];
|
|
92
|
+
try { spawnSync('docker', ['rm', '-f', this._containerId], { timeout: 10000, stdio: 'pipe' }); }
|
|
93
|
+
catch { /* ignore */ }
|
|
94
|
+
this._containerId = null;
|
|
95
|
+
}
|
|
96
|
+
}
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CIPHER Benchmark — Scoring logic.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { BenchmarkResult, RunReport } from './models.js';
|
|
9
|
+
|
|
10
|
+
export function scoreFlag(expected, actual) {
|
|
11
|
+
if (!expected || !actual) return false;
|
|
12
|
+
return expected.trim().toLowerCase() === actual.trim().toLowerCase();
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
export function scoreResult(config, solverResult, expectedFlag, targetUrl = '') {
|
|
16
|
+
const passed = solverResult.flagFound && solverResult.flagValue
|
|
17
|
+
? scoreFlag(expectedFlag, solverResult.flagValue)
|
|
18
|
+
: false;
|
|
19
|
+
return new BenchmarkResult({ config, solverResult, passed, expectedFlag, actualFlag: solverResult.flagValue, targetUrl });
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export function aggregateResults(results) {
|
|
23
|
+
return new RunReport({
|
|
24
|
+
total: results.length,
|
|
25
|
+
passed: results.filter((r) => r.passed).length,
|
|
26
|
+
failed: results.filter((r) => !r.passed && !r.solverResult.error).length,
|
|
27
|
+
skipped: results.filter((r) => r.solverResult.error != null).length,
|
|
28
|
+
durationS: results.reduce((s, r) => s + r.solverResult.durationS, 0),
|
|
29
|
+
totalCostUsd: results.reduce((s, r) => s + r.solverResult.apiCostUsd, 0),
|
|
30
|
+
results,
|
|
31
|
+
});
|
|
32
|
+
}
|
|
@@ -0,0 +1,166 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CIPHER Benchmark — Solver adapters.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { SolverResult } from './models.js';
|
|
9
|
+
|
|
10
|
+
/** Abstract solver base. Subclasses must implement name and solve(). */
|
|
11
|
+
export class SolverAdapter {
|
|
12
|
+
get name() { throw new Error('not implemented'); }
|
|
13
|
+
solve(config, targetUrl, expectedFlag) { throw new Error('not implemented'); }
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/** Testing solver — returns the expected flag immediately. */
|
|
17
|
+
export class StubSolver extends SolverAdapter {
|
|
18
|
+
get name() { return 'stub'; }
|
|
19
|
+
solve(config, targetUrl, expectedFlag) {
|
|
20
|
+
const start = performance.now();
|
|
21
|
+
return new SolverResult({
|
|
22
|
+
flagFound: true,
|
|
23
|
+
flagValue: expectedFlag,
|
|
24
|
+
durationS: (performance.now() - start) / 1000,
|
|
25
|
+
steps: ['stub: returned injected flag'],
|
|
26
|
+
});
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
/** Interactive solver — prompts for the flag. */
|
|
31
|
+
export class ManualSolver extends SolverAdapter {
|
|
32
|
+
get name() { return 'manual'; }
|
|
33
|
+
solve(config, targetUrl, expectedFlag) {
|
|
34
|
+
const { createInterface } = require('node:readline');
|
|
35
|
+
const rl = createInterface({ input: process.stdin, output: process.stdout });
|
|
36
|
+
return new Promise((resolve) => {
|
|
37
|
+
console.log(`\n${'='.repeat(60)}`);
|
|
38
|
+
console.log(` Challenge: ${config.name}`);
|
|
39
|
+
console.log(` Level: ${config.level}`);
|
|
40
|
+
console.log(` Tags: ${config.tags.join(', ')}`);
|
|
41
|
+
console.log(` Description: ${config.description}`);
|
|
42
|
+
console.log(` Target: ${targetUrl}`);
|
|
43
|
+
console.log(`${'='.repeat(60)}\n`);
|
|
44
|
+
const start = performance.now();
|
|
45
|
+
rl.question('Enter the flag (or "skip"): ', (answer) => {
|
|
46
|
+
rl.close();
|
|
47
|
+
const duration = (performance.now() - start) / 1000;
|
|
48
|
+
if (!answer || answer.toLowerCase() === 'skip') {
|
|
49
|
+
resolve(new SolverResult({ durationS: duration, steps: ['manual: skipped'] }));
|
|
50
|
+
} else {
|
|
51
|
+
resolve(new SolverResult({ flagFound: true, flagValue: answer.trim(), durationS: duration, steps: ['manual: user provided flag'] }));
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
});
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** AI-powered solver using SecurityAgent in a Docker sandbox. */
|
|
59
|
+
export class AutonomousSolver extends SolverAdapter {
|
|
60
|
+
constructor({ backend, maxTurns = 30 } = {}) {
|
|
61
|
+
super();
|
|
62
|
+
this._backend = backend;
|
|
63
|
+
this._maxTurns = maxTurns;
|
|
64
|
+
this._supervisedHook = null;
|
|
65
|
+
}
|
|
66
|
+
get name() { return 'autonomous'; }
|
|
67
|
+
async solve(config, targetUrl, expectedFlag) {
|
|
68
|
+
const { SecurityAgent } = await import('./agent.js');
|
|
69
|
+
const { makeAgentClient } = await import('./llm.js');
|
|
70
|
+
const { SandboxContainer } = await import('./sandbox.js');
|
|
71
|
+
const { readFileSync } = await import('node:fs');
|
|
72
|
+
const yaml = await import('yaml');
|
|
73
|
+
|
|
74
|
+
const start = performance.now();
|
|
75
|
+
const composePath = `${config.path}/docker-compose.yml`;
|
|
76
|
+
const data = yaml.parse(readFileSync(composePath, 'utf8'));
|
|
77
|
+
const services = data.services || {};
|
|
78
|
+
let serviceName = Object.keys(services)[0] || 'web';
|
|
79
|
+
let internalPort = 80;
|
|
80
|
+
for (const [name, svc] of Object.entries(services)) {
|
|
81
|
+
if (svc.ports?.length) {
|
|
82
|
+
serviceName = name;
|
|
83
|
+
const portSpec = String(svc.ports[0]);
|
|
84
|
+
internalPort = parseInt(portSpec.split(':').pop(), 10);
|
|
85
|
+
break;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const networkName = `cipher-bench-${config.dirName}_default`.toLowerCase();
|
|
89
|
+
const { client, model } = await makeAgentClient({ backendOverride: this._backend });
|
|
90
|
+
const sandbox = new SandboxContainer({ nameSuffix: config.dirName.toLowerCase() });
|
|
91
|
+
try {
|
|
92
|
+
sandbox.create();
|
|
93
|
+
sandbox.connectNetwork(networkName);
|
|
94
|
+
const internalUrl = sandbox.getInternalUrl(serviceName, internalPort);
|
|
95
|
+
const agent = new SecurityAgent({ client, model, sandbox, maxTurns: this._maxTurns, preToolHook: this._supervisedHook });
|
|
96
|
+
const result = await agent.run({ targetUrl: internalUrl, challengeDescription: config.description });
|
|
97
|
+
return new SolverResult({
|
|
98
|
+
flagFound: result.flagFound, flagValue: result.flagValue,
|
|
99
|
+
durationS: (performance.now() - start) / 1000,
|
|
100
|
+
tokensIn: result.tokensIn, tokensOut: result.tokensOut,
|
|
101
|
+
toolCalls: result.toolCalls, steps: result.steps, error: result.error,
|
|
102
|
+
});
|
|
103
|
+
} catch (e) {
|
|
104
|
+
return new SolverResult({ durationS: (performance.now() - start) / 1000, error: String(e), steps: [`[error] ${e}`] });
|
|
105
|
+
} finally {
|
|
106
|
+
sandbox.cleanup();
|
|
107
|
+
}
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Multi-agent solver using phased coordinator. */
|
|
112
|
+
export class MultiAgentSolver extends SolverAdapter {
|
|
113
|
+
constructor({ backend, reconTurns = 10, exploitTurns = 25 } = {}) {
|
|
114
|
+
super();
|
|
115
|
+
this._backend = backend;
|
|
116
|
+
this._reconTurns = reconTurns;
|
|
117
|
+
this._exploitTurns = exploitTurns;
|
|
118
|
+
this._supervisedHook = null;
|
|
119
|
+
}
|
|
120
|
+
get name() { return 'autonomous-multi'; }
|
|
121
|
+
async solve(config, targetUrl, expectedFlag) {
|
|
122
|
+
const { Coordinator } = await import('./coordinator.js');
|
|
123
|
+
const { makeAgentClient } = await import('./llm.js');
|
|
124
|
+
const { SandboxContainer } = await import('./sandbox.js');
|
|
125
|
+
const { readFileSync } = await import('node:fs');
|
|
126
|
+
const yaml = await import('yaml');
|
|
127
|
+
|
|
128
|
+
const start = performance.now();
|
|
129
|
+
const composePath = `${config.path}/docker-compose.yml`;
|
|
130
|
+
const data = yaml.parse(readFileSync(composePath, 'utf8'));
|
|
131
|
+
const services = data.services || {};
|
|
132
|
+
let serviceName = Object.keys(services)[0] || 'web';
|
|
133
|
+
let internalPort = 80;
|
|
134
|
+
for (const [name, svc] of Object.entries(services)) {
|
|
135
|
+
if (svc.ports?.length) { serviceName = name; internalPort = parseInt(String(svc.ports[0]).split(':').pop(), 10); break; }
|
|
136
|
+
}
|
|
137
|
+
const networkName = `cipher-bench-${config.dirName}_default`.toLowerCase();
|
|
138
|
+
const { client, model } = await makeAgentClient({ backendOverride: this._backend });
|
|
139
|
+
const sandbox = new SandboxContainer({ nameSuffix: config.dirName.toLowerCase() });
|
|
140
|
+
try {
|
|
141
|
+
sandbox.create();
|
|
142
|
+
sandbox.connectNetwork(networkName);
|
|
143
|
+
const internalUrl = sandbox.getInternalUrl(serviceName, internalPort);
|
|
144
|
+
const coordinator = new Coordinator({ client, model, sandbox, reconTurns: this._reconTurns, exploitTurns: this._exploitTurns, preToolHook: this._supervisedHook });
|
|
145
|
+
const result = await coordinator.solve({ targetUrl: internalUrl, challengeDescription: config.description });
|
|
146
|
+
return new SolverResult({
|
|
147
|
+
flagFound: result.flagFound, flagValue: result.flagValue,
|
|
148
|
+
durationS: (performance.now() - start) / 1000,
|
|
149
|
+
tokensIn: result.tokensIn, tokensOut: result.tokensOut,
|
|
150
|
+
toolCalls: result.toolCalls, steps: result.steps, error: result.error,
|
|
151
|
+
});
|
|
152
|
+
} catch (e) {
|
|
153
|
+
return new SolverResult({ durationS: (performance.now() - start) / 1000, error: String(e), steps: [`[error] ${e}`] });
|
|
154
|
+
} finally {
|
|
155
|
+
sandbox.cleanup();
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
export const SOLVERS = { stub: StubSolver, manual: ManualSolver, autonomous: AutonomousSolver, 'autonomous-multi': MultiAgentSolver };
|
|
161
|
+
|
|
162
|
+
export function getSolver(name, opts = {}) {
|
|
163
|
+
const Cls = SOLVERS[name];
|
|
164
|
+
if (!Cls) throw new Error(`Unknown solver: ${name}. Available: ${Object.keys(SOLVERS).join(', ')}`);
|
|
165
|
+
return new Cls(opts);
|
|
166
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* CIPHER Benchmark — Tool definitions for the security agent.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
const MAX_OUTPUT_CHARS = 4000;
|
|
9
|
+
|
|
10
|
+
export const SANDBOX_EXEC_SCHEMA = {
|
|
11
|
+
name: 'sandbox_exec',
|
|
12
|
+
description: 'Execute a shell command in the security sandbox. Returns stdout, stderr, and exit code.',
|
|
13
|
+
input_schema: {
|
|
14
|
+
type: 'object',
|
|
15
|
+
properties: {
|
|
16
|
+
command: { type: 'string', description: 'Shell command to execute' },
|
|
17
|
+
timeout: { type: 'integer', description: 'Timeout in seconds (default 60)', default: 60 },
|
|
18
|
+
},
|
|
19
|
+
required: ['command'],
|
|
20
|
+
},
|
|
21
|
+
};
|
|
22
|
+
|
|
23
|
+
export const SUBMIT_FLAG_SCHEMA = {
|
|
24
|
+
name: 'submit_flag',
|
|
25
|
+
description: 'Submit a captured flag. Call this when you have found the flag.',
|
|
26
|
+
input_schema: {
|
|
27
|
+
type: 'object',
|
|
28
|
+
properties: {
|
|
29
|
+
flag: { type: 'string', description: 'The captured flag value' },
|
|
30
|
+
},
|
|
31
|
+
required: ['flag'],
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
|
|
35
|
+
export const AGENT_TOOLS = [SANDBOX_EXEC_SCHEMA, SUBMIT_FLAG_SCHEMA];
|
|
36
|
+
|
|
37
|
+
/**
|
|
38
|
+
* Dispatch a tool call within the sandbox.
|
|
39
|
+
* @param {string} toolName
|
|
40
|
+
* @param {object} toolInput
|
|
41
|
+
* @param {import('./sandbox.js').SandboxContainer} sandbox
|
|
42
|
+
* @returns {{ output: string, flagSubmitted?: string }}
|
|
43
|
+
*/
|
|
44
|
+
export function dispatchTool(toolName, toolInput, sandbox) {
|
|
45
|
+
if (toolName === 'sandbox_exec') {
|
|
46
|
+
const { command, timeout = 60 } = toolInput;
|
|
47
|
+
const result = sandbox.execTool(command, timeout);
|
|
48
|
+
let output = `Exit code: ${result.exitCode}\n`;
|
|
49
|
+
if (result.stdout) output += `STDOUT:\n${result.stdout.slice(0, MAX_OUTPUT_CHARS)}\n`;
|
|
50
|
+
if (result.stderr) output += `STDERR:\n${result.stderr.slice(0, MAX_OUTPUT_CHARS)}\n`;
|
|
51
|
+
if (result.stdout.length > MAX_OUTPUT_CHARS || result.stderr.length > MAX_OUTPUT_CHARS) {
|
|
52
|
+
output += '[output truncated]\n';
|
|
53
|
+
}
|
|
54
|
+
return { output };
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
if (toolName === 'submit_flag') {
|
|
58
|
+
return { output: `Flag submitted: ${toolInput.flag}`, flagSubmitted: toolInput.flag };
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
return { output: `Unknown tool: ${toolName}` };
|
|
62
|
+
}
|