cipher-security 2.1.0 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cipher.js +10 -0
- package/lib/analyze/consistency.js +566 -0
- package/lib/analyze/constitution.js +110 -0
- package/lib/analyze/sharding.js +251 -0
- package/lib/autonomous/agent-tool.js +165 -0
- package/lib/autonomous/framework.js +17 -0
- package/lib/autonomous/handoff.js +506 -0
- package/lib/autonomous/modes/blue.js +26 -0
- package/lib/autonomous/modes/red.js +28 -0
- package/lib/benchmark/agent.js +88 -26
- package/lib/benchmark/baselines.js +3 -0
- package/lib/benchmark/claude-code-solver.js +254 -0
- package/lib/benchmark/cognitive.js +283 -0
- package/lib/benchmark/index.js +12 -2
- package/lib/benchmark/knowledge.js +281 -0
- package/lib/benchmark/llm.js +156 -15
- package/lib/benchmark/models.js +5 -2
- package/lib/benchmark/nyu-ctf.js +192 -0
- package/lib/benchmark/overthewire.js +347 -0
- package/lib/benchmark/picoctf.js +281 -0
- package/lib/benchmark/prompts.js +280 -0
- package/lib/benchmark/registry.js +219 -0
- package/lib/benchmark/remote-solver.js +356 -0
- package/lib/benchmark/remote-target.js +263 -0
- package/lib/benchmark/reporter.js +35 -0
- package/lib/benchmark/runner.js +174 -10
- package/lib/benchmark/sandbox.js +35 -0
- package/lib/benchmark/scorer.js +22 -4
- package/lib/benchmark/solver.js +34 -1
- package/lib/benchmark/tools.js +262 -16
- package/lib/commands.js +9 -0
- package/lib/execution/council.js +434 -0
- package/lib/execution/parallel.js +292 -0
- package/lib/gates/circuit-breaker.js +135 -0
- package/lib/gates/confidence.js +302 -0
- package/lib/gates/corrections.js +219 -0
- package/lib/gates/self-check.js +245 -0
- package/lib/gateway/commands.js +727 -0
- package/lib/guardrails/engine.js +364 -0
- package/lib/mcp/server.js +349 -3
- package/lib/memory/compressor.js +94 -7
- package/lib/pipeline/hooks.js +288 -0
- package/lib/pipeline/index.js +11 -0
- package/lib/review/budget.js +210 -0
- package/lib/review/engine.js +526 -0
- package/lib/review/layers/acceptance-auditor.js +279 -0
- package/lib/review/layers/blind-hunter.js +500 -0
- package/lib/review/layers/defense-in-depth.js +209 -0
- package/lib/review/layers/edge-case-hunter.js +266 -0
- package/lib/review/panel.js +519 -0
- package/lib/review/two-stage.js +244 -0
- package/lib/session/cost-tracker.js +203 -0
- package/lib/session/logger.js +349 -0
- package/package.json +1 -1
|
@@ -0,0 +1,283 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
// CIPHER is a trademark of defconxt.
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Cognitive Architecture — Plan tool, confidence tracker, and reasoning support.
|
|
7
|
+
*
|
|
8
|
+
* Gives the benchmark agent structured reasoning capabilities:
|
|
9
|
+
* - PlanTool: persists attack plans as external memory across turns
|
|
10
|
+
* - ConfidenceTracker: tracks confidence level and recommends next actions
|
|
11
|
+
*
|
|
12
|
+
* These tools help the agent avoid going in circles and make deliberate
|
|
13
|
+
* decisions about when to explore vs. exploit.
|
|
14
|
+
*
|
|
15
|
+
* @module benchmark/cognitive
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
// ---------------------------------------------------------------------------
|
|
19
|
+
// Plan Tool — external memory for attack plans
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
|
|
22
|
+
export const UPDATE_PLAN_SCHEMA = {
|
|
23
|
+
name: 'update_plan',
|
|
24
|
+
description: 'Write or update your attack plan. Use this to organize your approach, track what you\'ve tried, and plan next steps. The plan persists across turns as your external memory.',
|
|
25
|
+
input_schema: {
|
|
26
|
+
type: 'object',
|
|
27
|
+
properties: {
|
|
28
|
+
plan: {
|
|
29
|
+
type: 'string',
|
|
30
|
+
description: 'Your current attack plan in structured format. Include: objective, findings so far, current hypothesis, next steps, and what has been tried and failed.',
|
|
31
|
+
},
|
|
32
|
+
phase: {
|
|
33
|
+
type: 'string',
|
|
34
|
+
enum: ['recon', 'enumerate', 'exploit', 'post-exploit', 'pivot'],
|
|
35
|
+
description: 'Current attack phase',
|
|
36
|
+
},
|
|
37
|
+
confidence: {
|
|
38
|
+
type: 'number',
|
|
39
|
+
description: 'Your confidence level (0.0 to 1.0) that you can solve this challenge',
|
|
40
|
+
},
|
|
41
|
+
},
|
|
42
|
+
required: ['plan'],
|
|
43
|
+
},
|
|
44
|
+
};
|
|
45
|
+
|
|
46
|
+
export const CHECK_CONFIDENCE_SCHEMA = {
|
|
47
|
+
name: 'check_confidence',
|
|
48
|
+
description: 'Check your current confidence level and get guidance on what to do next. Call this when you feel stuck or want to reassess your approach.',
|
|
49
|
+
input_schema: {
|
|
50
|
+
type: 'object',
|
|
51
|
+
properties: {
|
|
52
|
+
stuck_reason: {
|
|
53
|
+
type: 'string',
|
|
54
|
+
description: 'Why you feel stuck (optional — helps generate better guidance)',
|
|
55
|
+
},
|
|
56
|
+
},
|
|
57
|
+
},
|
|
58
|
+
};
|
|
59
|
+
|
|
60
|
+
// ---------------------------------------------------------------------------
|
|
61
|
+
// ConfidenceTracker
|
|
62
|
+
// ---------------------------------------------------------------------------
|
|
63
|
+
|
|
64
|
+
/**
|
|
65
|
+
* Tracks agent confidence and provides structured reasoning guidance.
|
|
66
|
+
*/
|
|
67
|
+
export class ConfidenceTracker {
|
|
68
|
+
constructor() {
|
|
69
|
+
/** @type {string} */
|
|
70
|
+
this._currentPlan = '';
|
|
71
|
+
/** @type {string} */
|
|
72
|
+
this._currentPhase = 'recon';
|
|
73
|
+
/** @type {number} */
|
|
74
|
+
this._confidence = 0.5;
|
|
75
|
+
/** @type {number} */
|
|
76
|
+
this._turnsInPhase = 0;
|
|
77
|
+
/** @type {number} */
|
|
78
|
+
this._totalTurns = 0;
|
|
79
|
+
/** @type {string[]} */
|
|
80
|
+
this._planHistory = [];
|
|
81
|
+
/** @type {string[]} */
|
|
82
|
+
this._failedApproaches = [];
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
/** Current confidence level (0-1). */
|
|
86
|
+
get confidence() { return this._confidence; }
|
|
87
|
+
|
|
88
|
+
/** Current attack phase. */
|
|
89
|
+
get phase() { return this._currentPhase; }
|
|
90
|
+
|
|
91
|
+
/** Current plan text. */
|
|
92
|
+
get plan() { return this._currentPlan; }
|
|
93
|
+
|
|
94
|
+
/** Number of plan updates. */
|
|
95
|
+
get planUpdates() { return this._planHistory.length; }
|
|
96
|
+
|
|
97
|
+
/**
|
|
98
|
+
* Update the attack plan and confidence.
|
|
99
|
+
*
|
|
100
|
+
* @param {object} opts
|
|
101
|
+
* @param {string} opts.plan
|
|
102
|
+
* @param {string} [opts.phase]
|
|
103
|
+
* @param {number} [opts.confidence]
|
|
104
|
+
* @returns {string} Acknowledgment with guidance
|
|
105
|
+
*/
|
|
106
|
+
updatePlan(opts) {
|
|
107
|
+
this._totalTurns++;
|
|
108
|
+
|
|
109
|
+
const prevPhase = this._currentPhase;
|
|
110
|
+
this._currentPlan = opts.plan;
|
|
111
|
+
|
|
112
|
+
if (opts.phase && opts.phase !== prevPhase) {
|
|
113
|
+
this._currentPhase = opts.phase;
|
|
114
|
+
this._turnsInPhase = 0;
|
|
115
|
+
} else {
|
|
116
|
+
this._turnsInPhase++;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
if (typeof opts.confidence === 'number') {
|
|
120
|
+
this._confidence = Math.max(0, Math.min(1, opts.confidence));
|
|
121
|
+
}
|
|
122
|
+
|
|
123
|
+
this._planHistory.push(opts.plan);
|
|
124
|
+
|
|
125
|
+
// Generate guidance based on state
|
|
126
|
+
const guidance = [];
|
|
127
|
+
guidance.push(`Plan updated. Phase: ${this._currentPhase}. Confidence: ${(this._confidence * 100).toFixed(0)}%.`);
|
|
128
|
+
|
|
129
|
+
if (this._turnsInPhase > 5) {
|
|
130
|
+
guidance.push(`WARNING: ${this._turnsInPhase} turns in ${this._currentPhase} phase without progress. Consider changing approach.`);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
if (this._confidence < 0.2) {
|
|
134
|
+
guidance.push('LOW CONFIDENCE: Consider stepping back to enumerate more broadly or trying a completely different attack vector.');
|
|
135
|
+
} else if (this._confidence > 0.8) {
|
|
136
|
+
guidance.push('HIGH CONFIDENCE: Focus on execution. You likely have the right approach.');
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
if (this._planHistory.length > 3) {
|
|
140
|
+
guidance.push(`You've updated your plan ${this._planHistory.length} times. Make sure each iteration adds new information.`);
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
return guidance.join('\n');
|
|
144
|
+
}
|
|
145
|
+
|
|
146
|
+
/**
|
|
147
|
+
* Check confidence and provide guidance.
|
|
148
|
+
*
|
|
149
|
+
* @param {string} [stuckReason]
|
|
150
|
+
* @returns {string} Structured guidance
|
|
151
|
+
*/
|
|
152
|
+
checkConfidence(stuckReason) {
|
|
153
|
+
const lines = [];
|
|
154
|
+
lines.push(`Current state:`);
|
|
155
|
+
lines.push(` Phase: ${this._currentPhase}`);
|
|
156
|
+
lines.push(` Confidence: ${(this._confidence * 100).toFixed(0)}%`);
|
|
157
|
+
lines.push(` Turns in phase: ${this._turnsInPhase}`);
|
|
158
|
+
lines.push(` Total turns: ${this._totalTurns}`);
|
|
159
|
+
lines.push(` Plan updates: ${this._planHistory.length}`);
|
|
160
|
+
|
|
161
|
+
if (this._currentPlan) {
|
|
162
|
+
lines.push(`\nCurrent plan:\n${this._currentPlan}`);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
lines.push('');
|
|
166
|
+
|
|
167
|
+
// Phase-specific guidance
|
|
168
|
+
const phaseGuidance = {
|
|
169
|
+
recon: [
|
|
170
|
+
'RECON: Focus on service identification, technology stack, and attack surface.',
|
|
171
|
+
'Try: nmap, curl headers, directory brute-force, robots.txt, source code review.',
|
|
172
|
+
'Move to ENUMERATE when you know what services are running.',
|
|
173
|
+
],
|
|
174
|
+
enumerate: [
|
|
175
|
+
'ENUMERATE: Focus on finding specific vulnerabilities.',
|
|
176
|
+
'Try: parameter fuzzing, SQL injection probes, XSS testing, auth bypass, file inclusion.',
|
|
177
|
+
'Move to EXPLOIT when you have a confirmed vulnerability.',
|
|
178
|
+
],
|
|
179
|
+
exploit: [
|
|
180
|
+
'EXPLOIT: Focus on weaponizing your finding.',
|
|
181
|
+
'Try: craft specific payloads, chain vulnerabilities, escalate access.',
|
|
182
|
+
'Move to POST-EXPLOIT if you have code execution but need to find the flag.',
|
|
183
|
+
],
|
|
184
|
+
'post-exploit': [
|
|
185
|
+
'POST-EXPLOIT: Focus on finding the flag.',
|
|
186
|
+
'Try: search the filesystem (find / -name "flag*"), check environment variables, check databases.',
|
|
187
|
+
'The flag is usually in a file, environment variable, or database.',
|
|
188
|
+
],
|
|
189
|
+
pivot: [
|
|
190
|
+
'PIVOT: Try a completely different approach.',
|
|
191
|
+
'Reassess what you know. Was there a different service? A different vulnerability class?',
|
|
192
|
+
'Consider: SSRF, race conditions, deserialization, template injection.',
|
|
193
|
+
],
|
|
194
|
+
};
|
|
195
|
+
|
|
196
|
+
const guidance = phaseGuidance[this._currentPhase] || phaseGuidance.recon;
|
|
197
|
+
lines.push(...guidance);
|
|
198
|
+
|
|
199
|
+
if (stuckReason) {
|
|
200
|
+
lines.push(`\nYou said you're stuck because: "${stuckReason}"`);
|
|
201
|
+
lines.push('Suggestions:');
|
|
202
|
+
|
|
203
|
+
if (stuckReason.toLowerCase().includes('no response') || stuckReason.toLowerCase().includes('timeout')) {
|
|
204
|
+
lines.push(' - Check if the service is on a different port');
|
|
205
|
+
lines.push(' - Try a different protocol (HTTP vs HTTPS)');
|
|
206
|
+
lines.push(' - Check if the service needs specific headers');
|
|
207
|
+
} else if (stuckReason.toLowerCase().includes('filter') || stuckReason.toLowerCase().includes('blocked') || stuckReason.toLowerCase().includes('waf')) {
|
|
208
|
+
lines.push(' - Try encoding payloads (URL encoding, double encoding, unicode)');
|
|
209
|
+
lines.push(' - Try alternative injection vectors');
|
|
210
|
+
lines.push(' - Look for bypass techniques specific to the WAF/filter');
|
|
211
|
+
} else {
|
|
212
|
+
lines.push(' - Go back to enumeration and look for something you missed');
|
|
213
|
+
lines.push(' - Try a completely different vulnerability class');
|
|
214
|
+
lines.push(' - Read the challenge description again for hints');
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
// Recommend phase change if stuck too long
|
|
219
|
+
if (this._turnsInPhase > 7 && this._confidence < 0.3) {
|
|
220
|
+
lines.push('\nSTRONGLY RECOMMEND: Update your plan with phase:"pivot" and try a fundamentally different approach.');
|
|
221
|
+
}
|
|
222
|
+
|
|
223
|
+
return lines.join('\n');
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
/**
|
|
227
|
+
* Record a failed approach for future reference.
|
|
228
|
+
* @param {string} approach
|
|
229
|
+
*/
|
|
230
|
+
recordFailure(approach) {
|
|
231
|
+
this._failedApproaches.push(approach);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
/**
|
|
235
|
+
* Get a summary of the tracker state (for serialization).
|
|
236
|
+
*/
|
|
237
|
+
getState() {
|
|
238
|
+
return {
|
|
239
|
+
phase: this._currentPhase,
|
|
240
|
+
confidence: this._confidence,
|
|
241
|
+
turnsInPhase: this._turnsInPhase,
|
|
242
|
+
totalTurns: this._totalTurns,
|
|
243
|
+
planUpdates: this._planHistory.length,
|
|
244
|
+
currentPlan: this._currentPlan,
|
|
245
|
+
failedApproaches: this._failedApproaches,
|
|
246
|
+
};
|
|
247
|
+
}
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
// ---------------------------------------------------------------------------
|
|
251
|
+
// Dispatch integration
|
|
252
|
+
// ---------------------------------------------------------------------------
|
|
253
|
+
|
|
254
|
+
/**
|
|
255
|
+
* Create a dispatcher for cognitive tools.
|
|
256
|
+
*
|
|
257
|
+
* @param {ConfidenceTracker} tracker
|
|
258
|
+
* @returns {(toolName: string, toolInput: object) => { output: string } | null}
|
|
259
|
+
*/
|
|
260
|
+
export function createCognitiveDispatcher(tracker) {
|
|
261
|
+
return (toolName, toolInput) => {
|
|
262
|
+
if (toolName === 'update_plan') {
|
|
263
|
+
const output = tracker.updatePlan({
|
|
264
|
+
plan: toolInput.plan,
|
|
265
|
+
phase: toolInput.phase,
|
|
266
|
+
confidence: toolInput.confidence,
|
|
267
|
+
});
|
|
268
|
+
return { output };
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (toolName === 'check_confidence') {
|
|
272
|
+
const output = tracker.checkConfidence(toolInput.stuck_reason);
|
|
273
|
+
return { output };
|
|
274
|
+
}
|
|
275
|
+
|
|
276
|
+
return null; // Not a cognitive tool
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
/**
|
|
281
|
+
* Cognitive tool schemas for inclusion in agent tool sets.
|
|
282
|
+
*/
|
|
283
|
+
export const COGNITIVE_TOOLS = [UPDATE_PLAN_SCHEMA, CHECK_CONFIDENCE_SCHEMA];
|
package/lib/benchmark/index.js
CHANGED
|
@@ -11,10 +11,20 @@ export { scoreFlag, scoreResult, aggregateResults } from './scorer.js';
|
|
|
11
11
|
export { ALL_BASELINES, PENTESTGPT_BASELINE, MAPTA_BASELINE, SHANNON_BASELINE } from './baselines.js';
|
|
12
12
|
export { BenchmarkBuilder, enumerateBenchmarks } from './builder.js';
|
|
13
13
|
export { SandboxContainer, SandboxError } from './sandbox.js';
|
|
14
|
-
export { AGENT_TOOLS, dispatchTool } from './tools.js';
|
|
14
|
+
export { AGENT_TOOLS, FLAG_TOOLS, QUESTION_TOOLS, getToolsForWinCondition, dispatchTool } from './tools.js';
|
|
15
15
|
export { makeAgentClient } from './llm.js';
|
|
16
16
|
export { SecurityAgent, AgentResult } from './agent.js';
|
|
17
17
|
export { Coordinator } from './coordinator.js';
|
|
18
18
|
export { SolverAdapter, StubSolver, ManualSolver, AutonomousSolver, MultiAgentSolver, SOLVERS, getSolver } from './solver.js';
|
|
19
|
-
export { runSingleBenchmark, runBenchmarks, reportToDict } from './runner.js';
|
|
19
|
+
export { runSingleBenchmark, runBenchmarks, runWithRetry, reportToDict } from './runner.js';
|
|
20
20
|
export { generateJsonReport, generateMarkdownReport } from './reporter.js';
|
|
21
|
+
export { ConfidenceTracker, createCognitiveDispatcher, COGNITIVE_TOOLS } from './cognitive.js';
|
|
22
|
+
export { generateSystemPrompt, orderToolsByPhase } from './prompts.js';
|
|
23
|
+
export { loadNyuChallenge, enumerateNyuChallenges, getNyuFlag, hasDockerCompose, NYU_BASELINES, NYU_CATEGORIES, CRAKEN_BASELINE, DCIPHER_BASELINE, ENIGMA_BASELINE } from './nyu-ctf.js';
|
|
24
|
+
export { KnowledgeSolver, KnowledgeQuestion, KnowledgeResult, PIIEntry, PIIResult, calculateAccuracy, calculateFScore } from './knowledge.js';
|
|
25
|
+
export { RemoteTarget, SSHTarget, NetcatTarget, HTTPTarget, createRemoteTarget } from './remote-target.js';
|
|
26
|
+
export { ChallengeRegistry, SuiteInfo, getDefaultRegistry } from './registry.js';
|
|
27
|
+
export { PICOCTF_CATALOG, PICO_CATEGORIES, loadPicoChallenge, enumeratePicoChallenges, savePicoCatalog, clonePicoCTF, getPicoTargetInfo, getPicoCatalogStats, PICOCTF_DATA_DIR } from './picoctf.js';
|
|
28
|
+
export { OTW_WARGAMES, loadOtwWargame, enumerateOtwChallenges, getOtwConnectionInfo, loadProgress, saveProgress, getProgressStats, cloneOtw, getOtwCatalogStats, OTW_DATA_DIR } from './overthewire.js';
|
|
29
|
+
export { RemoteSolver, RemoteSandboxAdapter, dispatchRemoteTool } from './remote-solver.js';
|
|
30
|
+
export { ClaudeCodeSolver } from './claude-code-solver.js';
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
// CIPHER is a trademark of defconxt.
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Knowledge Solver — MCQ evaluation for security knowledge benchmarks.
|
|
7
|
+
*
|
|
8
|
+
* Evaluates LLM knowledge against multiple-choice question sets:
|
|
9
|
+
* - SecEval: security domain knowledge (2K+ questions)
|
|
10
|
+
* - CyberMetric: cybersecurity metrics and concepts (5K+ questions)
|
|
11
|
+
* - CTIBench: cyber threat intelligence (10K+ questions)
|
|
12
|
+
*
|
|
13
|
+
* Scoring: accuracy (correct/total), with per-category breakdown.
|
|
14
|
+
*
|
|
15
|
+
* @module benchmark/knowledge
|
|
16
|
+
*/
|
|
17
|
+
|
|
18
|
+
import { SolverResult } from './models.js';
|
|
19
|
+
|
|
20
|
+
// ---------------------------------------------------------------------------
|
|
21
|
+
// KnowledgeQuestion
|
|
22
|
+
// ---------------------------------------------------------------------------
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* A single MCQ question for knowledge evaluation.
|
|
26
|
+
*/
|
|
27
|
+
export class KnowledgeQuestion {
|
|
28
|
+
/**
|
|
29
|
+
* @param {object} opts
|
|
30
|
+
* @param {string} opts.id - Question identifier
|
|
31
|
+
* @param {string} opts.question - Question text
|
|
32
|
+
* @param {string[]} opts.choices - Answer choices (e.g. ['A: ...', 'B: ...', ...])
|
|
33
|
+
* @param {string} opts.correctAnswer - Correct answer letter or text
|
|
34
|
+
* @param {string} [opts.category] - Question category
|
|
35
|
+
* @param {string} [opts.difficulty] - Difficulty level
|
|
36
|
+
* @param {string} [opts.source] - Source benchmark (SecEval, CyberMetric, CTIBench)
|
|
37
|
+
*/
|
|
38
|
+
constructor(opts = {}) {
|
|
39
|
+
this.id = opts.id ?? '';
|
|
40
|
+
this.question = opts.question ?? '';
|
|
41
|
+
this.choices = opts.choices ?? [];
|
|
42
|
+
this.correctAnswer = opts.correctAnswer ?? '';
|
|
43
|
+
this.category = opts.category ?? '';
|
|
44
|
+
this.difficulty = opts.difficulty ?? '';
|
|
45
|
+
this.source = opts.source ?? '';
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// ---------------------------------------------------------------------------
|
|
50
|
+
// KnowledgeResult
|
|
51
|
+
// ---------------------------------------------------------------------------
|
|
52
|
+
|
|
53
|
+
/**
|
|
54
|
+
* Result of evaluating a set of knowledge questions.
|
|
55
|
+
*/
|
|
56
|
+
export class KnowledgeResult {
|
|
57
|
+
/**
|
|
58
|
+
* @param {object} opts
|
|
59
|
+
* @param {number} opts.total
|
|
60
|
+
* @param {number} opts.correct
|
|
61
|
+
* @param {number} opts.incorrect
|
|
62
|
+
* @param {number} opts.skipped
|
|
63
|
+
* @param {Record<string, { total: number, correct: number }>} [opts.byCategory]
|
|
64
|
+
* @param {Array<{ id: string, correct: boolean, expected: string, actual: string }>} [opts.details]
|
|
65
|
+
*/
|
|
66
|
+
constructor(opts = {}) {
|
|
67
|
+
this.total = opts.total ?? 0;
|
|
68
|
+
this.correct = opts.correct ?? 0;
|
|
69
|
+
this.incorrect = opts.incorrect ?? 0;
|
|
70
|
+
this.skipped = opts.skipped ?? 0;
|
|
71
|
+
this.byCategory = opts.byCategory ?? {};
|
|
72
|
+
this.details = opts.details ?? [];
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** Accuracy as a percentage. */
|
|
76
|
+
get accuracy() {
|
|
77
|
+
return this.total > 0 ? (this.correct / this.total) * 100 : 0;
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
/** Accuracy by category. */
|
|
81
|
+
get categoryAccuracy() {
|
|
82
|
+
const result = {};
|
|
83
|
+
for (const [cat, data] of Object.entries(this.byCategory)) {
|
|
84
|
+
result[cat] = data.total > 0 ? (data.correct / data.total) * 100 : 0;
|
|
85
|
+
}
|
|
86
|
+
return result;
|
|
87
|
+
}
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
// KnowledgeSolver
|
|
92
|
+
// ---------------------------------------------------------------------------
|
|
93
|
+
|
|
94
|
+
/**
|
|
95
|
+
* Evaluates LLM on MCQ security knowledge questions.
|
|
96
|
+
*/
|
|
97
|
+
export class KnowledgeSolver {
|
|
98
|
+
/**
|
|
99
|
+
* @param {object} [opts]
|
|
100
|
+
* @param {Function} [opts.agentRunner] - Injectable runner for testing
|
|
101
|
+
*/
|
|
102
|
+
constructor(opts = {}) {
|
|
103
|
+
this._agentRunner = opts.agentRunner;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
/**
|
|
107
|
+
* Evaluate a set of questions.
|
|
108
|
+
*
|
|
109
|
+
* @param {KnowledgeQuestion[]} questions
|
|
110
|
+
* @returns {Promise<KnowledgeResult>}
|
|
111
|
+
*/
|
|
112
|
+
async evaluate(questions) {
|
|
113
|
+
const details = [];
|
|
114
|
+
const byCategory = {};
|
|
115
|
+
|
|
116
|
+
for (const q of questions) {
|
|
117
|
+
const cat = q.category || 'uncategorized';
|
|
118
|
+
if (!byCategory[cat]) byCategory[cat] = { total: 0, correct: 0 };
|
|
119
|
+
byCategory[cat].total++;
|
|
120
|
+
|
|
121
|
+
let answer;
|
|
122
|
+
try {
|
|
123
|
+
answer = await this._answerQuestion(q);
|
|
124
|
+
} catch {
|
|
125
|
+
details.push({ id: q.id, correct: false, expected: q.correctAnswer, actual: 'ERROR' });
|
|
126
|
+
continue;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
const isCorrect = this._checkAnswer(answer, q.correctAnswer);
|
|
130
|
+
if (isCorrect) byCategory[cat].correct++;
|
|
131
|
+
|
|
132
|
+
details.push({
|
|
133
|
+
id: q.id,
|
|
134
|
+
correct: isCorrect,
|
|
135
|
+
expected: q.correctAnswer,
|
|
136
|
+
actual: answer,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const correct = details.filter(d => d.correct).length;
|
|
141
|
+
const incorrect = details.filter(d => !d.correct && d.actual !== 'ERROR').length;
|
|
142
|
+
const skipped = details.filter(d => d.actual === 'ERROR').length;
|
|
143
|
+
|
|
144
|
+
return new KnowledgeResult({
|
|
145
|
+
total: questions.length,
|
|
146
|
+
correct,
|
|
147
|
+
incorrect,
|
|
148
|
+
skipped,
|
|
149
|
+
byCategory,
|
|
150
|
+
details,
|
|
151
|
+
});
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Answer a single question using the LLM.
|
|
156
|
+
*
|
|
157
|
+
* @param {KnowledgeQuestion} question
|
|
158
|
+
* @returns {Promise<string>}
|
|
159
|
+
*/
|
|
160
|
+
async _answerQuestion(question) {
|
|
161
|
+
if (this._agentRunner) {
|
|
162
|
+
const result = await this._agentRunner('ARCHITECT', {
|
|
163
|
+
task: 'Answer this security knowledge question',
|
|
164
|
+
user_message: `${question.question}\n\nChoices:\n${question.choices.join('\n')}\n\nRespond with ONLY the correct answer letter (A, B, C, or D).`,
|
|
165
|
+
});
|
|
166
|
+
return (result.outputText || '').trim().charAt(0).toUpperCase();
|
|
167
|
+
}
|
|
168
|
+
throw new Error('No agent runner configured');
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
/**
|
|
172
|
+
* Check if the answer matches the correct answer.
|
|
173
|
+
* Handles both letter answers (A, B) and full text matching.
|
|
174
|
+
*
|
|
175
|
+
* @param {string} answer
|
|
176
|
+
* @param {string} correct
|
|
177
|
+
* @returns {boolean}
|
|
178
|
+
*/
|
|
179
|
+
_checkAnswer(answer, correct) {
|
|
180
|
+
if (!answer || !correct) return false;
|
|
181
|
+
const a = answer.trim().toUpperCase();
|
|
182
|
+
const c = correct.trim().toUpperCase();
|
|
183
|
+
// Match first letter
|
|
184
|
+
if (a.charAt(0) === c.charAt(0) && /^[A-D]$/.test(a.charAt(0))) return true;
|
|
185
|
+
// Full text match
|
|
186
|
+
return a === c;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
// ---------------------------------------------------------------------------
|
|
191
|
+
// Privacy Solver — PII detection scoring
|
|
192
|
+
// ---------------------------------------------------------------------------
|
|
193
|
+
|
|
194
|
+
/**
|
|
195
|
+
* A single PII detection entry.
|
|
196
|
+
*/
|
|
197
|
+
export class PIIEntry {
|
|
198
|
+
/**
|
|
199
|
+
* @param {object} opts
|
|
200
|
+
* @param {string} opts.id
|
|
201
|
+
* @param {string} opts.text - Text containing potential PII
|
|
202
|
+
* @param {Array<{ type: string, value: string, start: number, end: number }>} opts.expectedPII
|
|
203
|
+
*/
|
|
204
|
+
constructor(opts = {}) {
|
|
205
|
+
this.id = opts.id ?? '';
|
|
206
|
+
this.text = opts.text ?? '';
|
|
207
|
+
this.expectedPII = opts.expectedPII ?? [];
|
|
208
|
+
}
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
/**
|
|
212
|
+
* Result of PII detection evaluation.
|
|
213
|
+
*/
|
|
214
|
+
export class PIIResult {
|
|
215
|
+
/**
|
|
216
|
+
* @param {object} opts
|
|
217
|
+
* @param {number} opts.total - Total PII entities expected
|
|
218
|
+
* @param {number} opts.truePositives
|
|
219
|
+
* @param {number} opts.falsePositives
|
|
220
|
+
* @param {number} opts.falseNegatives
|
|
221
|
+
*/
|
|
222
|
+
constructor(opts = {}) {
|
|
223
|
+
this.total = opts.total ?? 0;
|
|
224
|
+
this.truePositives = opts.truePositives ?? 0;
|
|
225
|
+
this.falsePositives = opts.falsePositives ?? 0;
|
|
226
|
+
this.falseNegatives = opts.falseNegatives ?? 0;
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
/** Precision: TP / (TP + FP) */
|
|
230
|
+
get precision() {
|
|
231
|
+
const denom = this.truePositives + this.falsePositives;
|
|
232
|
+
return denom > 0 ? this.truePositives / denom : 0;
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
/** Recall: TP / (TP + FN) */
|
|
236
|
+
get recall() {
|
|
237
|
+
const denom = this.truePositives + this.falseNegatives;
|
|
238
|
+
return denom > 0 ? this.truePositives / denom : 0;
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/** F1 score: harmonic mean of precision and recall */
|
|
242
|
+
get f1() {
|
|
243
|
+
const p = this.precision;
|
|
244
|
+
const r = this.recall;
|
|
245
|
+
return (p + r) > 0 ? (2 * p * r) / (p + r) : 0;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
/** F2 score: weighted harmonic mean emphasizing recall */
|
|
249
|
+
get f2() {
|
|
250
|
+
const p = this.precision;
|
|
251
|
+
const r = this.recall;
|
|
252
|
+
return (p + r) > 0 ? (5 * p * r) / (4 * p + r) : 0;
|
|
253
|
+
}
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
// ---------------------------------------------------------------------------
|
|
257
|
+
// Accuracy and F-score metrics for scorer.js
|
|
258
|
+
// ---------------------------------------------------------------------------
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Calculate accuracy from correct/total counts.
|
|
262
|
+
* @param {number} correct
|
|
263
|
+
* @param {number} total
|
|
264
|
+
* @returns {number} Accuracy as percentage (0-100)
|
|
265
|
+
*/
|
|
266
|
+
export function calculateAccuracy(correct, total) {
|
|
267
|
+
return total > 0 ? (correct / total) * 100 : 0;
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Calculate F-beta score.
|
|
272
|
+
* @param {number} precision
|
|
273
|
+
* @param {number} recall
|
|
274
|
+
* @param {number} [beta=1] - Beta weight (1 = F1, 2 = F2)
|
|
275
|
+
* @returns {number}
|
|
276
|
+
*/
|
|
277
|
+
export function calculateFScore(precision, recall, beta = 1) {
|
|
278
|
+
if (precision + recall === 0) return 0;
|
|
279
|
+
const b2 = beta * beta;
|
|
280
|
+
return ((1 + b2) * precision * recall) / (b2 * precision + recall);
|
|
281
|
+
}
|