cipher-security 2.0.8 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/bin/cipher.js +11 -1
  2. package/lib/agent-runtime/handlers/architect.js +199 -0
  3. package/lib/agent-runtime/handlers/base.js +240 -0
  4. package/lib/agent-runtime/handlers/blue.js +220 -0
  5. package/lib/agent-runtime/handlers/incident.js +161 -0
  6. package/lib/agent-runtime/handlers/privacy.js +190 -0
  7. package/lib/agent-runtime/handlers/purple.js +209 -0
  8. package/lib/agent-runtime/handlers/recon.js +174 -0
  9. package/lib/agent-runtime/handlers/red.js +246 -0
  10. package/lib/agent-runtime/handlers/researcher.js +170 -0
  11. package/lib/agent-runtime/handlers.js +35 -0
  12. package/lib/agent-runtime/index.js +196 -0
  13. package/lib/agent-runtime/parser.js +316 -0
  14. package/lib/analyze/consistency.js +566 -0
  15. package/lib/analyze/constitution.js +110 -0
  16. package/lib/analyze/sharding.js +251 -0
  17. package/lib/autonomous/agent-tool.js +165 -0
  18. package/lib/autonomous/feedback-loop.js +13 -6
  19. package/lib/autonomous/framework.js +17 -0
  20. package/lib/autonomous/handoff.js +506 -0
  21. package/lib/autonomous/modes/blue.js +26 -0
  22. package/lib/autonomous/modes/red.js +585 -0
  23. package/lib/autonomous/modes/researcher.js +322 -0
  24. package/lib/autonomous/researcher.js +12 -45
  25. package/lib/autonomous/runner.js +9 -537
  26. package/lib/benchmark/agent.js +88 -26
  27. package/lib/benchmark/baselines.js +3 -0
  28. package/lib/benchmark/claude-code-solver.js +254 -0
  29. package/lib/benchmark/cognitive.js +283 -0
  30. package/lib/benchmark/index.js +12 -2
  31. package/lib/benchmark/knowledge.js +281 -0
  32. package/lib/benchmark/llm.js +156 -15
  33. package/lib/benchmark/models.js +5 -2
  34. package/lib/benchmark/nyu-ctf.js +192 -0
  35. package/lib/benchmark/overthewire.js +347 -0
  36. package/lib/benchmark/picoctf.js +281 -0
  37. package/lib/benchmark/prompts.js +280 -0
  38. package/lib/benchmark/registry.js +219 -0
  39. package/lib/benchmark/remote-solver.js +356 -0
  40. package/lib/benchmark/remote-target.js +263 -0
  41. package/lib/benchmark/reporter.js +35 -0
  42. package/lib/benchmark/runner.js +174 -10
  43. package/lib/benchmark/sandbox.js +35 -0
  44. package/lib/benchmark/scorer.js +22 -4
  45. package/lib/benchmark/solver.js +34 -1
  46. package/lib/benchmark/tools.js +262 -16
  47. package/lib/commands.js +9 -0
  48. package/lib/execution/council.js +434 -0
  49. package/lib/execution/parallel.js +292 -0
  50. package/lib/gates/circuit-breaker.js +135 -0
  51. package/lib/gates/confidence.js +302 -0
  52. package/lib/gates/corrections.js +219 -0
  53. package/lib/gates/self-check.js +245 -0
  54. package/lib/gateway/commands.js +727 -0
  55. package/lib/guardrails/engine.js +364 -0
  56. package/lib/mcp/server.js +349 -3
  57. package/lib/memory/compressor.js +94 -7
  58. package/lib/pipeline/hooks.js +288 -0
  59. package/lib/pipeline/index.js +11 -0
  60. package/lib/review/budget.js +210 -0
  61. package/lib/review/engine.js +526 -0
  62. package/lib/review/layers/acceptance-auditor.js +279 -0
  63. package/lib/review/layers/blind-hunter.js +500 -0
  64. package/lib/review/layers/defense-in-depth.js +209 -0
  65. package/lib/review/layers/edge-case-hunter.js +266 -0
  66. package/lib/review/panel.js +519 -0
  67. package/lib/review/two-stage.js +244 -0
  68. package/lib/session/cost-tracker.js +203 -0
  69. package/lib/session/logger.js +349 -0
  70. package/package.json +1 -1
@@ -0,0 +1,283 @@
1
+ // Copyright (c) 2026 defconxt. All rights reserved.
2
+ // Licensed under AGPL-3.0 — see LICENSE file for details.
3
+ // CIPHER is a trademark of defconxt.
4
+
5
+ /**
6
+ * Cognitive Architecture — Plan tool, confidence tracker, and reasoning support.
7
+ *
8
+ * Gives the benchmark agent structured reasoning capabilities:
9
+ * - PlanTool: persists attack plans as external memory across turns
10
+ * - ConfidenceTracker: tracks confidence level and recommends next actions
11
+ *
12
+ * These tools help the agent avoid going in circles and make deliberate
13
+ * decisions about when to explore vs. exploit.
14
+ *
15
+ * @module benchmark/cognitive
16
+ */
17
+
18
+ // ---------------------------------------------------------------------------
19
+ // Plan Tool — external memory for attack plans
20
+ // ---------------------------------------------------------------------------
21
+
22
+ export const UPDATE_PLAN_SCHEMA = {
23
+ name: 'update_plan',
24
+ description: 'Write or update your attack plan. Use this to organize your approach, track what you\'ve tried, and plan next steps. The plan persists across turns as your external memory.',
25
+ input_schema: {
26
+ type: 'object',
27
+ properties: {
28
+ plan: {
29
+ type: 'string',
30
+ description: 'Your current attack plan in structured format. Include: objective, findings so far, current hypothesis, next steps, and what has been tried and failed.',
31
+ },
32
+ phase: {
33
+ type: 'string',
34
+ enum: ['recon', 'enumerate', 'exploit', 'post-exploit', 'pivot'],
35
+ description: 'Current attack phase',
36
+ },
37
+ confidence: {
38
+ type: 'number',
39
+ description: 'Your confidence level (0.0 to 1.0) that you can solve this challenge',
40
+ },
41
+ },
42
+ required: ['plan'],
43
+ },
44
+ };
45
+
46
+ export const CHECK_CONFIDENCE_SCHEMA = {
47
+ name: 'check_confidence',
48
+ description: 'Check your current confidence level and get guidance on what to do next. Call this when you feel stuck or want to reassess your approach.',
49
+ input_schema: {
50
+ type: 'object',
51
+ properties: {
52
+ stuck_reason: {
53
+ type: 'string',
54
+ description: 'Why you feel stuck (optional — helps generate better guidance)',
55
+ },
56
+ },
57
+ },
58
+ };
59
+
60
+ // ---------------------------------------------------------------------------
61
+ // ConfidenceTracker
62
+ // ---------------------------------------------------------------------------
63
+
64
+ /**
65
+ * Tracks agent confidence and provides structured reasoning guidance.
66
+ */
67
+ export class ConfidenceTracker {
68
+ constructor() {
69
+ /** @type {string} */
70
+ this._currentPlan = '';
71
+ /** @type {string} */
72
+ this._currentPhase = 'recon';
73
+ /** @type {number} */
74
+ this._confidence = 0.5;
75
+ /** @type {number} */
76
+ this._turnsInPhase = 0;
77
+ /** @type {number} */
78
+ this._totalTurns = 0;
79
+ /** @type {string[]} */
80
+ this._planHistory = [];
81
+ /** @type {string[]} */
82
+ this._failedApproaches = [];
83
+ }
84
+
85
+ /** Current confidence level (0-1). */
86
+ get confidence() { return this._confidence; }
87
+
88
+ /** Current attack phase. */
89
+ get phase() { return this._currentPhase; }
90
+
91
+ /** Current plan text. */
92
+ get plan() { return this._currentPlan; }
93
+
94
+ /** Number of plan updates. */
95
+ get planUpdates() { return this._planHistory.length; }
96
+
97
+ /**
98
+ * Update the attack plan and confidence.
99
+ *
100
+ * @param {object} opts
101
+ * @param {string} opts.plan
102
+ * @param {string} [opts.phase]
103
+ * @param {number} [opts.confidence]
104
+ * @returns {string} Acknowledgment with guidance
105
+ */
106
+ updatePlan(opts) {
107
+ this._totalTurns++;
108
+
109
+ const prevPhase = this._currentPhase;
110
+ this._currentPlan = opts.plan;
111
+
112
+ if (opts.phase && opts.phase !== prevPhase) {
113
+ this._currentPhase = opts.phase;
114
+ this._turnsInPhase = 0;
115
+ } else {
116
+ this._turnsInPhase++;
117
+ }
118
+
119
+ if (typeof opts.confidence === 'number') {
120
+ this._confidence = Math.max(0, Math.min(1, opts.confidence));
121
+ }
122
+
123
+ this._planHistory.push(opts.plan);
124
+
125
+ // Generate guidance based on state
126
+ const guidance = [];
127
+ guidance.push(`Plan updated. Phase: ${this._currentPhase}. Confidence: ${(this._confidence * 100).toFixed(0)}%.`);
128
+
129
+ if (this._turnsInPhase > 5) {
130
+ guidance.push(`WARNING: ${this._turnsInPhase} turns in ${this._currentPhase} phase without progress. Consider changing approach.`);
131
+ }
132
+
133
+ if (this._confidence < 0.2) {
134
+ guidance.push('LOW CONFIDENCE: Consider stepping back to enumerate more broadly or trying a completely different attack vector.');
135
+ } else if (this._confidence > 0.8) {
136
+ guidance.push('HIGH CONFIDENCE: Focus on execution. You likely have the right approach.');
137
+ }
138
+
139
+ if (this._planHistory.length > 3) {
140
+ guidance.push(`You've updated your plan ${this._planHistory.length} times. Make sure each iteration adds new information.`);
141
+ }
142
+
143
+ return guidance.join('\n');
144
+ }
145
+
146
+ /**
147
+ * Check confidence and provide guidance.
148
+ *
149
+ * @param {string} [stuckReason]
150
+ * @returns {string} Structured guidance
151
+ */
152
+ checkConfidence(stuckReason) {
153
+ const lines = [];
154
+ lines.push(`Current state:`);
155
+ lines.push(` Phase: ${this._currentPhase}`);
156
+ lines.push(` Confidence: ${(this._confidence * 100).toFixed(0)}%`);
157
+ lines.push(` Turns in phase: ${this._turnsInPhase}`);
158
+ lines.push(` Total turns: ${this._totalTurns}`);
159
+ lines.push(` Plan updates: ${this._planHistory.length}`);
160
+
161
+ if (this._currentPlan) {
162
+ lines.push(`\nCurrent plan:\n${this._currentPlan}`);
163
+ }
164
+
165
+ lines.push('');
166
+
167
+ // Phase-specific guidance
168
+ const phaseGuidance = {
169
+ recon: [
170
+ 'RECON: Focus on service identification, technology stack, and attack surface.',
171
+ 'Try: nmap, curl headers, directory brute-force, robots.txt, source code review.',
172
+ 'Move to ENUMERATE when you know what services are running.',
173
+ ],
174
+ enumerate: [
175
+ 'ENUMERATE: Focus on finding specific vulnerabilities.',
176
+ 'Try: parameter fuzzing, SQL injection probes, XSS testing, auth bypass, file inclusion.',
177
+ 'Move to EXPLOIT when you have a confirmed vulnerability.',
178
+ ],
179
+ exploit: [
180
+ 'EXPLOIT: Focus on weaponizing your finding.',
181
+ 'Try: craft specific payloads, chain vulnerabilities, escalate access.',
182
+ 'Move to POST-EXPLOIT if you have code execution but need to find the flag.',
183
+ ],
184
+ 'post-exploit': [
185
+ 'POST-EXPLOIT: Focus on finding the flag.',
186
+ 'Try: search the filesystem (find / -name "flag*"), check environment variables, check databases.',
187
+ 'The flag is usually in a file, environment variable, or database.',
188
+ ],
189
+ pivot: [
190
+ 'PIVOT: Try a completely different approach.',
191
+ 'Reassess what you know. Was there a different service? A different vulnerability class?',
192
+ 'Consider: SSRF, race conditions, deserialization, template injection.',
193
+ ],
194
+ };
195
+
196
+ const guidance = phaseGuidance[this._currentPhase] || phaseGuidance.recon;
197
+ lines.push(...guidance);
198
+
199
+ if (stuckReason) {
200
+ lines.push(`\nYou said you're stuck because: "${stuckReason}"`);
201
+ lines.push('Suggestions:');
202
+
203
+ if (stuckReason.toLowerCase().includes('no response') || stuckReason.toLowerCase().includes('timeout')) {
204
+ lines.push(' - Check if the service is on a different port');
205
+ lines.push(' - Try a different protocol (HTTP vs HTTPS)');
206
+ lines.push(' - Check if the service needs specific headers');
207
+ } else if (stuckReason.toLowerCase().includes('filter') || stuckReason.toLowerCase().includes('blocked') || stuckReason.toLowerCase().includes('waf')) {
208
+ lines.push(' - Try encoding payloads (URL encoding, double encoding, unicode)');
209
+ lines.push(' - Try alternative injection vectors');
210
+ lines.push(' - Look for bypass techniques specific to the WAF/filter');
211
+ } else {
212
+ lines.push(' - Go back to enumeration and look for something you missed');
213
+ lines.push(' - Try a completely different vulnerability class');
214
+ lines.push(' - Read the challenge description again for hints');
215
+ }
216
+ }
217
+
218
+ // Recommend phase change if stuck too long
219
+ if (this._turnsInPhase > 7 && this._confidence < 0.3) {
220
+ lines.push('\nSTRONGLY RECOMMEND: Update your plan with phase:"pivot" and try a fundamentally different approach.');
221
+ }
222
+
223
+ return lines.join('\n');
224
+ }
225
+
226
+ /**
227
+ * Record a failed approach for future reference.
228
+ * @param {string} approach
229
+ */
230
+ recordFailure(approach) {
231
+ this._failedApproaches.push(approach);
232
+ }
233
+
234
+ /**
235
+ * Get a summary of the tracker state (for serialization).
236
+ */
237
+ getState() {
238
+ return {
239
+ phase: this._currentPhase,
240
+ confidence: this._confidence,
241
+ turnsInPhase: this._turnsInPhase,
242
+ totalTurns: this._totalTurns,
243
+ planUpdates: this._planHistory.length,
244
+ currentPlan: this._currentPlan,
245
+ failedApproaches: this._failedApproaches,
246
+ };
247
+ }
248
+ }
249
+
250
+ // ---------------------------------------------------------------------------
251
+ // Dispatch integration
252
+ // ---------------------------------------------------------------------------
253
+
254
+ /**
255
+ * Create a dispatcher for cognitive tools.
256
+ *
257
+ * @param {ConfidenceTracker} tracker
258
+ * @returns {(toolName: string, toolInput: object) => { output: string } | null}
259
+ */
260
+ export function createCognitiveDispatcher(tracker) {
261
+ return (toolName, toolInput) => {
262
+ if (toolName === 'update_plan') {
263
+ const output = tracker.updatePlan({
264
+ plan: toolInput.plan,
265
+ phase: toolInput.phase,
266
+ confidence: toolInput.confidence,
267
+ });
268
+ return { output };
269
+ }
270
+
271
+ if (toolName === 'check_confidence') {
272
+ const output = tracker.checkConfidence(toolInput.stuck_reason);
273
+ return { output };
274
+ }
275
+
276
+ return null; // Not a cognitive tool
277
+ };
278
+ }
279
+
280
+ /**
281
+ * Cognitive tool schemas for inclusion in agent tool sets.
282
+ */
283
+ export const COGNITIVE_TOOLS = [UPDATE_PLAN_SCHEMA, CHECK_CONFIDENCE_SCHEMA];
@@ -11,10 +11,20 @@ export { scoreFlag, scoreResult, aggregateResults } from './scorer.js';
11
11
  export { ALL_BASELINES, PENTESTGPT_BASELINE, MAPTA_BASELINE, SHANNON_BASELINE } from './baselines.js';
12
12
  export { BenchmarkBuilder, enumerateBenchmarks } from './builder.js';
13
13
  export { SandboxContainer, SandboxError } from './sandbox.js';
14
- export { AGENT_TOOLS, dispatchTool } from './tools.js';
14
+ export { AGENT_TOOLS, FLAG_TOOLS, QUESTION_TOOLS, getToolsForWinCondition, dispatchTool } from './tools.js';
15
15
  export { makeAgentClient } from './llm.js';
16
16
  export { SecurityAgent, AgentResult } from './agent.js';
17
17
  export { Coordinator } from './coordinator.js';
18
18
  export { SolverAdapter, StubSolver, ManualSolver, AutonomousSolver, MultiAgentSolver, SOLVERS, getSolver } from './solver.js';
19
- export { runSingleBenchmark, runBenchmarks, reportToDict } from './runner.js';
19
+ export { runSingleBenchmark, runBenchmarks, runWithRetry, reportToDict } from './runner.js';
20
20
  export { generateJsonReport, generateMarkdownReport } from './reporter.js';
21
+ export { ConfidenceTracker, createCognitiveDispatcher, COGNITIVE_TOOLS } from './cognitive.js';
22
+ export { generateSystemPrompt, orderToolsByPhase } from './prompts.js';
23
+ export { loadNyuChallenge, enumerateNyuChallenges, getNyuFlag, hasDockerCompose, NYU_BASELINES, NYU_CATEGORIES, CRAKEN_BASELINE, DCIPHER_BASELINE, ENIGMA_BASELINE } from './nyu-ctf.js';
24
+ export { KnowledgeSolver, KnowledgeQuestion, KnowledgeResult, PIIEntry, PIIResult, calculateAccuracy, calculateFScore } from './knowledge.js';
25
+ export { RemoteTarget, SSHTarget, NetcatTarget, HTTPTarget, createRemoteTarget } from './remote-target.js';
26
+ export { ChallengeRegistry, SuiteInfo, getDefaultRegistry } from './registry.js';
27
+ export { PICOCTF_CATALOG, PICO_CATEGORIES, loadPicoChallenge, enumeratePicoChallenges, savePicoCatalog, clonePicoCTF, getPicoTargetInfo, getPicoCatalogStats, PICOCTF_DATA_DIR } from './picoctf.js';
28
+ export { OTW_WARGAMES, loadOtwWargame, enumerateOtwChallenges, getOtwConnectionInfo, loadProgress, saveProgress, getProgressStats, cloneOtw, getOtwCatalogStats, OTW_DATA_DIR } from './overthewire.js';
29
+ export { RemoteSolver, RemoteSandboxAdapter, dispatchRemoteTool } from './remote-solver.js';
30
+ export { ClaudeCodeSolver } from './claude-code-solver.js';
@@ -0,0 +1,281 @@
1
+ // Copyright (c) 2026 defconxt. All rights reserved.
2
+ // Licensed under AGPL-3.0 — see LICENSE file for details.
3
+ // CIPHER is a trademark of defconxt.
4
+
5
+ /**
6
+ * Knowledge Solver — MCQ evaluation for security knowledge benchmarks.
7
+ *
8
+ * Evaluates LLM knowledge against multiple-choice question sets:
9
+ * - SecEval: security domain knowledge (2K+ questions)
10
+ * - CyberMetric: cybersecurity metrics and concepts (5K+ questions)
11
+ * - CTIBench: cyber threat intelligence (10K+ questions)
12
+ *
13
+ * Scoring: accuracy (correct/total), with per-category breakdown.
14
+ *
15
+ * @module benchmark/knowledge
16
+ */
17
+
18
+ import { SolverResult } from './models.js';
19
+
20
+ // ---------------------------------------------------------------------------
21
+ // KnowledgeQuestion
22
+ // ---------------------------------------------------------------------------
23
+
24
+ /**
25
+ * A single MCQ question for knowledge evaluation.
26
+ */
27
+ export class KnowledgeQuestion {
28
+ /**
29
+ * @param {object} opts
30
+ * @param {string} opts.id - Question identifier
31
+ * @param {string} opts.question - Question text
32
+ * @param {string[]} opts.choices - Answer choices (e.g. ['A: ...', 'B: ...', ...])
33
+ * @param {string} opts.correctAnswer - Correct answer letter or text
34
+ * @param {string} [opts.category] - Question category
35
+ * @param {string} [opts.difficulty] - Difficulty level
36
+ * @param {string} [opts.source] - Source benchmark (SecEval, CyberMetric, CTIBench)
37
+ */
38
+ constructor(opts = {}) {
39
+ this.id = opts.id ?? '';
40
+ this.question = opts.question ?? '';
41
+ this.choices = opts.choices ?? [];
42
+ this.correctAnswer = opts.correctAnswer ?? '';
43
+ this.category = opts.category ?? '';
44
+ this.difficulty = opts.difficulty ?? '';
45
+ this.source = opts.source ?? '';
46
+ }
47
+ }
48
+
49
+ // ---------------------------------------------------------------------------
50
+ // KnowledgeResult
51
+ // ---------------------------------------------------------------------------
52
+
53
+ /**
54
+ * Result of evaluating a set of knowledge questions.
55
+ */
56
+ export class KnowledgeResult {
57
+ /**
58
+ * @param {object} opts
59
+ * @param {number} opts.total
60
+ * @param {number} opts.correct
61
+ * @param {number} opts.incorrect
62
+ * @param {number} opts.skipped
63
+ * @param {Record<string, { total: number, correct: number }>} [opts.byCategory]
64
+ * @param {Array<{ id: string, correct: boolean, expected: string, actual: string }>} [opts.details]
65
+ */
66
+ constructor(opts = {}) {
67
+ this.total = opts.total ?? 0;
68
+ this.correct = opts.correct ?? 0;
69
+ this.incorrect = opts.incorrect ?? 0;
70
+ this.skipped = opts.skipped ?? 0;
71
+ this.byCategory = opts.byCategory ?? {};
72
+ this.details = opts.details ?? [];
73
+ }
74
+
75
+ /** Accuracy as a percentage. */
76
+ get accuracy() {
77
+ return this.total > 0 ? (this.correct / this.total) * 100 : 0;
78
+ }
79
+
80
+ /** Accuracy by category. */
81
+ get categoryAccuracy() {
82
+ const result = {};
83
+ for (const [cat, data] of Object.entries(this.byCategory)) {
84
+ result[cat] = data.total > 0 ? (data.correct / data.total) * 100 : 0;
85
+ }
86
+ return result;
87
+ }
88
+ }
89
+
90
+ // ---------------------------------------------------------------------------
91
+ // KnowledgeSolver
92
+ // ---------------------------------------------------------------------------
93
+
94
+ /**
95
+ * Evaluates LLM on MCQ security knowledge questions.
96
+ */
97
+ export class KnowledgeSolver {
98
+ /**
99
+ * @param {object} [opts]
100
+ * @param {Function} [opts.agentRunner] - Injectable runner for testing
101
+ */
102
+ constructor(opts = {}) {
103
+ this._agentRunner = opts.agentRunner;
104
+ }
105
+
106
+ /**
107
+ * Evaluate a set of questions.
108
+ *
109
+ * @param {KnowledgeQuestion[]} questions
110
+ * @returns {Promise<KnowledgeResult>}
111
+ */
112
+ async evaluate(questions) {
113
+ const details = [];
114
+ const byCategory = {};
115
+
116
+ for (const q of questions) {
117
+ const cat = q.category || 'uncategorized';
118
+ if (!byCategory[cat]) byCategory[cat] = { total: 0, correct: 0 };
119
+ byCategory[cat].total++;
120
+
121
+ let answer;
122
+ try {
123
+ answer = await this._answerQuestion(q);
124
+ } catch {
125
+ details.push({ id: q.id, correct: false, expected: q.correctAnswer, actual: 'ERROR' });
126
+ continue;
127
+ }
128
+
129
+ const isCorrect = this._checkAnswer(answer, q.correctAnswer);
130
+ if (isCorrect) byCategory[cat].correct++;
131
+
132
+ details.push({
133
+ id: q.id,
134
+ correct: isCorrect,
135
+ expected: q.correctAnswer,
136
+ actual: answer,
137
+ });
138
+ }
139
+
140
+ const correct = details.filter(d => d.correct).length;
141
+ const incorrect = details.filter(d => !d.correct && d.actual !== 'ERROR').length;
142
+ const skipped = details.filter(d => d.actual === 'ERROR').length;
143
+
144
+ return new KnowledgeResult({
145
+ total: questions.length,
146
+ correct,
147
+ incorrect,
148
+ skipped,
149
+ byCategory,
150
+ details,
151
+ });
152
+ }
153
+
154
+ /**
155
+ * Answer a single question using the LLM.
156
+ *
157
+ * @param {KnowledgeQuestion} question
158
+ * @returns {Promise<string>}
159
+ */
160
+ async _answerQuestion(question) {
161
+ if (this._agentRunner) {
162
+ const result = await this._agentRunner('ARCHITECT', {
163
+ task: 'Answer this security knowledge question',
164
+ user_message: `${question.question}\n\nChoices:\n${question.choices.join('\n')}\n\nRespond with ONLY the correct answer letter (A, B, C, or D).`,
165
+ });
166
+ return (result.outputText || '').trim().charAt(0).toUpperCase();
167
+ }
168
+ throw new Error('No agent runner configured');
169
+ }
170
+
171
+ /**
172
+ * Check if the answer matches the correct answer.
173
+ * Handles both letter answers (A, B) and full text matching.
174
+ *
175
+ * @param {string} answer
176
+ * @param {string} correct
177
+ * @returns {boolean}
178
+ */
179
+ _checkAnswer(answer, correct) {
180
+ if (!answer || !correct) return false;
181
+ const a = answer.trim().toUpperCase();
182
+ const c = correct.trim().toUpperCase();
183
+ // Match first letter
184
+ if (a.charAt(0) === c.charAt(0) && /^[A-D]$/.test(a.charAt(0))) return true;
185
+ // Full text match
186
+ return a === c;
187
+ }
188
+ }
189
+
190
+ // ---------------------------------------------------------------------------
191
+ // Privacy Solver — PII detection scoring
192
+ // ---------------------------------------------------------------------------
193
+
194
+ /**
195
+ * A single PII detection entry.
196
+ */
197
+ export class PIIEntry {
198
+ /**
199
+ * @param {object} opts
200
+ * @param {string} opts.id
201
+ * @param {string} opts.text - Text containing potential PII
202
+ * @param {Array<{ type: string, value: string, start: number, end: number }>} opts.expectedPII
203
+ */
204
+ constructor(opts = {}) {
205
+ this.id = opts.id ?? '';
206
+ this.text = opts.text ?? '';
207
+ this.expectedPII = opts.expectedPII ?? [];
208
+ }
209
+ }
210
+
211
+ /**
212
+ * Result of PII detection evaluation.
213
+ */
214
+ export class PIIResult {
215
+ /**
216
+ * @param {object} opts
217
+ * @param {number} opts.total - Total PII entities expected
218
+ * @param {number} opts.truePositives
219
+ * @param {number} opts.falsePositives
220
+ * @param {number} opts.falseNegatives
221
+ */
222
+ constructor(opts = {}) {
223
+ this.total = opts.total ?? 0;
224
+ this.truePositives = opts.truePositives ?? 0;
225
+ this.falsePositives = opts.falsePositives ?? 0;
226
+ this.falseNegatives = opts.falseNegatives ?? 0;
227
+ }
228
+
229
+ /** Precision: TP / (TP + FP) */
230
+ get precision() {
231
+ const denom = this.truePositives + this.falsePositives;
232
+ return denom > 0 ? this.truePositives / denom : 0;
233
+ }
234
+
235
+ /** Recall: TP / (TP + FN) */
236
+ get recall() {
237
+ const denom = this.truePositives + this.falseNegatives;
238
+ return denom > 0 ? this.truePositives / denom : 0;
239
+ }
240
+
241
+ /** F1 score: harmonic mean of precision and recall */
242
+ get f1() {
243
+ const p = this.precision;
244
+ const r = this.recall;
245
+ return (p + r) > 0 ? (2 * p * r) / (p + r) : 0;
246
+ }
247
+
248
+ /** F2 score: weighted harmonic mean emphasizing recall */
249
+ get f2() {
250
+ const p = this.precision;
251
+ const r = this.recall;
252
+ return (p + r) > 0 ? (5 * p * r) / (4 * p + r) : 0;
253
+ }
254
+ }
255
+
256
+ // ---------------------------------------------------------------------------
257
+ // Accuracy and F-score metrics for scorer.js
258
+ // ---------------------------------------------------------------------------
259
+
260
+ /**
261
+ * Calculate accuracy from correct/total counts.
262
+ * @param {number} correct
263
+ * @param {number} total
264
+ * @returns {number} Accuracy as percentage (0-100)
265
+ */
266
+ export function calculateAccuracy(correct, total) {
267
+ return total > 0 ? (correct / total) * 100 : 0;
268
+ }
269
+
270
+ /**
271
+ * Calculate F-beta score.
272
+ * @param {number} precision
273
+ * @param {number} recall
274
+ * @param {number} [beta=1] - Beta weight (1 = F1, 2 = F2)
275
+ * @returns {number}
276
+ */
277
+ export function calculateFScore(precision, recall, beta = 1) {
278
+ if (precision + recall === 0) return 0;
279
+ const b2 = beta * beta;
280
+ return ((1 + b2) * precision * recall) / (b2 * precision + recall);
281
+ }