cipher-security 2.0.8 → 2.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. package/bin/cipher.js +11 -1
  2. package/lib/agent-runtime/handlers/architect.js +199 -0
  3. package/lib/agent-runtime/handlers/base.js +240 -0
  4. package/lib/agent-runtime/handlers/blue.js +220 -0
  5. package/lib/agent-runtime/handlers/incident.js +161 -0
  6. package/lib/agent-runtime/handlers/privacy.js +190 -0
  7. package/lib/agent-runtime/handlers/purple.js +209 -0
  8. package/lib/agent-runtime/handlers/recon.js +174 -0
  9. package/lib/agent-runtime/handlers/red.js +246 -0
  10. package/lib/agent-runtime/handlers/researcher.js +170 -0
  11. package/lib/agent-runtime/handlers.js +35 -0
  12. package/lib/agent-runtime/index.js +196 -0
  13. package/lib/agent-runtime/parser.js +316 -0
  14. package/lib/analyze/consistency.js +566 -0
  15. package/lib/analyze/constitution.js +110 -0
  16. package/lib/analyze/sharding.js +251 -0
  17. package/lib/autonomous/agent-tool.js +165 -0
  18. package/lib/autonomous/feedback-loop.js +13 -6
  19. package/lib/autonomous/framework.js +17 -0
  20. package/lib/autonomous/handoff.js +506 -0
  21. package/lib/autonomous/modes/blue.js +26 -0
  22. package/lib/autonomous/modes/red.js +585 -0
  23. package/lib/autonomous/modes/researcher.js +322 -0
  24. package/lib/autonomous/researcher.js +12 -45
  25. package/lib/autonomous/runner.js +9 -537
  26. package/lib/benchmark/agent.js +88 -26
  27. package/lib/benchmark/baselines.js +3 -0
  28. package/lib/benchmark/claude-code-solver.js +254 -0
  29. package/lib/benchmark/cognitive.js +283 -0
  30. package/lib/benchmark/index.js +12 -2
  31. package/lib/benchmark/knowledge.js +281 -0
  32. package/lib/benchmark/llm.js +156 -15
  33. package/lib/benchmark/models.js +5 -2
  34. package/lib/benchmark/nyu-ctf.js +192 -0
  35. package/lib/benchmark/overthewire.js +347 -0
  36. package/lib/benchmark/picoctf.js +281 -0
  37. package/lib/benchmark/prompts.js +280 -0
  38. package/lib/benchmark/registry.js +219 -0
  39. package/lib/benchmark/remote-solver.js +356 -0
  40. package/lib/benchmark/remote-target.js +263 -0
  41. package/lib/benchmark/reporter.js +35 -0
  42. package/lib/benchmark/runner.js +174 -10
  43. package/lib/benchmark/sandbox.js +35 -0
  44. package/lib/benchmark/scorer.js +22 -4
  45. package/lib/benchmark/solver.js +34 -1
  46. package/lib/benchmark/tools.js +262 -16
  47. package/lib/commands.js +9 -0
  48. package/lib/execution/council.js +434 -0
  49. package/lib/execution/parallel.js +292 -0
  50. package/lib/gates/circuit-breaker.js +135 -0
  51. package/lib/gates/confidence.js +302 -0
  52. package/lib/gates/corrections.js +219 -0
  53. package/lib/gates/self-check.js +245 -0
  54. package/lib/gateway/commands.js +727 -0
  55. package/lib/guardrails/engine.js +364 -0
  56. package/lib/mcp/server.js +349 -3
  57. package/lib/memory/compressor.js +94 -7
  58. package/lib/pipeline/hooks.js +288 -0
  59. package/lib/pipeline/index.js +11 -0
  60. package/lib/review/budget.js +210 -0
  61. package/lib/review/engine.js +526 -0
  62. package/lib/review/layers/acceptance-auditor.js +279 -0
  63. package/lib/review/layers/blind-hunter.js +500 -0
  64. package/lib/review/layers/defense-in-depth.js +209 -0
  65. package/lib/review/layers/edge-case-hunter.js +266 -0
  66. package/lib/review/panel.js +519 -0
  67. package/lib/review/two-stage.js +244 -0
  68. package/lib/session/cost-tracker.js +203 -0
  69. package/lib/session/logger.js +349 -0
  70. package/package.json +1 -1
@@ -0,0 +1,434 @@
1
+ // Copyright (c) 2026 defconxt. All rights reserved.
2
+ // Licensed under AGPL-3.0 — see LICENSE file for details.
3
+ // CIPHER is a trademark of defconxt.
4
+
5
+ /**
6
+ * LLM Council — Multi-model consensus via parallel evaluation and synthesis.
7
+ *
8
+ * 3-stage pipeline:
9
+ * 1. Parallel evaluation: N members independently evaluate the task
10
+ * 2. Cross-ranking: each member scores the others' responses
11
+ * 3. Synthesis: highest-ranked response used as basis for final output
12
+ *
13
+ * Designed for high-stakes decisions where single-model bias is a risk:
14
+ * exploit/no-exploit, severity ratings, compliance findings, threat models.
15
+ *
16
+ * Cost: 2N+1 API calls per invocation (N evaluate + N cross-rank + 1 synthesize).
17
+ *
18
+ * @module execution/council
19
+ */
20
+
21
+ import { ModeAgentResult } from '../autonomous/framework.js';
22
+
23
+ // ---------------------------------------------------------------------------
24
+ // Types
25
+ // ---------------------------------------------------------------------------
26
+
27
+ /**
28
+ * Result from a council evaluation.
29
+ */
30
+ export class CouncilResult {
31
+ /**
32
+ * @param {object} opts
33
+ * @param {string} opts.task - Original task
34
+ * @param {CouncilMemberResponse[]} [opts.responses] - Individual member responses
35
+ * @param {number[][]} [opts.rankings] - Cross-ranking matrix (member × member scores)
36
+ * @param {string} [opts.synthesis] - Final synthesized output
37
+ * @param {number} [opts.confidence] - Consensus confidence 0-1
38
+ * @param {number} [opts.totalTokensIn] - Total input tokens
39
+ * @param {number} [opts.totalTokensOut] - Total output tokens
40
+ * @param {number} [opts.totalDurationS] - Wall-clock seconds
41
+ * @param {number} [opts.estimatedCostUSD] - Estimated cost
42
+ * @param {string|null} [opts.error] - Top-level error
43
+ */
44
+ constructor(opts = {}) {
45
+ this.task = opts.task ?? '';
46
+ /** @type {CouncilMemberResponse[]} */
47
+ this.responses = opts.responses ?? [];
48
+ /** @type {number[][]} */
49
+ this.rankings = opts.rankings ?? [];
50
+ this.synthesis = opts.synthesis ?? '';
51
+ this.confidence = opts.confidence ?? 0;
52
+ this.totalTokensIn = opts.totalTokensIn ?? 0;
53
+ this.totalTokensOut = opts.totalTokensOut ?? 0;
54
+ this.totalDurationS = opts.totalDurationS ?? 0;
55
+ this.estimatedCostUSD = opts.estimatedCostUSD ?? 0;
56
+ this.error = opts.error ?? null;
57
+ }
58
+
59
+ /** Number of council members. */
60
+ get memberCount() {
61
+ return this.responses.length;
62
+ }
63
+ }
64
+
65
+ /**
66
+ * Individual council member response.
67
+ */
68
+ export class CouncilMemberResponse {
69
+ /**
70
+ * @param {object} opts
71
+ * @param {number} opts.memberId - 0-indexed member number
72
+ * @param {string} opts.response - Member's evaluation text
73
+ * @param {number} opts.tokensIn
74
+ * @param {number} opts.tokensOut
75
+ * @param {number} opts.durationS
76
+ * @param {string|null} [opts.error]
77
+ */
78
+ constructor(opts = {}) {
79
+ this.memberId = opts.memberId ?? 0;
80
+ this.response = opts.response ?? '';
81
+ this.tokensIn = opts.tokensIn ?? 0;
82
+ this.tokensOut = opts.tokensOut ?? 0;
83
+ this.durationS = opts.durationS ?? 0;
84
+ this.error = opts.error ?? null;
85
+ }
86
+ }
87
+
88
+ // ---------------------------------------------------------------------------
89
+ // Cost estimation
90
+ // ---------------------------------------------------------------------------
91
+
92
+ /** Rough token costs per 1M tokens (input/output) by model family. */
93
+ const MODEL_COSTS = {
94
+ 'claude': { input: 3.0, output: 15.0 }, // Claude Sonnet
95
+ 'gpt-4': { input: 5.0, output: 15.0 },
96
+ 'ollama': { input: 0, output: 0 }, // Local, no API cost
97
+ 'default': { input: 3.0, output: 15.0 },
98
+ };
99
+
100
+ /**
101
+ * Estimate council cost before execution.
102
+ *
103
+ * @param {string} task - Task text
104
+ * @param {object} [opts]
105
+ * @param {number} [opts.members=3] - Number of council members
106
+ * @param {string} [opts.backend='default'] - Backend for cost lookup
107
+ * @returns {{ estimatedTokensIn: number, estimatedTokensOut: number, estimatedCostUSD: number, apiCalls: number }}
108
+ */
109
+ export function estimateCouncilCost(task, opts = {}) {
110
+ const members = opts.members ?? 3;
111
+ const backend = opts.backend ?? 'default';
112
+
113
+ // Rough heuristic: task tokens + system prompt ≈ 500 tokens per call
114
+ const taskTokens = Math.ceil(task.length / 4);
115
+ const perCallIn = taskTokens + 500;
116
+ const perCallOut = 800; // average response length
117
+
118
+ // 2N+1 calls: N evaluate + N cross-rank + 1 synthesize
119
+ const apiCalls = 2 * members + 1;
120
+ const totalIn = perCallIn * apiCalls;
121
+ const totalOut = perCallOut * apiCalls;
122
+
123
+ const costKey = Object.keys(MODEL_COSTS).find(k => backend.includes(k)) || 'default';
124
+ const costs = MODEL_COSTS[costKey];
125
+ const costUSD = (totalIn / 1_000_000) * costs.input + (totalOut / 1_000_000) * costs.output;
126
+
127
+ return {
128
+ estimatedTokensIn: totalIn,
129
+ estimatedTokensOut: totalOut,
130
+ estimatedCostUSD: Math.round(costUSD * 10000) / 10000,
131
+ apiCalls,
132
+ };
133
+ }
134
+
135
+ // ---------------------------------------------------------------------------
136
+ // LLMCouncil
137
+ // ---------------------------------------------------------------------------
138
+
139
+ /**
140
+ * Multi-model consensus engine.
141
+ */
142
+ export class LLMCouncil {
143
+ /**
144
+ * @param {object} opts
145
+ * @param {number} [opts.members=3] - Number of council members
146
+ * @param {Function} [opts.agentRunner] - Injectable runner for evaluation calls
147
+ */
148
+ constructor(opts = {}) {
149
+ this._members = opts.members ?? 3;
150
+ this._agentRunner = opts.agentRunner ?? null;
151
+ }
152
+
153
+ /**
154
+ * Run full council evaluation: evaluate → cross-rank → synthesize.
155
+ *
156
+ * @param {string} task
157
+ * @param {object} [opts]
158
+ * @param {string|null} [opts.backend] - LLM backend override
159
+ * @param {boolean} [opts.dryRun=false] - Return cost estimate only
160
+ * @returns {Promise<CouncilResult>}
161
+ */
162
+ async evaluate(task, opts = {}) {
163
+ if (opts.dryRun) {
164
+ const estimate = estimateCouncilCost(task, {
165
+ members: this._members,
166
+ backend: opts.backend || 'default',
167
+ });
168
+ return new CouncilResult({
169
+ task,
170
+ estimatedCostUSD: estimate.estimatedCostUSD,
171
+ totalTokensIn: estimate.estimatedTokensIn,
172
+ totalTokensOut: estimate.estimatedTokensOut,
173
+ });
174
+ }
175
+
176
+ const startTime = performance.now() / 1000;
177
+ const runner = this._agentRunner || (await import('../autonomous/runner.js')).runAutonomous;
178
+
179
+ // Stage 1: Parallel evaluation
180
+ const evaluationPromises = [];
181
+ for (let i = 0; i < this._members; i++) {
182
+ evaluationPromises.push(
183
+ this._evaluateMember(i, task, runner, opts.backend)
184
+ );
185
+ }
186
+ const responses = await Promise.allSettled(evaluationPromises);
187
+ const memberResponses = responses.map((settled, i) => {
188
+ if (settled.status === 'fulfilled') return settled.value;
189
+ return new CouncilMemberResponse({
190
+ memberId: i,
191
+ error: settled.reason?.message || 'Unknown error',
192
+ });
193
+ });
194
+
195
+ // Check if we have enough successful responses
196
+ const successfulResponses = memberResponses.filter(r => !r.error);
197
+ if (successfulResponses.length === 0) {
198
+ return new CouncilResult({
199
+ task,
200
+ responses: memberResponses,
201
+ error: 'All council members failed',
202
+ totalDurationS: performance.now() / 1000 - startTime,
203
+ });
204
+ }
205
+
206
+ // Stage 2: Cross-ranking
207
+ const rankings = await this._crossRank(successfulResponses, task, runner, opts.backend);
208
+
209
+ // Stage 3: Synthesis
210
+ const synthesis = await this._synthesize(successfulResponses, rankings, task, runner, opts.backend);
211
+
212
+ // Aggregate metrics
213
+ let totalIn = 0, totalOut = 0;
214
+ for (const r of memberResponses) {
215
+ totalIn += r.tokensIn;
216
+ totalOut += r.tokensOut;
217
+ }
218
+ // Add ranking + synthesis tokens (estimated)
219
+ totalIn += synthesis.tokensIn || 0;
220
+ totalOut += synthesis.tokensOut || 0;
221
+
222
+ const confidence = this._computeConfidence(rankings, successfulResponses.length);
223
+
224
+ return new CouncilResult({
225
+ task,
226
+ responses: memberResponses,
227
+ rankings,
228
+ synthesis: synthesis.text,
229
+ confidence,
230
+ totalTokensIn: totalIn,
231
+ totalTokensOut: totalOut,
232
+ totalDurationS: performance.now() / 1000 - startTime,
233
+ estimatedCostUSD: estimateCouncilCost(task, {
234
+ members: this._members,
235
+ backend: opts.backend || 'default',
236
+ }).estimatedCostUSD,
237
+ });
238
+ }
239
+
240
+ // -- internal -----------------------------------------------------------
241
+
242
+ /**
243
+ * Evaluate a single member.
244
+ * @param {number} memberId
245
+ * @param {string} task
246
+ * @param {Function} runner
247
+ * @param {string|null} backend
248
+ * @returns {Promise<CouncilMemberResponse>}
249
+ */
250
+ async _evaluateMember(memberId, task, runner, backend) {
251
+ const startTime = performance.now() / 1000;
252
+
253
+ const taskInput = {
254
+ task,
255
+ user_message: `[Council Member ${memberId + 1}] Evaluate the following independently:\n\n${task}`,
256
+ };
257
+
258
+ const result = await runner('ARCHITECT', taskInput, backend, null);
259
+
260
+ return new CouncilMemberResponse({
261
+ memberId,
262
+ response: result.outputText,
263
+ tokensIn: result.tokensIn,
264
+ tokensOut: result.tokensOut,
265
+ durationS: performance.now() / 1000 - startTime,
266
+ error: result.error,
267
+ });
268
+ }
269
+
270
+ /**
271
+ * Cross-rank: each member scores the others' responses.
272
+ * Returns a matrix where rankings[i][j] = member i's score for response j.
273
+ *
274
+ * @param {CouncilMemberResponse[]} responses
275
+ * @param {string} task
276
+ * @param {Function} runner
277
+ * @param {string|null} backend
278
+ * @returns {Promise<number[][]>}
279
+ */
280
+ async _crossRank(responses, task, runner, backend) {
281
+ const n = responses.length;
282
+ const rankings = Array.from({ length: n }, () => Array(n).fill(0));
283
+
284
+ const rankingPromises = [];
285
+ for (let i = 0; i < n; i++) {
286
+ rankingPromises.push(
287
+ this._rankByMember(i, responses, task, runner, backend)
288
+ );
289
+ }
290
+
291
+ const results = await Promise.allSettled(rankingPromises);
292
+ for (let i = 0; i < results.length; i++) {
293
+ if (results[i].status === 'fulfilled') {
294
+ rankings[i] = results[i].value;
295
+ }
296
+ }
297
+
298
+ return rankings;
299
+ }
300
+
301
+ /**
302
+ * Single member ranks all responses.
303
+ * @param {number} rankerId
304
+ * @param {CouncilMemberResponse[]} responses
305
+ * @param {string} task
306
+ * @param {Function} runner
307
+ * @param {string|null} backend
308
+ * @returns {Promise<number[]>}
309
+ */
310
+ async _rankByMember(rankerId, responses, task, runner, backend) {
311
+ const responseTexts = responses.map((r, j) =>
312
+ `Response ${j + 1}:\n${r.response}\n`
313
+ ).join('\n---\n\n');
314
+
315
+ const taskInput = {
316
+ task: `Rank these ${responses.length} responses to: "${task}"`,
317
+ user_message: `Score each response from 0 to 10 based on accuracy, completeness, and actionability.\n\n${responseTexts}\n\nReturn ONLY a JSON array of scores, e.g. [8, 6, 9]`,
318
+ };
319
+
320
+ const result = await runner('ARCHITECT', taskInput, backend, null);
321
+
322
+ // Parse scores from output
323
+ try {
324
+ const match = result.outputText.match(/\[[\d\s,\.]+\]/);
325
+ if (match) {
326
+ const scores = JSON.parse(match[0]);
327
+ return scores.map(s => Math.min(10, Math.max(0, Number(s) || 0)));
328
+ }
329
+ } catch { /* fall through */ }
330
+
331
+ // Fallback: equal scores
332
+ return Array(responses.length).fill(5);
333
+ }
334
+
335
+ /**
336
+ * Synthesize final output from the highest-ranked response.
337
+ *
338
+ * @param {CouncilMemberResponse[]} responses
339
+ * @param {number[][]} rankings
340
+ * @param {string} task
341
+ * @param {Function} runner
342
+ * @param {string|null} backend
343
+ * @returns {Promise<{ text: string, tokensIn: number, tokensOut: number }>}
344
+ */
345
+ async _synthesize(responses, rankings, task, runner, backend) {
346
+ // Compute aggregate score per response
347
+ const n = responses.length;
348
+ const aggregateScores = Array(n).fill(0);
349
+ for (let i = 0; i < rankings.length; i++) {
350
+ for (let j = 0; j < rankings[i].length; j++) {
351
+ aggregateScores[j] += rankings[i][j];
352
+ }
353
+ }
354
+
355
+ // Find highest-scored response
356
+ let bestIdx = 0;
357
+ let bestScore = aggregateScores[0];
358
+ for (let j = 1; j < n; j++) {
359
+ if (aggregateScores[j] > bestScore) {
360
+ bestScore = aggregateScores[j];
361
+ bestIdx = j;
362
+ }
363
+ }
364
+
365
+ const taskInput = {
366
+ task: `Synthesize a consensus from council evaluation`,
367
+ user_message: `Original task: "${task}"\n\nBest-ranked response (score ${bestScore}):\n${responses[bestIdx].response}\n\nProvide a refined, authoritative response incorporating the strongest analysis.`,
368
+ };
369
+
370
+ const result = await runner('ARCHITECT', taskInput, backend, null);
371
+
372
+ return {
373
+ text: result.outputText || responses[bestIdx].response,
374
+ tokensIn: result.tokensIn,
375
+ tokensOut: result.tokensOut,
376
+ };
377
+ }
378
+
379
+ /**
380
+ * Compute consensus confidence from cross-rankings.
381
+ * Higher agreement between rankers → higher confidence.
382
+ *
383
+ * @param {number[][]} rankings
384
+ * @param {number} responseCount
385
+ * @returns {number} 0-1 confidence score
386
+ */
387
+ _computeConfidence(rankings, responseCount) {
388
+ if (rankings.length === 0 || responseCount === 0) return 0;
389
+
390
+ // Compute per-response aggregate scores
391
+ const aggregates = Array(responseCount).fill(0);
392
+ let totalScores = 0;
393
+ for (const row of rankings) {
394
+ for (let j = 0; j < row.length; j++) {
395
+ aggregates[j] += row[j];
396
+ totalScores += row[j];
397
+ }
398
+ }
399
+
400
+ if (totalScores === 0) return 0;
401
+
402
+ // Find the spread: if one response dominates, confidence is high
403
+ const maxScore = Math.max(...aggregates);
404
+ const avgScore = totalScores / responseCount;
405
+
406
+ // Confidence = how much the best response stands out
407
+ if (avgScore === 0) return 0;
408
+ const dominance = maxScore / (totalScores || 1);
409
+ return Math.min(1, Math.max(0, dominance * responseCount));
410
+ }
411
+ }
412
+
413
+ // ---------------------------------------------------------------------------
414
+ // runCouncil — convenience entry point
415
+ // ---------------------------------------------------------------------------
416
+
417
+ /**
418
+ * Run a council evaluation.
419
+ *
420
+ * @param {string} task
421
+ * @param {object} [opts]
422
+ * @param {number} [opts.members=3]
423
+ * @param {string|null} [opts.backend]
424
+ * @param {boolean} [opts.dryRun=false]
425
+ * @param {Function} [opts.agentRunner]
426
+ * @returns {Promise<CouncilResult>}
427
+ */
428
+ export async function runCouncil(task, opts = {}) {
429
+ const council = new LLMCouncil({
430
+ members: opts.members ?? 3,
431
+ agentRunner: opts.agentRunner,
432
+ });
433
+ return council.evaluate(task, opts);
434
+ }