cipher-security 2.0.8 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cipher.js +11 -1
- package/lib/agent-runtime/handlers/architect.js +199 -0
- package/lib/agent-runtime/handlers/base.js +240 -0
- package/lib/agent-runtime/handlers/blue.js +220 -0
- package/lib/agent-runtime/handlers/incident.js +161 -0
- package/lib/agent-runtime/handlers/privacy.js +190 -0
- package/lib/agent-runtime/handlers/purple.js +209 -0
- package/lib/agent-runtime/handlers/recon.js +174 -0
- package/lib/agent-runtime/handlers/red.js +246 -0
- package/lib/agent-runtime/handlers/researcher.js +170 -0
- package/lib/agent-runtime/handlers.js +35 -0
- package/lib/agent-runtime/index.js +196 -0
- package/lib/agent-runtime/parser.js +316 -0
- package/lib/analyze/consistency.js +566 -0
- package/lib/analyze/constitution.js +110 -0
- package/lib/analyze/sharding.js +251 -0
- package/lib/autonomous/agent-tool.js +165 -0
- package/lib/autonomous/feedback-loop.js +13 -6
- package/lib/autonomous/framework.js +17 -0
- package/lib/autonomous/handoff.js +506 -0
- package/lib/autonomous/modes/blue.js +26 -0
- package/lib/autonomous/modes/red.js +585 -0
- package/lib/autonomous/modes/researcher.js +322 -0
- package/lib/autonomous/researcher.js +12 -45
- package/lib/autonomous/runner.js +9 -537
- package/lib/benchmark/agent.js +88 -26
- package/lib/benchmark/baselines.js +3 -0
- package/lib/benchmark/claude-code-solver.js +254 -0
- package/lib/benchmark/cognitive.js +283 -0
- package/lib/benchmark/index.js +12 -2
- package/lib/benchmark/knowledge.js +281 -0
- package/lib/benchmark/llm.js +156 -15
- package/lib/benchmark/models.js +5 -2
- package/lib/benchmark/nyu-ctf.js +192 -0
- package/lib/benchmark/overthewire.js +347 -0
- package/lib/benchmark/picoctf.js +281 -0
- package/lib/benchmark/prompts.js +280 -0
- package/lib/benchmark/registry.js +219 -0
- package/lib/benchmark/remote-solver.js +356 -0
- package/lib/benchmark/remote-target.js +263 -0
- package/lib/benchmark/reporter.js +35 -0
- package/lib/benchmark/runner.js +174 -10
- package/lib/benchmark/sandbox.js +35 -0
- package/lib/benchmark/scorer.js +22 -4
- package/lib/benchmark/solver.js +34 -1
- package/lib/benchmark/tools.js +262 -16
- package/lib/commands.js +9 -0
- package/lib/execution/council.js +434 -0
- package/lib/execution/parallel.js +292 -0
- package/lib/gates/circuit-breaker.js +135 -0
- package/lib/gates/confidence.js +302 -0
- package/lib/gates/corrections.js +219 -0
- package/lib/gates/self-check.js +245 -0
- package/lib/gateway/commands.js +727 -0
- package/lib/guardrails/engine.js +364 -0
- package/lib/mcp/server.js +349 -3
- package/lib/memory/compressor.js +94 -7
- package/lib/pipeline/hooks.js +288 -0
- package/lib/pipeline/index.js +11 -0
- package/lib/review/budget.js +210 -0
- package/lib/review/engine.js +526 -0
- package/lib/review/layers/acceptance-auditor.js +279 -0
- package/lib/review/layers/blind-hunter.js +500 -0
- package/lib/review/layers/defense-in-depth.js +209 -0
- package/lib/review/layers/edge-case-hunter.js +266 -0
- package/lib/review/panel.js +519 -0
- package/lib/review/two-stage.js +244 -0
- package/lib/session/cost-tracker.js +203 -0
- package/lib/session/logger.js +349 -0
- package/package.json +1 -1
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
// Copyright (c) 2026 defconxt. All rights reserved.
|
|
2
|
+
// Licensed under AGPL-3.0 — see LICENSE file for details.
|
|
3
|
+
// CIPHER is a trademark of defconxt.
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* LLM Council — Multi-model consensus via parallel evaluation and synthesis.
|
|
7
|
+
*
|
|
8
|
+
* 3-stage pipeline:
|
|
9
|
+
* 1. Parallel evaluation: N members independently evaluate the task
|
|
10
|
+
* 2. Cross-ranking: each member scores the others' responses
|
|
11
|
+
* 3. Synthesis: highest-ranked response used as basis for final output
|
|
12
|
+
*
|
|
13
|
+
* Designed for high-stakes decisions where single-model bias is a risk:
|
|
14
|
+
* exploit/no-exploit, severity ratings, compliance findings, threat models.
|
|
15
|
+
*
|
|
16
|
+
* Cost: 2N+1 API calls per invocation (N evaluate + N cross-rank + 1 synthesize).
|
|
17
|
+
*
|
|
18
|
+
* @module execution/council
|
|
19
|
+
*/
|
|
20
|
+
|
|
21
|
+
import { ModeAgentResult } from '../autonomous/framework.js';
|
|
22
|
+
|
|
23
|
+
// ---------------------------------------------------------------------------
|
|
24
|
+
// Types
|
|
25
|
+
// ---------------------------------------------------------------------------
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Result from a council evaluation.
|
|
29
|
+
*/
|
|
30
|
+
export class CouncilResult {
|
|
31
|
+
/**
|
|
32
|
+
* @param {object} opts
|
|
33
|
+
* @param {string} opts.task - Original task
|
|
34
|
+
* @param {CouncilMemberResponse[]} [opts.responses] - Individual member responses
|
|
35
|
+
* @param {number[][]} [opts.rankings] - Cross-ranking matrix (member × member scores)
|
|
36
|
+
* @param {string} [opts.synthesis] - Final synthesized output
|
|
37
|
+
* @param {number} [opts.confidence] - Consensus confidence 0-1
|
|
38
|
+
* @param {number} [opts.totalTokensIn] - Total input tokens
|
|
39
|
+
* @param {number} [opts.totalTokensOut] - Total output tokens
|
|
40
|
+
* @param {number} [opts.totalDurationS] - Wall-clock seconds
|
|
41
|
+
* @param {number} [opts.estimatedCostUSD] - Estimated cost
|
|
42
|
+
* @param {string|null} [opts.error] - Top-level error
|
|
43
|
+
*/
|
|
44
|
+
constructor(opts = {}) {
|
|
45
|
+
this.task = opts.task ?? '';
|
|
46
|
+
/** @type {CouncilMemberResponse[]} */
|
|
47
|
+
this.responses = opts.responses ?? [];
|
|
48
|
+
/** @type {number[][]} */
|
|
49
|
+
this.rankings = opts.rankings ?? [];
|
|
50
|
+
this.synthesis = opts.synthesis ?? '';
|
|
51
|
+
this.confidence = opts.confidence ?? 0;
|
|
52
|
+
this.totalTokensIn = opts.totalTokensIn ?? 0;
|
|
53
|
+
this.totalTokensOut = opts.totalTokensOut ?? 0;
|
|
54
|
+
this.totalDurationS = opts.totalDurationS ?? 0;
|
|
55
|
+
this.estimatedCostUSD = opts.estimatedCostUSD ?? 0;
|
|
56
|
+
this.error = opts.error ?? null;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
/** Number of council members. */
|
|
60
|
+
get memberCount() {
|
|
61
|
+
return this.responses.length;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Individual council member response.
|
|
67
|
+
*/
|
|
68
|
+
export class CouncilMemberResponse {
|
|
69
|
+
/**
|
|
70
|
+
* @param {object} opts
|
|
71
|
+
* @param {number} opts.memberId - 0-indexed member number
|
|
72
|
+
* @param {string} opts.response - Member's evaluation text
|
|
73
|
+
* @param {number} opts.tokensIn
|
|
74
|
+
* @param {number} opts.tokensOut
|
|
75
|
+
* @param {number} opts.durationS
|
|
76
|
+
* @param {string|null} [opts.error]
|
|
77
|
+
*/
|
|
78
|
+
constructor(opts = {}) {
|
|
79
|
+
this.memberId = opts.memberId ?? 0;
|
|
80
|
+
this.response = opts.response ?? '';
|
|
81
|
+
this.tokensIn = opts.tokensIn ?? 0;
|
|
82
|
+
this.tokensOut = opts.tokensOut ?? 0;
|
|
83
|
+
this.durationS = opts.durationS ?? 0;
|
|
84
|
+
this.error = opts.error ?? null;
|
|
85
|
+
}
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
// ---------------------------------------------------------------------------
|
|
89
|
+
// Cost estimation
|
|
90
|
+
// ---------------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
/** Rough token costs per 1M tokens (input/output) by model family. */
|
|
93
|
+
const MODEL_COSTS = {
|
|
94
|
+
'claude': { input: 3.0, output: 15.0 }, // Claude Sonnet
|
|
95
|
+
'gpt-4': { input: 5.0, output: 15.0 },
|
|
96
|
+
'ollama': { input: 0, output: 0 }, // Local, no API cost
|
|
97
|
+
'default': { input: 3.0, output: 15.0 },
|
|
98
|
+
};
|
|
99
|
+
|
|
100
|
+
/**
|
|
101
|
+
* Estimate council cost before execution.
|
|
102
|
+
*
|
|
103
|
+
* @param {string} task - Task text
|
|
104
|
+
* @param {object} [opts]
|
|
105
|
+
* @param {number} [opts.members=3] - Number of council members
|
|
106
|
+
* @param {string} [opts.backend='default'] - Backend for cost lookup
|
|
107
|
+
* @returns {{ estimatedTokensIn: number, estimatedTokensOut: number, estimatedCostUSD: number, apiCalls: number }}
|
|
108
|
+
*/
|
|
109
|
+
export function estimateCouncilCost(task, opts = {}) {
|
|
110
|
+
const members = opts.members ?? 3;
|
|
111
|
+
const backend = opts.backend ?? 'default';
|
|
112
|
+
|
|
113
|
+
// Rough heuristic: task tokens + system prompt ≈ 500 tokens per call
|
|
114
|
+
const taskTokens = Math.ceil(task.length / 4);
|
|
115
|
+
const perCallIn = taskTokens + 500;
|
|
116
|
+
const perCallOut = 800; // average response length
|
|
117
|
+
|
|
118
|
+
// 2N+1 calls: N evaluate + N cross-rank + 1 synthesize
|
|
119
|
+
const apiCalls = 2 * members + 1;
|
|
120
|
+
const totalIn = perCallIn * apiCalls;
|
|
121
|
+
const totalOut = perCallOut * apiCalls;
|
|
122
|
+
|
|
123
|
+
const costKey = Object.keys(MODEL_COSTS).find(k => backend.includes(k)) || 'default';
|
|
124
|
+
const costs = MODEL_COSTS[costKey];
|
|
125
|
+
const costUSD = (totalIn / 1_000_000) * costs.input + (totalOut / 1_000_000) * costs.output;
|
|
126
|
+
|
|
127
|
+
return {
|
|
128
|
+
estimatedTokensIn: totalIn,
|
|
129
|
+
estimatedTokensOut: totalOut,
|
|
130
|
+
estimatedCostUSD: Math.round(costUSD * 10000) / 10000,
|
|
131
|
+
apiCalls,
|
|
132
|
+
};
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// ---------------------------------------------------------------------------
|
|
136
|
+
// LLMCouncil
|
|
137
|
+
// ---------------------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
/**
|
|
140
|
+
* Multi-model consensus engine.
|
|
141
|
+
*/
|
|
142
|
+
export class LLMCouncil {
|
|
143
|
+
/**
|
|
144
|
+
* @param {object} opts
|
|
145
|
+
* @param {number} [opts.members=3] - Number of council members
|
|
146
|
+
* @param {Function} [opts.agentRunner] - Injectable runner for evaluation calls
|
|
147
|
+
*/
|
|
148
|
+
constructor(opts = {}) {
|
|
149
|
+
this._members = opts.members ?? 3;
|
|
150
|
+
this._agentRunner = opts.agentRunner ?? null;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
/**
|
|
154
|
+
* Run full council evaluation: evaluate → cross-rank → synthesize.
|
|
155
|
+
*
|
|
156
|
+
* @param {string} task
|
|
157
|
+
* @param {object} [opts]
|
|
158
|
+
* @param {string|null} [opts.backend] - LLM backend override
|
|
159
|
+
* @param {boolean} [opts.dryRun=false] - Return cost estimate only
|
|
160
|
+
* @returns {Promise<CouncilResult>}
|
|
161
|
+
*/
|
|
162
|
+
async evaluate(task, opts = {}) {
|
|
163
|
+
if (opts.dryRun) {
|
|
164
|
+
const estimate = estimateCouncilCost(task, {
|
|
165
|
+
members: this._members,
|
|
166
|
+
backend: opts.backend || 'default',
|
|
167
|
+
});
|
|
168
|
+
return new CouncilResult({
|
|
169
|
+
task,
|
|
170
|
+
estimatedCostUSD: estimate.estimatedCostUSD,
|
|
171
|
+
totalTokensIn: estimate.estimatedTokensIn,
|
|
172
|
+
totalTokensOut: estimate.estimatedTokensOut,
|
|
173
|
+
});
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const startTime = performance.now() / 1000;
|
|
177
|
+
const runner = this._agentRunner || (await import('../autonomous/runner.js')).runAutonomous;
|
|
178
|
+
|
|
179
|
+
// Stage 1: Parallel evaluation
|
|
180
|
+
const evaluationPromises = [];
|
|
181
|
+
for (let i = 0; i < this._members; i++) {
|
|
182
|
+
evaluationPromises.push(
|
|
183
|
+
this._evaluateMember(i, task, runner, opts.backend)
|
|
184
|
+
);
|
|
185
|
+
}
|
|
186
|
+
const responses = await Promise.allSettled(evaluationPromises);
|
|
187
|
+
const memberResponses = responses.map((settled, i) => {
|
|
188
|
+
if (settled.status === 'fulfilled') return settled.value;
|
|
189
|
+
return new CouncilMemberResponse({
|
|
190
|
+
memberId: i,
|
|
191
|
+
error: settled.reason?.message || 'Unknown error',
|
|
192
|
+
});
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
// Check if we have enough successful responses
|
|
196
|
+
const successfulResponses = memberResponses.filter(r => !r.error);
|
|
197
|
+
if (successfulResponses.length === 0) {
|
|
198
|
+
return new CouncilResult({
|
|
199
|
+
task,
|
|
200
|
+
responses: memberResponses,
|
|
201
|
+
error: 'All council members failed',
|
|
202
|
+
totalDurationS: performance.now() / 1000 - startTime,
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
// Stage 2: Cross-ranking
|
|
207
|
+
const rankings = await this._crossRank(successfulResponses, task, runner, opts.backend);
|
|
208
|
+
|
|
209
|
+
// Stage 3: Synthesis
|
|
210
|
+
const synthesis = await this._synthesize(successfulResponses, rankings, task, runner, opts.backend);
|
|
211
|
+
|
|
212
|
+
// Aggregate metrics
|
|
213
|
+
let totalIn = 0, totalOut = 0;
|
|
214
|
+
for (const r of memberResponses) {
|
|
215
|
+
totalIn += r.tokensIn;
|
|
216
|
+
totalOut += r.tokensOut;
|
|
217
|
+
}
|
|
218
|
+
// Add ranking + synthesis tokens (estimated)
|
|
219
|
+
totalIn += synthesis.tokensIn || 0;
|
|
220
|
+
totalOut += synthesis.tokensOut || 0;
|
|
221
|
+
|
|
222
|
+
const confidence = this._computeConfidence(rankings, successfulResponses.length);
|
|
223
|
+
|
|
224
|
+
return new CouncilResult({
|
|
225
|
+
task,
|
|
226
|
+
responses: memberResponses,
|
|
227
|
+
rankings,
|
|
228
|
+
synthesis: synthesis.text,
|
|
229
|
+
confidence,
|
|
230
|
+
totalTokensIn: totalIn,
|
|
231
|
+
totalTokensOut: totalOut,
|
|
232
|
+
totalDurationS: performance.now() / 1000 - startTime,
|
|
233
|
+
estimatedCostUSD: estimateCouncilCost(task, {
|
|
234
|
+
members: this._members,
|
|
235
|
+
backend: opts.backend || 'default',
|
|
236
|
+
}).estimatedCostUSD,
|
|
237
|
+
});
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// -- internal -----------------------------------------------------------
|
|
241
|
+
|
|
242
|
+
/**
|
|
243
|
+
* Evaluate a single member.
|
|
244
|
+
* @param {number} memberId
|
|
245
|
+
* @param {string} task
|
|
246
|
+
* @param {Function} runner
|
|
247
|
+
* @param {string|null} backend
|
|
248
|
+
* @returns {Promise<CouncilMemberResponse>}
|
|
249
|
+
*/
|
|
250
|
+
async _evaluateMember(memberId, task, runner, backend) {
|
|
251
|
+
const startTime = performance.now() / 1000;
|
|
252
|
+
|
|
253
|
+
const taskInput = {
|
|
254
|
+
task,
|
|
255
|
+
user_message: `[Council Member ${memberId + 1}] Evaluate the following independently:\n\n${task}`,
|
|
256
|
+
};
|
|
257
|
+
|
|
258
|
+
const result = await runner('ARCHITECT', taskInput, backend, null);
|
|
259
|
+
|
|
260
|
+
return new CouncilMemberResponse({
|
|
261
|
+
memberId,
|
|
262
|
+
response: result.outputText,
|
|
263
|
+
tokensIn: result.tokensIn,
|
|
264
|
+
tokensOut: result.tokensOut,
|
|
265
|
+
durationS: performance.now() / 1000 - startTime,
|
|
266
|
+
error: result.error,
|
|
267
|
+
});
|
|
268
|
+
}
|
|
269
|
+
|
|
270
|
+
/**
|
|
271
|
+
* Cross-rank: each member scores the others' responses.
|
|
272
|
+
* Returns a matrix where rankings[i][j] = member i's score for response j.
|
|
273
|
+
*
|
|
274
|
+
* @param {CouncilMemberResponse[]} responses
|
|
275
|
+
* @param {string} task
|
|
276
|
+
* @param {Function} runner
|
|
277
|
+
* @param {string|null} backend
|
|
278
|
+
* @returns {Promise<number[][]>}
|
|
279
|
+
*/
|
|
280
|
+
async _crossRank(responses, task, runner, backend) {
|
|
281
|
+
const n = responses.length;
|
|
282
|
+
const rankings = Array.from({ length: n }, () => Array(n).fill(0));
|
|
283
|
+
|
|
284
|
+
const rankingPromises = [];
|
|
285
|
+
for (let i = 0; i < n; i++) {
|
|
286
|
+
rankingPromises.push(
|
|
287
|
+
this._rankByMember(i, responses, task, runner, backend)
|
|
288
|
+
);
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
const results = await Promise.allSettled(rankingPromises);
|
|
292
|
+
for (let i = 0; i < results.length; i++) {
|
|
293
|
+
if (results[i].status === 'fulfilled') {
|
|
294
|
+
rankings[i] = results[i].value;
|
|
295
|
+
}
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
return rankings;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/**
|
|
302
|
+
* Single member ranks all responses.
|
|
303
|
+
* @param {number} rankerId
|
|
304
|
+
* @param {CouncilMemberResponse[]} responses
|
|
305
|
+
* @param {string} task
|
|
306
|
+
* @param {Function} runner
|
|
307
|
+
* @param {string|null} backend
|
|
308
|
+
* @returns {Promise<number[]>}
|
|
309
|
+
*/
|
|
310
|
+
async _rankByMember(rankerId, responses, task, runner, backend) {
|
|
311
|
+
const responseTexts = responses.map((r, j) =>
|
|
312
|
+
`Response ${j + 1}:\n${r.response}\n`
|
|
313
|
+
).join('\n---\n\n');
|
|
314
|
+
|
|
315
|
+
const taskInput = {
|
|
316
|
+
task: `Rank these ${responses.length} responses to: "${task}"`,
|
|
317
|
+
user_message: `Score each response from 0 to 10 based on accuracy, completeness, and actionability.\n\n${responseTexts}\n\nReturn ONLY a JSON array of scores, e.g. [8, 6, 9]`,
|
|
318
|
+
};
|
|
319
|
+
|
|
320
|
+
const result = await runner('ARCHITECT', taskInput, backend, null);
|
|
321
|
+
|
|
322
|
+
// Parse scores from output
|
|
323
|
+
try {
|
|
324
|
+
const match = result.outputText.match(/\[[\d\s,\.]+\]/);
|
|
325
|
+
if (match) {
|
|
326
|
+
const scores = JSON.parse(match[0]);
|
|
327
|
+
return scores.map(s => Math.min(10, Math.max(0, Number(s) || 0)));
|
|
328
|
+
}
|
|
329
|
+
} catch { /* fall through */ }
|
|
330
|
+
|
|
331
|
+
// Fallback: equal scores
|
|
332
|
+
return Array(responses.length).fill(5);
|
|
333
|
+
}
|
|
334
|
+
|
|
335
|
+
/**
|
|
336
|
+
* Synthesize final output from the highest-ranked response.
|
|
337
|
+
*
|
|
338
|
+
* @param {CouncilMemberResponse[]} responses
|
|
339
|
+
* @param {number[][]} rankings
|
|
340
|
+
* @param {string} task
|
|
341
|
+
* @param {Function} runner
|
|
342
|
+
* @param {string|null} backend
|
|
343
|
+
* @returns {Promise<{ text: string, tokensIn: number, tokensOut: number }>}
|
|
344
|
+
*/
|
|
345
|
+
async _synthesize(responses, rankings, task, runner, backend) {
|
|
346
|
+
// Compute aggregate score per response
|
|
347
|
+
const n = responses.length;
|
|
348
|
+
const aggregateScores = Array(n).fill(0);
|
|
349
|
+
for (let i = 0; i < rankings.length; i++) {
|
|
350
|
+
for (let j = 0; j < rankings[i].length; j++) {
|
|
351
|
+
aggregateScores[j] += rankings[i][j];
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
// Find highest-scored response
|
|
356
|
+
let bestIdx = 0;
|
|
357
|
+
let bestScore = aggregateScores[0];
|
|
358
|
+
for (let j = 1; j < n; j++) {
|
|
359
|
+
if (aggregateScores[j] > bestScore) {
|
|
360
|
+
bestScore = aggregateScores[j];
|
|
361
|
+
bestIdx = j;
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
const taskInput = {
|
|
366
|
+
task: `Synthesize a consensus from council evaluation`,
|
|
367
|
+
user_message: `Original task: "${task}"\n\nBest-ranked response (score ${bestScore}):\n${responses[bestIdx].response}\n\nProvide a refined, authoritative response incorporating the strongest analysis.`,
|
|
368
|
+
};
|
|
369
|
+
|
|
370
|
+
const result = await runner('ARCHITECT', taskInput, backend, null);
|
|
371
|
+
|
|
372
|
+
return {
|
|
373
|
+
text: result.outputText || responses[bestIdx].response,
|
|
374
|
+
tokensIn: result.tokensIn,
|
|
375
|
+
tokensOut: result.tokensOut,
|
|
376
|
+
};
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
/**
|
|
380
|
+
* Compute consensus confidence from cross-rankings.
|
|
381
|
+
* Higher agreement between rankers → higher confidence.
|
|
382
|
+
*
|
|
383
|
+
* @param {number[][]} rankings
|
|
384
|
+
* @param {number} responseCount
|
|
385
|
+
* @returns {number} 0-1 confidence score
|
|
386
|
+
*/
|
|
387
|
+
_computeConfidence(rankings, responseCount) {
|
|
388
|
+
if (rankings.length === 0 || responseCount === 0) return 0;
|
|
389
|
+
|
|
390
|
+
// Compute per-response aggregate scores
|
|
391
|
+
const aggregates = Array(responseCount).fill(0);
|
|
392
|
+
let totalScores = 0;
|
|
393
|
+
for (const row of rankings) {
|
|
394
|
+
for (let j = 0; j < row.length; j++) {
|
|
395
|
+
aggregates[j] += row[j];
|
|
396
|
+
totalScores += row[j];
|
|
397
|
+
}
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
if (totalScores === 0) return 0;
|
|
401
|
+
|
|
402
|
+
// Find the spread: if one response dominates, confidence is high
|
|
403
|
+
const maxScore = Math.max(...aggregates);
|
|
404
|
+
const avgScore = totalScores / responseCount;
|
|
405
|
+
|
|
406
|
+
// Confidence = how much the best response stands out
|
|
407
|
+
if (avgScore === 0) return 0;
|
|
408
|
+
const dominance = maxScore / (totalScores || 1);
|
|
409
|
+
return Math.min(1, Math.max(0, dominance * responseCount));
|
|
410
|
+
}
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
// ---------------------------------------------------------------------------
|
|
414
|
+
// runCouncil — convenience entry point
|
|
415
|
+
// ---------------------------------------------------------------------------
|
|
416
|
+
|
|
417
|
+
/**
|
|
418
|
+
* Run a council evaluation.
|
|
419
|
+
*
|
|
420
|
+
* @param {string} task
|
|
421
|
+
* @param {object} [opts]
|
|
422
|
+
* @param {number} [opts.members=3]
|
|
423
|
+
* @param {string|null} [opts.backend]
|
|
424
|
+
* @param {boolean} [opts.dryRun=false]
|
|
425
|
+
* @param {Function} [opts.agentRunner]
|
|
426
|
+
* @returns {Promise<CouncilResult>}
|
|
427
|
+
*/
|
|
428
|
+
export async function runCouncil(task, opts = {}) {
|
|
429
|
+
const council = new LLMCouncil({
|
|
430
|
+
members: opts.members ?? 3,
|
|
431
|
+
agentRunner: opts.agentRunner,
|
|
432
|
+
});
|
|
433
|
+
return council.evaluate(task, opts);
|
|
434
|
+
}
|