mcp-rubber-duck 1.2.5 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.desktop.example +1 -1
- package/.env.pi.example +1 -1
- package/.env.template +1 -1
- package/.eslintrc.json +1 -0
- package/CHANGELOG.md +19 -0
- package/README.md +238 -44
- package/assets/mcp-rubber-duck.png +0 -0
- package/audit-ci.json +2 -1
- package/config/config.example.json +4 -4
- package/dist/config/config.js +4 -4
- package/dist/config/config.js.map +1 -1
- package/dist/config/types.d.ts +78 -0
- package/dist/config/types.d.ts.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +150 -0
- package/dist/server.js.map +1 -1
- package/dist/services/consensus.d.ts +28 -0
- package/dist/services/consensus.d.ts.map +1 -0
- package/dist/services/consensus.js +257 -0
- package/dist/services/consensus.js.map +1 -0
- package/dist/tools/duck-debate.d.ts +16 -0
- package/dist/tools/duck-debate.d.ts.map +1 -0
- package/dist/tools/duck-debate.js +272 -0
- package/dist/tools/duck-debate.js.map +1 -0
- package/dist/tools/duck-iterate.d.ts +14 -0
- package/dist/tools/duck-iterate.d.ts.map +1 -0
- package/dist/tools/duck-iterate.js +195 -0
- package/dist/tools/duck-iterate.js.map +1 -0
- package/dist/tools/duck-judge.d.ts +15 -0
- package/dist/tools/duck-judge.d.ts.map +1 -0
- package/dist/tools/duck-judge.js +208 -0
- package/dist/tools/duck-judge.js.map +1 -0
- package/dist/tools/duck-vote.d.ts +14 -0
- package/dist/tools/duck-vote.d.ts.map +1 -0
- package/dist/tools/duck-vote.js +46 -0
- package/dist/tools/duck-vote.js.map +1 -0
- package/docker-compose.yml +1 -1
- package/package.json +1 -1
- package/src/config/config.ts +4 -4
- package/src/config/types.ts +92 -0
- package/src/server.ts +154 -0
- package/src/services/consensus.ts +324 -0
- package/src/tools/duck-debate.ts +383 -0
- package/src/tools/duck-iterate.ts +253 -0
- package/src/tools/duck-judge.ts +301 -0
- package/src/tools/duck-vote.ts +87 -0
- package/tests/consensus.test.ts +282 -0
- package/tests/duck-debate.test.ts +286 -0
- package/tests/duck-iterate.test.ts +249 -0
- package/tests/duck-judge.test.ts +296 -0
- package/tests/duck-vote.test.ts +250 -0
|
@@ -0,0 +1,253 @@
|
|
|
1
|
+
import { ProviderManager } from '../providers/manager.js';
|
|
2
|
+
import { IterationRound, IterationResult } from '../config/types.js';
|
|
3
|
+
import { logger } from '../utils/logger.js';
|
|
4
|
+
|
|
5
|
+
export interface DuckIterateArgs {
|
|
6
|
+
prompt: string;
|
|
7
|
+
iterations?: number;
|
|
8
|
+
providers: [string, string];
|
|
9
|
+
mode: 'refine' | 'critique-improve';
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
const DEFAULT_ITERATIONS = 3;
|
|
13
|
+
const CONVERGENCE_THRESHOLD = 0.8; // 80% similarity indicates convergence
|
|
14
|
+
|
|
15
|
+
export async function duckIterateTool(
|
|
16
|
+
providerManager: ProviderManager,
|
|
17
|
+
args: Record<string, unknown>
|
|
18
|
+
) {
|
|
19
|
+
const {
|
|
20
|
+
prompt,
|
|
21
|
+
iterations = DEFAULT_ITERATIONS,
|
|
22
|
+
providers,
|
|
23
|
+
mode,
|
|
24
|
+
} = args as unknown as DuckIterateArgs;
|
|
25
|
+
|
|
26
|
+
// Validate inputs
|
|
27
|
+
if (!prompt || typeof prompt !== 'string') {
|
|
28
|
+
throw new Error('Prompt is required');
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if (!providers || !Array.isArray(providers) || providers.length !== 2) {
|
|
32
|
+
throw new Error('Exactly 2 providers are required for iteration');
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
if (!mode || !['refine', 'critique-improve'].includes(mode)) {
|
|
36
|
+
throw new Error('Mode must be either "refine" or "critique-improve"');
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
if (iterations < 1 || iterations > 10) {
|
|
40
|
+
throw new Error('Iterations must be between 1 and 10');
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Validate providers exist
|
|
44
|
+
const providerNames = providerManager.getProviderNames();
|
|
45
|
+
for (const p of providers) {
|
|
46
|
+
if (!providerNames.includes(p)) {
|
|
47
|
+
throw new Error(`Provider "${p}" not found`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
logger.info(`Starting ${mode} iteration with ${providers.join(' <-> ')} for ${iterations} rounds`);
|
|
52
|
+
|
|
53
|
+
const rounds: IterationRound[] = [];
|
|
54
|
+
let lastResponse = '';
|
|
55
|
+
let converged = false;
|
|
56
|
+
|
|
57
|
+
// Round 1: Initial generation by provider A
|
|
58
|
+
const initialResponse = await providerManager.askDuck(providers[0], prompt);
|
|
59
|
+
const providerAInfo = providerManager.getProvider(providers[0]);
|
|
60
|
+
|
|
61
|
+
rounds.push({
|
|
62
|
+
round: 1,
|
|
63
|
+
provider: providers[0],
|
|
64
|
+
nickname: providerAInfo.nickname,
|
|
65
|
+
role: 'generator',
|
|
66
|
+
content: initialResponse.content,
|
|
67
|
+
timestamp: new Date(),
|
|
68
|
+
});
|
|
69
|
+
|
|
70
|
+
lastResponse = initialResponse.content;
|
|
71
|
+
logger.info(`Round 1: ${providers[0]} generated initial response`);
|
|
72
|
+
|
|
73
|
+
// Subsequent rounds: Alternate between providers
|
|
74
|
+
for (let i = 2; i <= iterations; i++) {
|
|
75
|
+
const isProviderA = i % 2 === 1;
|
|
76
|
+
const currentProvider = isProviderA ? providers[0] : providers[1];
|
|
77
|
+
const providerInfo = providerManager.getProvider(currentProvider);
|
|
78
|
+
|
|
79
|
+
const iterationPrompt = buildIterationPrompt(prompt, lastResponse, mode, i, rounds);
|
|
80
|
+
|
|
81
|
+
const response = await providerManager.askDuck(currentProvider, iterationPrompt);
|
|
82
|
+
|
|
83
|
+
// Check for convergence
|
|
84
|
+
if (checkConvergence(lastResponse, response.content)) {
|
|
85
|
+
converged = true;
|
|
86
|
+
logger.info(`Convergence detected at round ${i}`);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const role = mode === 'refine' ? 'refiner' : (i % 2 === 0 ? 'critic' : 'refiner');
|
|
90
|
+
|
|
91
|
+
rounds.push({
|
|
92
|
+
round: i,
|
|
93
|
+
provider: currentProvider,
|
|
94
|
+
nickname: providerInfo.nickname,
|
|
95
|
+
role,
|
|
96
|
+
content: response.content,
|
|
97
|
+
timestamp: new Date(),
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
lastResponse = response.content;
|
|
101
|
+
logger.info(`Round ${i}: ${currentProvider} ${role === 'critic' ? 'critiqued' : 'refined'}`);
|
|
102
|
+
|
|
103
|
+
if (converged) {
|
|
104
|
+
break;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
const result: IterationResult = {
|
|
109
|
+
prompt,
|
|
110
|
+
mode,
|
|
111
|
+
providers,
|
|
112
|
+
rounds,
|
|
113
|
+
finalResponse: lastResponse,
|
|
114
|
+
totalIterations: rounds.length,
|
|
115
|
+
converged,
|
|
116
|
+
};
|
|
117
|
+
|
|
118
|
+
// Format output
|
|
119
|
+
const formattedOutput = formatIterationResult(result);
|
|
120
|
+
|
|
121
|
+
logger.info(`Iteration completed: ${rounds.length} rounds, converged: ${converged}`);
|
|
122
|
+
|
|
123
|
+
return {
|
|
124
|
+
content: [
|
|
125
|
+
{
|
|
126
|
+
type: 'text',
|
|
127
|
+
text: formattedOutput,
|
|
128
|
+
},
|
|
129
|
+
],
|
|
130
|
+
};
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function buildIterationPrompt(
|
|
134
|
+
originalPrompt: string,
|
|
135
|
+
previousResponse: string,
|
|
136
|
+
mode: 'refine' | 'critique-improve',
|
|
137
|
+
round: number,
|
|
138
|
+
previousRounds: IterationRound[]
|
|
139
|
+
): string {
|
|
140
|
+
if (mode === 'refine') {
|
|
141
|
+
return `You are refining a response through iterative improvement.
|
|
142
|
+
|
|
143
|
+
ORIGINAL TASK:
|
|
144
|
+
${originalPrompt}
|
|
145
|
+
|
|
146
|
+
PREVIOUS RESPONSE (Round ${round - 1}):
|
|
147
|
+
${previousResponse}
|
|
148
|
+
|
|
149
|
+
YOUR TASK:
|
|
150
|
+
Improve upon the previous response. Make it:
|
|
151
|
+
- More accurate
|
|
152
|
+
- More complete
|
|
153
|
+
- Clearer and better structured
|
|
154
|
+
- More practical and actionable
|
|
155
|
+
|
|
156
|
+
Provide your improved version directly. Do not explain what you changed - just give the improved response.`;
|
|
157
|
+
} else {
|
|
158
|
+
// critique-improve mode
|
|
159
|
+
const isEvenRound = round % 2 === 0;
|
|
160
|
+
|
|
161
|
+
if (isEvenRound) {
|
|
162
|
+
// Critic round
|
|
163
|
+
return `You are a critical reviewer evaluating a response.
|
|
164
|
+
|
|
165
|
+
ORIGINAL TASK:
|
|
166
|
+
${originalPrompt}
|
|
167
|
+
|
|
168
|
+
RESPONSE TO CRITIQUE:
|
|
169
|
+
${previousResponse}
|
|
170
|
+
|
|
171
|
+
YOUR TASK:
|
|
172
|
+
Provide a thorough critique of this response:
|
|
173
|
+
1. Identify specific weaknesses, errors, or gaps
|
|
174
|
+
2. Point out unclear or confusing parts
|
|
175
|
+
3. Suggest concrete improvements
|
|
176
|
+
4. Note any missing considerations
|
|
177
|
+
|
|
178
|
+
Be constructive but thorough. Format as a bulleted critique.`;
|
|
179
|
+
} else {
|
|
180
|
+
// Improvement round based on critique
|
|
181
|
+
const lastCritique = previousRounds[previousRounds.length - 1]?.content || '';
|
|
182
|
+
const lastGoodResponse = previousRounds[previousRounds.length - 2]?.content || previousResponse;
|
|
183
|
+
|
|
184
|
+
return `You are improving a response based on critical feedback.
|
|
185
|
+
|
|
186
|
+
ORIGINAL TASK:
|
|
187
|
+
${originalPrompt}
|
|
188
|
+
|
|
189
|
+
PREVIOUS RESPONSE:
|
|
190
|
+
${lastGoodResponse}
|
|
191
|
+
|
|
192
|
+
CRITIQUE RECEIVED:
|
|
193
|
+
${lastCritique}
|
|
194
|
+
|
|
195
|
+
YOUR TASK:
|
|
196
|
+
Create an improved response that addresses the critique points while maintaining the strengths of the original. Provide only the improved response, not meta-commentary.`;
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
function checkConvergence(previous: string, current: string): boolean {
|
|
202
|
+
// Simple similarity check based on length and common words
|
|
203
|
+
const prevWords = new Set(previous.toLowerCase().split(/\s+/));
|
|
204
|
+
const currWords = new Set(current.toLowerCase().split(/\s+/));
|
|
205
|
+
|
|
206
|
+
const intersection = new Set([...prevWords].filter(x => currWords.has(x)));
|
|
207
|
+
const union = new Set([...prevWords, ...currWords]);
|
|
208
|
+
|
|
209
|
+
const similarity = intersection.size / union.size;
|
|
210
|
+
|
|
211
|
+
// Also check if lengths are similar
|
|
212
|
+
const lengthRatio = Math.min(previous.length, current.length) / Math.max(previous.length, current.length);
|
|
213
|
+
|
|
214
|
+
return similarity > CONVERGENCE_THRESHOLD && lengthRatio > 0.8;
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
function formatIterationResult(result: IterationResult): string {
|
|
218
|
+
let output = `🔄 **Iterative Refinement Results**\n`;
|
|
219
|
+
output += `═══════════════════════════════════════\n\n`;
|
|
220
|
+
output += `**Mode:** ${result.mode}\n`;
|
|
221
|
+
output += `**Providers:** ${result.providers.join(' ↔ ')}\n`;
|
|
222
|
+
output += `**Iterations:** ${result.totalIterations}`;
|
|
223
|
+
if (result.converged) {
|
|
224
|
+
output += ` (converged early ✓)`;
|
|
225
|
+
}
|
|
226
|
+
output += `\n\n`;
|
|
227
|
+
|
|
228
|
+
// Show each round
|
|
229
|
+
output += `**Iteration History:**\n`;
|
|
230
|
+
output += `─────────────────────────────────────\n`;
|
|
231
|
+
|
|
232
|
+
for (const round of result.rounds) {
|
|
233
|
+
const roleEmoji = round.role === 'generator' ? '🎯' :
|
|
234
|
+
round.role === 'critic' ? '🔍' : '✨';
|
|
235
|
+
output += `\n${roleEmoji} **Round ${round.round}: ${round.nickname}** (${round.role})\n`;
|
|
236
|
+
|
|
237
|
+
// Truncate long content for display
|
|
238
|
+
const displayContent = round.content.length > 500
|
|
239
|
+
? round.content.substring(0, 500) + '...[truncated]'
|
|
240
|
+
: round.content;
|
|
241
|
+
output += `${displayContent}\n`;
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
// Final response
|
|
245
|
+
output += `\n═══════════════════════════════════════\n`;
|
|
246
|
+
output += `🏁 **Final Response:**\n`;
|
|
247
|
+
output += `─────────────────────────────────────\n`;
|
|
248
|
+
output += `${result.finalResponse}\n`;
|
|
249
|
+
output += `\n═══════════════════════════════════════\n`;
|
|
250
|
+
output += `📊 ${result.totalIterations} rounds completed\n`;
|
|
251
|
+
|
|
252
|
+
return output;
|
|
253
|
+
}
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
import { ProviderManager } from '../providers/manager.js';
|
|
2
|
+
import { DuckResponse, JudgeEvaluation } from '../config/types.js';
|
|
3
|
+
import { logger } from '../utils/logger.js';
|
|
4
|
+
|
|
5
|
+
export interface DuckJudgeArgs {
|
|
6
|
+
responses: DuckResponse[];
|
|
7
|
+
judge?: string;
|
|
8
|
+
criteria?: string[];
|
|
9
|
+
persona?: string;
|
|
10
|
+
}
|
|
11
|
+
|
|
12
|
+
interface ParsedJudgment {
|
|
13
|
+
rankings: Array<{
|
|
14
|
+
provider: string;
|
|
15
|
+
score: number;
|
|
16
|
+
justification: string;
|
|
17
|
+
}>;
|
|
18
|
+
criteria_scores?: Record<string, Record<string, number>>;
|
|
19
|
+
summary: string;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
const DEFAULT_CRITERIA = ['accuracy', 'completeness', 'clarity'];
|
|
23
|
+
|
|
24
|
+
export async function duckJudgeTool(
|
|
25
|
+
providerManager: ProviderManager,
|
|
26
|
+
args: Record<string, unknown>
|
|
27
|
+
) {
|
|
28
|
+
const {
|
|
29
|
+
responses,
|
|
30
|
+
judge,
|
|
31
|
+
criteria = DEFAULT_CRITERIA,
|
|
32
|
+
persona,
|
|
33
|
+
} = args as unknown as DuckJudgeArgs;
|
|
34
|
+
|
|
35
|
+
// Validate inputs
|
|
36
|
+
if (!responses || !Array.isArray(responses) || responses.length === 0) {
|
|
37
|
+
throw new Error('At least one response is required to judge');
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
if (responses.length === 1) {
|
|
41
|
+
throw new Error('At least two responses are required for comparison');
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Determine judge provider
|
|
45
|
+
const judgeProvider = judge || providerManager.getProviderNames()[0];
|
|
46
|
+
if (!judgeProvider) {
|
|
47
|
+
throw new Error('No judge provider available');
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
logger.info(`Starting judgment with ${judgeProvider} on ${responses.length} responses`);
|
|
51
|
+
|
|
52
|
+
// Build the judgment prompt
|
|
53
|
+
const prompt = buildJudgePrompt(responses, criteria, persona);
|
|
54
|
+
|
|
55
|
+
// Get judgment from the judge duck
|
|
56
|
+
const judgeResponse = await providerManager.askDuck(judgeProvider, prompt);
|
|
57
|
+
|
|
58
|
+
// Parse the judgment
|
|
59
|
+
const evaluation = parseJudgment(
|
|
60
|
+
judgeResponse.content,
|
|
61
|
+
judgeResponse.provider,
|
|
62
|
+
judgeResponse.nickname,
|
|
63
|
+
responses,
|
|
64
|
+
criteria
|
|
65
|
+
);
|
|
66
|
+
|
|
67
|
+
// Format output
|
|
68
|
+
const formattedOutput = formatJudgeResult(evaluation);
|
|
69
|
+
|
|
70
|
+
logger.info(
|
|
71
|
+
`Judgment completed by ${judgeProvider}: #1 is ${evaluation.rankings[0]?.provider || 'unknown'}`
|
|
72
|
+
);
|
|
73
|
+
|
|
74
|
+
return {
|
|
75
|
+
content: [
|
|
76
|
+
{
|
|
77
|
+
type: 'text',
|
|
78
|
+
text: formattedOutput,
|
|
79
|
+
},
|
|
80
|
+
],
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function buildJudgePrompt(
|
|
85
|
+
responses: DuckResponse[],
|
|
86
|
+
criteria: string[],
|
|
87
|
+
persona?: string
|
|
88
|
+
): string {
|
|
89
|
+
const criteriaList = criteria.map((c, i) => `${i + 1}. ${c}`).join('\n');
|
|
90
|
+
|
|
91
|
+
const responsesText = responses.map((r, i) =>
|
|
92
|
+
`--- Response ${i + 1} (${r.nickname} / ${r.provider}) ---\n${r.content}\n`
|
|
93
|
+
).join('\n');
|
|
94
|
+
|
|
95
|
+
const personaText = persona
|
|
96
|
+
? `You are a ${persona} evaluating these responses.\n\n`
|
|
97
|
+
: '';
|
|
98
|
+
|
|
99
|
+
return `${personaText}You are a judge evaluating ${responses.length} responses to the same prompt.
|
|
100
|
+
|
|
101
|
+
RESPONSES TO EVALUATE:
|
|
102
|
+
${responsesText}
|
|
103
|
+
|
|
104
|
+
EVALUATION CRITERIA:
|
|
105
|
+
${criteriaList}
|
|
106
|
+
|
|
107
|
+
INSTRUCTIONS:
|
|
108
|
+
1. Evaluate each response against ALL criteria
|
|
109
|
+
2. Assign a score from 0-100 for each response
|
|
110
|
+
3. Rank responses from best to worst
|
|
111
|
+
4. Provide a brief justification for each ranking
|
|
112
|
+
5. Give a final summary
|
|
113
|
+
|
|
114
|
+
Respond with ONLY a JSON object in this exact format:
|
|
115
|
+
{
|
|
116
|
+
"rankings": [
|
|
117
|
+
{"provider": "<provider name>", "score": <0-100>, "justification": "<brief explanation>"},
|
|
118
|
+
{"provider": "<provider name>", "score": <0-100>, "justification": "<brief explanation>"}
|
|
119
|
+
],
|
|
120
|
+
"criteria_scores": {
|
|
121
|
+
"<provider>": {${criteria.map(c => `"${c}": <0-100>`).join(', ')}}
|
|
122
|
+
},
|
|
123
|
+
"summary": "<overall assessment and recommendation>"
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
IMPORTANT:
|
|
127
|
+
- Rankings must be ordered from highest score to lowest
|
|
128
|
+
- Use the exact provider names from the responses
|
|
129
|
+
- Do NOT include any text before or after the JSON
|
|
130
|
+
- Do NOT use markdown code blocks`;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function matchProvider(
|
|
134
|
+
judgeProviderName: string,
|
|
135
|
+
originalResponses: DuckResponse[]
|
|
136
|
+
): DuckResponse | undefined {
|
|
137
|
+
const nameLower = judgeProviderName.toLowerCase();
|
|
138
|
+
|
|
139
|
+
// Try exact match first
|
|
140
|
+
const exactMatch = originalResponses.find(r => r.provider.toLowerCase() === nameLower);
|
|
141
|
+
if (exactMatch) return exactMatch;
|
|
142
|
+
|
|
143
|
+
// Try matching by provider name contained in judge's response
|
|
144
|
+
const containsMatch = originalResponses.find(r =>
|
|
145
|
+
nameLower.includes(r.provider.toLowerCase()) ||
|
|
146
|
+
nameLower.includes(r.nickname.toLowerCase())
|
|
147
|
+
);
|
|
148
|
+
if (containsMatch) return containsMatch;
|
|
149
|
+
|
|
150
|
+
// Try matching by nickname
|
|
151
|
+
const nicknameMatch = originalResponses.find(r =>
|
|
152
|
+
r.nickname.toLowerCase() === nameLower
|
|
153
|
+
);
|
|
154
|
+
if (nicknameMatch) return nicknameMatch;
|
|
155
|
+
|
|
156
|
+
return undefined;
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function parseJudgment(
|
|
160
|
+
response: string,
|
|
161
|
+
judgeProvider: string,
|
|
162
|
+
judgeNickname: string,
|
|
163
|
+
originalResponses: DuckResponse[],
|
|
164
|
+
criteria: string[]
|
|
165
|
+
): JudgeEvaluation {
|
|
166
|
+
const evaluation: JudgeEvaluation = {
|
|
167
|
+
judge: judgeProvider,
|
|
168
|
+
judgeNickname: judgeNickname,
|
|
169
|
+
prompt: '', // Will be filled by caller if needed
|
|
170
|
+
criteria,
|
|
171
|
+
rankings: [],
|
|
172
|
+
criteriaScores: {},
|
|
173
|
+
summary: '',
|
|
174
|
+
rawResponse: response,
|
|
175
|
+
};
|
|
176
|
+
|
|
177
|
+
try {
|
|
178
|
+
// Try to extract JSON from the response
|
|
179
|
+
const jsonMatch = response.match(/\{[\s\S]*\}/);
|
|
180
|
+
if (!jsonMatch) {
|
|
181
|
+
logger.warn(`No JSON found in judge response from ${judgeProvider}`);
|
|
182
|
+
return createFallbackEvaluation(evaluation, originalResponses, response);
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
const parsed = JSON.parse(jsonMatch[0]) as ParsedJudgment;
|
|
186
|
+
const matchedProviders = new Set<string>();
|
|
187
|
+
|
|
188
|
+
// Parse rankings
|
|
189
|
+
if (Array.isArray(parsed.rankings)) {
|
|
190
|
+
for (const [index, r] of parsed.rankings.entries()) {
|
|
191
|
+
const matched = matchProvider(r.provider, originalResponses);
|
|
192
|
+
if (matched && !matchedProviders.has(matched.provider)) {
|
|
193
|
+
matchedProviders.add(matched.provider);
|
|
194
|
+
evaluation.rankings.push({
|
|
195
|
+
provider: matched.provider,
|
|
196
|
+
nickname: matched.nickname,
|
|
197
|
+
rank: index + 1,
|
|
198
|
+
score: typeof r.score === 'number' ? Math.max(0, Math.min(100, r.score)) : 0,
|
|
199
|
+
justification: r.justification?.toString() || '',
|
|
200
|
+
});
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
// Parse criteria scores
|
|
206
|
+
if (parsed.criteria_scores && typeof parsed.criteria_scores === 'object') {
|
|
207
|
+
evaluation.criteriaScores = parsed.criteria_scores;
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// Parse summary
|
|
211
|
+
if (parsed.summary) {
|
|
212
|
+
evaluation.summary = parsed.summary.toString();
|
|
213
|
+
}
|
|
214
|
+
|
|
215
|
+
} catch (error) {
|
|
216
|
+
logger.warn(`Failed to parse JSON judgment from ${judgeProvider}:`, error);
|
|
217
|
+
return createFallbackEvaluation(evaluation, originalResponses, response);
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
// Ensure all original responses are represented
|
|
221
|
+
const rankedProviders = new Set(evaluation.rankings.map(r => r.provider));
|
|
222
|
+
for (const resp of originalResponses) {
|
|
223
|
+
if (!rankedProviders.has(resp.provider)) {
|
|
224
|
+
evaluation.rankings.push({
|
|
225
|
+
provider: resp.provider,
|
|
226
|
+
nickname: resp.nickname,
|
|
227
|
+
rank: evaluation.rankings.length + 1,
|
|
228
|
+
score: 0,
|
|
229
|
+
justification: 'Not evaluated by judge',
|
|
230
|
+
});
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
return evaluation;
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
function createFallbackEvaluation(
|
|
238
|
+
evaluation: JudgeEvaluation,
|
|
239
|
+
originalResponses: DuckResponse[],
|
|
240
|
+
rawResponse: string
|
|
241
|
+
): JudgeEvaluation {
|
|
242
|
+
// Create a basic evaluation when parsing fails
|
|
243
|
+
evaluation.rankings = originalResponses.map((r, index) => ({
|
|
244
|
+
provider: r.provider,
|
|
245
|
+
nickname: r.nickname,
|
|
246
|
+
rank: index + 1,
|
|
247
|
+
score: 50,
|
|
248
|
+
justification: 'Unable to parse judge response',
|
|
249
|
+
}));
|
|
250
|
+
evaluation.summary = `Judge evaluation parsing failed. Raw response available for review.`;
|
|
251
|
+
evaluation.rawResponse = rawResponse;
|
|
252
|
+
return evaluation;
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function formatJudgeResult(evaluation: JudgeEvaluation): string {
|
|
256
|
+
let output = `⚖️ **Judge Evaluation**\n`;
|
|
257
|
+
output += `═══════════════════════════════════════\n\n`;
|
|
258
|
+
output += `**Judge:** ${evaluation.judgeNickname} (${evaluation.judge})\n`;
|
|
259
|
+
output += `**Criteria:** ${evaluation.criteria.join(', ')}\n\n`;
|
|
260
|
+
|
|
261
|
+
// Rankings
|
|
262
|
+
output += `**Rankings:**\n`;
|
|
263
|
+
output += `─────────────────────────────────────\n`;
|
|
264
|
+
|
|
265
|
+
for (const ranking of evaluation.rankings) {
|
|
266
|
+
const medal = ranking.rank === 1 ? '🥇' : ranking.rank === 2 ? '🥈' : ranking.rank === 3 ? '🥉' : ' ';
|
|
267
|
+
const bar = '█'.repeat(Math.floor(ranking.score / 10));
|
|
268
|
+
const emptyBar = '░'.repeat(10 - Math.floor(ranking.score / 10));
|
|
269
|
+
|
|
270
|
+
output += `${medal} **#${ranking.rank} ${ranking.nickname}** (${ranking.provider})\n`;
|
|
271
|
+
output += ` Score: ${bar}${emptyBar} ${ranking.score}/100\n`;
|
|
272
|
+
output += ` 💭 "${ranking.justification}"\n\n`;
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Criteria breakdown if available
|
|
276
|
+
if (Object.keys(evaluation.criteriaScores).length > 0) {
|
|
277
|
+
output += `**Criteria Breakdown:**\n`;
|
|
278
|
+
output += `─────────────────────────────────────\n`;
|
|
279
|
+
|
|
280
|
+
for (const [provider, scores] of Object.entries(evaluation.criteriaScores)) {
|
|
281
|
+
output += `📊 **${provider}:**\n`;
|
|
282
|
+
for (const [criterion, score] of Object.entries(scores)) {
|
|
283
|
+
const criterionScore = typeof score === 'number' ? score : 0;
|
|
284
|
+
output += ` • ${criterion}: ${criterionScore}/100\n`;
|
|
285
|
+
}
|
|
286
|
+
output += `\n`;
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
// Summary
|
|
291
|
+
if (evaluation.summary) {
|
|
292
|
+
output += `**Summary:**\n`;
|
|
293
|
+
output += `─────────────────────────────────────\n`;
|
|
294
|
+
output += `${evaluation.summary}\n\n`;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
output += `═══════════════════════════════════════\n`;
|
|
298
|
+
output += `📋 Evaluated ${evaluation.rankings.length} responses\n`;
|
|
299
|
+
|
|
300
|
+
return output;
|
|
301
|
+
}
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import { ProviderManager } from '../providers/manager.js';
|
|
2
|
+
import { ConsensusService } from '../services/consensus.js';
|
|
3
|
+
import { VoteResult } from '../config/types.js';
|
|
4
|
+
import { logger } from '../utils/logger.js';
|
|
5
|
+
|
|
6
|
+
export interface DuckVoteArgs {
|
|
7
|
+
question: string;
|
|
8
|
+
options: string[];
|
|
9
|
+
voters?: string[];
|
|
10
|
+
require_reasoning?: boolean;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export async function duckVoteTool(
|
|
14
|
+
providerManager: ProviderManager,
|
|
15
|
+
args: Record<string, unknown>
|
|
16
|
+
) {
|
|
17
|
+
const {
|
|
18
|
+
question,
|
|
19
|
+
options,
|
|
20
|
+
voters,
|
|
21
|
+
require_reasoning = true,
|
|
22
|
+
} = args as unknown as DuckVoteArgs;
|
|
23
|
+
|
|
24
|
+
// Validate inputs
|
|
25
|
+
if (!question || typeof question !== 'string') {
|
|
26
|
+
throw new Error('Question is required');
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if (!options || !Array.isArray(options) || options.length < 2) {
|
|
30
|
+
throw new Error('At least 2 options are required');
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
if (options.length > 10) {
|
|
34
|
+
throw new Error('Maximum 10 options allowed');
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
// Get voters (all providers if not specified)
|
|
38
|
+
const voterNames = voters && voters.length > 0
|
|
39
|
+
? voters
|
|
40
|
+
: providerManager.getProviderNames();
|
|
41
|
+
|
|
42
|
+
if (voterNames.length === 0) {
|
|
43
|
+
throw new Error('No voters available');
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
logger.info(`Starting vote with ${voterNames.length} voters on: "${question}"`);
|
|
47
|
+
|
|
48
|
+
const consensusService = new ConsensusService();
|
|
49
|
+
const votePrompt = consensusService.buildVotePrompt(
|
|
50
|
+
question,
|
|
51
|
+
options,
|
|
52
|
+
require_reasoning
|
|
53
|
+
);
|
|
54
|
+
|
|
55
|
+
// Get votes from all ducks in parallel
|
|
56
|
+
const responses = await providerManager.compareDucks(votePrompt, voterNames);
|
|
57
|
+
|
|
58
|
+
// Parse votes
|
|
59
|
+
const votes: VoteResult[] = responses.map(response => {
|
|
60
|
+
return consensusService.parseVote(
|
|
61
|
+
response.content,
|
|
62
|
+
response.provider,
|
|
63
|
+
response.nickname,
|
|
64
|
+
options
|
|
65
|
+
);
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
// Aggregate results
|
|
69
|
+
const aggregatedResult = consensusService.aggregateVotes(question, options, votes);
|
|
70
|
+
|
|
71
|
+
// Format output
|
|
72
|
+
const formattedOutput = consensusService.formatVoteResult(aggregatedResult);
|
|
73
|
+
|
|
74
|
+
logger.info(
|
|
75
|
+
`Vote completed: ${aggregatedResult.consensusLevel} consensus, ` +
|
|
76
|
+
`winner: ${aggregatedResult.winner || 'none'}`
|
|
77
|
+
);
|
|
78
|
+
|
|
79
|
+
return {
|
|
80
|
+
content: [
|
|
81
|
+
{
|
|
82
|
+
type: 'text',
|
|
83
|
+
text: formattedOutput,
|
|
84
|
+
},
|
|
85
|
+
],
|
|
86
|
+
};
|
|
87
|
+
}
|