mcp-rubber-duck 1.2.5 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.env.desktop.example +1 -1
- package/.env.pi.example +1 -1
- package/.env.template +1 -1
- package/.eslintrc.json +1 -0
- package/CHANGELOG.md +19 -0
- package/README.md +238 -44
- package/assets/mcp-rubber-duck.png +0 -0
- package/audit-ci.json +2 -1
- package/config/config.example.json +4 -4
- package/dist/config/config.js +4 -4
- package/dist/config/config.js.map +1 -1
- package/dist/config/types.d.ts +78 -0
- package/dist/config/types.d.ts.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +150 -0
- package/dist/server.js.map +1 -1
- package/dist/services/consensus.d.ts +28 -0
- package/dist/services/consensus.d.ts.map +1 -0
- package/dist/services/consensus.js +257 -0
- package/dist/services/consensus.js.map +1 -0
- package/dist/tools/duck-debate.d.ts +16 -0
- package/dist/tools/duck-debate.d.ts.map +1 -0
- package/dist/tools/duck-debate.js +272 -0
- package/dist/tools/duck-debate.js.map +1 -0
- package/dist/tools/duck-iterate.d.ts +14 -0
- package/dist/tools/duck-iterate.d.ts.map +1 -0
- package/dist/tools/duck-iterate.js +195 -0
- package/dist/tools/duck-iterate.js.map +1 -0
- package/dist/tools/duck-judge.d.ts +15 -0
- package/dist/tools/duck-judge.d.ts.map +1 -0
- package/dist/tools/duck-judge.js +208 -0
- package/dist/tools/duck-judge.js.map +1 -0
- package/dist/tools/duck-vote.d.ts +14 -0
- package/dist/tools/duck-vote.d.ts.map +1 -0
- package/dist/tools/duck-vote.js +46 -0
- package/dist/tools/duck-vote.js.map +1 -0
- package/docker-compose.yml +1 -1
- package/package.json +1 -1
- package/src/config/config.ts +4 -4
- package/src/config/types.ts +92 -0
- package/src/server.ts +154 -0
- package/src/services/consensus.ts +324 -0
- package/src/tools/duck-debate.ts +383 -0
- package/src/tools/duck-iterate.ts +253 -0
- package/src/tools/duck-judge.ts +301 -0
- package/src/tools/duck-vote.ts +87 -0
- package/tests/consensus.test.ts +282 -0
- package/tests/duck-debate.test.ts +286 -0
- package/tests/duck-iterate.test.ts +249 -0
- package/tests/duck-judge.test.ts +296 -0
- package/tests/duck-vote.test.ts +250 -0
|
@@ -0,0 +1,249 @@
|
|
|
1
|
+
import { describe, it, expect, jest, beforeEach } from '@jest/globals';
|
|
2
|
+
|
|
3
|
+
// Mock OpenAI BEFORE importing the provider
|
|
4
|
+
const mockCreate = jest.fn();
|
|
5
|
+
jest.mock('openai', () => {
|
|
6
|
+
const MockOpenAI = jest.fn().mockImplementation(() => ({
|
|
7
|
+
chat: {
|
|
8
|
+
completions: {
|
|
9
|
+
create: mockCreate,
|
|
10
|
+
},
|
|
11
|
+
},
|
|
12
|
+
}));
|
|
13
|
+
return {
|
|
14
|
+
__esModule: true,
|
|
15
|
+
default: MockOpenAI,
|
|
16
|
+
};
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
// Mock config manager and logger
|
|
20
|
+
jest.mock('../src/config/config');
|
|
21
|
+
jest.mock('../src/utils/logger');
|
|
22
|
+
|
|
23
|
+
import { duckIterateTool } from '../src/tools/duck-iterate';
|
|
24
|
+
import { ProviderManager } from '../src/providers/manager';
|
|
25
|
+
import { ConfigManager } from '../src/config/config';
|
|
26
|
+
|
|
27
|
+
describe('duckIterateTool', () => {
|
|
28
|
+
let mockProviderManager: ProviderManager;
|
|
29
|
+
let mockConfigManager: jest.Mocked<ConfigManager>;
|
|
30
|
+
|
|
31
|
+
beforeEach(() => {
|
|
32
|
+
jest.clearAllMocks();
|
|
33
|
+
|
|
34
|
+
mockConfigManager = {
|
|
35
|
+
getConfig: jest.fn().mockReturnValue({
|
|
36
|
+
providers: {
|
|
37
|
+
openai: {
|
|
38
|
+
api_key: 'key1',
|
|
39
|
+
base_url: 'https://api.openai.com/v1',
|
|
40
|
+
default_model: 'gpt-4',
|
|
41
|
+
nickname: 'GPT-4',
|
|
42
|
+
models: ['gpt-4'],
|
|
43
|
+
},
|
|
44
|
+
gemini: {
|
|
45
|
+
api_key: 'key2',
|
|
46
|
+
base_url: 'https://api.gemini.com/v1',
|
|
47
|
+
default_model: 'gemini-pro',
|
|
48
|
+
nickname: 'Gemini',
|
|
49
|
+
models: ['gemini-pro'],
|
|
50
|
+
},
|
|
51
|
+
},
|
|
52
|
+
default_provider: 'openai',
|
|
53
|
+
cache_ttl: 300,
|
|
54
|
+
enable_failover: true,
|
|
55
|
+
default_temperature: 0.7,
|
|
56
|
+
}),
|
|
57
|
+
} as any;
|
|
58
|
+
|
|
59
|
+
mockProviderManager = new ProviderManager(mockConfigManager);
|
|
60
|
+
|
|
61
|
+
// Override the client method on all providers
|
|
62
|
+
const provider1 = mockProviderManager.getProvider('openai');
|
|
63
|
+
const provider2 = mockProviderManager.getProvider('gemini');
|
|
64
|
+
provider1['client'].chat.completions.create = mockCreate;
|
|
65
|
+
provider2['client'].chat.completions.create = mockCreate;
|
|
66
|
+
});
|
|
67
|
+
|
|
68
|
+
it('should throw error when prompt is missing', async () => {
|
|
69
|
+
await expect(
|
|
70
|
+
duckIterateTool(mockProviderManager, { providers: ['openai', 'gemini'], mode: 'refine' })
|
|
71
|
+
).rejects.toThrow('Prompt is required');
|
|
72
|
+
});
|
|
73
|
+
|
|
74
|
+
it('should throw error when providers count is not 2', async () => {
|
|
75
|
+
await expect(
|
|
76
|
+
duckIterateTool(mockProviderManager, { prompt: 'Test', providers: ['openai'], mode: 'refine' })
|
|
77
|
+
).rejects.toThrow('Exactly 2 providers are required');
|
|
78
|
+
|
|
79
|
+
await expect(
|
|
80
|
+
duckIterateTool(mockProviderManager, { prompt: 'Test', providers: ['openai', 'gemini', 'another'], mode: 'refine' })
|
|
81
|
+
).rejects.toThrow('Exactly 2 providers are required');
|
|
82
|
+
});
|
|
83
|
+
|
|
84
|
+
it('should throw error when mode is invalid', async () => {
|
|
85
|
+
await expect(
|
|
86
|
+
duckIterateTool(mockProviderManager, { prompt: 'Test', providers: ['openai', 'gemini'], mode: 'invalid' })
|
|
87
|
+
).rejects.toThrow('Mode must be either "refine" or "critique-improve"');
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
it('should throw error when iterations out of range', async () => {
|
|
91
|
+
await expect(
|
|
92
|
+
duckIterateTool(mockProviderManager, { prompt: 'Test', providers: ['openai', 'gemini'], mode: 'refine', iterations: 0 })
|
|
93
|
+
).rejects.toThrow('Iterations must be between 1 and 10');
|
|
94
|
+
|
|
95
|
+
await expect(
|
|
96
|
+
duckIterateTool(mockProviderManager, { prompt: 'Test', providers: ['openai', 'gemini'], mode: 'refine', iterations: 11 })
|
|
97
|
+
).rejects.toThrow('Iterations must be between 1 and 10');
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('should throw error when provider does not exist', async () => {
|
|
101
|
+
await expect(
|
|
102
|
+
duckIterateTool(mockProviderManager, { prompt: 'Test', providers: ['openai', 'nonexistent'], mode: 'refine' })
|
|
103
|
+
).rejects.toThrow('Provider "nonexistent" not found');
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it('should perform refine iteration', async () => {
|
|
107
|
+
mockCreate
|
|
108
|
+
.mockResolvedValueOnce({
|
|
109
|
+
choices: [{ message: { content: 'Initial response about sorting' }, finish_reason: 'stop' }],
|
|
110
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
111
|
+
model: 'gpt-4',
|
|
112
|
+
})
|
|
113
|
+
.mockResolvedValueOnce({
|
|
114
|
+
choices: [{ message: { content: 'Refined response with better explanation' }, finish_reason: 'stop' }],
|
|
115
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
116
|
+
model: 'gemini-pro',
|
|
117
|
+
})
|
|
118
|
+
.mockResolvedValueOnce({
|
|
119
|
+
choices: [{ message: { content: 'Further refined with examples' }, finish_reason: 'stop' }],
|
|
120
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
121
|
+
model: 'gpt-4',
|
|
122
|
+
});
|
|
123
|
+
|
|
124
|
+
const result = await duckIterateTool(mockProviderManager, {
|
|
125
|
+
prompt: 'Write a sorting algorithm',
|
|
126
|
+
providers: ['openai', 'gemini'],
|
|
127
|
+
mode: 'refine',
|
|
128
|
+
iterations: 3,
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
expect(result.content).toHaveLength(1);
|
|
132
|
+
expect(result.content[0].type).toBe('text');
|
|
133
|
+
|
|
134
|
+
const text = result.content[0].text;
|
|
135
|
+
expect(text).toContain('Iterative Refinement');
|
|
136
|
+
expect(text).toContain('refine');
|
|
137
|
+
expect(text).toContain('Round 1');
|
|
138
|
+
expect(text).toContain('Round 2');
|
|
139
|
+
expect(text).toContain('Round 3');
|
|
140
|
+
expect(text).toContain('Final Response');
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
it('should perform critique-improve iteration', async () => {
|
|
144
|
+
mockCreate
|
|
145
|
+
.mockResolvedValueOnce({
|
|
146
|
+
choices: [{ message: { content: 'Initial implementation' }, finish_reason: 'stop' }],
|
|
147
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
148
|
+
model: 'gpt-4',
|
|
149
|
+
})
|
|
150
|
+
.mockResolvedValueOnce({
|
|
151
|
+
choices: [{ message: { content: 'Critique: Missing edge cases, no error handling' }, finish_reason: 'stop' }],
|
|
152
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
153
|
+
model: 'gemini-pro',
|
|
154
|
+
})
|
|
155
|
+
.mockResolvedValueOnce({
|
|
156
|
+
choices: [{ message: { content: 'Improved with edge cases and error handling' }, finish_reason: 'stop' }],
|
|
157
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
158
|
+
model: 'gpt-4',
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
const result = await duckIterateTool(mockProviderManager, {
|
|
162
|
+
prompt: 'Write a function',
|
|
163
|
+
providers: ['openai', 'gemini'],
|
|
164
|
+
mode: 'critique-improve',
|
|
165
|
+
iterations: 3,
|
|
166
|
+
});
|
|
167
|
+
|
|
168
|
+
const text = result.content[0].text;
|
|
169
|
+
expect(text).toContain('critique-improve');
|
|
170
|
+
expect(text).toContain('generator');
|
|
171
|
+
expect(text).toContain('critic');
|
|
172
|
+
});
|
|
173
|
+
|
|
174
|
+
it('should use default iterations when not specified', async () => {
|
|
175
|
+
mockCreate
|
|
176
|
+
.mockResolvedValueOnce({
|
|
177
|
+
choices: [{ message: { content: 'Response 1' }, finish_reason: 'stop' }],
|
|
178
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
179
|
+
model: 'gpt-4',
|
|
180
|
+
})
|
|
181
|
+
.mockResolvedValueOnce({
|
|
182
|
+
choices: [{ message: { content: 'Response 2' }, finish_reason: 'stop' }],
|
|
183
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
184
|
+
model: 'gemini-pro',
|
|
185
|
+
})
|
|
186
|
+
.mockResolvedValueOnce({
|
|
187
|
+
choices: [{ message: { content: 'Response 3' }, finish_reason: 'stop' }],
|
|
188
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
189
|
+
model: 'gpt-4',
|
|
190
|
+
});
|
|
191
|
+
|
|
192
|
+
const result = await duckIterateTool(mockProviderManager, {
|
|
193
|
+
prompt: 'Test prompt',
|
|
194
|
+
providers: ['openai', 'gemini'],
|
|
195
|
+
mode: 'refine',
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
// Default is 3 iterations
|
|
199
|
+
expect(mockCreate).toHaveBeenCalledTimes(3);
|
|
200
|
+
expect(result.content[0].text).toContain('3 rounds completed');
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
it('should detect convergence and stop early', async () => {
|
|
204
|
+
// Return very similar responses to trigger convergence
|
|
205
|
+
const similarResponse = 'This is the exact same response content that will be repeated to trigger convergence detection.';
|
|
206
|
+
|
|
207
|
+
mockCreate
|
|
208
|
+
.mockResolvedValueOnce({
|
|
209
|
+
choices: [{ message: { content: similarResponse }, finish_reason: 'stop' }],
|
|
210
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
211
|
+
model: 'gpt-4',
|
|
212
|
+
})
|
|
213
|
+
.mockResolvedValueOnce({
|
|
214
|
+
choices: [{ message: { content: similarResponse }, finish_reason: 'stop' }],
|
|
215
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
216
|
+
model: 'gemini-pro',
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
const result = await duckIterateTool(mockProviderManager, {
|
|
220
|
+
prompt: 'Test',
|
|
221
|
+
providers: ['openai', 'gemini'],
|
|
222
|
+
mode: 'refine',
|
|
223
|
+
iterations: 5,
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
const text = result.content[0].text;
|
|
227
|
+
expect(text).toContain('converged');
|
|
228
|
+
// Should stop at 2 rounds due to convergence, not 5
|
|
229
|
+
expect(mockCreate).toHaveBeenCalledTimes(2);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
it('should handle single iteration', async () => {
|
|
233
|
+
mockCreate.mockResolvedValueOnce({
|
|
234
|
+
choices: [{ message: { content: 'Single response' }, finish_reason: 'stop' }],
|
|
235
|
+
usage: { prompt_tokens: 10, completion_tokens: 20, total_tokens: 30 },
|
|
236
|
+
model: 'gpt-4',
|
|
237
|
+
});
|
|
238
|
+
|
|
239
|
+
const result = await duckIterateTool(mockProviderManager, {
|
|
240
|
+
prompt: 'Test',
|
|
241
|
+
providers: ['openai', 'gemini'],
|
|
242
|
+
mode: 'refine',
|
|
243
|
+
iterations: 1,
|
|
244
|
+
});
|
|
245
|
+
|
|
246
|
+
expect(mockCreate).toHaveBeenCalledTimes(1);
|
|
247
|
+
expect(result.content[0].text).toContain('1 rounds completed');
|
|
248
|
+
});
|
|
249
|
+
});
|
|
@@ -0,0 +1,296 @@
|
|
|
1
|
+
import { describe, it, expect, jest, beforeEach } from '@jest/globals';
|
|
2
|
+
|
|
3
|
+
// Mock OpenAI BEFORE importing the provider
|
|
4
|
+
const mockCreate = jest.fn();
|
|
5
|
+
jest.mock('openai', () => {
|
|
6
|
+
const MockOpenAI = jest.fn().mockImplementation(() => ({
|
|
7
|
+
chat: {
|
|
8
|
+
completions: {
|
|
9
|
+
create: mockCreate,
|
|
10
|
+
},
|
|
11
|
+
},
|
|
12
|
+
}));
|
|
13
|
+
return {
|
|
14
|
+
__esModule: true,
|
|
15
|
+
default: MockOpenAI,
|
|
16
|
+
};
|
|
17
|
+
});
|
|
18
|
+
|
|
19
|
+
// Mock config manager and logger
|
|
20
|
+
jest.mock('../src/config/config');
|
|
21
|
+
jest.mock('../src/utils/logger');
|
|
22
|
+
|
|
23
|
+
import { duckJudgeTool } from '../src/tools/duck-judge';
|
|
24
|
+
import { ProviderManager } from '../src/providers/manager';
|
|
25
|
+
import { ConfigManager } from '../src/config/config';
|
|
26
|
+
import { DuckResponse } from '../src/config/types';
|
|
27
|
+
|
|
28
|
+
describe('duckJudgeTool', () => {
|
|
29
|
+
let mockProviderManager: ProviderManager;
|
|
30
|
+
let mockConfigManager: jest.Mocked<ConfigManager>;
|
|
31
|
+
|
|
32
|
+
const mockResponses: DuckResponse[] = [
|
|
33
|
+
{
|
|
34
|
+
provider: 'openai',
|
|
35
|
+
nickname: 'GPT-4',
|
|
36
|
+
model: 'gpt-4',
|
|
37
|
+
content: 'Response from GPT-4 about error handling using try-catch blocks.',
|
|
38
|
+
latency: 1000,
|
|
39
|
+
cached: false,
|
|
40
|
+
},
|
|
41
|
+
{
|
|
42
|
+
provider: 'gemini',
|
|
43
|
+
nickname: 'Gemini',
|
|
44
|
+
model: 'gemini-pro',
|
|
45
|
+
content: 'Response from Gemini about error handling using Result types.',
|
|
46
|
+
latency: 1500,
|
|
47
|
+
cached: false,
|
|
48
|
+
},
|
|
49
|
+
];
|
|
50
|
+
|
|
51
|
+
beforeEach(() => {
|
|
52
|
+
jest.clearAllMocks();
|
|
53
|
+
|
|
54
|
+
mockConfigManager = {
|
|
55
|
+
getConfig: jest.fn().mockReturnValue({
|
|
56
|
+
providers: {
|
|
57
|
+
openai: {
|
|
58
|
+
api_key: 'key1',
|
|
59
|
+
base_url: 'https://api.openai.com/v1',
|
|
60
|
+
default_model: 'gpt-4',
|
|
61
|
+
nickname: 'GPT-4',
|
|
62
|
+
models: ['gpt-4'],
|
|
63
|
+
},
|
|
64
|
+
gemini: {
|
|
65
|
+
api_key: 'key2',
|
|
66
|
+
base_url: 'https://api.gemini.com/v1',
|
|
67
|
+
default_model: 'gemini-pro',
|
|
68
|
+
nickname: 'Gemini',
|
|
69
|
+
models: ['gemini-pro'],
|
|
70
|
+
},
|
|
71
|
+
},
|
|
72
|
+
default_provider: 'openai',
|
|
73
|
+
cache_ttl: 300,
|
|
74
|
+
enable_failover: true,
|
|
75
|
+
default_temperature: 0.7,
|
|
76
|
+
}),
|
|
77
|
+
} as any;
|
|
78
|
+
|
|
79
|
+
mockProviderManager = new ProviderManager(mockConfigManager);
|
|
80
|
+
|
|
81
|
+
// Override the client method on all providers
|
|
82
|
+
const provider1 = mockProviderManager.getProvider('openai');
|
|
83
|
+
const provider2 = mockProviderManager.getProvider('gemini');
|
|
84
|
+
provider1['client'].chat.completions.create = mockCreate;
|
|
85
|
+
provider2['client'].chat.completions.create = mockCreate;
|
|
86
|
+
});
|
|
87
|
+
|
|
88
|
+
it('should throw error when responses are missing', async () => {
|
|
89
|
+
await expect(
|
|
90
|
+
duckJudgeTool(mockProviderManager, {})
|
|
91
|
+
).rejects.toThrow('At least one response is required');
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
it('should throw error when responses is empty array', async () => {
|
|
95
|
+
await expect(
|
|
96
|
+
duckJudgeTool(mockProviderManager, { responses: [] })
|
|
97
|
+
).rejects.toThrow('At least one response is required');
|
|
98
|
+
});
|
|
99
|
+
|
|
100
|
+
it('should throw error when only one response provided', async () => {
|
|
101
|
+
await expect(
|
|
102
|
+
duckJudgeTool(mockProviderManager, { responses: [mockResponses[0]] })
|
|
103
|
+
).rejects.toThrow('At least two responses are required');
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
it('should evaluate responses and return rankings', async () => {
|
|
107
|
+
const judgeResponse = JSON.stringify({
|
|
108
|
+
rankings: [
|
|
109
|
+
{ provider: 'gemini', score: 85, justification: 'Better type safety explanation' },
|
|
110
|
+
{ provider: 'openai', score: 75, justification: 'Good but less comprehensive' },
|
|
111
|
+
],
|
|
112
|
+
criteria_scores: {
|
|
113
|
+
gemini: { accuracy: 85, completeness: 90, clarity: 80 },
|
|
114
|
+
openai: { accuracy: 75, completeness: 70, clarity: 80 },
|
|
115
|
+
},
|
|
116
|
+
summary: 'Gemini provided a more comprehensive response with better type safety coverage.',
|
|
117
|
+
});
|
|
118
|
+
|
|
119
|
+
mockCreate.mockResolvedValueOnce({
|
|
120
|
+
choices: [{
|
|
121
|
+
message: { content: judgeResponse },
|
|
122
|
+
finish_reason: 'stop',
|
|
123
|
+
}],
|
|
124
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
125
|
+
model: 'gpt-4',
|
|
126
|
+
});
|
|
127
|
+
|
|
128
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
129
|
+
responses: mockResponses,
|
|
130
|
+
});
|
|
131
|
+
|
|
132
|
+
expect(result.content).toHaveLength(1);
|
|
133
|
+
expect(result.content[0].type).toBe('text');
|
|
134
|
+
|
|
135
|
+
const text = result.content[0].text;
|
|
136
|
+
expect(text).toContain('Judge Evaluation');
|
|
137
|
+
expect(text).toContain('#1');
|
|
138
|
+
expect(text).toContain('#2');
|
|
139
|
+
expect(text).toContain('gemini');
|
|
140
|
+
expect(text).toContain('85/100');
|
|
141
|
+
});
|
|
142
|
+
|
|
143
|
+
it('should use specified judge provider', async () => {
|
|
144
|
+
const judgeResponse = JSON.stringify({
|
|
145
|
+
rankings: [
|
|
146
|
+
{ provider: 'openai', score: 80, justification: 'Good response' },
|
|
147
|
+
{ provider: 'gemini', score: 70, justification: 'Okay response' },
|
|
148
|
+
],
|
|
149
|
+
summary: 'OpenAI wins.',
|
|
150
|
+
});
|
|
151
|
+
|
|
152
|
+
mockCreate.mockResolvedValueOnce({
|
|
153
|
+
choices: [{
|
|
154
|
+
message: { content: judgeResponse },
|
|
155
|
+
finish_reason: 'stop',
|
|
156
|
+
}],
|
|
157
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
158
|
+
model: 'gemini-pro',
|
|
159
|
+
});
|
|
160
|
+
|
|
161
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
162
|
+
responses: mockResponses,
|
|
163
|
+
judge: 'gemini',
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
const text = result.content[0].text;
|
|
167
|
+
expect(text).toContain('Gemini');
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it('should use custom criteria', async () => {
|
|
171
|
+
const judgeResponse = JSON.stringify({
|
|
172
|
+
rankings: [
|
|
173
|
+
{ provider: 'openai', score: 90, justification: 'Most secure' },
|
|
174
|
+
{ provider: 'gemini', score: 85, justification: 'Good security' },
|
|
175
|
+
],
|
|
176
|
+
summary: 'Security focused evaluation.',
|
|
177
|
+
});
|
|
178
|
+
|
|
179
|
+
mockCreate.mockResolvedValueOnce({
|
|
180
|
+
choices: [{
|
|
181
|
+
message: { content: judgeResponse },
|
|
182
|
+
finish_reason: 'stop',
|
|
183
|
+
}],
|
|
184
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
185
|
+
model: 'gpt-4',
|
|
186
|
+
});
|
|
187
|
+
|
|
188
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
189
|
+
responses: mockResponses,
|
|
190
|
+
criteria: ['security', 'performance', 'maintainability'],
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
const text = result.content[0].text;
|
|
194
|
+
expect(text).toContain('security');
|
|
195
|
+
expect(text).toContain('performance');
|
|
196
|
+
expect(text).toContain('maintainability');
|
|
197
|
+
});
|
|
198
|
+
|
|
199
|
+
it('should handle persona parameter', async () => {
|
|
200
|
+
const judgeResponse = JSON.stringify({
|
|
201
|
+
rankings: [
|
|
202
|
+
{ provider: 'openai', score: 85, justification: 'Senior approved' },
|
|
203
|
+
{ provider: 'gemini', score: 80, justification: 'Good for juniors' },
|
|
204
|
+
],
|
|
205
|
+
summary: 'From a senior perspective.',
|
|
206
|
+
});
|
|
207
|
+
|
|
208
|
+
mockCreate.mockResolvedValueOnce({
|
|
209
|
+
choices: [{
|
|
210
|
+
message: { content: judgeResponse },
|
|
211
|
+
finish_reason: 'stop',
|
|
212
|
+
}],
|
|
213
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
214
|
+
model: 'gpt-4',
|
|
215
|
+
});
|
|
216
|
+
|
|
217
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
218
|
+
responses: mockResponses,
|
|
219
|
+
persona: 'senior engineer',
|
|
220
|
+
});
|
|
221
|
+
|
|
222
|
+
expect(result.content[0].text).toContain('Judge Evaluation');
|
|
223
|
+
expect(mockCreate).toHaveBeenCalledTimes(1);
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('should handle invalid JSON gracefully with fallback', async () => {
|
|
227
|
+
mockCreate.mockResolvedValueOnce({
|
|
228
|
+
choices: [{
|
|
229
|
+
message: { content: 'This is not valid JSON at all, just some random text.' },
|
|
230
|
+
finish_reason: 'stop',
|
|
231
|
+
}],
|
|
232
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
233
|
+
model: 'gpt-4',
|
|
234
|
+
});
|
|
235
|
+
|
|
236
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
237
|
+
responses: mockResponses,
|
|
238
|
+
});
|
|
239
|
+
|
|
240
|
+
const text = result.content[0].text;
|
|
241
|
+
expect(text).toContain('Judge Evaluation');
|
|
242
|
+
expect(text).toContain('Unable to parse');
|
|
243
|
+
});
|
|
244
|
+
|
|
245
|
+
it('should handle JSON with extra text around it', async () => {
|
|
246
|
+
const judgeResponse = `Here is my evaluation:
|
|
247
|
+
{"rankings": [{"provider": "openai", "score": 90, "justification": "Best"}], "summary": "Done"}
|
|
248
|
+
Hope this helps!`;
|
|
249
|
+
|
|
250
|
+
mockCreate.mockResolvedValueOnce({
|
|
251
|
+
choices: [{
|
|
252
|
+
message: { content: judgeResponse },
|
|
253
|
+
finish_reason: 'stop',
|
|
254
|
+
}],
|
|
255
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
256
|
+
model: 'gpt-4',
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
260
|
+
responses: mockResponses,
|
|
261
|
+
});
|
|
262
|
+
|
|
263
|
+
const text = result.content[0].text;
|
|
264
|
+
expect(text).toContain('90/100');
|
|
265
|
+
expect(text).toContain('openai');
|
|
266
|
+
});
|
|
267
|
+
|
|
268
|
+
it('should include missing providers in rankings', async () => {
|
|
269
|
+
// Judge only ranks one provider
|
|
270
|
+
const judgeResponse = JSON.stringify({
|
|
271
|
+
rankings: [
|
|
272
|
+
{ provider: 'openai', score: 85, justification: 'Good' },
|
|
273
|
+
],
|
|
274
|
+
summary: 'Only evaluated one.',
|
|
275
|
+
});
|
|
276
|
+
|
|
277
|
+
mockCreate.mockResolvedValueOnce({
|
|
278
|
+
choices: [{
|
|
279
|
+
message: { content: judgeResponse },
|
|
280
|
+
finish_reason: 'stop',
|
|
281
|
+
}],
|
|
282
|
+
usage: { prompt_tokens: 100, completion_tokens: 50, total_tokens: 150 },
|
|
283
|
+
model: 'gpt-4',
|
|
284
|
+
});
|
|
285
|
+
|
|
286
|
+
const result = await duckJudgeTool(mockProviderManager, {
|
|
287
|
+
responses: mockResponses,
|
|
288
|
+
});
|
|
289
|
+
|
|
290
|
+
const text = result.content[0].text;
|
|
291
|
+
// Should include both providers even though only one was ranked
|
|
292
|
+
expect(text).toContain('openai');
|
|
293
|
+
expect(text).toContain('gemini');
|
|
294
|
+
expect(text).toContain('Not evaluated');
|
|
295
|
+
});
|
|
296
|
+
});
|