@yeaft/webchat-agent 0.1.410 → 0.1.412
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/unify/config.js +36 -0
- package/unify/engine.js +124 -16
- package/unify/eval/cases/e2e.js +154 -0
- package/unify/eval/cases/memory.js +182 -0
- package/unify/eval/cases/skills.js +51 -0
- package/unify/eval/cases/tool-use.js +356 -0
- package/unify/eval/run-eval.js +250 -0
- package/unify/eval/runner.js +525 -0
- package/unify/index.js +2 -1
- package/unify/prompts.js +6 -0
- package/unify/session.js +191 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval/cases/tool-use.js — Tool use eval cases
|
|
3
|
+
*
|
|
4
|
+
* Tests whether the model correctly decides when and how to call tools.
|
|
5
|
+
* These are the most important evals — they catch regressions in:
|
|
6
|
+
* - Tool selection (right tool for the job)
|
|
7
|
+
* - Parameter extraction (correct input from natural language)
|
|
8
|
+
* - Tool avoidance (not calling tools when unnecessary)
|
|
9
|
+
* - Multi-tool orchestration (using multiple tools in sequence)
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { defineTool } from '../../tools/types.js';
|
|
13
|
+
import {
|
|
14
|
+
noError,
|
|
15
|
+
toolWasCalled,
|
|
16
|
+
toolCalledWith,
|
|
17
|
+
toolNotCalled,
|
|
18
|
+
toolSucceeded,
|
|
19
|
+
turnCountInRange,
|
|
20
|
+
containsText,
|
|
21
|
+
custom,
|
|
22
|
+
} from '../runner.js';
|
|
23
|
+
|
|
24
|
+
// ─── Mock Tools for Evals ────────────────────────────────────
|
|
25
|
+
|
|
26
|
+
const searchTool = defineTool({
|
|
27
|
+
name: 'search',
|
|
28
|
+
description: 'Search the web for information. Returns search results as text.',
|
|
29
|
+
parameters: {
|
|
30
|
+
type: 'object',
|
|
31
|
+
properties: {
|
|
32
|
+
query: { type: 'string', description: 'The search query' },
|
|
33
|
+
},
|
|
34
|
+
required: ['query'],
|
|
35
|
+
},
|
|
36
|
+
modes: ['chat', 'work'],
|
|
37
|
+
async execute(input) {
|
|
38
|
+
const q = (input.query || '').slice(0, 200);
|
|
39
|
+
return JSON.stringify({
|
|
40
|
+
results: [
|
|
41
|
+
{ title: `Result for: ${q}`, snippet: `Information about ${q}` },
|
|
42
|
+
],
|
|
43
|
+
});
|
|
44
|
+
},
|
|
45
|
+
});
|
|
46
|
+
|
|
47
|
+
const calculatorTool = defineTool({
|
|
48
|
+
name: 'calculator',
|
|
49
|
+
description: 'Perform mathematical calculations. Supports basic arithmetic and common functions.',
|
|
50
|
+
parameters: {
|
|
51
|
+
type: 'object',
|
|
52
|
+
properties: {
|
|
53
|
+
expression: { type: 'string', description: 'Math expression to evaluate (e.g. "2 + 3 * 4")' },
|
|
54
|
+
},
|
|
55
|
+
required: ['expression'],
|
|
56
|
+
},
|
|
57
|
+
modes: ['chat', 'work'],
|
|
58
|
+
async execute(input) {
|
|
59
|
+
try {
|
|
60
|
+
// Safe eval for basic math
|
|
61
|
+
const result = Function(`"use strict"; return (${input.expression})`)();
|
|
62
|
+
return String(result);
|
|
63
|
+
} catch {
|
|
64
|
+
return `Error: invalid expression "${input.expression}"`;
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
});
|
|
68
|
+
|
|
69
|
+
const readFileTool = defineTool({
|
|
70
|
+
name: 'read_file',
|
|
71
|
+
description: 'Read the contents of a file at the given path.',
|
|
72
|
+
parameters: {
|
|
73
|
+
type: 'object',
|
|
74
|
+
properties: {
|
|
75
|
+
path: { type: 'string', description: 'File path to read' },
|
|
76
|
+
},
|
|
77
|
+
required: ['path'],
|
|
78
|
+
},
|
|
79
|
+
modes: ['chat', 'work'],
|
|
80
|
+
async execute(input) {
|
|
81
|
+
// Mock file system
|
|
82
|
+
const files = {
|
|
83
|
+
'package.json': '{ "name": "my-app", "version": "1.0.0", "dependencies": { "express": "^4.18" } }',
|
|
84
|
+
'README.md': '# My App\n\nA sample application built with Express.',
|
|
85
|
+
'src/index.js': 'const express = require("express");\nconst app = express();\napp.listen(3000);',
|
|
86
|
+
};
|
|
87
|
+
return files[input.path] || `Error: file not found "${input.path}"`;
|
|
88
|
+
},
|
|
89
|
+
});
|
|
90
|
+
|
|
91
|
+
const writeFileTool = defineTool({
|
|
92
|
+
name: 'write_file',
|
|
93
|
+
description: 'Write content to a file at the given path.',
|
|
94
|
+
parameters: {
|
|
95
|
+
type: 'object',
|
|
96
|
+
properties: {
|
|
97
|
+
path: { type: 'string', description: 'File path to write' },
|
|
98
|
+
content: { type: 'string', description: 'Content to write' },
|
|
99
|
+
},
|
|
100
|
+
required: ['path', 'content'],
|
|
101
|
+
},
|
|
102
|
+
modes: ['work'],
|
|
103
|
+
async execute(input) {
|
|
104
|
+
return `Successfully wrote ${input.content.length} bytes to ${input.path}`;
|
|
105
|
+
},
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
const bashTool = defineTool({
|
|
109
|
+
name: 'bash',
|
|
110
|
+
description: 'Execute a bash command and return the output.',
|
|
111
|
+
parameters: {
|
|
112
|
+
type: 'object',
|
|
113
|
+
properties: {
|
|
114
|
+
command: { type: 'string', description: 'The bash command to execute' },
|
|
115
|
+
},
|
|
116
|
+
required: ['command'],
|
|
117
|
+
},
|
|
118
|
+
modes: ['work'],
|
|
119
|
+
async execute(input) {
|
|
120
|
+
// Mock bash responses
|
|
121
|
+
const responses = {
|
|
122
|
+
'git status': 'On branch main\nnothing to commit, working tree clean',
|
|
123
|
+
'ls': 'package.json\nREADME.md\nsrc/',
|
|
124
|
+
'npm test': 'Tests passed: 42/42',
|
|
125
|
+
};
|
|
126
|
+
// Match any command that starts with a known command
|
|
127
|
+
for (const [cmd, response] of Object.entries(responses)) {
|
|
128
|
+
if (input.command.startsWith(cmd)) return response;
|
|
129
|
+
}
|
|
130
|
+
return `$ ${input.command}\n(command executed successfully)`;
|
|
131
|
+
},
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
const allTools = [searchTool, calculatorTool, readFileTool, writeFileTool, bashTool];
|
|
135
|
+
|
|
136
|
+
// ─── Eval Cases ──────────────────────────────────────────────
|
|
137
|
+
|
|
138
|
+
export const toolUseCases = [
|
|
139
|
+
|
|
140
|
+
// ─── Basic Tool Selection ─────────────────────────────
|
|
141
|
+
|
|
142
|
+
{
|
|
143
|
+
id: 'tool-select-search',
|
|
144
|
+
suite: 'tools',
|
|
145
|
+
description: 'Model should use search tool for factual questions',
|
|
146
|
+
prompt: 'What is the current population of Tokyo?',
|
|
147
|
+
registryTools: allTools,
|
|
148
|
+
criteria: [
|
|
149
|
+
noError,
|
|
150
|
+
toolWasCalled('search', { weight: 10 }),
|
|
151
|
+
toolCalledWith('search', (input) =>
|
|
152
|
+
input.query && input.query.toLowerCase().includes('tokyo'),
|
|
153
|
+
{ id: 'search-mentions-tokyo', weight: 6 },
|
|
154
|
+
),
|
|
155
|
+
toolNotCalled('calculator', { weight: 3 }),
|
|
156
|
+
toolNotCalled('read_file', { weight: 3 }),
|
|
157
|
+
],
|
|
158
|
+
},
|
|
159
|
+
|
|
160
|
+
{
|
|
161
|
+
id: 'tool-select-calculator',
|
|
162
|
+
suite: 'tools',
|
|
163
|
+
description: 'Model should use calculator for math problems',
|
|
164
|
+
prompt: 'What is 1847 * 293 + 7621?',
|
|
165
|
+
registryTools: allTools,
|
|
166
|
+
criteria: [
|
|
167
|
+
noError,
|
|
168
|
+
toolWasCalled('calculator', { weight: 10 }),
|
|
169
|
+
toolCalledWith('calculator', (input) =>
|
|
170
|
+
input.expression && /1847/.test(input.expression) && /293/.test(input.expression),
|
|
171
|
+
{ id: 'calc-correct-expr', weight: 8 },
|
|
172
|
+
),
|
|
173
|
+
toolNotCalled('search', { weight: 3 }),
|
|
174
|
+
],
|
|
175
|
+
},
|
|
176
|
+
|
|
177
|
+
{
|
|
178
|
+
id: 'tool-select-read-file',
|
|
179
|
+
suite: 'tools',
|
|
180
|
+
description: 'Model should use read_file to examine project files',
|
|
181
|
+
prompt: 'What dependencies does the project have? Check package.json',
|
|
182
|
+
registryTools: allTools,
|
|
183
|
+
criteria: [
|
|
184
|
+
noError,
|
|
185
|
+
toolWasCalled('read_file', { weight: 10 }),
|
|
186
|
+
toolCalledWith('read_file', (input) =>
|
|
187
|
+
input.path === 'package.json',
|
|
188
|
+
{ id: 'reads-package-json', weight: 8 },
|
|
189
|
+
),
|
|
190
|
+
containsText('express', { id: 'mentions-express', weight: 5 }),
|
|
191
|
+
],
|
|
192
|
+
},
|
|
193
|
+
|
|
194
|
+
// ─── Tool Avoidance ───────────────────────────────────
|
|
195
|
+
|
|
196
|
+
{
|
|
197
|
+
id: 'tool-avoid-simple-chat',
|
|
198
|
+
suite: 'tools',
|
|
199
|
+
description: 'Model should NOT use tools for simple conversation',
|
|
200
|
+
prompt: 'Hello! How are you doing today?',
|
|
201
|
+
registryTools: allTools,
|
|
202
|
+
criteria: [
|
|
203
|
+
noError,
|
|
204
|
+
toolNotCalled('search', { weight: 8 }),
|
|
205
|
+
toolNotCalled('calculator', { weight: 8 }),
|
|
206
|
+
toolNotCalled('read_file', { weight: 8 }),
|
|
207
|
+
turnCountInRange(1, 1, { weight: 5 }),
|
|
208
|
+
],
|
|
209
|
+
},
|
|
210
|
+
|
|
211
|
+
{
|
|
212
|
+
id: 'tool-avoid-known-knowledge',
|
|
213
|
+
suite: 'tools',
|
|
214
|
+
description: 'Model should NOT search for common knowledge it already has',
|
|
215
|
+
prompt: 'What is the capital of France?',
|
|
216
|
+
registryTools: allTools,
|
|
217
|
+
criteria: [
|
|
218
|
+
noError,
|
|
219
|
+
toolNotCalled('search', { weight: 8, id: 'no-search-for-common-knowledge' }),
|
|
220
|
+
containsText('Paris', { weight: 7 }),
|
|
221
|
+
turnCountInRange(1, 1, { weight: 3 }),
|
|
222
|
+
],
|
|
223
|
+
},
|
|
224
|
+
|
|
225
|
+
{
|
|
226
|
+
id: 'tool-avoid-simple-math',
|
|
227
|
+
suite: 'tools',
|
|
228
|
+
description: 'Model should NOT use calculator for trivial math (2+2)',
|
|
229
|
+
prompt: 'What is 2 + 2?',
|
|
230
|
+
registryTools: allTools,
|
|
231
|
+
criteria: [
|
|
232
|
+
noError,
|
|
233
|
+
toolNotCalled('calculator', { weight: 6, id: 'no-calc-for-trivial' }),
|
|
234
|
+
containsText('4', { weight: 5 }),
|
|
235
|
+
],
|
|
236
|
+
},
|
|
237
|
+
|
|
238
|
+
// ─── Multi-Tool Orchestration ─────────────────────────
|
|
239
|
+
|
|
240
|
+
{
|
|
241
|
+
id: 'tool-multi-read-then-write',
|
|
242
|
+
suite: 'tools',
|
|
243
|
+
description: 'Model should read a file then modify it (sequential tools)',
|
|
244
|
+
prompt: 'Read src/index.js and add a health check endpoint at /health',
|
|
245
|
+
mode: 'work',
|
|
246
|
+
registryTools: allTools,
|
|
247
|
+
criteria: [
|
|
248
|
+
noError,
|
|
249
|
+
toolWasCalled('read_file', { weight: 8, id: 'reads-first' }),
|
|
250
|
+
toolWasCalled('write_file', { weight: 8, id: 'writes-after' }),
|
|
251
|
+
toolCalledWith('read_file', (input) =>
|
|
252
|
+
input.path === 'src/index.js',
|
|
253
|
+
{ id: 'reads-correct-file', weight: 6 },
|
|
254
|
+
),
|
|
255
|
+
toolCalledWith('write_file', (input) =>
|
|
256
|
+
input.path === 'src/index.js' && input.content && input.content.includes('health'),
|
|
257
|
+
{ id: 'writes-health-endpoint', weight: 8 },
|
|
258
|
+
),
|
|
259
|
+
custom('read-before-write', 'Read happens before write', 5, (result) => {
|
|
260
|
+
const readIdx = result.toolCalls.findIndex(tc => tc.name === 'read_file');
|
|
261
|
+
const writeIdx = result.toolCalls.findIndex(tc => tc.name === 'write_file');
|
|
262
|
+
const ordered = readIdx >= 0 && writeIdx >= 0 && readIdx < writeIdx;
|
|
263
|
+
return { pass: ordered, score: ordered ? 1 : 0 };
|
|
264
|
+
}),
|
|
265
|
+
],
|
|
266
|
+
},
|
|
267
|
+
|
|
268
|
+
{
|
|
269
|
+
id: 'tool-multi-bash-workflow',
|
|
270
|
+
suite: 'tools',
|
|
271
|
+
description: 'Model should run git status and npm test in work mode',
|
|
272
|
+
prompt: 'Check the git status and run the tests',
|
|
273
|
+
mode: 'work',
|
|
274
|
+
registryTools: allTools,
|
|
275
|
+
criteria: [
|
|
276
|
+
noError,
|
|
277
|
+
toolWasCalled('bash', { weight: 8 }),
|
|
278
|
+
custom('git-status-called', 'Ran git status', 7, (result) => {
|
|
279
|
+
const gitCall = result.toolCalls.find(tc =>
|
|
280
|
+
tc.name === 'bash' && tc.input.command && tc.input.command.includes('git status'),
|
|
281
|
+
);
|
|
282
|
+
return { pass: !!gitCall, score: gitCall ? 1 : 0 };
|
|
283
|
+
}),
|
|
284
|
+
custom('npm-test-called', 'Ran npm test', 7, (result) => {
|
|
285
|
+
const testCall = result.toolCalls.find(tc =>
|
|
286
|
+
tc.name === 'bash' && tc.input.command && tc.input.command.includes('test'),
|
|
287
|
+
);
|
|
288
|
+
return { pass: !!testCall, score: testCall ? 1 : 0 };
|
|
289
|
+
}),
|
|
290
|
+
],
|
|
291
|
+
},
|
|
292
|
+
|
|
293
|
+
// ─── Mode Awareness ───────────────────────────────────
|
|
294
|
+
|
|
295
|
+
{
|
|
296
|
+
id: 'tool-mode-chat-no-write',
|
|
297
|
+
suite: 'tools',
|
|
298
|
+
description: 'In chat mode, write_file should not be available (work-only tool)',
|
|
299
|
+
prompt: 'Write "hello" to a file called greeting.txt',
|
|
300
|
+
mode: 'chat',
|
|
301
|
+
registryTools: allTools,
|
|
302
|
+
criteria: [
|
|
303
|
+
noError,
|
|
304
|
+
toolNotCalled('write_file', {
|
|
305
|
+
weight: 10,
|
|
306
|
+
id: 'no-write-in-chat',
|
|
307
|
+
description: 'write_file is work-only and should not be called in chat mode',
|
|
308
|
+
}),
|
|
309
|
+
],
|
|
310
|
+
},
|
|
311
|
+
|
|
312
|
+
// ─── Error Handling ───────────────────────────────────
|
|
313
|
+
|
|
314
|
+
{
|
|
315
|
+
id: 'tool-error-recovery',
|
|
316
|
+
suite: 'tools',
|
|
317
|
+
description: 'Model should handle tool errors gracefully and explain to user',
|
|
318
|
+
prompt: 'Read the file at /nonexistent/path/file.txt',
|
|
319
|
+
registryTools: allTools,
|
|
320
|
+
criteria: [
|
|
321
|
+
noError,
|
|
322
|
+
toolWasCalled('read_file', { weight: 8 }),
|
|
323
|
+
custom('acknowledges-error', 'Model acknowledges the file was not found', 7, (result) => {
|
|
324
|
+
const hasError = result.toolResults.some(tr =>
|
|
325
|
+
tr.name === 'read_file' && tr.output.includes('not found'),
|
|
326
|
+
);
|
|
327
|
+
const acknowledges = result.fullText.toLowerCase().includes('not found') ||
|
|
328
|
+
result.fullText.toLowerCase().includes('doesn\'t exist') ||
|
|
329
|
+
result.fullText.toLowerCase().includes('does not exist') ||
|
|
330
|
+
result.fullText.toLowerCase().includes('error') ||
|
|
331
|
+
result.fullText.toLowerCase().includes('unable');
|
|
332
|
+
return { pass: hasError && acknowledges, score: (hasError && acknowledges) ? 1 : 0 };
|
|
333
|
+
}),
|
|
334
|
+
],
|
|
335
|
+
},
|
|
336
|
+
|
|
337
|
+
// ─── Parameter Extraction ─────────────────────────────
|
|
338
|
+
|
|
339
|
+
{
|
|
340
|
+
id: 'tool-param-extraction-complex',
|
|
341
|
+
suite: 'tools',
|
|
342
|
+
description: 'Model should extract correct parameters from complex natural language',
|
|
343
|
+
prompt: 'Search for "best practices for TypeScript error handling in 2026"',
|
|
344
|
+
registryTools: allTools,
|
|
345
|
+
criteria: [
|
|
346
|
+
noError,
|
|
347
|
+
toolWasCalled('search', { weight: 8 }),
|
|
348
|
+
toolCalledWith('search', (input) =>
|
|
349
|
+
input.query &&
|
|
350
|
+
input.query.toLowerCase().includes('typescript') &&
|
|
351
|
+
input.query.toLowerCase().includes('error'),
|
|
352
|
+
{ id: 'extracts-key-terms', weight: 8 },
|
|
353
|
+
),
|
|
354
|
+
],
|
|
355
|
+
},
|
|
356
|
+
];
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* eval/run-eval.js — CLI runner for Yeaft evals
|
|
5
|
+
*
|
|
6
|
+
* Usage:
|
|
7
|
+
* # Run all evals against default model (requires API key)
|
|
8
|
+
* node agent/unify/eval/run-eval.js
|
|
9
|
+
*
|
|
10
|
+
* # Run specific suite
|
|
11
|
+
* node agent/unify/eval/run-eval.js --suite tools
|
|
12
|
+
*
|
|
13
|
+
* # Compare multiple models
|
|
14
|
+
* node agent/unify/eval/run-eval.js --models claude-sonnet-4-20250514,gpt-5
|
|
15
|
+
*
|
|
16
|
+
* # Save baseline
|
|
17
|
+
* node agent/unify/eval/run-eval.js --save-baseline initial
|
|
18
|
+
*
|
|
19
|
+
* # Compare against baseline
|
|
20
|
+
* node agent/unify/eval/run-eval.js --compare-baseline baselines/initial.json
|
|
21
|
+
*
|
|
22
|
+
* # Dry run (MockAdapter, no API calls)
|
|
23
|
+
* node agent/unify/eval/run-eval.js --dry-run
|
|
24
|
+
*
|
|
25
|
+
* Environment:
|
|
26
|
+
* YEAFT_API_KEY — Anthropic API key
|
|
27
|
+
* YEAFT_OPENAI_API_KEY — OpenAI API key
|
|
28
|
+
*/
|
|
29
|
+
|
|
30
|
+
import { parseArgs } from 'util';
|
|
31
|
+
import { join } from 'path';
|
|
32
|
+
import { homedir } from 'os';
|
|
33
|
+
|
|
34
|
+
import { toolUseCases } from './cases/tool-use.js';
|
|
35
|
+
import { memoryCases } from './cases/memory.js';
|
|
36
|
+
import { skillsCases } from './cases/skills.js';
|
|
37
|
+
import { e2eCases } from './cases/e2e.js';
|
|
38
|
+
import {
|
|
39
|
+
runEvals,
|
|
40
|
+
printResults,
|
|
41
|
+
printComparison,
|
|
42
|
+
saveBaseline,
|
|
43
|
+
loadBaseline,
|
|
44
|
+
compareToBaseline,
|
|
45
|
+
} from './runner.js';
|
|
46
|
+
|
|
47
|
+
// ─── Parse CLI args ──────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
const { values: args } = parseArgs({
|
|
50
|
+
options: {
|
|
51
|
+
suite: { type: 'string', short: 's', default: 'all' },
|
|
52
|
+
models: { type: 'string', short: 'm', default: '' },
|
|
53
|
+
'save-baseline': { type: 'string', default: '' },
|
|
54
|
+
'compare-baseline': { type: 'string', default: '' },
|
|
55
|
+
'dry-run': { type: 'boolean', default: false },
|
|
56
|
+
help: { type: 'boolean', short: 'h', default: false },
|
|
57
|
+
},
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
if (args.help) {
|
|
61
|
+
console.log(`
|
|
62
|
+
Yeaft Eval Runner
|
|
63
|
+
|
|
64
|
+
Usage:
|
|
65
|
+
node agent/unify/eval/run-eval.js [options]
|
|
66
|
+
|
|
67
|
+
Options:
|
|
68
|
+
-s, --suite <name> Run specific suite: tools, memory, skills, e2e, all (default: all)
|
|
69
|
+
-m, --models <list> Comma-separated model IDs (default: auto-detect from API keys)
|
|
70
|
+
--save-baseline <name> Save results as named baseline
|
|
71
|
+
--compare-baseline <path> Compare results against a baseline file
|
|
72
|
+
--dry-run Run with MockAdapter (no API calls, for testing the harness)
|
|
73
|
+
-h, --help Show this help
|
|
74
|
+
|
|
75
|
+
Environment:
|
|
76
|
+
YEAFT_API_KEY Anthropic API key (enables Claude models)
|
|
77
|
+
YEAFT_OPENAI_API_KEY OpenAI API key (enables GPT models)
|
|
78
|
+
|
|
79
|
+
Examples:
|
|
80
|
+
# Quick dry run to verify harness works
|
|
81
|
+
node agent/unify/eval/run-eval.js --dry-run
|
|
82
|
+
|
|
83
|
+
# Run tool evals against Claude Sonnet
|
|
84
|
+
node agent/unify/eval/run-eval.js --suite tools --models claude-sonnet-4-20250514
|
|
85
|
+
|
|
86
|
+
# Full eval, save baseline
|
|
87
|
+
node agent/unify/eval/run-eval.js --save-baseline v0.1.411
|
|
88
|
+
|
|
89
|
+
# Check for regressions
|
|
90
|
+
node agent/unify/eval/run-eval.js --compare-baseline baselines/v0.1.411.json
|
|
91
|
+
`);
|
|
92
|
+
process.exit(0);
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
// ─── Collect cases ───────────────────────────────────────────
|
|
96
|
+
|
|
97
|
+
const allCases = {
|
|
98
|
+
tools: toolUseCases,
|
|
99
|
+
memory: memoryCases,
|
|
100
|
+
skills: skillsCases,
|
|
101
|
+
e2e: e2eCases,
|
|
102
|
+
};
|
|
103
|
+
|
|
104
|
+
let cases;
|
|
105
|
+
if (args.suite === 'all') {
|
|
106
|
+
cases = [...toolUseCases, ...memoryCases, ...skillsCases, ...e2eCases];
|
|
107
|
+
} else if (allCases[args.suite]) {
|
|
108
|
+
cases = allCases[args.suite];
|
|
109
|
+
} else {
|
|
110
|
+
console.error(`Unknown suite: ${args.suite}. Available: tools, memory, skills, e2e, all`);
|
|
111
|
+
process.exit(1);
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
console.log(`\nYeaft Eval Runner`);
|
|
115
|
+
console.log(`Suite: ${args.suite} (${cases.length} cases)`);
|
|
116
|
+
|
|
117
|
+
// ─── Build adapters ──────────────────────────────────────────
|
|
118
|
+
|
|
119
|
+
const adapters = [];
|
|
120
|
+
|
|
121
|
+
if (args['dry-run']) {
|
|
122
|
+
// MockAdapter that gives simple responses
|
|
123
|
+
console.log('Mode: DRY RUN (MockAdapter)\n');
|
|
124
|
+
|
|
125
|
+
class DryRunAdapter {
|
|
126
|
+
async *stream(params) {
|
|
127
|
+
// Check if tools are available
|
|
128
|
+
const hasTools = params.tools && params.tools.length > 0;
|
|
129
|
+
|
|
130
|
+
// Simple heuristic: if prompt mentions search/find → call search tool
|
|
131
|
+
const prompt = params.messages[params.messages.length - 1]?.content || '';
|
|
132
|
+
const lp = prompt.toLowerCase();
|
|
133
|
+
|
|
134
|
+
if (hasTools && (lp.includes('search') || lp.includes('population') || lp.includes('find'))) {
|
|
135
|
+
const searchTool = params.tools.find(t => t.name === 'search');
|
|
136
|
+
if (searchTool) {
|
|
137
|
+
yield { type: 'text_delta', text: 'Let me search for that. ' };
|
|
138
|
+
yield { type: 'tool_call', id: 'mock-tc-1', name: 'search', input: { query: prompt } };
|
|
139
|
+
yield { type: 'usage', inputTokens: 100, outputTokens: 20 };
|
|
140
|
+
yield { type: 'stop', stopReason: 'tool_use' };
|
|
141
|
+
return;
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (hasTools && (lp.includes('calculate') || lp.includes('math') || /\d+\s*[\+\-\*\/]\s*\d+/.test(lp))) {
|
|
146
|
+
const calcTool = params.tools.find(t => t.name === 'calculator');
|
|
147
|
+
if (calcTool) {
|
|
148
|
+
const expr = prompt.match(/[\d\s\+\-\*\/\(\)]+/)?.[0]?.trim() || '0';
|
|
149
|
+
yield { type: 'text_delta', text: 'Let me calculate. ' };
|
|
150
|
+
yield { type: 'tool_call', id: 'mock-tc-1', name: 'calculator', input: { expression: expr } };
|
|
151
|
+
yield { type: 'usage', inputTokens: 100, outputTokens: 20 };
|
|
152
|
+
yield { type: 'stop', stopReason: 'tool_use' };
|
|
153
|
+
return;
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Default: just respond with text
|
|
158
|
+
yield { type: 'text_delta', text: `I understand you asked: "${prompt.slice(0, 50)}". ` };
|
|
159
|
+
yield { type: 'text_delta', text: 'Here is my response.' };
|
|
160
|
+
yield { type: 'usage', inputTokens: 100, outputTokens: 15 };
|
|
161
|
+
yield { type: 'stop', stopReason: 'end_turn' };
|
|
162
|
+
}
|
|
163
|
+
async call() {
|
|
164
|
+
return { text: '{}', usage: { inputTokens: 10, outputTokens: 5 } };
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
adapters.push({ name: 'dry-run-mock', adapter: new DryRunAdapter() });
|
|
169
|
+
|
|
170
|
+
} else {
|
|
171
|
+
// Real adapters
|
|
172
|
+
const { createLLMAdapter } = await import('../llm/adapter.js');
|
|
173
|
+
const modelList = args.models
|
|
174
|
+
? args.models.split(',').map(m => m.trim())
|
|
175
|
+
: [];
|
|
176
|
+
|
|
177
|
+
// Auto-detect from API keys if no models specified
|
|
178
|
+
if (modelList.length === 0) {
|
|
179
|
+
if (process.env.YEAFT_API_KEY) modelList.push('claude-sonnet-4-20250514');
|
|
180
|
+
if (process.env.YEAFT_OPENAI_API_KEY) modelList.push('gpt-5');
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (modelList.length === 0) {
|
|
184
|
+
console.error('\nNo models available. Set YEAFT_API_KEY or YEAFT_OPENAI_API_KEY, or use --dry-run.');
|
|
185
|
+
process.exit(1);
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
console.log(`Models: ${modelList.join(', ')}\n`);
|
|
189
|
+
|
|
190
|
+
for (const model of modelList) {
|
|
191
|
+
try {
|
|
192
|
+
const config = {
|
|
193
|
+
model,
|
|
194
|
+
apiKey: process.env.YEAFT_API_KEY,
|
|
195
|
+
openaiApiKey: process.env.YEAFT_OPENAI_API_KEY,
|
|
196
|
+
};
|
|
197
|
+
const adapter = await createLLMAdapter(config);
|
|
198
|
+
adapters.push({ name: model, adapter, config });
|
|
199
|
+
} catch (err) {
|
|
200
|
+
console.error(`Failed to create adapter for ${model}: ${err.message}`);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
if (adapters.length === 0) {
|
|
206
|
+
console.error('No adapters available. Exiting.');
|
|
207
|
+
process.exit(1);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
// ─── Run evals ───────────────────────────────────────────────
|
|
211
|
+
|
|
212
|
+
console.log(`Running ${cases.length} eval cases across ${adapters.length} adapter(s)...\n`);
|
|
213
|
+
|
|
214
|
+
const scores = await runEvals({ cases, adapters });
|
|
215
|
+
|
|
216
|
+
// ─── Display results ─────────────────────────────────────────
|
|
217
|
+
|
|
218
|
+
printResults(scores);
|
|
219
|
+
|
|
220
|
+
// ─── Save baseline if requested ──────────────────────────────
|
|
221
|
+
|
|
222
|
+
const baselineDir = join(homedir(), '.yeaft', 'eval', 'baselines');
|
|
223
|
+
|
|
224
|
+
if (args['save-baseline']) {
|
|
225
|
+
const path = saveBaseline(scores, baselineDir, args['save-baseline']);
|
|
226
|
+
console.log(`\nBaseline saved: ${path}`);
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
// ─── Compare to baseline if requested ────────────────────────
|
|
230
|
+
|
|
231
|
+
if (args['compare-baseline']) {
|
|
232
|
+
try {
|
|
233
|
+
const baseline = loadBaseline(args['compare-baseline']);
|
|
234
|
+
const comparison = compareToBaseline(scores, baseline);
|
|
235
|
+
printComparison(comparison);
|
|
236
|
+
|
|
237
|
+
if (comparison.regressions.length > 0) {
|
|
238
|
+
process.exit(1); // Exit with error code for CI
|
|
239
|
+
}
|
|
240
|
+
} catch (err) {
|
|
241
|
+
console.error(`\nFailed to load baseline: ${err.message}`);
|
|
242
|
+
process.exit(1);
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
// ─── Summary ─────────────────────────────────────────────────
|
|
247
|
+
|
|
248
|
+
const passed = scores.filter(s => s.totalScore >= 80).length;
|
|
249
|
+
const total = scores.length;
|
|
250
|
+
console.log(`\n${passed}/${total} evals passed (≥80 score)`);
|