@yeaft/webchat-agent 0.1.411 → 0.1.412

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,356 @@
1
+ /**
2
+ * eval/cases/tool-use.js — Tool use eval cases
3
+ *
4
+ * Tests whether the model correctly decides when and how to call tools.
5
+ * These are the most important evals — they catch regressions in:
6
+ * - Tool selection (right tool for the job)
7
+ * - Parameter extraction (correct input from natural language)
8
+ * - Tool avoidance (not calling tools when unnecessary)
9
+ * - Multi-tool orchestration (using multiple tools in sequence)
10
+ */
11
+
12
+ import { defineTool } from '../../tools/types.js';
13
+ import {
14
+ noError,
15
+ toolWasCalled,
16
+ toolCalledWith,
17
+ toolNotCalled,
18
+ toolSucceeded,
19
+ turnCountInRange,
20
+ containsText,
21
+ custom,
22
+ } from '../runner.js';
23
+
24
+ // ─── Mock Tools for Evals ────────────────────────────────────
25
+
26
+ const searchTool = defineTool({
27
+ name: 'search',
28
+ description: 'Search the web for information. Returns search results as text.',
29
+ parameters: {
30
+ type: 'object',
31
+ properties: {
32
+ query: { type: 'string', description: 'The search query' },
33
+ },
34
+ required: ['query'],
35
+ },
36
+ modes: ['chat', 'work'],
37
+ async execute(input) {
38
+ const q = (input.query || '').slice(0, 200);
39
+ return JSON.stringify({
40
+ results: [
41
+ { title: `Result for: ${q}`, snippet: `Information about ${q}` },
42
+ ],
43
+ });
44
+ },
45
+ });
46
+
47
+ const calculatorTool = defineTool({
48
+ name: 'calculator',
49
+ description: 'Perform mathematical calculations. Supports basic arithmetic and common functions.',
50
+ parameters: {
51
+ type: 'object',
52
+ properties: {
53
+ expression: { type: 'string', description: 'Math expression to evaluate (e.g. "2 + 3 * 4")' },
54
+ },
55
+ required: ['expression'],
56
+ },
57
+ modes: ['chat', 'work'],
58
+ async execute(input) {
59
+ try {
60
+ // Safe eval for basic math
61
+ const result = Function(`"use strict"; return (${input.expression})`)();
62
+ return String(result);
63
+ } catch {
64
+ return `Error: invalid expression "${input.expression}"`;
65
+ }
66
+ },
67
+ });
68
+
69
+ const readFileTool = defineTool({
70
+ name: 'read_file',
71
+ description: 'Read the contents of a file at the given path.',
72
+ parameters: {
73
+ type: 'object',
74
+ properties: {
75
+ path: { type: 'string', description: 'File path to read' },
76
+ },
77
+ required: ['path'],
78
+ },
79
+ modes: ['chat', 'work'],
80
+ async execute(input) {
81
+ // Mock file system
82
+ const files = {
83
+ 'package.json': '{ "name": "my-app", "version": "1.0.0", "dependencies": { "express": "^4.18" } }',
84
+ 'README.md': '# My App\n\nA sample application built with Express.',
85
+ 'src/index.js': 'const express = require("express");\nconst app = express();\napp.listen(3000);',
86
+ };
87
+ return files[input.path] || `Error: file not found "${input.path}"`;
88
+ },
89
+ });
90
+
91
+ const writeFileTool = defineTool({
92
+ name: 'write_file',
93
+ description: 'Write content to a file at the given path.',
94
+ parameters: {
95
+ type: 'object',
96
+ properties: {
97
+ path: { type: 'string', description: 'File path to write' },
98
+ content: { type: 'string', description: 'Content to write' },
99
+ },
100
+ required: ['path', 'content'],
101
+ },
102
+ modes: ['work'],
103
+ async execute(input) {
104
+ return `Successfully wrote ${input.content.length} bytes to ${input.path}`;
105
+ },
106
+ });
107
+
108
+ const bashTool = defineTool({
109
+ name: 'bash',
110
+ description: 'Execute a bash command and return the output.',
111
+ parameters: {
112
+ type: 'object',
113
+ properties: {
114
+ command: { type: 'string', description: 'The bash command to execute' },
115
+ },
116
+ required: ['command'],
117
+ },
118
+ modes: ['work'],
119
+ async execute(input) {
120
+ // Mock bash responses
121
+ const responses = {
122
+ 'git status': 'On branch main\nnothing to commit, working tree clean',
123
+ 'ls': 'package.json\nREADME.md\nsrc/',
124
+ 'npm test': 'Tests passed: 42/42',
125
+ };
126
+ // Match any command that starts with a known command
127
+ for (const [cmd, response] of Object.entries(responses)) {
128
+ if (input.command.startsWith(cmd)) return response;
129
+ }
130
+ return `$ ${input.command}\n(command executed successfully)`;
131
+ },
132
+ });
133
+
134
+ const allTools = [searchTool, calculatorTool, readFileTool, writeFileTool, bashTool];
135
+
136
+ // ─── Eval Cases ──────────────────────────────────────────────
137
+
138
+ export const toolUseCases = [
139
+
140
+ // ─── Basic Tool Selection ─────────────────────────────
141
+
142
+ {
143
+ id: 'tool-select-search',
144
+ suite: 'tools',
145
+ description: 'Model should use search tool for factual questions',
146
+ prompt: 'What is the current population of Tokyo?',
147
+ registryTools: allTools,
148
+ criteria: [
149
+ noError,
150
+ toolWasCalled('search', { weight: 10 }),
151
+ toolCalledWith('search', (input) =>
152
+ input.query && input.query.toLowerCase().includes('tokyo'),
153
+ { id: 'search-mentions-tokyo', weight: 6 },
154
+ ),
155
+ toolNotCalled('calculator', { weight: 3 }),
156
+ toolNotCalled('read_file', { weight: 3 }),
157
+ ],
158
+ },
159
+
160
+ {
161
+ id: 'tool-select-calculator',
162
+ suite: 'tools',
163
+ description: 'Model should use calculator for math problems',
164
+ prompt: 'What is 1847 * 293 + 7621?',
165
+ registryTools: allTools,
166
+ criteria: [
167
+ noError,
168
+ toolWasCalled('calculator', { weight: 10 }),
169
+ toolCalledWith('calculator', (input) =>
170
+ input.expression && /1847/.test(input.expression) && /293/.test(input.expression),
171
+ { id: 'calc-correct-expr', weight: 8 },
172
+ ),
173
+ toolNotCalled('search', { weight: 3 }),
174
+ ],
175
+ },
176
+
177
+ {
178
+ id: 'tool-select-read-file',
179
+ suite: 'tools',
180
+ description: 'Model should use read_file to examine project files',
181
+ prompt: 'What dependencies does the project have? Check package.json',
182
+ registryTools: allTools,
183
+ criteria: [
184
+ noError,
185
+ toolWasCalled('read_file', { weight: 10 }),
186
+ toolCalledWith('read_file', (input) =>
187
+ input.path === 'package.json',
188
+ { id: 'reads-package-json', weight: 8 },
189
+ ),
190
+ containsText('express', { id: 'mentions-express', weight: 5 }),
191
+ ],
192
+ },
193
+
194
+ // ─── Tool Avoidance ───────────────────────────────────
195
+
196
+ {
197
+ id: 'tool-avoid-simple-chat',
198
+ suite: 'tools',
199
+ description: 'Model should NOT use tools for simple conversation',
200
+ prompt: 'Hello! How are you doing today?',
201
+ registryTools: allTools,
202
+ criteria: [
203
+ noError,
204
+ toolNotCalled('search', { weight: 8 }),
205
+ toolNotCalled('calculator', { weight: 8 }),
206
+ toolNotCalled('read_file', { weight: 8 }),
207
+ turnCountInRange(1, 1, { weight: 5 }),
208
+ ],
209
+ },
210
+
211
+ {
212
+ id: 'tool-avoid-known-knowledge',
213
+ suite: 'tools',
214
+ description: 'Model should NOT search for common knowledge it already has',
215
+ prompt: 'What is the capital of France?',
216
+ registryTools: allTools,
217
+ criteria: [
218
+ noError,
219
+ toolNotCalled('search', { weight: 8, id: 'no-search-for-common-knowledge' }),
220
+ containsText('Paris', { weight: 7 }),
221
+ turnCountInRange(1, 1, { weight: 3 }),
222
+ ],
223
+ },
224
+
225
+ {
226
+ id: 'tool-avoid-simple-math',
227
+ suite: 'tools',
228
+ description: 'Model should NOT use calculator for trivial math (2+2)',
229
+ prompt: 'What is 2 + 2?',
230
+ registryTools: allTools,
231
+ criteria: [
232
+ noError,
233
+ toolNotCalled('calculator', { weight: 6, id: 'no-calc-for-trivial' }),
234
+ containsText('4', { weight: 5 }),
235
+ ],
236
+ },
237
+
238
+ // ─── Multi-Tool Orchestration ─────────────────────────
239
+
240
+ {
241
+ id: 'tool-multi-read-then-write',
242
+ suite: 'tools',
243
+ description: 'Model should read a file then modify it (sequential tools)',
244
+ prompt: 'Read src/index.js and add a health check endpoint at /health',
245
+ mode: 'work',
246
+ registryTools: allTools,
247
+ criteria: [
248
+ noError,
249
+ toolWasCalled('read_file', { weight: 8, id: 'reads-first' }),
250
+ toolWasCalled('write_file', { weight: 8, id: 'writes-after' }),
251
+ toolCalledWith('read_file', (input) =>
252
+ input.path === 'src/index.js',
253
+ { id: 'reads-correct-file', weight: 6 },
254
+ ),
255
+ toolCalledWith('write_file', (input) =>
256
+ input.path === 'src/index.js' && input.content && input.content.includes('health'),
257
+ { id: 'writes-health-endpoint', weight: 8 },
258
+ ),
259
+ custom('read-before-write', 'Read happens before write', 5, (result) => {
260
+ const readIdx = result.toolCalls.findIndex(tc => tc.name === 'read_file');
261
+ const writeIdx = result.toolCalls.findIndex(tc => tc.name === 'write_file');
262
+ const ordered = readIdx >= 0 && writeIdx >= 0 && readIdx < writeIdx;
263
+ return { pass: ordered, score: ordered ? 1 : 0 };
264
+ }),
265
+ ],
266
+ },
267
+
268
+ {
269
+ id: 'tool-multi-bash-workflow',
270
+ suite: 'tools',
271
+ description: 'Model should run git status and npm test in work mode',
272
+ prompt: 'Check the git status and run the tests',
273
+ mode: 'work',
274
+ registryTools: allTools,
275
+ criteria: [
276
+ noError,
277
+ toolWasCalled('bash', { weight: 8 }),
278
+ custom('git-status-called', 'Ran git status', 7, (result) => {
279
+ const gitCall = result.toolCalls.find(tc =>
280
+ tc.name === 'bash' && tc.input.command && tc.input.command.includes('git status'),
281
+ );
282
+ return { pass: !!gitCall, score: gitCall ? 1 : 0 };
283
+ }),
284
+ custom('npm-test-called', 'Ran npm test', 7, (result) => {
285
+ const testCall = result.toolCalls.find(tc =>
286
+ tc.name === 'bash' && tc.input.command && tc.input.command.includes('test'),
287
+ );
288
+ return { pass: !!testCall, score: testCall ? 1 : 0 };
289
+ }),
290
+ ],
291
+ },
292
+
293
+ // ─── Mode Awareness ───────────────────────────────────
294
+
295
+ {
296
+ id: 'tool-mode-chat-no-write',
297
+ suite: 'tools',
298
+ description: 'In chat mode, write_file should not be available (work-only tool)',
299
+ prompt: 'Write "hello" to a file called greeting.txt',
300
+ mode: 'chat',
301
+ registryTools: allTools,
302
+ criteria: [
303
+ noError,
304
+ toolNotCalled('write_file', {
305
+ weight: 10,
306
+ id: 'no-write-in-chat',
307
+ description: 'write_file is work-only and should not be called in chat mode',
308
+ }),
309
+ ],
310
+ },
311
+
312
+ // ─── Error Handling ───────────────────────────────────
313
+
314
+ {
315
+ id: 'tool-error-recovery',
316
+ suite: 'tools',
317
+ description: 'Model should handle tool errors gracefully and explain to user',
318
+ prompt: 'Read the file at /nonexistent/path/file.txt',
319
+ registryTools: allTools,
320
+ criteria: [
321
+ noError,
322
+ toolWasCalled('read_file', { weight: 8 }),
323
+ custom('acknowledges-error', 'Model acknowledges the file was not found', 7, (result) => {
324
+ const hasError = result.toolResults.some(tr =>
325
+ tr.name === 'read_file' && tr.output.includes('not found'),
326
+ );
327
+ const acknowledges = result.fullText.toLowerCase().includes('not found') ||
328
+ result.fullText.toLowerCase().includes('doesn\'t exist') ||
329
+ result.fullText.toLowerCase().includes('does not exist') ||
330
+ result.fullText.toLowerCase().includes('error') ||
331
+ result.fullText.toLowerCase().includes('unable');
332
+ return { pass: hasError && acknowledges, score: (hasError && acknowledges) ? 1 : 0 };
333
+ }),
334
+ ],
335
+ },
336
+
337
+ // ─── Parameter Extraction ─────────────────────────────
338
+
339
+ {
340
+ id: 'tool-param-extraction-complex',
341
+ suite: 'tools',
342
+ description: 'Model should extract correct parameters from complex natural language',
343
+ prompt: 'Search for "best practices for TypeScript error handling in 2026"',
344
+ registryTools: allTools,
345
+ criteria: [
346
+ noError,
347
+ toolWasCalled('search', { weight: 8 }),
348
+ toolCalledWith('search', (input) =>
349
+ input.query &&
350
+ input.query.toLowerCase().includes('typescript') &&
351
+ input.query.toLowerCase().includes('error'),
352
+ { id: 'extracts-key-terms', weight: 8 },
353
+ ),
354
+ ],
355
+ },
356
+ ];
@@ -0,0 +1,250 @@
1
+ #!/usr/bin/env node
2
+
3
+ /**
4
+ * eval/run-eval.js — CLI runner for Yeaft evals
5
+ *
6
+ * Usage:
7
+ * # Run all evals against default model (requires API key)
8
+ * node agent/unify/eval/run-eval.js
9
+ *
10
+ * # Run specific suite
11
+ * node agent/unify/eval/run-eval.js --suite tools
12
+ *
13
+ * # Compare multiple models
14
+ * node agent/unify/eval/run-eval.js --models claude-sonnet-4-20250514,gpt-5
15
+ *
16
+ * # Save baseline
17
+ * node agent/unify/eval/run-eval.js --save-baseline initial
18
+ *
19
+ * # Compare against baseline
20
+ * node agent/unify/eval/run-eval.js --compare-baseline baselines/initial.json
21
+ *
22
+ * # Dry run (MockAdapter, no API calls)
23
+ * node agent/unify/eval/run-eval.js --dry-run
24
+ *
25
+ * Environment:
26
+ * YEAFT_API_KEY — Anthropic API key
27
+ * YEAFT_OPENAI_API_KEY — OpenAI API key
28
+ */
29
+
30
+ import { parseArgs } from 'util';
31
+ import { join } from 'path';
32
+ import { homedir } from 'os';
33
+
34
+ import { toolUseCases } from './cases/tool-use.js';
35
+ import { memoryCases } from './cases/memory.js';
36
+ import { skillsCases } from './cases/skills.js';
37
+ import { e2eCases } from './cases/e2e.js';
38
+ import {
39
+ runEvals,
40
+ printResults,
41
+ printComparison,
42
+ saveBaseline,
43
+ loadBaseline,
44
+ compareToBaseline,
45
+ } from './runner.js';
46
+
47
+ // ─── Parse CLI args ──────────────────────────────────────────
48
+
49
+ const { values: args } = parseArgs({
50
+ options: {
51
+ suite: { type: 'string', short: 's', default: 'all' },
52
+ models: { type: 'string', short: 'm', default: '' },
53
+ 'save-baseline': { type: 'string', default: '' },
54
+ 'compare-baseline': { type: 'string', default: '' },
55
+ 'dry-run': { type: 'boolean', default: false },
56
+ help: { type: 'boolean', short: 'h', default: false },
57
+ },
58
+ });
59
+
60
+ if (args.help) {
61
+ console.log(`
62
+ Yeaft Eval Runner
63
+
64
+ Usage:
65
+ node agent/unify/eval/run-eval.js [options]
66
+
67
+ Options:
68
+ -s, --suite <name> Run specific suite: tools, memory, skills, e2e, all (default: all)
69
+ -m, --models <list> Comma-separated model IDs (default: auto-detect from API keys)
70
+ --save-baseline <name> Save results as named baseline
71
+ --compare-baseline <path> Compare results against a baseline file
72
+ --dry-run Run with MockAdapter (no API calls, for testing the harness)
73
+ -h, --help Show this help
74
+
75
+ Environment:
76
+ YEAFT_API_KEY Anthropic API key (enables Claude models)
77
+ YEAFT_OPENAI_API_KEY OpenAI API key (enables GPT models)
78
+
79
+ Examples:
80
+ # Quick dry run to verify harness works
81
+ node agent/unify/eval/run-eval.js --dry-run
82
+
83
+ # Run tool evals against Claude Sonnet
84
+ node agent/unify/eval/run-eval.js --suite tools --models claude-sonnet-4-20250514
85
+
86
+ # Full eval, save baseline
87
+ node agent/unify/eval/run-eval.js --save-baseline v0.1.411
88
+
89
+ # Check for regressions
90
+ node agent/unify/eval/run-eval.js --compare-baseline baselines/v0.1.411.json
91
+ `);
92
+ process.exit(0);
93
+ }
94
+
95
+ // ─── Collect cases ───────────────────────────────────────────
96
+
97
+ const allCases = {
98
+ tools: toolUseCases,
99
+ memory: memoryCases,
100
+ skills: skillsCases,
101
+ e2e: e2eCases,
102
+ };
103
+
104
+ let cases;
105
+ if (args.suite === 'all') {
106
+ cases = [...toolUseCases, ...memoryCases, ...skillsCases, ...e2eCases];
107
+ } else if (allCases[args.suite]) {
108
+ cases = allCases[args.suite];
109
+ } else {
110
+ console.error(`Unknown suite: ${args.suite}. Available: tools, memory, skills, e2e, all`);
111
+ process.exit(1);
112
+ }
113
+
114
+ console.log(`\nYeaft Eval Runner`);
115
+ console.log(`Suite: ${args.suite} (${cases.length} cases)`);
116
+
117
+ // ─── Build adapters ──────────────────────────────────────────
118
+
119
+ const adapters = [];
120
+
121
+ if (args['dry-run']) {
122
+ // MockAdapter that gives simple responses
123
+ console.log('Mode: DRY RUN (MockAdapter)\n');
124
+
125
+ class DryRunAdapter {
126
+ async *stream(params) {
127
+ // Check if tools are available
128
+ const hasTools = params.tools && params.tools.length > 0;
129
+
130
+ // Simple heuristic: if prompt mentions search/find → call search tool
131
+ const prompt = params.messages[params.messages.length - 1]?.content || '';
132
+ const lp = prompt.toLowerCase();
133
+
134
+ if (hasTools && (lp.includes('search') || lp.includes('population') || lp.includes('find'))) {
135
+ const searchTool = params.tools.find(t => t.name === 'search');
136
+ if (searchTool) {
137
+ yield { type: 'text_delta', text: 'Let me search for that. ' };
138
+ yield { type: 'tool_call', id: 'mock-tc-1', name: 'search', input: { query: prompt } };
139
+ yield { type: 'usage', inputTokens: 100, outputTokens: 20 };
140
+ yield { type: 'stop', stopReason: 'tool_use' };
141
+ return;
142
+ }
143
+ }
144
+
145
+ if (hasTools && (lp.includes('calculate') || lp.includes('math') || /\d+\s*[\+\-\*\/]\s*\d+/.test(lp))) {
146
+ const calcTool = params.tools.find(t => t.name === 'calculator');
147
+ if (calcTool) {
148
+ const expr = prompt.match(/[\d\s\+\-\*\/\(\)]+/)?.[0]?.trim() || '0';
149
+ yield { type: 'text_delta', text: 'Let me calculate. ' };
150
+ yield { type: 'tool_call', id: 'mock-tc-1', name: 'calculator', input: { expression: expr } };
151
+ yield { type: 'usage', inputTokens: 100, outputTokens: 20 };
152
+ yield { type: 'stop', stopReason: 'tool_use' };
153
+ return;
154
+ }
155
+ }
156
+
157
+ // Default: just respond with text
158
+ yield { type: 'text_delta', text: `I understand you asked: "${prompt.slice(0, 50)}". ` };
159
+ yield { type: 'text_delta', text: 'Here is my response.' };
160
+ yield { type: 'usage', inputTokens: 100, outputTokens: 15 };
161
+ yield { type: 'stop', stopReason: 'end_turn' };
162
+ }
163
+ async call() {
164
+ return { text: '{}', usage: { inputTokens: 10, outputTokens: 5 } };
165
+ }
166
+ }
167
+
168
+ adapters.push({ name: 'dry-run-mock', adapter: new DryRunAdapter() });
169
+
170
+ } else {
171
+ // Real adapters
172
+ const { createLLMAdapter } = await import('../llm/adapter.js');
173
+ const modelList = args.models
174
+ ? args.models.split(',').map(m => m.trim())
175
+ : [];
176
+
177
+ // Auto-detect from API keys if no models specified
178
+ if (modelList.length === 0) {
179
+ if (process.env.YEAFT_API_KEY) modelList.push('claude-sonnet-4-20250514');
180
+ if (process.env.YEAFT_OPENAI_API_KEY) modelList.push('gpt-5');
181
+ }
182
+
183
+ if (modelList.length === 0) {
184
+ console.error('\nNo models available. Set YEAFT_API_KEY or YEAFT_OPENAI_API_KEY, or use --dry-run.');
185
+ process.exit(1);
186
+ }
187
+
188
+ console.log(`Models: ${modelList.join(', ')}\n`);
189
+
190
+ for (const model of modelList) {
191
+ try {
192
+ const config = {
193
+ model,
194
+ apiKey: process.env.YEAFT_API_KEY,
195
+ openaiApiKey: process.env.YEAFT_OPENAI_API_KEY,
196
+ };
197
+ const adapter = await createLLMAdapter(config);
198
+ adapters.push({ name: model, adapter, config });
199
+ } catch (err) {
200
+ console.error(`Failed to create adapter for ${model}: ${err.message}`);
201
+ }
202
+ }
203
+ }
204
+
205
+ if (adapters.length === 0) {
206
+ console.error('No adapters available. Exiting.');
207
+ process.exit(1);
208
+ }
209
+
210
+ // ─── Run evals ───────────────────────────────────────────────
211
+
212
+ console.log(`Running ${cases.length} eval cases across ${adapters.length} adapter(s)...\n`);
213
+
214
+ const scores = await runEvals({ cases, adapters });
215
+
216
+ // ─── Display results ─────────────────────────────────────────
217
+
218
+ printResults(scores);
219
+
220
+ // ─── Save baseline if requested ──────────────────────────────
221
+
222
+ const baselineDir = join(homedir(), '.yeaft', 'eval', 'baselines');
223
+
224
+ if (args['save-baseline']) {
225
+ const path = saveBaseline(scores, baselineDir, args['save-baseline']);
226
+ console.log(`\nBaseline saved: ${path}`);
227
+ }
228
+
229
+ // ─── Compare to baseline if requested ────────────────────────
230
+
231
+ if (args['compare-baseline']) {
232
+ try {
233
+ const baseline = loadBaseline(args['compare-baseline']);
234
+ const comparison = compareToBaseline(scores, baseline);
235
+ printComparison(comparison);
236
+
237
+ if (comparison.regressions.length > 0) {
238
+ process.exit(1); // Exit with error code for CI
239
+ }
240
+ } catch (err) {
241
+ console.error(`\nFailed to load baseline: ${err.message}`);
242
+ process.exit(1);
243
+ }
244
+ }
245
+
246
+ // ─── Summary ─────────────────────────────────────────────────
247
+
248
+ const passed = scores.filter(s => s.totalScore >= 80).length;
249
+ const total = scores.length;
250
+ console.log(`\n${passed}/${total} evals passed (≥80 score)`);