@yeaft/webchat-agent 0.1.410 → 0.1.412

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,525 @@
1
+ /**
2
+ * eval/runner.js — Eval harness for Yeaft Unify
3
+ *
4
+ * Runs eval cases against one or more models and produces scored results.
5
+ * Each eval case defines:
6
+ * - A prompt (what to send)
7
+ * - Expected behavior (tool calls, output assertions, timing)
8
+ * - Scoring rubric (pass/fail/score per criterion)
9
+ *
10
+ * Results are deterministic and comparable across models for regression detection.
11
+ *
12
+ * Usage:
13
+ * import { runEvals, printResults, saveBaseline, compareToBaseline } from './runner.js';
14
+ * const results = await runEvals({ models: ['claude-sonnet-4-20250514'], suite: 'tools' });
15
+ * printResults(results);
16
+ */
17
+
18
+ import { Engine } from '../engine.js';
19
+ import { NullTrace } from '../debug-trace.js';
20
+ import { buildSystemPrompt } from '../prompts.js';
21
+ import { createEmptyRegistry } from '../tools/registry.js';
22
+ import { existsSync, readFileSync, writeFileSync, mkdirSync } from 'fs';
23
+ import { join } from 'path';
24
+
25
+ // ─── Types ───────────────────────────────────────────────────
26
+
27
+ /**
28
+ * @typedef {Object} EvalCase
29
+ * @property {string} id — unique identifier (e.g. "tool-use-search-basic")
30
+ * @property {string} suite — group name (e.g. "tools", "memory", "skills")
31
+ * @property {string} description — human-readable description
32
+ * @property {string} prompt — the user prompt to send
33
+ * @property {string} [mode] — 'chat' | 'work' (default: 'chat')
34
+ * @property {object[]} [messages] — prior conversation messages
35
+ * @property {object[]} [tools] — tool definitions to register
36
+ * @property {object} [registryTools] — ToolDef objects to register in ToolRegistry
37
+ * @property {Function} [setupEngine] — custom engine setup hook
38
+ * @property {EvalCriterion[]} criteria — scoring criteria
39
+ */
40
+
41
+ /**
42
+ * @typedef {Object} EvalCriterion
43
+ * @property {string} id — criterion identifier
44
+ * @property {string} description — what this checks
45
+ * @property {number} weight — importance weight (1-10)
46
+ * @property {(result: EvalRunResult) => CriterionScore} score — scoring function
47
+ */
48
+
49
+ /**
50
+ * @typedef {Object} CriterionScore
51
+ * @property {boolean} pass — did it pass?
52
+ * @property {number} score — 0.0 to 1.0
53
+ * @property {string} [reason] — explanation
54
+ */
55
+
56
+ /**
57
+ * @typedef {Object} EvalRunResult
58
+ * @property {string} caseId
59
+ * @property {string} model
60
+ * @property {object[]} events — all engine events
61
+ * @property {string} fullText — concatenated text_delta
62
+ * @property {object[]} toolCalls — tool_call events
63
+ * @property {object[]} toolResults — tool_end events
64
+ * @property {number} turnCount
65
+ * @property {number} inputTokens
66
+ * @property {number} outputTokens
67
+ * @property {number} latencyMs
68
+ * @property {string|null} error — error message if failed
69
+ */
70
+
71
+ /**
72
+ * @typedef {Object} EvalScore
73
+ * @property {string} caseId
74
+ * @property {string} model
75
+ * @property {number} totalScore — weighted average 0-100
76
+ * @property {Record<string, CriterionScore>} criteria
77
+ * @property {EvalRunResult} raw
78
+ */
79
+
80
+ // ─── Runner ──────────────────────────────────────────────────
81
+
82
+ /**
83
+ * Run a single eval case against a model adapter.
84
+ *
85
+ * @param {EvalCase} evalCase
86
+ * @param {{ adapter: object, model: string, config?: object }} options
87
+ * @returns {Promise<EvalRunResult>}
88
+ */
89
+ export async function runSingleEval(evalCase, { adapter, model, config = {} }) {
90
+ const trace = new NullTrace();
91
+ const engineConfig = { model, maxOutputTokens: 4096, ...config };
92
+
93
+ // Build engine — optionally with ToolRegistry
94
+ const engineOpts = { adapter, trace, config: engineConfig };
95
+
96
+ if (evalCase.registryTools) {
97
+ const registry = createEmptyRegistry();
98
+ for (const tool of evalCase.registryTools) {
99
+ registry.register(tool);
100
+ }
101
+ engineOpts.toolRegistry = registry;
102
+ }
103
+
104
+ const engine = new Engine(engineOpts);
105
+
106
+ // Register legacy tools if provided
107
+ if (evalCase.tools) {
108
+ for (const tool of evalCase.tools) {
109
+ engine.registerTool(tool);
110
+ }
111
+ }
112
+
113
+ // Custom setup hook
114
+ if (evalCase.setupEngine) {
115
+ evalCase.setupEngine(engine);
116
+ }
117
+
118
+ const events = [];
119
+ let fullText = '';
120
+ const toolCalls = [];
121
+ const toolResults = [];
122
+ let turnCount = 0;
123
+ let inputTokens = 0;
124
+ let outputTokens = 0;
125
+ let error = null;
126
+
127
+ const startTime = Date.now();
128
+
129
+ try {
130
+ for await (const event of engine.query({
131
+ prompt: evalCase.prompt,
132
+ mode: evalCase.mode || 'chat',
133
+ messages: evalCase.messages || [],
134
+ })) {
135
+ events.push(event);
136
+
137
+ switch (event.type) {
138
+ case 'text_delta':
139
+ fullText += event.text;
140
+ break;
141
+ case 'tool_call':
142
+ toolCalls.push(event);
143
+ break;
144
+ case 'tool_end':
145
+ toolResults.push(event);
146
+ break;
147
+ case 'turn_start':
148
+ turnCount++;
149
+ break;
150
+ case 'usage':
151
+ inputTokens += event.inputTokens || 0;
152
+ outputTokens += event.outputTokens || 0;
153
+ break;
154
+ case 'error':
155
+ error = event.error?.message || 'Unknown error';
156
+ break;
157
+ }
158
+ }
159
+ } catch (err) {
160
+ error = err.message;
161
+ }
162
+
163
+ const latencyMs = Date.now() - startTime;
164
+
165
+ return {
166
+ caseId: evalCase.id,
167
+ model,
168
+ events,
169
+ fullText,
170
+ toolCalls,
171
+ toolResults,
172
+ turnCount,
173
+ inputTokens,
174
+ outputTokens,
175
+ latencyMs,
176
+ error,
177
+ };
178
+ }
179
+
180
+ /**
181
+ * Score an eval run against its criteria.
182
+ *
183
+ * @param {EvalCase} evalCase
184
+ * @param {EvalRunResult} runResult
185
+ * @returns {EvalScore}
186
+ */
187
+ export function scoreEval(evalCase, runResult) {
188
+ const criteriaResults = {};
189
+ let weightedSum = 0;
190
+ let totalWeight = 0;
191
+
192
+ for (const criterion of evalCase.criteria) {
193
+ const result = criterion.score(runResult);
194
+ criteriaResults[criterion.id] = result;
195
+ weightedSum += result.score * criterion.weight;
196
+ totalWeight += criterion.weight;
197
+ }
198
+
199
+ return {
200
+ caseId: evalCase.id,
201
+ model: runResult.model,
202
+ totalScore: totalWeight > 0 ? Math.round((weightedSum / totalWeight) * 100) : 0,
203
+ criteria: criteriaResults,
204
+ raw: runResult,
205
+ };
206
+ }
207
+
208
+ /**
209
+ * Run multiple eval cases against multiple adapters.
210
+ *
211
+ * @param {{
212
+ * cases: EvalCase[],
213
+ * adapters: { name: string, adapter: object, config?: object }[],
214
+ * }} params
215
+ * @returns {Promise<EvalScore[]>}
216
+ */
217
+ export async function runEvals({ cases, adapters }) {
218
+ const allScores = [];
219
+
220
+ for (const { name: model, adapter, config } of adapters) {
221
+ for (const evalCase of cases) {
222
+ const runResult = await runSingleEval(evalCase, { adapter, model, config });
223
+ const score = scoreEval(evalCase, runResult);
224
+ allScores.push(score);
225
+ }
226
+ }
227
+
228
+ return allScores;
229
+ }
230
+
231
+ // ─── Baseline Management ─────────────────────────────────────
232
+
233
+ /**
234
+ * Save eval scores as a baseline for future comparison.
235
+ *
236
+ * @param {EvalScore[]} scores
237
+ * @param {string} baselineDir — directory to store baselines
238
+ * @param {string} [name] — baseline name (default: timestamp)
239
+ */
240
+ export function saveBaseline(scores, baselineDir, name) {
241
+ mkdirSync(baselineDir, { recursive: true });
242
+ const filename = `${name || new Date().toISOString().replace(/[:.]/g, '-')}.json`;
243
+ const baseline = {
244
+ timestamp: new Date().toISOString(),
245
+ name: name || 'unnamed',
246
+ scores: scores.map(s => ({
247
+ caseId: s.caseId,
248
+ model: s.model,
249
+ totalScore: s.totalScore,
250
+ criteria: s.criteria,
251
+ // Omit raw to keep baseline files small
252
+ })),
253
+ };
254
+ writeFileSync(join(baselineDir, filename), JSON.stringify(baseline, null, 2));
255
+ return join(baselineDir, filename);
256
+ }
257
+
258
+ /**
259
+ * Load a baseline file.
260
+ *
261
+ * @param {string} path
262
+ * @returns {object}
263
+ */
264
+ export function loadBaseline(path) {
265
+ return JSON.parse(readFileSync(path, 'utf8'));
266
+ }
267
+
268
+ /**
269
+ * Compare current scores against a baseline.
270
+ * Returns a list of regressions (cases where score dropped).
271
+ *
272
+ * @param {EvalScore[]} current
273
+ * @param {object} baseline — loaded baseline object
274
+ * @param {number} [threshold=5] — score drop threshold to flag as regression
275
+ * @returns {{ regressions: object[], improvements: object[], unchanged: object[] }}
276
+ */
277
+ export function compareToBaseline(current, baseline, threshold = 5) {
278
+ const baselineMap = new Map();
279
+ for (const s of baseline.scores) {
280
+ baselineMap.set(`${s.model}::${s.caseId}`, s);
281
+ }
282
+
283
+ const regressions = [];
284
+ const improvements = [];
285
+ const unchanged = [];
286
+
287
+ for (const score of current) {
288
+ const key = `${score.model}::${score.caseId}`;
289
+ const base = baselineMap.get(key);
290
+
291
+ if (!base) {
292
+ unchanged.push({ ...score, baseScore: null, delta: null, status: 'new' });
293
+ continue;
294
+ }
295
+
296
+ const delta = score.totalScore - base.totalScore;
297
+
298
+ if (delta < -threshold) {
299
+ regressions.push({
300
+ caseId: score.caseId,
301
+ model: score.model,
302
+ currentScore: score.totalScore,
303
+ baseScore: base.totalScore,
304
+ delta,
305
+ criteria: score.criteria,
306
+ baseCriteria: base.criteria,
307
+ });
308
+ } else if (delta > threshold) {
309
+ improvements.push({
310
+ caseId: score.caseId,
311
+ model: score.model,
312
+ currentScore: score.totalScore,
313
+ baseScore: base.totalScore,
314
+ delta,
315
+ });
316
+ } else {
317
+ unchanged.push({
318
+ caseId: score.caseId,
319
+ model: score.model,
320
+ currentScore: score.totalScore,
321
+ baseScore: base.totalScore,
322
+ delta,
323
+ });
324
+ }
325
+ }
326
+
327
+ return { regressions, improvements, unchanged };
328
+ }
329
+
330
+ // ─── Display ─────────────────────────────────────────────────
331
+
332
+ /**
333
+ * Print eval results as a formatted table.
334
+ *
335
+ * @param {EvalScore[]} scores
336
+ */
337
+ export function printResults(scores) {
338
+ // Group by model
339
+ const byModel = new Map();
340
+ for (const s of scores) {
341
+ if (!byModel.has(s.model)) byModel.set(s.model, []);
342
+ byModel.get(s.model).push(s);
343
+ }
344
+
345
+ for (const [model, modelScores] of byModel) {
346
+ console.log(`\n${'═'.repeat(60)}`);
347
+ console.log(` Model: ${model}`);
348
+ console.log(`${'═'.repeat(60)}`);
349
+
350
+ for (const s of modelScores) {
351
+ const icon = s.totalScore >= 80 ? '✅' : s.totalScore >= 50 ? '⚠️' : '❌';
352
+ console.log(`\n ${icon} ${s.caseId} — ${s.totalScore}/100`);
353
+
354
+ for (const [critId, crit] of Object.entries(s.criteria)) {
355
+ const critIcon = crit.pass ? ' ✓' : ' ✗';
356
+ console.log(` ${critIcon} ${critId}: ${Math.round(crit.score * 100)}%${crit.reason ? ` — ${crit.reason}` : ''}`);
357
+ }
358
+ }
359
+
360
+ const avg = Math.round(modelScores.reduce((sum, s) => sum + s.totalScore, 0) / modelScores.length);
361
+ console.log(`\n Average: ${avg}/100`);
362
+ }
363
+ }
364
+
365
+ /**
366
+ * Print baseline comparison results.
367
+ *
368
+ * @param {{ regressions: object[], improvements: object[], unchanged: object[] }} comparison
369
+ */
370
+ export function printComparison(comparison) {
371
+ const { regressions, improvements, unchanged } = comparison;
372
+
373
+ console.log(`\n${'═'.repeat(60)}`);
374
+ console.log(' Baseline Comparison');
375
+ console.log(`${'═'.repeat(60)}`);
376
+
377
+ if (regressions.length > 0) {
378
+ console.log(`\n 🔴 REGRESSIONS (${regressions.length}):`);
379
+ for (const r of regressions) {
380
+ console.log(` ${r.model} :: ${r.caseId}: ${r.baseScore} → ${r.currentScore} (${r.delta})`);
381
+ }
382
+ }
383
+
384
+ if (improvements.length > 0) {
385
+ console.log(`\n 🟢 IMPROVEMENTS (${improvements.length}):`);
386
+ for (const i of improvements) {
387
+ console.log(` ${i.model} :: ${i.caseId}: ${i.baseScore} → ${i.currentScore} (+${i.delta})`);
388
+ }
389
+ }
390
+
391
+ console.log(`\n ⚪ Unchanged: ${unchanged.length} cases`);
392
+
393
+ if (regressions.length > 0) {
394
+ console.log('\n ⛔ REGRESSION DETECTED — eval failed');
395
+ } else {
396
+ console.log('\n ✅ No regressions detected');
397
+ }
398
+ }
399
+
400
+ // ─── Criterion Helpers ───────────────────────────────────────
401
+
402
+ /** Check that no error occurred. */
403
+ export const noError = {
404
+ id: 'no-error',
405
+ description: 'No error during execution',
406
+ weight: 10,
407
+ score: (result) => ({
408
+ pass: !result.error,
409
+ score: result.error ? 0 : 1,
410
+ reason: result.error || undefined,
411
+ }),
412
+ };
413
+
414
+ /** Check that response contains expected text. */
415
+ export function containsText(text, opts = {}) {
416
+ return {
417
+ id: opts.id || `contains-${text.slice(0, 20)}`,
418
+ description: opts.description || `Response contains "${text}"`,
419
+ weight: opts.weight || 5,
420
+ score: (result) => {
421
+ const found = result.fullText.toLowerCase().includes(text.toLowerCase());
422
+ return { pass: found, score: found ? 1 : 0 };
423
+ },
424
+ };
425
+ }
426
+
427
+ /** Check that a specific tool was called. */
428
+ export function toolWasCalled(toolName, opts = {}) {
429
+ return {
430
+ id: opts.id || `tool-called-${toolName}`,
431
+ description: opts.description || `Tool "${toolName}" was called`,
432
+ weight: opts.weight || 8,
433
+ score: (result) => {
434
+ const called = result.toolCalls.some(tc => tc.name === toolName);
435
+ return { pass: called, score: called ? 1 : 0 };
436
+ },
437
+ };
438
+ }
439
+
440
+ /** Check that a tool was called with specific input. */
441
+ export function toolCalledWith(toolName, inputMatcher, opts = {}) {
442
+ return {
443
+ id: opts.id || `tool-input-${toolName}`,
444
+ description: opts.description || `Tool "${toolName}" called with correct input`,
445
+ weight: opts.weight || 6,
446
+ score: (result) => {
447
+ const call = result.toolCalls.find(tc => tc.name === toolName);
448
+ if (!call) return { pass: false, score: 0, reason: 'Tool not called' };
449
+ const match = inputMatcher(call.input);
450
+ return { pass: match, score: match ? 1 : 0 };
451
+ },
452
+ };
453
+ }
454
+
455
+ /** Check that a tool was NOT called. */
456
+ export function toolNotCalled(toolName, opts = {}) {
457
+ return {
458
+ id: opts.id || `tool-not-called-${toolName}`,
459
+ description: opts.description || `Tool "${toolName}" was NOT called`,
460
+ weight: opts.weight || 5,
461
+ score: (result) => {
462
+ const called = result.toolCalls.some(tc => tc.name === toolName);
463
+ return { pass: !called, score: called ? 0 : 1 };
464
+ },
465
+ };
466
+ }
467
+
468
+ /** Check that a tool succeeded (not error). */
469
+ export function toolSucceeded(toolName, opts = {}) {
470
+ return {
471
+ id: opts.id || `tool-success-${toolName}`,
472
+ description: opts.description || `Tool "${toolName}" succeeded`,
473
+ weight: opts.weight || 7,
474
+ score: (result) => {
475
+ const toolResult = result.toolResults.find(tr => tr.name === toolName);
476
+ if (!toolResult) return { pass: false, score: 0, reason: 'Tool not found in results' };
477
+ return { pass: !toolResult.isError, score: toolResult.isError ? 0 : 1, reason: toolResult.isError ? toolResult.output : undefined };
478
+ },
479
+ };
480
+ }
481
+
482
+ /** Check total turn count is within bounds. */
483
+ export function turnCountInRange(min, max, opts = {}) {
484
+ return {
485
+ id: opts.id || `turns-${min}-${max}`,
486
+ description: opts.description || `Turn count between ${min} and ${max}`,
487
+ weight: opts.weight || 3,
488
+ score: (result) => {
489
+ const inRange = result.turnCount >= min && result.turnCount <= max;
490
+ return { pass: inRange, score: inRange ? 1 : 0, reason: `${result.turnCount} turns` };
491
+ },
492
+ };
493
+ }
494
+
495
+ /** Check that response does NOT contain text. */
496
+ export function doesNotContain(text, opts = {}) {
497
+ return {
498
+ id: opts.id || `not-contains-${text.slice(0, 20)}`,
499
+ description: opts.description || `Response does NOT contain "${text}"`,
500
+ weight: opts.weight || 4,
501
+ score: (result) => {
502
+ const found = result.fullText.toLowerCase().includes(text.toLowerCase());
503
+ return { pass: !found, score: found ? 0 : 1 };
504
+ },
505
+ };
506
+ }
507
+
508
+ /** Check response length is within bounds (characters). */
509
+ export function responseLengthInRange(min, max, opts = {}) {
510
+ return {
511
+ id: opts.id || `length-${min}-${max}`,
512
+ description: opts.description || `Response length between ${min} and ${max} chars`,
513
+ weight: opts.weight || 2,
514
+ score: (result) => {
515
+ const len = result.fullText.length;
516
+ const inRange = len >= min && len <= max;
517
+ return { pass: inRange, score: inRange ? 1 : 0, reason: `${len} chars` };
518
+ },
519
+ };
520
+ }
521
+
522
+ /** Custom criterion with arbitrary scoring function. */
523
+ export function custom(id, description, weight, scoreFn) {
524
+ return { id, description, weight, score: scoreFn };
525
+ }
package/unify/index.js CHANGED
@@ -5,7 +5,7 @@
5
5
  */
6
6
 
7
7
  export { initYeaftDir, DEFAULT_YEAFT_DIR } from './init.js';
8
- export { loadConfig, parseFrontmatter } from './config.js';
8
+ export { loadConfig, parseFrontmatter, loadMCPConfig } from './config.js';
9
9
  export { DebugTrace, NullTrace, createTrace } from './debug-trace.js';
10
10
  export {
11
11
  LLMAdapter,
@@ -36,4 +36,5 @@ export { MCPManager, createMCPManager } from './mcp.js';
36
36
  export { SkillManager, createSkillManager, parseSkill, serializeSkill } from './skills.js';
37
37
  export { defineTool } from './tools/types.js';
38
38
  export { ToolRegistry, createEmptyRegistry } from './tools/registry.js';
39
+ export { loadSession } from './session.js';
39
40
 
package/unify/prompts.js CHANGED
@@ -61,6 +61,7 @@ export function buildSystemPrompt({
61
61
  toolNames = [],
62
62
  memory,
63
63
  compactSummary,
64
+ skillContent,
64
65
  } = {}) {
65
66
  // Fallback to English for unknown languages
66
67
  const lang = PROMPTS[language] || PROMPTS.en;
@@ -81,6 +82,11 @@ export function buildSystemPrompt({
81
82
  parts.push(lang.tools(toolNames.join(', ')));
82
83
  }
83
84
 
85
+ // ─── Skills Section ─────────────────────────────────────
86
+ if (skillContent) {
87
+ parts.push(skillContent);
88
+ }
89
+
84
90
  // ─── Memory Section ─────────────────────────────────────
85
91
  if (memory && (memory.profile || (memory.entries && memory.entries.length > 0))) {
86
92
  const memoryParts = [lang.memoryHeader];