@yeaft/webchat-agent 0.1.410 → 0.1.412
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/unify/config.js +36 -0
- package/unify/engine.js +124 -16
- package/unify/eval/cases/e2e.js +154 -0
- package/unify/eval/cases/memory.js +182 -0
- package/unify/eval/cases/skills.js +51 -0
- package/unify/eval/cases/tool-use.js +356 -0
- package/unify/eval/run-eval.js +250 -0
- package/unify/eval/runner.js +525 -0
- package/unify/index.js +2 -1
- package/unify/prompts.js +6 -0
- package/unify/session.js +191 -0
package/package.json
CHANGED
package/unify/config.js
CHANGED
|
@@ -267,3 +267,39 @@ export function loadConfig(overrides = {}) {
|
|
|
267
267
|
|
|
268
268
|
return config;
|
|
269
269
|
}
|
|
270
|
+
|
|
271
|
+
/**
|
|
272
|
+
* Load MCP server configuration from ~/.yeaft/mcp.json.
|
|
273
|
+
*
|
|
274
|
+
* JSON format (frontmatter parser can't handle nested objects):
|
|
275
|
+
* {
|
|
276
|
+
* "servers": [
|
|
277
|
+
* {
|
|
278
|
+
* "name": "github",
|
|
279
|
+
* "command": "npx",
|
|
280
|
+
* "args": ["@mcp/github"],
|
|
281
|
+
* "env": { "GITHUB_TOKEN": "ghp_..." }
|
|
282
|
+
* }
|
|
283
|
+
* ]
|
|
284
|
+
* }
|
|
285
|
+
*
|
|
286
|
+
* @param {string} yeaftDir — e.g. ~/.yeaft
|
|
287
|
+
* @returns {{ servers: object[] }}
|
|
288
|
+
*/
|
|
289
|
+
export function loadMCPConfig(yeaftDir) {
|
|
290
|
+
const mcpPath = join(yeaftDir, 'mcp.json');
|
|
291
|
+
if (!existsSync(mcpPath)) return { servers: [] };
|
|
292
|
+
|
|
293
|
+
try {
|
|
294
|
+
const raw = readFileSync(mcpPath, 'utf8');
|
|
295
|
+
const parsed = JSON.parse(raw);
|
|
296
|
+
if (!parsed.servers || !Array.isArray(parsed.servers)) {
|
|
297
|
+
return { servers: [] };
|
|
298
|
+
}
|
|
299
|
+
// Each server must have at least name + command
|
|
300
|
+
const valid = parsed.servers.filter(s => s.name && s.command);
|
|
301
|
+
return { servers: valid };
|
|
302
|
+
} catch {
|
|
303
|
+
return { servers: [] };
|
|
304
|
+
}
|
|
305
|
+
}
|
package/unify/engine.js
CHANGED
|
@@ -22,6 +22,7 @@ import { buildSystemPrompt } from './prompts.js';
|
|
|
22
22
|
import { LLMContextError } from './llm/adapter.js';
|
|
23
23
|
import { recall } from './memory/recall.js';
|
|
24
24
|
import { shouldConsolidate, consolidate } from './memory/consolidate.js';
|
|
25
|
+
import { runStopHooks } from './stop-hooks.js';
|
|
25
26
|
|
|
26
27
|
/** Maximum number of turns before the engine stops to prevent infinite loops. */
|
|
27
28
|
const MAX_TURNS = 25;
|
|
@@ -67,16 +68,32 @@ export class Engine {
|
|
|
67
68
|
/** @type {import('./memory/store.js').MemoryStore|null} */
|
|
68
69
|
#memoryStore;
|
|
69
70
|
|
|
71
|
+
/** @type {import('./tools/registry.js').ToolRegistry|null} */
|
|
72
|
+
#toolRegistry;
|
|
73
|
+
|
|
74
|
+
/** @type {import('./skills.js').SkillManager|null} */
|
|
75
|
+
#skillManager;
|
|
76
|
+
|
|
77
|
+
/** @type {import('./mcp.js').MCPManager|null} */
|
|
78
|
+
#mcpManager;
|
|
79
|
+
|
|
80
|
+
/** @type {string|null} */
|
|
81
|
+
#yeaftDir;
|
|
82
|
+
|
|
70
83
|
/**
|
|
71
84
|
* @param {{
|
|
72
85
|
* adapter: import('./llm/adapter.js').LLMAdapter,
|
|
73
86
|
* trace: object,
|
|
74
87
|
* config: object,
|
|
75
88
|
* conversationStore?: import('./conversation/persist.js').ConversationStore,
|
|
76
|
-
* memoryStore?: import('./memory/store.js').MemoryStore
|
|
89
|
+
* memoryStore?: import('./memory/store.js').MemoryStore,
|
|
90
|
+
* toolRegistry?: import('./tools/registry.js').ToolRegistry,
|
|
91
|
+
* skillManager?: import('./skills.js').SkillManager,
|
|
92
|
+
* mcpManager?: import('./mcp.js').MCPManager,
|
|
93
|
+
* yeaftDir?: string,
|
|
77
94
|
* }} params
|
|
78
95
|
*/
|
|
79
|
-
constructor({ adapter, trace, config, conversationStore, memoryStore }) {
|
|
96
|
+
constructor({ adapter, trace, config, conversationStore, memoryStore, toolRegistry, skillManager, mcpManager, yeaftDir }) {
|
|
80
97
|
this.#adapter = adapter;
|
|
81
98
|
this.#trace = trace;
|
|
82
99
|
this.#config = config;
|
|
@@ -84,6 +101,10 @@ export class Engine {
|
|
|
84
101
|
this.#traceId = randomUUID();
|
|
85
102
|
this.#conversationStore = conversationStore || null;
|
|
86
103
|
this.#memoryStore = memoryStore || null;
|
|
104
|
+
this.#toolRegistry = toolRegistry || null;
|
|
105
|
+
this.#skillManager = skillManager || null;
|
|
106
|
+
this.#mcpManager = mcpManager || null;
|
|
107
|
+
this.#yeaftDir = yeaftDir || null;
|
|
87
108
|
}
|
|
88
109
|
|
|
89
110
|
/**
|
|
@@ -106,10 +127,16 @@ export class Engine {
|
|
|
106
127
|
|
|
107
128
|
/**
|
|
108
129
|
* Get the list of registered tool definitions (for passing to the adapter).
|
|
130
|
+
* Prefers ToolRegistry (mode-aware) when available, falls back to legacy #tools Map.
|
|
109
131
|
*
|
|
132
|
+
* @param {string} [mode]
|
|
110
133
|
* @returns {import('./llm/adapter.js').UnifiedToolDef[]}
|
|
111
134
|
*/
|
|
112
|
-
#getToolDefs() {
|
|
135
|
+
#getToolDefs(mode) {
|
|
136
|
+
if (this.#toolRegistry) {
|
|
137
|
+
return this.#toolRegistry.getToolDefs(mode || 'chat');
|
|
138
|
+
}
|
|
139
|
+
// Legacy path: no mode filtering
|
|
113
140
|
const defs = [];
|
|
114
141
|
for (const [, tool] of this.#tools) {
|
|
115
142
|
defs.push({
|
|
@@ -122,23 +149,58 @@ export class Engine {
|
|
|
122
149
|
}
|
|
123
150
|
|
|
124
151
|
/**
|
|
125
|
-
* Build the system prompt with memory and
|
|
152
|
+
* Build the system prompt with memory, compact summary, and skill content.
|
|
126
153
|
*
|
|
127
154
|
* @param {string} mode
|
|
128
155
|
* @param {{ profile?: string, entries?: object[] }} [memory]
|
|
129
156
|
* @param {string} [compactSummary]
|
|
157
|
+
* @param {string} [prompt] — user prompt (for skill relevance matching)
|
|
130
158
|
* @returns {string}
|
|
131
159
|
*/
|
|
132
|
-
#buildSystemPrompt(mode, memory, compactSummary) {
|
|
160
|
+
#buildSystemPrompt(mode, memory, compactSummary, prompt) {
|
|
161
|
+
// Get relevant skill content if SkillManager is wired
|
|
162
|
+
let skillContent = '';
|
|
163
|
+
if (this.#skillManager && prompt) {
|
|
164
|
+
skillContent = this.#skillManager.getRelevantPromptContent(prompt, mode);
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
// Get tool names from the appropriate source
|
|
168
|
+
const toolNames = this.#toolRegistry
|
|
169
|
+
? this.#toolRegistry.getToolNames(mode || 'chat')
|
|
170
|
+
: Array.from(this.#tools.keys());
|
|
171
|
+
|
|
133
172
|
return buildSystemPrompt({
|
|
134
173
|
language: this.#config.language || 'en',
|
|
135
174
|
mode,
|
|
136
|
-
toolNames
|
|
175
|
+
toolNames,
|
|
137
176
|
memory,
|
|
138
177
|
compactSummary,
|
|
178
|
+
skillContent,
|
|
139
179
|
});
|
|
140
180
|
}
|
|
141
181
|
|
|
182
|
+
/**
|
|
183
|
+
* Build the full tool context for Phase 5 tools.
|
|
184
|
+
*
|
|
185
|
+
* @param {AbortSignal} [signal]
|
|
186
|
+
* @param {string} [mode]
|
|
187
|
+
* @returns {object}
|
|
188
|
+
*/
|
|
189
|
+
#buildToolContext(signal, mode) {
|
|
190
|
+
return {
|
|
191
|
+
signal,
|
|
192
|
+
yeaftDir: this.#yeaftDir,
|
|
193
|
+
cwd: process.cwd(),
|
|
194
|
+
mcpManager: this.#mcpManager,
|
|
195
|
+
skillManager: this.#skillManager,
|
|
196
|
+
memoryStore: this.#memoryStore,
|
|
197
|
+
conversationStore: this.#conversationStore,
|
|
198
|
+
adapter: this.#adapter,
|
|
199
|
+
config: this.#config,
|
|
200
|
+
mode,
|
|
201
|
+
};
|
|
202
|
+
}
|
|
203
|
+
|
|
142
204
|
/**
|
|
143
205
|
* Perform memory recall for a given prompt.
|
|
144
206
|
*
|
|
@@ -262,7 +324,7 @@ export class Engine {
|
|
|
262
324
|
}
|
|
263
325
|
|
|
264
326
|
const compactSummary = this.#getCompactSummary();
|
|
265
|
-
const systemPrompt = this.#buildSystemPrompt(mode, memory, compactSummary);
|
|
327
|
+
const systemPrompt = this.#buildSystemPrompt(mode, memory, compactSummary, prompt);
|
|
266
328
|
|
|
267
329
|
// Build conversation: existing messages + new user message
|
|
268
330
|
const conversationMessages = [
|
|
@@ -270,7 +332,7 @@ export class Engine {
|
|
|
270
332
|
{ role: 'user', content: prompt },
|
|
271
333
|
];
|
|
272
334
|
|
|
273
|
-
const toolDefs = this.#getToolDefs();
|
|
335
|
+
const toolDefs = this.#getToolDefs(mode);
|
|
274
336
|
let turnNumber = 0;
|
|
275
337
|
let continueTurns = 0; // auto-continue counter
|
|
276
338
|
let fullResponseText = '';
|
|
@@ -416,33 +478,66 @@ export class Engine {
|
|
|
416
478
|
if (stopReason !== 'tool_use' || toolCalls.length === 0) {
|
|
417
479
|
yield { type: 'turn_end', turnNumber, stopReason };
|
|
418
480
|
|
|
419
|
-
// ─── Post-query:
|
|
420
|
-
this.#
|
|
481
|
+
// ─── Post-query: StopHooks or Legacy ─────────────
|
|
482
|
+
if (this.#yeaftDir && this.#conversationStore) {
|
|
483
|
+
// Full pipeline: persist + consolidate + dream gate
|
|
484
|
+
const hookResult = await runStopHooks({
|
|
485
|
+
yeaftDir: this.#yeaftDir,
|
|
486
|
+
mode,
|
|
487
|
+
conversationStore: this.#conversationStore,
|
|
488
|
+
memoryStore: this.#memoryStore,
|
|
489
|
+
adapter: this.#adapter,
|
|
490
|
+
config: this.#config,
|
|
491
|
+
messages: conversationMessages,
|
|
492
|
+
trace: this.#trace,
|
|
493
|
+
});
|
|
494
|
+
|
|
495
|
+
if (hookResult.consolidated) {
|
|
496
|
+
yield { type: 'consolidate', archivedCount: 0, extractedCount: 0 };
|
|
497
|
+
}
|
|
498
|
+
if (hookResult.dreamTriggered) {
|
|
499
|
+
yield { type: 'dream_triggered' };
|
|
500
|
+
}
|
|
501
|
+
} else {
|
|
502
|
+
// Legacy path (no yeaftDir → use old behavior)
|
|
503
|
+
this.#persistMessages(prompt, fullResponseText, mode, assistantMsg.toolCalls);
|
|
421
504
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
505
|
+
const consolidated = await this.#maybeConsolidate();
|
|
506
|
+
if (consolidated && consolidated.archivedCount > 0) {
|
|
507
|
+
yield { type: 'consolidate', archivedCount: consolidated.archivedCount, extractedCount: consolidated.extractedCount };
|
|
508
|
+
}
|
|
425
509
|
}
|
|
426
510
|
|
|
427
511
|
break;
|
|
428
512
|
}
|
|
429
513
|
|
|
430
514
|
// Execute tool calls and feed results back
|
|
515
|
+
const toolCtx = this.#buildToolContext(signal, mode);
|
|
516
|
+
|
|
431
517
|
for (const tc of toolCalls) {
|
|
432
|
-
const tool = this.#tools.get(tc.name);
|
|
433
518
|
const toolStartTime = Date.now();
|
|
434
519
|
|
|
435
520
|
let output;
|
|
436
521
|
let isError = false;
|
|
437
522
|
|
|
438
|
-
|
|
523
|
+
// Resolve tool: prefer ToolRegistry, fallback to legacy #tools Map
|
|
524
|
+
const hasTool = this.#toolRegistry
|
|
525
|
+
? this.#toolRegistry.has(tc.name)
|
|
526
|
+
: this.#tools.has(tc.name);
|
|
527
|
+
|
|
528
|
+
if (!hasTool) {
|
|
439
529
|
output = `Error: unknown tool "${tc.name}"`;
|
|
440
530
|
isError = true;
|
|
441
531
|
yield { type: 'tool_end', id: tc.id, name: tc.name, output, isError: true };
|
|
442
532
|
} else {
|
|
443
533
|
try {
|
|
444
534
|
yield { type: 'tool_start', id: tc.id, name: tc.name, input: tc.input };
|
|
445
|
-
|
|
535
|
+
if (this.#toolRegistry) {
|
|
536
|
+
output = await this.#toolRegistry.execute(tc.name, tc.input, toolCtx);
|
|
537
|
+
} else {
|
|
538
|
+
const tool = this.#tools.get(tc.name);
|
|
539
|
+
output = await tool.execute(tc.input, { signal });
|
|
540
|
+
}
|
|
446
541
|
yield { type: 'tool_end', id: tc.id, name: tc.name, output, isError: false };
|
|
447
542
|
} catch (err) {
|
|
448
543
|
output = `Error: ${err.message}`;
|
|
@@ -490,6 +585,7 @@ export class Engine {
|
|
|
490
585
|
* @returns {string[]}
|
|
491
586
|
*/
|
|
492
587
|
get toolNames() {
|
|
588
|
+
if (this.#toolRegistry) return this.#toolRegistry.names;
|
|
493
589
|
return Array.from(this.#tools.keys());
|
|
494
590
|
}
|
|
495
591
|
|
|
@@ -508,4 +604,16 @@ export class Engine {
|
|
|
508
604
|
get memoryStore() {
|
|
509
605
|
return this.#memoryStore;
|
|
510
606
|
}
|
|
607
|
+
|
|
608
|
+
/** @returns {import('./tools/registry.js').ToolRegistry|null} */
|
|
609
|
+
get toolRegistry() { return this.#toolRegistry; }
|
|
610
|
+
|
|
611
|
+
/** @returns {import('./skills.js').SkillManager|null} */
|
|
612
|
+
get skillManager() { return this.#skillManager; }
|
|
613
|
+
|
|
614
|
+
/** @returns {import('./mcp.js').MCPManager|null} */
|
|
615
|
+
get mcpManager() { return this.#mcpManager; }
|
|
616
|
+
|
|
617
|
+
/** @returns {string|null} */
|
|
618
|
+
get yeaftDir() { return this.#yeaftDir; }
|
|
511
619
|
}
|
|
@@ -0,0 +1,154 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval/cases/e2e.js — End-to-end session eval cases
|
|
3
|
+
*
|
|
4
|
+
* Tests the full pipeline: prompt → recall → system prompt → LLM → tools → response.
|
|
5
|
+
* These cases verify that the integration holds together correctly.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import { defineTool } from '../../tools/types.js';
|
|
9
|
+
import {
|
|
10
|
+
noError,
|
|
11
|
+
containsText,
|
|
12
|
+
toolWasCalled,
|
|
13
|
+
toolNotCalled,
|
|
14
|
+
toolSucceeded,
|
|
15
|
+
turnCountInRange,
|
|
16
|
+
responseLengthInRange,
|
|
17
|
+
custom,
|
|
18
|
+
} from '../runner.js';
|
|
19
|
+
|
|
20
|
+
// ─── Mock Tools ──────────────────────────────────────────────
|
|
21
|
+
|
|
22
|
+
const listProjectsTool = defineTool({
|
|
23
|
+
name: 'list_projects',
|
|
24
|
+
description: 'List all projects in the workspace.',
|
|
25
|
+
parameters: { type: 'object', properties: {} },
|
|
26
|
+
modes: ['chat', 'work'],
|
|
27
|
+
async execute() {
|
|
28
|
+
return JSON.stringify({
|
|
29
|
+
projects: ['my-app', 'shared-lib', 'docs-site'],
|
|
30
|
+
});
|
|
31
|
+
},
|
|
32
|
+
});
|
|
33
|
+
|
|
34
|
+
const getProjectInfoTool = defineTool({
|
|
35
|
+
name: 'get_project_info',
|
|
36
|
+
description: 'Get detailed information about a specific project.',
|
|
37
|
+
parameters: {
|
|
38
|
+
type: 'object',
|
|
39
|
+
properties: {
|
|
40
|
+
name: { type: 'string', description: 'Project name' },
|
|
41
|
+
},
|
|
42
|
+
required: ['name'],
|
|
43
|
+
},
|
|
44
|
+
modes: ['chat', 'work'],
|
|
45
|
+
async execute(input) {
|
|
46
|
+
const projects = {
|
|
47
|
+
'my-app': { name: 'my-app', language: 'TypeScript', framework: 'Express', tests: 142 },
|
|
48
|
+
'shared-lib': { name: 'shared-lib', language: 'TypeScript', framework: 'none', tests: 67 },
|
|
49
|
+
'docs-site': { name: 'docs-site', language: 'MDX', framework: 'Next.js', tests: 23 },
|
|
50
|
+
};
|
|
51
|
+
return JSON.stringify(projects[input.name] || { error: `Unknown project: ${input.name}` });
|
|
52
|
+
},
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
const e2eTools = [listProjectsTool, getProjectInfoTool];
|
|
56
|
+
|
|
57
|
+
// ─── Eval Cases ──────────────────────────────────────────────
|
|
58
|
+
|
|
59
|
+
export const e2eCases = [
|
|
60
|
+
|
|
61
|
+
// ─── Conversation Coherence ───────────────────────────
|
|
62
|
+
|
|
63
|
+
{
|
|
64
|
+
id: 'e2e-conversation-context',
|
|
65
|
+
suite: 'e2e',
|
|
66
|
+
description: 'Model should use conversation history for context',
|
|
67
|
+
prompt: 'What language is it written in?',
|
|
68
|
+
messages: [
|
|
69
|
+
{ role: 'user', content: 'Tell me about the my-app project' },
|
|
70
|
+
{ role: 'assistant', content: 'The my-app project is a TypeScript application built with Express. It has 142 tests.' },
|
|
71
|
+
],
|
|
72
|
+
registryTools: e2eTools,
|
|
73
|
+
criteria: [
|
|
74
|
+
noError,
|
|
75
|
+
containsText('TypeScript', { weight: 8, id: 'remembers-language' }),
|
|
76
|
+
turnCountInRange(1, 2, { weight: 3 }),
|
|
77
|
+
],
|
|
78
|
+
},
|
|
79
|
+
|
|
80
|
+
// ─── Tool Chain ───────────────────────────────────────
|
|
81
|
+
|
|
82
|
+
{
|
|
83
|
+
id: 'e2e-tool-chain-list-then-detail',
|
|
84
|
+
suite: 'e2e',
|
|
85
|
+
description: 'Model should list projects then get details about a specific one',
|
|
86
|
+
prompt: 'Show me all projects and tell me about the one with the most tests',
|
|
87
|
+
registryTools: e2eTools,
|
|
88
|
+
criteria: [
|
|
89
|
+
noError,
|
|
90
|
+
toolWasCalled('list_projects', { weight: 7 }),
|
|
91
|
+
toolWasCalled('get_project_info', { weight: 7 }),
|
|
92
|
+
containsText('my-app', { weight: 5, id: 'identifies-most-tested' }),
|
|
93
|
+
containsText('142', { weight: 5, id: 'mentions-test-count' }),
|
|
94
|
+
],
|
|
95
|
+
},
|
|
96
|
+
|
|
97
|
+
// ─── Instruction Following ────────────────────────────
|
|
98
|
+
|
|
99
|
+
{
|
|
100
|
+
id: 'e2e-format-json',
|
|
101
|
+
suite: 'e2e',
|
|
102
|
+
description: 'Model should follow format instructions',
|
|
103
|
+
prompt: 'List three programming languages. Respond only with a JSON array of strings, nothing else.',
|
|
104
|
+
criteria: [
|
|
105
|
+
noError,
|
|
106
|
+
custom('valid-json-array', 'Response is a valid JSON array', 10, (result) => {
|
|
107
|
+
try {
|
|
108
|
+
// Try to extract JSON from the response
|
|
109
|
+
const text = result.fullText.trim();
|
|
110
|
+
const match = text.match(/\[[\s\S]*\]/);
|
|
111
|
+
if (!match) return { pass: false, score: 0, reason: 'No JSON array found' };
|
|
112
|
+
const arr = JSON.parse(match[0]);
|
|
113
|
+
const valid = Array.isArray(arr) && arr.length === 3 && arr.every(s => typeof s === 'string');
|
|
114
|
+
return { pass: valid, score: valid ? 1 : 0.5, reason: valid ? undefined : `Got: ${JSON.stringify(arr)}` };
|
|
115
|
+
} catch {
|
|
116
|
+
return { pass: false, score: 0, reason: 'Not valid JSON' };
|
|
117
|
+
}
|
|
118
|
+
}),
|
|
119
|
+
],
|
|
120
|
+
},
|
|
121
|
+
|
|
122
|
+
// ─── Response Quality ─────────────────────────────────
|
|
123
|
+
|
|
124
|
+
{
|
|
125
|
+
id: 'e2e-concise-answer',
|
|
126
|
+
suite: 'e2e',
|
|
127
|
+
description: 'Model should give a concise answer for simple question',
|
|
128
|
+
prompt: 'What does the acronym HTTP stand for?',
|
|
129
|
+
criteria: [
|
|
130
|
+
noError,
|
|
131
|
+
containsText('Hypertext Transfer Protocol', { weight: 8 }),
|
|
132
|
+
responseLengthInRange(10, 500, { weight: 5, id: 'not-too-long' }),
|
|
133
|
+
toolNotCalled('search', { weight: 3 }),
|
|
134
|
+
],
|
|
135
|
+
},
|
|
136
|
+
|
|
137
|
+
// ─── Language Handling ────────────────────────────────
|
|
138
|
+
|
|
139
|
+
{
|
|
140
|
+
id: 'e2e-chinese-response',
|
|
141
|
+
suite: 'e2e',
|
|
142
|
+
description: 'Model should respond in Chinese when prompted in Chinese',
|
|
143
|
+
prompt: '用中文简单解释什么是 API',
|
|
144
|
+
criteria: [
|
|
145
|
+
noError,
|
|
146
|
+
custom('has-chinese', 'Response contains Chinese characters', 8, (result) => {
|
|
147
|
+
const chinesePattern = /[\u4e00-\u9fff]/;
|
|
148
|
+
const hasChinese = chinesePattern.test(result.fullText);
|
|
149
|
+
return { pass: hasChinese, score: hasChinese ? 1 : 0 };
|
|
150
|
+
}),
|
|
151
|
+
containsText('API', { weight: 5 }),
|
|
152
|
+
],
|
|
153
|
+
},
|
|
154
|
+
];
|
|
@@ -0,0 +1,182 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval/cases/memory.js — Memory recall eval cases
|
|
3
|
+
*
|
|
4
|
+
* Tests the memory recall pipeline:
|
|
5
|
+
* - Keyword extraction accuracy
|
|
6
|
+
* - Scope + tag filtering
|
|
7
|
+
* - LLM selection (when >7 candidates)
|
|
8
|
+
* - Fingerprint caching
|
|
9
|
+
* - Memory injection into system prompt
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import {
|
|
13
|
+
noError,
|
|
14
|
+
containsText,
|
|
15
|
+
custom,
|
|
16
|
+
} from '../runner.js';
|
|
17
|
+
|
|
18
|
+
// ─── Memory Recall Test Helpers ──────────────────────────────
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Create an engine with pre-loaded memory entries for eval.
|
|
22
|
+
* Uses a mock MemoryStore that returns predefined entries.
|
|
23
|
+
*/
|
|
24
|
+
function createMockMemoryStore(entries) {
|
|
25
|
+
return {
|
|
26
|
+
readProfile: () => 'User is a senior TypeScript developer who prefers functional programming.',
|
|
27
|
+
readEntry: (name) => entries.find(e => e.name === name) || null,
|
|
28
|
+
readSection: () => '',
|
|
29
|
+
listEntries: () => entries,
|
|
30
|
+
findByFilter: ({ scope, tags, limit = 15 }) => {
|
|
31
|
+
// Simple scoring: scope match + tag overlap
|
|
32
|
+
return entries
|
|
33
|
+
.map(e => {
|
|
34
|
+
let score = 0;
|
|
35
|
+
if (scope && e.scope === scope) score += 3;
|
|
36
|
+
if (scope && e.scope === 'global') score += 1;
|
|
37
|
+
if (tags) {
|
|
38
|
+
for (const t of tags) {
|
|
39
|
+
if (e.tags && e.tags.includes(t)) score += 1;
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
return { ...e, _score: score };
|
|
43
|
+
})
|
|
44
|
+
.filter(e => e._score > 0)
|
|
45
|
+
.sort((a, b) => b._score - a._score)
|
|
46
|
+
.slice(0, limit);
|
|
47
|
+
},
|
|
48
|
+
bumpFrequency: () => {},
|
|
49
|
+
search: (keyword) => entries.filter(e =>
|
|
50
|
+
e.content.toLowerCase().includes(keyword.toLowerCase()) ||
|
|
51
|
+
e.name.toLowerCase().includes(keyword.toLowerCase()),
|
|
52
|
+
),
|
|
53
|
+
stats: () => ({ entryCount: entries.length, scopes: [], kinds: {} }),
|
|
54
|
+
writeEntry: () => 'test-entry',
|
|
55
|
+
writeEntries: () => [],
|
|
56
|
+
deleteEntry: () => true,
|
|
57
|
+
rebuildScopes: () => {},
|
|
58
|
+
addToSection: () => {},
|
|
59
|
+
writeProfile: () => {},
|
|
60
|
+
clear: () => {},
|
|
61
|
+
};
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
const sampleMemoryEntries = [
|
|
65
|
+
{
|
|
66
|
+
name: 'typescript-strict-mode',
|
|
67
|
+
kind: 'preference',
|
|
68
|
+
scope: 'global',
|
|
69
|
+
tags: ['typescript', 'config', 'strict'],
|
|
70
|
+
importance: 'high',
|
|
71
|
+
frequency: 5,
|
|
72
|
+
content: 'User always uses TypeScript strict mode with noImplicitAny enabled.',
|
|
73
|
+
created_at: '2026-03-01T00:00:00Z',
|
|
74
|
+
updated_at: '2026-04-01T00:00:00Z',
|
|
75
|
+
},
|
|
76
|
+
{
|
|
77
|
+
name: 'prefers-vitest',
|
|
78
|
+
kind: 'preference',
|
|
79
|
+
scope: 'work/claude-web-chat',
|
|
80
|
+
tags: ['testing', 'vitest', 'framework'],
|
|
81
|
+
importance: 'normal',
|
|
82
|
+
frequency: 3,
|
|
83
|
+
content: 'User prefers vitest over jest for testing. Uses vitest for all new projects.',
|
|
84
|
+
created_at: '2026-03-15T00:00:00Z',
|
|
85
|
+
updated_at: '2026-04-01T00:00:00Z',
|
|
86
|
+
},
|
|
87
|
+
{
|
|
88
|
+
name: 'error-handling-pattern',
|
|
89
|
+
kind: 'lesson',
|
|
90
|
+
scope: 'global',
|
|
91
|
+
tags: ['error-handling', 'typescript', 'patterns'],
|
|
92
|
+
importance: 'high',
|
|
93
|
+
frequency: 4,
|
|
94
|
+
content: 'Always use Result<T, E> pattern instead of throwing exceptions. Wrap external API calls in try-catch and return Result.',
|
|
95
|
+
created_at: '2026-02-01T00:00:00Z',
|
|
96
|
+
updated_at: '2026-04-01T00:00:00Z',
|
|
97
|
+
},
|
|
98
|
+
{
|
|
99
|
+
name: 'project-structure',
|
|
100
|
+
kind: 'context',
|
|
101
|
+
scope: 'work/claude-web-chat',
|
|
102
|
+
tags: ['architecture', 'project', 'monorepo'],
|
|
103
|
+
importance: 'normal',
|
|
104
|
+
frequency: 2,
|
|
105
|
+
content: 'Project uses monorepo with agent/, server/, web/ directories. Agent code is in agent/unify/.',
|
|
106
|
+
created_at: '2026-01-01T00:00:00Z',
|
|
107
|
+
updated_at: '2026-03-01T00:00:00Z',
|
|
108
|
+
},
|
|
109
|
+
{
|
|
110
|
+
name: 'functional-programming',
|
|
111
|
+
kind: 'preference',
|
|
112
|
+
scope: 'global',
|
|
113
|
+
tags: ['functional', 'programming', 'style'],
|
|
114
|
+
importance: 'normal',
|
|
115
|
+
frequency: 6,
|
|
116
|
+
content: 'User prefers functional programming: pure functions, immutable data, map/filter/reduce over loops.',
|
|
117
|
+
created_at: '2026-01-15T00:00:00Z',
|
|
118
|
+
updated_at: '2026-04-05T00:00:00Z',
|
|
119
|
+
},
|
|
120
|
+
{
|
|
121
|
+
name: 'api-design-rest',
|
|
122
|
+
kind: 'skill',
|
|
123
|
+
scope: 'global',
|
|
124
|
+
tags: ['api', 'rest', 'design'],
|
|
125
|
+
importance: 'normal',
|
|
126
|
+
frequency: 1,
|
|
127
|
+
content: 'REST API conventions: use plural nouns, HTTP methods for CRUD, 2xx success, 4xx client error, 5xx server error.',
|
|
128
|
+
created_at: '2026-02-15T00:00:00Z',
|
|
129
|
+
updated_at: '2026-02-15T00:00:00Z',
|
|
130
|
+
},
|
|
131
|
+
];
|
|
132
|
+
|
|
133
|
+
// ─── Eval Cases ──────────────────────────────────────────────
|
|
134
|
+
|
|
135
|
+
export const memoryCases = [
|
|
136
|
+
|
|
137
|
+
// ─── Memory Injection Verification ────────────────────
|
|
138
|
+
|
|
139
|
+
{
|
|
140
|
+
id: 'memory-profile-injection',
|
|
141
|
+
suite: 'memory',
|
|
142
|
+
description: 'System prompt should include user profile from memory',
|
|
143
|
+
prompt: 'Help me with a coding task',
|
|
144
|
+
setupEngine: (engine) => {
|
|
145
|
+
// We can't directly inject memoryStore here since Engine uses private fields
|
|
146
|
+
// Instead, this eval verifies via the adapter call log that system prompt contains memory
|
|
147
|
+
},
|
|
148
|
+
criteria: [
|
|
149
|
+
noError,
|
|
150
|
+
custom('has-response', 'Model produces a response', 5, (result) => ({
|
|
151
|
+
pass: result.fullText.length > 0,
|
|
152
|
+
score: result.fullText.length > 0 ? 1 : 0,
|
|
153
|
+
})),
|
|
154
|
+
],
|
|
155
|
+
},
|
|
156
|
+
|
|
157
|
+
// ─── Keyword Extraction (unit-level eval) ─────────────
|
|
158
|
+
|
|
159
|
+
{
|
|
160
|
+
id: 'memory-keyword-extraction',
|
|
161
|
+
suite: 'memory',
|
|
162
|
+
description: 'Keyword extraction produces relevant keywords',
|
|
163
|
+
prompt: 'How should I handle TypeScript errors in my Express API?',
|
|
164
|
+
criteria: [
|
|
165
|
+
noError,
|
|
166
|
+
// This is tested at unit level but verifiable here via recall event
|
|
167
|
+
custom('recall-event', 'Recall event emitted (if memory store provided)', 3, (result) => {
|
|
168
|
+
// Without a real memory store this won't emit recall, so we check gracefully
|
|
169
|
+
const recallEvent = result.events.find(e => e.type === 'recall');
|
|
170
|
+
return {
|
|
171
|
+
pass: true, // Always passes — it's informational
|
|
172
|
+
score: recallEvent ? 1 : 0.5,
|
|
173
|
+
reason: recallEvent ? `Recalled ${recallEvent.entryCount} entries` : 'No memory store configured',
|
|
174
|
+
};
|
|
175
|
+
}),
|
|
176
|
+
],
|
|
177
|
+
},
|
|
178
|
+
];
|
|
179
|
+
|
|
180
|
+
// ─── Exported for direct import in unit tests ────────────────
|
|
181
|
+
|
|
182
|
+
export { createMockMemoryStore, sampleMemoryEntries };
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* eval/cases/skills.js — Skill matching eval cases
|
|
3
|
+
*
|
|
4
|
+
* Tests whether the engine correctly:
|
|
5
|
+
* - Matches skills to relevant prompts
|
|
6
|
+
* - Injects matched skill content into system prompt
|
|
7
|
+
* - Does NOT inject irrelevant skills
|
|
8
|
+
* - Handles mode filtering correctly
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import {
|
|
12
|
+
noError,
|
|
13
|
+
containsText,
|
|
14
|
+
doesNotContain,
|
|
15
|
+
custom,
|
|
16
|
+
} from '../runner.js';
|
|
17
|
+
|
|
18
|
+
// ─── Eval Cases ──────────────────────────────────────────────
|
|
19
|
+
|
|
20
|
+
export const skillsCases = [
|
|
21
|
+
|
|
22
|
+
{
|
|
23
|
+
id: 'skill-match-basic',
|
|
24
|
+
suite: 'skills',
|
|
25
|
+
description: 'Engine should inject relevant skill into system prompt',
|
|
26
|
+
prompt: 'How do I set up testing for my project?',
|
|
27
|
+
criteria: [
|
|
28
|
+
noError,
|
|
29
|
+
// The actual skill injection happens via system prompt which we can check
|
|
30
|
+
// if the adapter captures it. For now, just verify no crash.
|
|
31
|
+
custom('produces-response', 'Model responds to the prompt', 5, (result) => ({
|
|
32
|
+
pass: result.fullText.length > 10,
|
|
33
|
+
score: result.fullText.length > 10 ? 1 : 0,
|
|
34
|
+
})),
|
|
35
|
+
],
|
|
36
|
+
},
|
|
37
|
+
|
|
38
|
+
{
|
|
39
|
+
id: 'skill-no-false-positive',
|
|
40
|
+
suite: 'skills',
|
|
41
|
+
description: 'Engine should NOT inject unrelated skills',
|
|
42
|
+
prompt: 'What is the weather like?',
|
|
43
|
+
criteria: [
|
|
44
|
+
noError,
|
|
45
|
+
custom('produces-response', 'Model responds', 5, (result) => ({
|
|
46
|
+
pass: result.fullText.length > 0,
|
|
47
|
+
score: result.fullText.length > 0 ? 1 : 0,
|
|
48
|
+
})),
|
|
49
|
+
],
|
|
50
|
+
},
|
|
51
|
+
];
|