@yeaft/webchat-agent 0.1.411 → 0.1.412

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@yeaft/webchat-agent",
3
- "version": "0.1.411",
3
+ "version": "0.1.412",
4
4
  "description": "Remote agent for Yeaft WebChat — connects worker machines to the central server",
5
5
  "main": "index.js",
6
6
  "type": "module",
@@ -0,0 +1,154 @@
1
+ /**
2
+ * eval/cases/e2e.js — End-to-end session eval cases
3
+ *
4
+ * Tests the full pipeline: prompt → recall → system prompt → LLM → tools → response.
5
+ * These cases verify that the integration holds together correctly.
6
+ */
7
+
8
+ import { defineTool } from '../../tools/types.js';
9
+ import {
10
+ noError,
11
+ containsText,
12
+ toolWasCalled,
13
+ toolNotCalled,
14
+ toolSucceeded,
15
+ turnCountInRange,
16
+ responseLengthInRange,
17
+ custom,
18
+ } from '../runner.js';
19
+
20
+ // ─── Mock Tools ──────────────────────────────────────────────
21
+
22
+ const listProjectsTool = defineTool({
23
+ name: 'list_projects',
24
+ description: 'List all projects in the workspace.',
25
+ parameters: { type: 'object', properties: {} },
26
+ modes: ['chat', 'work'],
27
+ async execute() {
28
+ return JSON.stringify({
29
+ projects: ['my-app', 'shared-lib', 'docs-site'],
30
+ });
31
+ },
32
+ });
33
+
34
+ const getProjectInfoTool = defineTool({
35
+ name: 'get_project_info',
36
+ description: 'Get detailed information about a specific project.',
37
+ parameters: {
38
+ type: 'object',
39
+ properties: {
40
+ name: { type: 'string', description: 'Project name' },
41
+ },
42
+ required: ['name'],
43
+ },
44
+ modes: ['chat', 'work'],
45
+ async execute(input) {
46
+ const projects = {
47
+ 'my-app': { name: 'my-app', language: 'TypeScript', framework: 'Express', tests: 142 },
48
+ 'shared-lib': { name: 'shared-lib', language: 'TypeScript', framework: 'none', tests: 67 },
49
+ 'docs-site': { name: 'docs-site', language: 'MDX', framework: 'Next.js', tests: 23 },
50
+ };
51
+ return JSON.stringify(projects[input.name] || { error: `Unknown project: ${input.name}` });
52
+ },
53
+ });
54
+
55
+ const e2eTools = [listProjectsTool, getProjectInfoTool];
56
+
57
+ // ─── Eval Cases ──────────────────────────────────────────────
58
+
59
+ export const e2eCases = [
60
+
61
+ // ─── Conversation Coherence ───────────────────────────
62
+
63
+ {
64
+ id: 'e2e-conversation-context',
65
+ suite: 'e2e',
66
+ description: 'Model should use conversation history for context',
67
+ prompt: 'What language is it written in?',
68
+ messages: [
69
+ { role: 'user', content: 'Tell me about the my-app project' },
70
+ { role: 'assistant', content: 'The my-app project is a TypeScript application built with Express. It has 142 tests.' },
71
+ ],
72
+ registryTools: e2eTools,
73
+ criteria: [
74
+ noError,
75
+ containsText('TypeScript', { weight: 8, id: 'remembers-language' }),
76
+ turnCountInRange(1, 2, { weight: 3 }),
77
+ ],
78
+ },
79
+
80
+ // ─── Tool Chain ───────────────────────────────────────
81
+
82
+ {
83
+ id: 'e2e-tool-chain-list-then-detail',
84
+ suite: 'e2e',
85
+ description: 'Model should list projects then get details about a specific one',
86
+ prompt: 'Show me all projects and tell me about the one with the most tests',
87
+ registryTools: e2eTools,
88
+ criteria: [
89
+ noError,
90
+ toolWasCalled('list_projects', { weight: 7 }),
91
+ toolWasCalled('get_project_info', { weight: 7 }),
92
+ containsText('my-app', { weight: 5, id: 'identifies-most-tested' }),
93
+ containsText('142', { weight: 5, id: 'mentions-test-count' }),
94
+ ],
95
+ },
96
+
97
+ // ─── Instruction Following ────────────────────────────
98
+
99
+ {
100
+ id: 'e2e-format-json',
101
+ suite: 'e2e',
102
+ description: 'Model should follow format instructions',
103
+ prompt: 'List three programming languages. Respond only with a JSON array of strings, nothing else.',
104
+ criteria: [
105
+ noError,
106
+ custom('valid-json-array', 'Response is a valid JSON array', 10, (result) => {
107
+ try {
108
+ // Try to extract JSON from the response
109
+ const text = result.fullText.trim();
110
+ const match = text.match(/\[[\s\S]*\]/);
111
+ if (!match) return { pass: false, score: 0, reason: 'No JSON array found' };
112
+ const arr = JSON.parse(match[0]);
113
+ const valid = Array.isArray(arr) && arr.length === 3 && arr.every(s => typeof s === 'string');
114
+ return { pass: valid, score: valid ? 1 : 0.5, reason: valid ? undefined : `Got: ${JSON.stringify(arr)}` };
115
+ } catch {
116
+ return { pass: false, score: 0, reason: 'Not valid JSON' };
117
+ }
118
+ }),
119
+ ],
120
+ },
121
+
122
+ // ─── Response Quality ─────────────────────────────────
123
+
124
+ {
125
+ id: 'e2e-concise-answer',
126
+ suite: 'e2e',
127
+ description: 'Model should give a concise answer for simple question',
128
+ prompt: 'What does the acronym HTTP stand for?',
129
+ criteria: [
130
+ noError,
131
+ containsText('Hypertext Transfer Protocol', { weight: 8 }),
132
+ responseLengthInRange(10, 500, { weight: 5, id: 'not-too-long' }),
133
+ toolNotCalled('search', { weight: 3 }),
134
+ ],
135
+ },
136
+
137
+ // ─── Language Handling ────────────────────────────────
138
+
139
+ {
140
+ id: 'e2e-chinese-response',
141
+ suite: 'e2e',
142
+ description: 'Model should respond in Chinese when prompted in Chinese',
143
+ prompt: '用中文简单解释什么是 API',
144
+ criteria: [
145
+ noError,
146
+ custom('has-chinese', 'Response contains Chinese characters', 8, (result) => {
147
+ const chinesePattern = /[\u4e00-\u9fff]/;
148
+ const hasChinese = chinesePattern.test(result.fullText);
149
+ return { pass: hasChinese, score: hasChinese ? 1 : 0 };
150
+ }),
151
+ containsText('API', { weight: 5 }),
152
+ ],
153
+ },
154
+ ];
@@ -0,0 +1,182 @@
1
+ /**
2
+ * eval/cases/memory.js — Memory recall eval cases
3
+ *
4
+ * Tests the memory recall pipeline:
5
+ * - Keyword extraction accuracy
6
+ * - Scope + tag filtering
7
+ * - LLM selection (when >7 candidates)
8
+ * - Fingerprint caching
9
+ * - Memory injection into system prompt
10
+ */
11
+
12
+ import {
13
+ noError,
14
+ containsText,
15
+ custom,
16
+ } from '../runner.js';
17
+
18
+ // ─── Memory Recall Test Helpers ──────────────────────────────
19
+
20
+ /**
21
+ * Create an engine with pre-loaded memory entries for eval.
22
+ * Uses a mock MemoryStore that returns predefined entries.
23
+ */
24
+ function createMockMemoryStore(entries) {
25
+ return {
26
+ readProfile: () => 'User is a senior TypeScript developer who prefers functional programming.',
27
+ readEntry: (name) => entries.find(e => e.name === name) || null,
28
+ readSection: () => '',
29
+ listEntries: () => entries,
30
+ findByFilter: ({ scope, tags, limit = 15 }) => {
31
+ // Simple scoring: scope match + tag overlap
32
+ return entries
33
+ .map(e => {
34
+ let score = 0;
35
+ if (scope && e.scope === scope) score += 3;
36
+ if (scope && e.scope === 'global') score += 1;
37
+ if (tags) {
38
+ for (const t of tags) {
39
+ if (e.tags && e.tags.includes(t)) score += 1;
40
+ }
41
+ }
42
+ return { ...e, _score: score };
43
+ })
44
+ .filter(e => e._score > 0)
45
+ .sort((a, b) => b._score - a._score)
46
+ .slice(0, limit);
47
+ },
48
+ bumpFrequency: () => {},
49
+ search: (keyword) => entries.filter(e =>
50
+ e.content.toLowerCase().includes(keyword.toLowerCase()) ||
51
+ e.name.toLowerCase().includes(keyword.toLowerCase()),
52
+ ),
53
+ stats: () => ({ entryCount: entries.length, scopes: [], kinds: {} }),
54
+ writeEntry: () => 'test-entry',
55
+ writeEntries: () => [],
56
+ deleteEntry: () => true,
57
+ rebuildScopes: () => {},
58
+ addToSection: () => {},
59
+ writeProfile: () => {},
60
+ clear: () => {},
61
+ };
62
+ }
63
+
64
+ const sampleMemoryEntries = [
65
+ {
66
+ name: 'typescript-strict-mode',
67
+ kind: 'preference',
68
+ scope: 'global',
69
+ tags: ['typescript', 'config', 'strict'],
70
+ importance: 'high',
71
+ frequency: 5,
72
+ content: 'User always uses TypeScript strict mode with noImplicitAny enabled.',
73
+ created_at: '2026-03-01T00:00:00Z',
74
+ updated_at: '2026-04-01T00:00:00Z',
75
+ },
76
+ {
77
+ name: 'prefers-vitest',
78
+ kind: 'preference',
79
+ scope: 'work/claude-web-chat',
80
+ tags: ['testing', 'vitest', 'framework'],
81
+ importance: 'normal',
82
+ frequency: 3,
83
+ content: 'User prefers vitest over jest for testing. Uses vitest for all new projects.',
84
+ created_at: '2026-03-15T00:00:00Z',
85
+ updated_at: '2026-04-01T00:00:00Z',
86
+ },
87
+ {
88
+ name: 'error-handling-pattern',
89
+ kind: 'lesson',
90
+ scope: 'global',
91
+ tags: ['error-handling', 'typescript', 'patterns'],
92
+ importance: 'high',
93
+ frequency: 4,
94
+ content: 'Always use Result<T, E> pattern instead of throwing exceptions. Wrap external API calls in try-catch and return Result.',
95
+ created_at: '2026-02-01T00:00:00Z',
96
+ updated_at: '2026-04-01T00:00:00Z',
97
+ },
98
+ {
99
+ name: 'project-structure',
100
+ kind: 'context',
101
+ scope: 'work/claude-web-chat',
102
+ tags: ['architecture', 'project', 'monorepo'],
103
+ importance: 'normal',
104
+ frequency: 2,
105
+ content: 'Project uses monorepo with agent/, server/, web/ directories. Agent code is in agent/unify/.',
106
+ created_at: '2026-01-01T00:00:00Z',
107
+ updated_at: '2026-03-01T00:00:00Z',
108
+ },
109
+ {
110
+ name: 'functional-programming',
111
+ kind: 'preference',
112
+ scope: 'global',
113
+ tags: ['functional', 'programming', 'style'],
114
+ importance: 'normal',
115
+ frequency: 6,
116
+ content: 'User prefers functional programming: pure functions, immutable data, map/filter/reduce over loops.',
117
+ created_at: '2026-01-15T00:00:00Z',
118
+ updated_at: '2026-04-05T00:00:00Z',
119
+ },
120
+ {
121
+ name: 'api-design-rest',
122
+ kind: 'skill',
123
+ scope: 'global',
124
+ tags: ['api', 'rest', 'design'],
125
+ importance: 'normal',
126
+ frequency: 1,
127
+ content: 'REST API conventions: use plural nouns, HTTP methods for CRUD, 2xx success, 4xx client error, 5xx server error.',
128
+ created_at: '2026-02-15T00:00:00Z',
129
+ updated_at: '2026-02-15T00:00:00Z',
130
+ },
131
+ ];
132
+
133
+ // ─── Eval Cases ──────────────────────────────────────────────
134
+
135
+ export const memoryCases = [
136
+
137
+ // ─── Memory Injection Verification ────────────────────
138
+
139
+ {
140
+ id: 'memory-profile-injection',
141
+ suite: 'memory',
142
+ description: 'System prompt should include user profile from memory',
143
+ prompt: 'Help me with a coding task',
144
+ setupEngine: (engine) => {
145
+ // We can't directly inject memoryStore here since Engine uses private fields
146
+ // Instead, this eval verifies via the adapter call log that system prompt contains memory
147
+ },
148
+ criteria: [
149
+ noError,
150
+ custom('has-response', 'Model produces a response', 5, (result) => ({
151
+ pass: result.fullText.length > 0,
152
+ score: result.fullText.length > 0 ? 1 : 0,
153
+ })),
154
+ ],
155
+ },
156
+
157
+ // ─── Keyword Extraction (unit-level eval) ─────────────
158
+
159
+ {
160
+ id: 'memory-keyword-extraction',
161
+ suite: 'memory',
162
+ description: 'Keyword extraction produces relevant keywords',
163
+ prompt: 'How should I handle TypeScript errors in my Express API?',
164
+ criteria: [
165
+ noError,
166
+ // This is tested at unit level but verifiable here via recall event
167
+ custom('recall-event', 'Recall event emitted (if memory store provided)', 3, (result) => {
168
+ // Without a real memory store this won't emit recall, so we check gracefully
169
+ const recallEvent = result.events.find(e => e.type === 'recall');
170
+ return {
171
+ pass: true, // Always passes — it's informational
172
+ score: recallEvent ? 1 : 0.5,
173
+ reason: recallEvent ? `Recalled ${recallEvent.entryCount} entries` : 'No memory store configured',
174
+ };
175
+ }),
176
+ ],
177
+ },
178
+ ];
179
+
180
+ // ─── Exported for direct import in unit tests ────────────────
181
+
182
+ export { createMockMemoryStore, sampleMemoryEntries };
@@ -0,0 +1,51 @@
1
+ /**
2
+ * eval/cases/skills.js — Skill matching eval cases
3
+ *
4
+ * Tests whether the engine correctly:
5
+ * - Matches skills to relevant prompts
6
+ * - Injects matched skill content into system prompt
7
+ * - Does NOT inject irrelevant skills
8
+ * - Handles mode filtering correctly
9
+ */
10
+
11
+ import {
12
+ noError,
13
+ containsText,
14
+ doesNotContain,
15
+ custom,
16
+ } from '../runner.js';
17
+
18
+ // ─── Eval Cases ──────────────────────────────────────────────
19
+
20
+ export const skillsCases = [
21
+
22
+ {
23
+ id: 'skill-match-basic',
24
+ suite: 'skills',
25
+ description: 'Engine should inject relevant skill into system prompt',
26
+ prompt: 'How do I set up testing for my project?',
27
+ criteria: [
28
+ noError,
29
+ // The actual skill injection happens via system prompt which we can check
30
+ // if the adapter captures it. For now, just verify no crash.
31
+ custom('produces-response', 'Model responds to the prompt', 5, (result) => ({
32
+ pass: result.fullText.length > 10,
33
+ score: result.fullText.length > 10 ? 1 : 0,
34
+ })),
35
+ ],
36
+ },
37
+
38
+ {
39
+ id: 'skill-no-false-positive',
40
+ suite: 'skills',
41
+ description: 'Engine should NOT inject unrelated skills',
42
+ prompt: 'What is the weather like?',
43
+ criteria: [
44
+ noError,
45
+ custom('produces-response', 'Model responds', 5, (result) => ({
46
+ pass: result.fullText.length > 0,
47
+ score: result.fullText.length > 0 ? 1 : 0,
48
+ })),
49
+ ],
50
+ },
51
+ ];