npm - @yeaft/webchat-agent - Versions diffs - 0.1.411 → 0.1.412 - Mend

@yeaft/webchat-agent 0.1.411 → 0.1.412

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/package.json +1 -1
package/unify/eval/cases/e2e.js +154 -0
package/unify/eval/cases/memory.js +182 -0
package/unify/eval/cases/skills.js +51 -0
package/unify/eval/cases/tool-use.js +356 -0
package/unify/eval/run-eval.js +250 -0
package/unify/eval/runner.js +525 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@yeaft/webchat-agent",
-  "version": "0.1.411",
+  "version": "0.1.412",
   "description": "Remote agent for Yeaft WebChat — connects worker machines to the central server",
   "main": "index.js",
   "type": "module",

package/unify/eval/cases/e2e.js ADDED Viewed

@@ -0,0 +1,154 @@
+/**
+ * eval/cases/e2e.js — End-to-end session eval cases
+ *
+ * Tests the full pipeline: prompt → recall → system prompt → LLM → tools → response.
+ * These cases verify that the integration holds together correctly.
+ */
+import { defineTool } from '../../tools/types.js';
+import {
+  noError,
+  containsText,
+  toolWasCalled,
+  toolNotCalled,
+  toolSucceeded,
+  turnCountInRange,
+  responseLengthInRange,
+  custom,
+} from '../runner.js';
+// ─── Mock Tools ──────────────────────────────────────────────
+const listProjectsTool = defineTool({
+  name: 'list_projects',
+  description: 'List all projects in the workspace.',
+  parameters: { type: 'object', properties: {} },
+  modes: ['chat', 'work'],
+  async execute() {
+    return JSON.stringify({
+      projects: ['my-app', 'shared-lib', 'docs-site'],
+    });
+  },
+});
+const getProjectInfoTool = defineTool({
+  name: 'get_project_info',
+  description: 'Get detailed information about a specific project.',
+  parameters: {
+    type: 'object',
+    properties: {
+      name: { type: 'string', description: 'Project name' },
+    },
+    required: ['name'],
+  },
+  modes: ['chat', 'work'],
+  async execute(input) {
+    const projects = {
+      'my-app': { name: 'my-app', language: 'TypeScript', framework: 'Express', tests: 142 },
+      'shared-lib': { name: 'shared-lib', language: 'TypeScript', framework: 'none', tests: 67 },
+      'docs-site': { name: 'docs-site', language: 'MDX', framework: 'Next.js', tests: 23 },
+    };
+    return JSON.stringify(projects[input.name] || { error: `Unknown project: ${input.name}` });
+  },
+});
+const e2eTools = [listProjectsTool, getProjectInfoTool];
+// ─── Eval Cases ──────────────────────────────────────────────
+export const e2eCases = [
+  // ─── Conversation Coherence ───────────────────────────
+  {
+    id: 'e2e-conversation-context',
+    suite: 'e2e',
+    description: 'Model should use conversation history for context',
+    prompt: 'What language is it written in?',
+    messages: [
+      { role: 'user', content: 'Tell me about the my-app project' },
+      { role: 'assistant', content: 'The my-app project is a TypeScript application built with Express. It has 142 tests.' },
+    ],
+    registryTools: e2eTools,
+    criteria: [
+      noError,
+      containsText('TypeScript', { weight: 8, id: 'remembers-language' }),
+      turnCountInRange(1, 2, { weight: 3 }),
+    ],
+  },
+  // ─── Tool Chain ───────────────────────────────────────
+  {
+    id: 'e2e-tool-chain-list-then-detail',
+    suite: 'e2e',
+    description: 'Model should list projects then get details about a specific one',
+    prompt: 'Show me all projects and tell me about the one with the most tests',
+    registryTools: e2eTools,
+    criteria: [
+      noError,
+      toolWasCalled('list_projects', { weight: 7 }),
+      toolWasCalled('get_project_info', { weight: 7 }),
+      containsText('my-app', { weight: 5, id: 'identifies-most-tested' }),
+      containsText('142', { weight: 5, id: 'mentions-test-count' }),
+    ],
+  },
+  // ─── Instruction Following ────────────────────────────
+  {
+    id: 'e2e-format-json',
+    suite: 'e2e',
+    description: 'Model should follow format instructions',
+    prompt: 'List three programming languages. Respond only with a JSON array of strings, nothing else.',
+    criteria: [
+      noError,
+      custom('valid-json-array', 'Response is a valid JSON array', 10, (result) => {
+        try {
+          // Try to extract JSON from the response
+          const text = result.fullText.trim();
+          const match = text.match(/\[[\s\S]*\]/);
+          if (!match) return { pass: false, score: 0, reason: 'No JSON array found' };
+          const arr = JSON.parse(match[0]);
+          const valid = Array.isArray(arr) && arr.length === 3 && arr.every(s => typeof s === 'string');
+          return { pass: valid, score: valid ? 1 : 0.5, reason: valid ? undefined : `Got: ${JSON.stringify(arr)}` };
+        } catch {
+          return { pass: false, score: 0, reason: 'Not valid JSON' };
+        }
+      }),
+    ],
+  },
+  // ─── Response Quality ─────────────────────────────────
+  {
+    id: 'e2e-concise-answer',
+    suite: 'e2e',
+    description: 'Model should give a concise answer for simple question',
+    prompt: 'What does the acronym HTTP stand for?',
+    criteria: [
+      noError,
+      containsText('Hypertext Transfer Protocol', { weight: 8 }),
+      responseLengthInRange(10, 500, { weight: 5, id: 'not-too-long' }),
+      toolNotCalled('search', { weight: 3 }),
+    ],
+  },
+  // ─── Language Handling ────────────────────────────────
+  {
+    id: 'e2e-chinese-response',
+    suite: 'e2e',
+    description: 'Model should respond in Chinese when prompted in Chinese',
+    prompt: '用中文简单解释什么是 API',
+    criteria: [
+      noError,
+      custom('has-chinese', 'Response contains Chinese characters', 8, (result) => {
+        const chinesePattern = /[\u4e00-\u9fff]/;
+        const hasChinese = chinesePattern.test(result.fullText);
+        return { pass: hasChinese, score: hasChinese ? 1 : 0 };
+      }),
+      containsText('API', { weight: 5 }),
+    ],
+  },
+];

package/unify/eval/cases/memory.js ADDED Viewed

@@ -0,0 +1,182 @@
+/**
+ * eval/cases/memory.js — Memory recall eval cases
+ *
+ * Tests the memory recall pipeline:
+ *   - Keyword extraction accuracy
+ *   - Scope + tag filtering
+ *   - LLM selection (when >7 candidates)
+ *   - Fingerprint caching
+ *   - Memory injection into system prompt
+ */
+import {
+  noError,
+  containsText,
+  custom,
+} from '../runner.js';
+// ─── Memory Recall Test Helpers ──────────────────────────────
+/**
+ * Create an engine with pre-loaded memory entries for eval.
+ * Uses a mock MemoryStore that returns predefined entries.
+ */
+function createMockMemoryStore(entries) {
+  return {
+    readProfile: () => 'User is a senior TypeScript developer who prefers functional programming.',
+    readEntry: (name) => entries.find(e => e.name === name) || null,
+    readSection: () => '',
+    listEntries: () => entries,
+    findByFilter: ({ scope, tags, limit = 15 }) => {
+      // Simple scoring: scope match + tag overlap
+      return entries
+        .map(e => {
+          let score = 0;
+          if (scope && e.scope === scope) score += 3;
+          if (scope && e.scope === 'global') score += 1;
+          if (tags) {
+            for (const t of tags) {
+              if (e.tags && e.tags.includes(t)) score += 1;
+            }
+          }
+          return { ...e, _score: score };
+        })
+        .filter(e => e._score > 0)
+        .sort((a, b) => b._score - a._score)
+        .slice(0, limit);
+    },
+    bumpFrequency: () => {},
+    search: (keyword) => entries.filter(e =>
+      e.content.toLowerCase().includes(keyword.toLowerCase()) ||
+      e.name.toLowerCase().includes(keyword.toLowerCase()),
+    ),
+    stats: () => ({ entryCount: entries.length, scopes: [], kinds: {} }),
+    writeEntry: () => 'test-entry',
+    writeEntries: () => [],
+    deleteEntry: () => true,
+    rebuildScopes: () => {},
+    addToSection: () => {},
+    writeProfile: () => {},
+    clear: () => {},
+  };
+}
+const sampleMemoryEntries = [
+  {
+    name: 'typescript-strict-mode',
+    kind: 'preference',
+    scope: 'global',
+    tags: ['typescript', 'config', 'strict'],
+    importance: 'high',
+    frequency: 5,
+    content: 'User always uses TypeScript strict mode with noImplicitAny enabled.',
+    created_at: '2026-03-01T00:00:00Z',
+    updated_at: '2026-04-01T00:00:00Z',
+  },
+  {
+    name: 'prefers-vitest',
+    kind: 'preference',
+    scope: 'work/claude-web-chat',
+    tags: ['testing', 'vitest', 'framework'],
+    importance: 'normal',
+    frequency: 3,
+    content: 'User prefers vitest over jest for testing. Uses vitest for all new projects.',
+    created_at: '2026-03-15T00:00:00Z',
+    updated_at: '2026-04-01T00:00:00Z',
+  },
+  {
+    name: 'error-handling-pattern',
+    kind: 'lesson',
+    scope: 'global',
+    tags: ['error-handling', 'typescript', 'patterns'],
+    importance: 'high',
+    frequency: 4,
+    content: 'Always use Result<T, E> pattern instead of throwing exceptions. Wrap external API calls in try-catch and return Result.',
+    created_at: '2026-02-01T00:00:00Z',
+    updated_at: '2026-04-01T00:00:00Z',
+  },
+  {
+    name: 'project-structure',
+    kind: 'context',
+    scope: 'work/claude-web-chat',
+    tags: ['architecture', 'project', 'monorepo'],
+    importance: 'normal',
+    frequency: 2,
+    content: 'Project uses monorepo with agent/, server/, web/ directories. Agent code is in agent/unify/.',
+    created_at: '2026-01-01T00:00:00Z',
+    updated_at: '2026-03-01T00:00:00Z',
+  },
+  {
+    name: 'functional-programming',
+    kind: 'preference',
+    scope: 'global',
+    tags: ['functional', 'programming', 'style'],
+    importance: 'normal',
+    frequency: 6,
+    content: 'User prefers functional programming: pure functions, immutable data, map/filter/reduce over loops.',
+    created_at: '2026-01-15T00:00:00Z',
+    updated_at: '2026-04-05T00:00:00Z',
+  },
+  {
+    name: 'api-design-rest',
+    kind: 'skill',
+    scope: 'global',
+    tags: ['api', 'rest', 'design'],
+    importance: 'normal',
+    frequency: 1,
+    content: 'REST API conventions: use plural nouns, HTTP methods for CRUD, 2xx success, 4xx client error, 5xx server error.',
+    created_at: '2026-02-15T00:00:00Z',
+    updated_at: '2026-02-15T00:00:00Z',
+  },
+];
+// ─── Eval Cases ──────────────────────────────────────────────
+export const memoryCases = [
+  // ─── Memory Injection Verification ────────────────────
+  {
+    id: 'memory-profile-injection',
+    suite: 'memory',
+    description: 'System prompt should include user profile from memory',
+    prompt: 'Help me with a coding task',
+    setupEngine: (engine) => {
+      // We can't directly inject memoryStore here since Engine uses private fields
+      // Instead, this eval verifies via the adapter call log that system prompt contains memory
+    },
+    criteria: [
+      noError,
+      custom('has-response', 'Model produces a response', 5, (result) => ({
+        pass: result.fullText.length > 0,
+        score: result.fullText.length > 0 ? 1 : 0,
+      })),
+    ],
+  },
+  // ─── Keyword Extraction (unit-level eval) ─────────────
+  {
+    id: 'memory-keyword-extraction',
+    suite: 'memory',
+    description: 'Keyword extraction produces relevant keywords',
+    prompt: 'How should I handle TypeScript errors in my Express API?',
+    criteria: [
+      noError,
+      // This is tested at unit level but verifiable here via recall event
+      custom('recall-event', 'Recall event emitted (if memory store provided)', 3, (result) => {
+        // Without a real memory store this won't emit recall, so we check gracefully
+        const recallEvent = result.events.find(e => e.type === 'recall');
+        return {
+          pass: true, // Always passes — it's informational
+          score: recallEvent ? 1 : 0.5,
+          reason: recallEvent ? `Recalled ${recallEvent.entryCount} entries` : 'No memory store configured',
+        };
+      }),
+    ],
+  },
+];
+// ─── Exported for direct import in unit tests ────────────────
+export { createMockMemoryStore, sampleMemoryEntries };

package/unify/eval/cases/skills.js ADDED Viewed

@@ -0,0 +1,51 @@
+/**
+ * eval/cases/skills.js — Skill matching eval cases
+ *
+ * Tests whether the engine correctly:
+ *   - Matches skills to relevant prompts
+ *   - Injects matched skill content into system prompt
+ *   - Does NOT inject irrelevant skills
+ *   - Handles mode filtering correctly
+ */
+import {
+  noError,
+  containsText,
+  doesNotContain,
+  custom,
+} from '../runner.js';
+// ─── Eval Cases ──────────────────────────────────────────────
+export const skillsCases = [
+  {
+    id: 'skill-match-basic',
+    suite: 'skills',
+    description: 'Engine should inject relevant skill into system prompt',
+    prompt: 'How do I set up testing for my project?',
+    criteria: [
+      noError,
+      // The actual skill injection happens via system prompt which we can check
+      // if the adapter captures it. For now, just verify no crash.
+      custom('produces-response', 'Model responds to the prompt', 5, (result) => ({
+        pass: result.fullText.length > 10,
+        score: result.fullText.length > 10 ? 1 : 0,
+      })),
+    ],
+  },
+  {
+    id: 'skill-no-false-positive',
+    suite: 'skills',
+    description: 'Engine should NOT inject unrelated skills',
+    prompt: 'What is the weather like?',
+    criteria: [
+      noError,
+      custom('produces-response', 'Model responds', 5, (result) => ({
+        pass: result.fullText.length > 0,
+        score: result.fullText.length > 0 ? 1 : 0,
+      })),
+    ],
+  },
+];