npm - @mastra/longmemeval - Versions diffs - 0.0.0-add-libsql-changeset-20250910154739 - Mend

@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +919 -0
package/DATA_DOWNLOAD_GUIDE.md +117 -0
package/LICENSE.md +15 -0
package/README.md +173 -0
package/USAGE.md +105 -0
package/package.json +67 -0
package/scripts/download.ts +180 -0
package/scripts/find-failed.ts +176 -0
package/scripts/generate-embeddings.ts +56 -0
package/scripts/generate-wm-templates.ts +296 -0
package/scripts/setup.ts +60 -0
package/src/__fixtures__/embeddings.json +2319 -0
package/src/__fixtures__/test-dataset.json +82 -0
package/src/cli.ts +690 -0
package/src/commands/__tests__/prepare.test.ts +230 -0
package/src/commands/__tests__/run.test.ts +403 -0
package/src/commands/prepare.ts +793 -0
package/src/commands/run.ts +553 -0
package/src/config.ts +83 -0
package/src/data/loader.ts +163 -0
package/src/data/types.ts +61 -0
package/src/embeddings/cached-openai-embedding-model.ts +227 -0
package/src/embeddings/cached-openai-provider.ts +40 -0
package/src/embeddings/index.ts +2 -0
package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
package/src/evaluation/longmemeval-metric.ts +173 -0
package/src/retry-model.ts +60 -0
package/src/storage/__tests__/benchmark-store.test.ts +280 -0
package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
package/src/storage/benchmark-store.ts +540 -0
package/src/storage/benchmark-vector.ts +234 -0
package/src/storage/index.ts +2 -0
package/src/test-utils/mock-embeddings.ts +54 -0
package/src/test-utils/mock-model.ts +49 -0
package/tests/data-loader.test.ts +96 -0
package/tsconfig.json +18 -0
package/vitest.config.ts +9 -0

package/src/evaluation/longmemeval-metric.ts ADDED Viewed

@@ -0,0 +1,173 @@
+import { Metric, type MetricResult } from '@mastra/core/eval';
+import { Agent } from '@mastra/core/agent';
+import type { QuestionType } from '../data/types';
+export interface LongMemEvalMetricConfig {
+  agent: Agent;
+  questionType: QuestionType;
+  isAbstention?: boolean;
+}
+/**
+ * LongMemEval Metric implementation using Mastra's eval framework
+ *
+ * This metric evaluates whether an LLM correctly recalls information
+ * from long conversation histories across different question types.
+ */
+export class LongMemEvalMetric extends Metric {
+  private agent: Agent;
+  private questionType: QuestionType;
+  private isAbstention: boolean;
+  constructor(config: LongMemEvalMetricConfig) {
+    super();
+    this.agent = config.agent;
+    if (!this.agent) {
+      throw new Error('Agent instance is required for LongMemEvalMetric');
+    }
+    this.questionType = config.questionType;
+    this.isAbstention = config.isAbstention || false;
+  }
+  /**
+   * Measure the correctness of a model's response
+   *
+   * @param input - JSON string containing question and expected answer
+   * @param output - Model's response
+   * @returns MetricResult with score (0 or 1) and additional info
+   */
+  async measure(input: string, output: string): Promise<MetricResult> {
+    // Parse input to get question and expected answer
+    const { question, answer } = JSON.parse(input) as {
+      question: string;
+      answer: string;
+    };
+    const prompt = this.getEvalPrompt(this.questionType, question, answer, output, this.isAbstention);
+    try {
+      const response = await this.agent.generate(
+        [
+          {
+            role: 'user',
+            content: prompt,
+          },
+        ],
+        {
+          temperature: 0,
+        },
+      );
+      const responseText = response.text?.toLowerCase().trim();
+      const isCorrect = responseText === 'yes' || responseText?.toLowerCase()?.startsWith('yes.');
+      // Extract reason if answer is no
+      let reason: string | undefined;
+      if (!isCorrect && responseText?.includes(':')) {
+        reason = responseText.split(':')[1]?.trim();
+      }
+      return {
+        score: isCorrect ? 1 : 0,
+        info: {
+          questionType: this.questionType,
+          isAbstention: this.isAbstention,
+          evaluatorResponse: responseText,
+          reason,
+        },
+      };
+    } catch (error) {
+      throw new Error(`Failed to evaluate response: ${error}`);
+    }
+  }
+  /**
+   * Get the evaluation prompt based on question type
+   */
+  private getEvalPrompt(
+    taskType: QuestionType,
+    question: string,
+    answer: string,
+    response: string,
+    isAbstention: boolean,
+  ): string {
+    if (isAbstention) {
+      return `I will give you an unanswerable question, an explanation, and a response from a model. Please answer yes if the model correctly identifies the question as unanswerable. The model could say that the information is incomplete, or some other information is given but the asked information is not.
+Question: ${question}
+Explanation: ${answer}
+Model Response: ${response}
+Does the model correctly identify the question as unanswerable? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
+    }
+    switch (taskType) {
+      case 'single-session-user':
+      case 'single-session-assistant':
+      case 'multi-session':
+        return `I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no.
+If you answer "no" please also include a reason why (ex "no: because x reason")
+Question: ${question}
+Correct Answer: ${answer}
+Model Response: ${response}
+Is the model response correct? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
+      case 'temporal-reasoning':
+        return `I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. In addition, do not penalize off-by-one errors for the number of days. If the question asks for the number of days/weeks/months, etc., and the model makes off-by-one errors (e.g., predicting 19 days when the answer is 18), the model's response is still correct.
+Question: ${question}
+Correct Answer: ${answer}
+Model Response: ${response}
+Is the model response correct? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
+      case 'knowledge-update':
+        return `I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response contains some previous information along with an updated answer, the response should be considered as correct as long as the updated answer is the required answer.
+Question: ${question}
+Correct Answer: ${answer}
+Model Response: ${response}
+Is the model response correct? Answer yes or no only.`;
+      case 'single-session-preference':
+        return `I will give you a question, a rubric for desired personalized response, and a response from a model. Please answer yes if the response satisfies the desired response. Otherwise, answer no and provide a reason why. The model does not need to reflect all the points in the rubric. The response is correct as long as it recalls and utilizes the user's personal information correctly.
+Question: ${question}
+Rubric: ${answer}
+Model Response: ${response}
+Is the model response correct? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
+      default:
+        throw new Error(`Unknown question type: ${taskType}`);
+    }
+  }
+}
+/**
+ * Factory function to create LongMemEval metrics for different question types
+ */
+export function createLongMemEvalMetric(
+  questionType: QuestionType,
+  agent: Agent,
+  options?: Partial<LongMemEvalMetricConfig>,
+): LongMemEvalMetric {
+  return new LongMemEvalMetric({
+    ...options,
+    agent,
+    questionType,
+  });
+}

package/src/retry-model.ts ADDED Viewed

@@ -0,0 +1,60 @@
+import { LanguageModel, wrapLanguageModel } from 'ai';
+export function makeRetryModel(model: LanguageModel) {
+  const state = {
+    rateLimitCount: 0,
+    pause: null as null | Promise<void>,
+    pauseResolve: () => {},
+    pauseTime: 0,
+  };
+  const wrapped = wrapLanguageModel({
+    model,
+    middleware: {
+      wrapGenerate: async ({ doGenerate }) => {
+        if (state.pause) await state.pause;
+        const maxRetries = 10;
+        let retries = 0;
+        while (retries < maxRetries) {
+          try {
+            const result = await doGenerate();
+            return result;
+          } catch (error: any) {
+            if (error.status === 429 || error.statusCode === 429) {
+              retries++;
+              state.rateLimitCount++;
+              const newPauseTime = 2000 * retries;
+              if (state.pause) {
+                await state.pause;
+              }
+              if (retries >= maxRetries) {
+                throw error;
+              }
+              if (newPauseTime <= state.pauseTime) {
+                continue;
+              }
+              if (!state.pause) {
+                state.pauseTime = newPauseTime;
+                state.pause = new Promise(resolve => {
+                  setTimeout(() => {
+                    resolve();
+                    state.pause = null;
+                    state.pauseTime = 0;
+                  }, state.pauseTime);
+                });
+              }
+              await state.pause;
+            } else {
+              throw error;
+            }
+          }
+        }
+        throw new Error('unhandled');
+      },
+    },
+  });
+  return {
+    model: wrapped,
+    state,
+  };
+}

package/src/storage/__tests__/benchmark-store.test.ts ADDED Viewed

@@ -0,0 +1,280 @@
+import { describe, it, expect, beforeEach, afterEach } from 'vitest';
+import { BenchmarkStore } from '../benchmark-store';
+import { rm } from 'fs/promises';
+import { existsSync } from 'fs';
+import { join } from 'path';
+import { tmpdir } from 'os';
+describe('BenchmarkStore', () => {
+  let store: BenchmarkStore;
+  let testFilePath: string;
+  beforeEach(async () => {
+    store = new BenchmarkStore();
+    await store.init();
+    testFilePath = join(tmpdir(), `benchmark-store-test-${Date.now()}.json`);
+  });
+  afterEach(async () => {
+    // Clean up test files
+    if (existsSync(testFilePath)) {
+      await rm(testFilePath);
+    }
+  });
+  describe('supports', () => {
+    it('should support resource scope and working memory', () => {
+      expect(store.supports.selectByIncludeResourceScope).toBe(true);
+      expect(store.supports.resourceWorkingMemory).toBe(true);
+    });
+  });
+  describe('persist', () => {
+    it('should save store data to a JSON file', async () => {
+      // Add some test data
+      await store.saveThread({
+        thread: {
+          id: 'test-thread-1',
+          resourceId: 'test-resource-1',
+          title: 'Test Thread',
+          metadata: { test: true },
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+      });
+      await store.saveMessages({
+        messages: [
+          {
+            id: 'msg-1',
+            threadId: 'test-thread-1',
+            resourceId: 'test-resource-1',
+            role: 'user' as const,
+            content: 'Hello',
+            createdAt: new Date(),
+            type: 'text' as const,
+          },
+          {
+            id: 'msg-2',
+            threadId: 'test-thread-1',
+            resourceId: 'test-resource-1',
+            role: 'assistant' as const,
+            content: 'Hi there!',
+            createdAt: new Date(),
+            type: 'text' as const,
+          },
+        ],
+      });
+      // Persist to file
+      await store.persist(testFilePath);
+      // Verify file exists
+      expect(existsSync(testFilePath)).toBe(true);
+    });
+  });
+  describe('hydrate', () => {
+    it('should restore store data from a JSON file', async () => {
+      // Create first store with data
+      const store1 = new BenchmarkStore();
+      await store1.init();
+      const thread = {
+        id: 'test-thread-1',
+        resourceId: 'test-resource-1',
+        title: 'Test Thread',
+        metadata: { test: true },
+        createdAt: new Date(),
+        updatedAt: new Date(),
+      };
+      await store1.saveThread({ thread });
+      await store1.saveMessages({
+        messages: [
+          {
+            id: 'msg-1',
+            threadId: 'test-thread-1',
+            resourceId: 'test-resource-1',
+            role: 'user' as const,
+            content: 'Hello',
+            createdAt: new Date(),
+            type: 'text' as const,
+          },
+        ],
+      });
+      // Persist store1
+      await store1.persist(testFilePath);
+      // Create new store and hydrate
+      const store2 = new BenchmarkStore();
+      await store2.init();
+      await store2.hydrate(testFilePath);
+      // Verify data was restored
+      const restoredThread = await store2.getThreadById({ threadId: 'test-thread-1' });
+      expect(restoredThread).toBeTruthy();
+      expect(restoredThread?.title).toBe('Test Thread');
+      const restoredMessages = await store2.getMessages({ threadId: 'test-thread-1' });
+      expect(restoredMessages).toHaveLength(1);
+      expect(restoredMessages[0].content).toBe('Hello');
+    });
+    it('should throw error if file does not exist', async () => {
+      await expect(store.hydrate('/non/existent/file.json')).rejects.toThrow('Storage file not found');
+    });
+  });
+  describe('cross-thread queries (resource scope)', () => {
+    it('should support selectBy.include with different threadIds', async () => {
+      // Create messages in different threads but same resource
+      await store.saveThread({
+        thread: {
+          id: 'thread-1',
+          resourceId: 'resource-1',
+          title: 'Thread 1',
+          metadata: {},
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+      });
+      await store.saveThread({
+        thread: {
+          id: 'thread-2',
+          resourceId: 'resource-1',
+          title: 'Thread 2',
+          metadata: {},
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+      });
+      await store.saveMessages({
+        messages: [
+          {
+            id: 'msg-1',
+            threadId: 'thread-1',
+            resourceId: 'resource-1',
+            role: 'user' as const,
+            content: 'Message in thread 1',
+            createdAt: new Date('2024-01-01'),
+            type: 'text' as const,
+          },
+          {
+            id: 'msg-2',
+            threadId: 'thread-2',
+            resourceId: 'resource-1',
+            role: 'user' as const,
+            content: 'Message in thread 2',
+            createdAt: new Date('2024-01-02'),
+            type: 'text' as const,
+          },
+          {
+            id: 'msg-3',
+            threadId: 'thread-2',
+            resourceId: 'resource-1',
+            role: 'assistant' as const,
+            content: 'Response in thread 2',
+            createdAt: new Date('2024-01-03'),
+            type: 'text' as const,
+          },
+        ],
+      });
+      // Query using selectBy.include to get messages from different threads
+      const messages = await store.getMessages({
+        threadId: 'thread-1',
+        selectBy: {
+          include: [
+            {
+              id: 'msg-2',
+              threadId: 'thread-2', // Different thread!
+              withPreviousMessages: 0,
+              withNextMessages: 1,
+            },
+          ],
+        },
+      });
+      expect(messages).toHaveLength(2);
+      expect(messages[0].content).toBe('Message in thread 2');
+      expect(messages[1].content).toBe('Response in thread 2');
+    });
+  });
+  describe('resource operations', () => {
+    it('should support resource working memory', async () => {
+      const resource = await store.saveResource({
+        resource: {
+          id: 'resource-1',
+          workingMemory: 'Initial working memory',
+          metadata: { key: 'value' },
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+      });
+      expect(resource.workingMemory).toBe('Initial working memory');
+      // Update resource
+      const updated = await store.updateResource({
+        resourceId: 'resource-1',
+        workingMemory: 'Updated working memory',
+        metadata: { key: 'newValue', extra: 'data' },
+      });
+      expect(updated.workingMemory).toBe('Updated working memory');
+      expect(updated.metadata).toEqual({ key: 'newValue', extra: 'data' });
+      // Get resource
+      const retrieved = await store.getResourceById({ resourceId: 'resource-1' });
+      expect(retrieved?.workingMemory).toBe('Updated working memory');
+    });
+  });
+  describe('clear', () => {
+    it('should clear all data', async () => {
+      // Add data
+      await store.saveThread({
+        thread: {
+          id: 'test-thread-1',
+          resourceId: 'test-resource-1',
+          title: 'Test Thread',
+          metadata: {},
+          createdAt: new Date(),
+          updatedAt: new Date(),
+        },
+      });
+      // Clear
+      await store.clear();
+      // Verify data is gone
+      const thread = await store.getThreadById({ threadId: 'test-thread-1' });
+      expect(thread).toBeNull();
+    });
+  });
+  describe('getting messages', () => {
+    it('should throw when threadId is an empty string or whitespace only', async () => {
+      await expect(() => store.getMessages({ threadId: '' })).rejects.toThrowError(
+        'threadId must be a non-empty string',
+      );
+      await expect(() => store.getMessagesPaginated({ threadId: '' })).rejects.toThrowError(
+        'threadId must be a non-empty string',
+      );
+      await expect(() => store.getMessages({ threadId: '   ' })).rejects.toThrowError(
+        'threadId must be a non-empty string',
+      );
+      await expect(() => store.getMessagesPaginated({ threadId: '   ' })).rejects.toThrowError(
+        'threadId must be a non-empty string',
+      );
+    });
+  });
+});