npm - @mastra/longmemeval - Versions diffs - 0.0.0-add-libsql-changeset-20250910154739 - Mend

@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +919 -0
package/DATA_DOWNLOAD_GUIDE.md +117 -0
package/LICENSE.md +15 -0
package/README.md +173 -0
package/USAGE.md +105 -0
package/package.json +67 -0
package/scripts/download.ts +180 -0
package/scripts/find-failed.ts +176 -0
package/scripts/generate-embeddings.ts +56 -0
package/scripts/generate-wm-templates.ts +296 -0
package/scripts/setup.ts +60 -0
package/src/__fixtures__/embeddings.json +2319 -0
package/src/__fixtures__/test-dataset.json +82 -0
package/src/cli.ts +690 -0
package/src/commands/__tests__/prepare.test.ts +230 -0
package/src/commands/__tests__/run.test.ts +403 -0
package/src/commands/prepare.ts +793 -0
package/src/commands/run.ts +553 -0
package/src/config.ts +83 -0
package/src/data/loader.ts +163 -0
package/src/data/types.ts +61 -0
package/src/embeddings/cached-openai-embedding-model.ts +227 -0
package/src/embeddings/cached-openai-provider.ts +40 -0
package/src/embeddings/index.ts +2 -0
package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
package/src/evaluation/longmemeval-metric.ts +173 -0
package/src/retry-model.ts +60 -0
package/src/storage/__tests__/benchmark-store.test.ts +280 -0
package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
package/src/storage/benchmark-store.ts +540 -0
package/src/storage/benchmark-vector.ts +234 -0
package/src/storage/index.ts +2 -0
package/src/test-utils/mock-embeddings.ts +54 -0
package/src/test-utils/mock-model.ts +49 -0
package/tests/data-loader.test.ts +96 -0
package/tsconfig.json +18 -0
package/vitest.config.ts +9 -0

package/src/data/loader.ts ADDED Viewed

@@ -0,0 +1,163 @@
+import { readFile } from 'fs/promises';
+import { join } from 'path';
+import type { LongMemEvalQuestion } from './types';
+export class DatasetLoader {
+  private dataDir: string;
+  constructor(dataDir?: string) {
+    // Default to data directory relative to where the command is run
+    this.dataDir = dataDir || join(process.cwd(), 'data');
+  }
+  /**
+   * Load a LongMemEval dataset from JSON file
+   */
+  async loadDataset(
+    dataset: 'longmemeval_s' | 'longmemeval_m' | 'longmemeval_oracle' | 'sample_data',
+  ): Promise<LongMemEvalQuestion[]> {
+    const filePath = join(this.dataDir, `${dataset}.json`);
+    try {
+      const fileContent = await readFile(filePath, 'utf-8');
+      const data = JSON.parse(fileContent) as LongMemEvalQuestion[];
+      // Validate the data structure
+      this.validateDataset(data);
+      return data;
+    } catch (error) {
+      if ((error as any).code === 'ENOENT') {
+        throw new Error(
+          `Dataset file not found: ${filePath}\n` +
+            `Please download the LongMemEval dataset from https://drive.google.com/file/d/1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80/view ` +
+            `and extract it to ${this.dataDir}`,
+        );
+      }
+      throw error;
+    }
+  }
+  /**
+   * Load a subset of questions for testing
+   */
+  async loadSubset(
+    dataset: 'longmemeval_s' | 'longmemeval_m' | 'longmemeval_oracle' | 'sample_data',
+    limit: number,
+  ): Promise<LongMemEvalQuestion[]> {
+    const fullDataset = await this.loadDataset(dataset);
+    return fullDataset.slice(0, limit);
+  }
+  /**
+   * Load questions of a specific type
+   */
+  async loadByType(
+    dataset: 'longmemeval_s' | 'longmemeval_m' | 'longmemeval_oracle' | 'sample_data',
+    questionType: string,
+  ): Promise<LongMemEvalQuestion[]> {
+    const fullDataset = await this.loadDataset(dataset);
+    return fullDataset.filter(q => q.question_type === questionType);
+  }
+  /**
+   * Get dataset statistics
+   */
+  async getDatasetStats(dataset: 'longmemeval_s' | 'longmemeval_m' | 'longmemeval_oracle' | 'sample_data') {
+    const data = await this.loadDataset(dataset);
+    const stats = {
+      totalQuestions: data.length,
+      questionsByType: {} as Record<string, number>,
+      abstentionQuestions: 0,
+      avgSessionsPerQuestion: 0,
+      avgTurnsPerSession: 0,
+      totalTokensEstimate: 0,
+    };
+    // Count questions by type
+    for (const question of data) {
+      const type = question.question_type;
+      stats.questionsByType[type] = (stats.questionsByType[type] || 0) + 1;
+      if (question.question_id.endsWith('_abs')) {
+        stats.abstentionQuestions++;
+      }
+    }
+    // Calculate average sessions and turns
+    const totalSessions = data.reduce((sum, q) => sum + q.haystack_sessions.length, 0);
+    stats.avgSessionsPerQuestion = totalSessions / data.length;
+    let totalTurns = 0;
+    for (const question of data) {
+      for (const session of question.haystack_sessions) {
+        totalTurns += session.length;
+      }
+    }
+    stats.avgTurnsPerSession = totalTurns / totalSessions;
+    // Rough token estimate (assuming ~4 chars per token)
+    for (const question of data) {
+      for (const session of question.haystack_sessions) {
+        for (const turn of session) {
+          stats.totalTokensEstimate += Math.ceil(turn.content.length / 4);
+        }
+      }
+    }
+    return stats;
+  }
+  /**
+   * Validate dataset structure
+   */
+  private validateDataset(data: any[]): void {
+    if (!Array.isArray(data)) {
+      throw new Error('Dataset must be an array of questions');
+    }
+    if (data.length === 0) {
+      throw new Error('Dataset is empty');
+    }
+    // Validate first question structure as sample
+    const sample = data[0];
+    const requiredFields = [
+      'question_id',
+      'question_type',
+      'question',
+      'answer',
+      'question_date',
+      'haystack_session_ids',
+      'haystack_dates',
+      'haystack_sessions',
+      'answer_session_ids',
+    ];
+    for (const field of requiredFields) {
+      if (!(field in sample)) {
+        throw new Error(`Missing required field: ${field}`);
+      }
+    }
+    // Validate haystack_sessions structure
+    if (!Array.isArray(sample.haystack_sessions)) {
+      throw new Error('haystack_sessions must be an array');
+    }
+    if (sample.haystack_sessions.length > 0) {
+      const firstSession = sample.haystack_sessions[0];
+      if (!Array.isArray(firstSession)) {
+        throw new Error('Each session must be an array of turns');
+      }
+      if (firstSession.length > 0) {
+        const firstTurn = firstSession[0];
+        if (!firstTurn.role || !firstTurn.content) {
+          throw new Error('Each turn must have role and content fields');
+        }
+      }
+    }
+  }
+}

package/src/data/types.ts ADDED Viewed

@@ -0,0 +1,61 @@
+import { MemoryConfig } from '@mastra/core/memory';
+export type QuestionType =
+  | 'single-session-user'
+  | 'single-session-assistant'
+  | 'single-session-preference'
+  | 'temporal-reasoning'
+  | 'knowledge-update'
+  | 'multi-session';
+export interface Turn {
+  role: 'user' | 'assistant';
+  content: string;
+  has_answer?: boolean;
+}
+export interface LongMemEvalQuestion {
+  question_id: string;
+  question_type: QuestionType;
+  question: string;
+  answer: string;
+  question_date: string;
+  haystack_session_ids: string[];
+  haystack_dates: string[];
+  haystack_sessions: Turn[][];
+  answer_session_ids: string[];
+}
+export interface EvaluationResult {
+  question_id: string;
+  hypothesis: string;
+  autoeval_label?: boolean;
+  question_type?: QuestionType;
+  is_correct?: boolean;
+}
+export type DatasetType = 'longmemeval_s' | 'longmemeval_m' | 'longmemeval_oracle';
+export type MemoryConfigType =
+  | 'semantic-recall'
+  | 'working-memory'
+  | 'working-memory-tailored'
+  | 'combined'
+  | 'combined-tailored';
+export interface MemoryConfigOptions {
+  type: MemoryConfigType;
+  options: MemoryConfig;
+}
+export interface BenchmarkMetrics {
+  overall_accuracy: number;
+  accuracy_by_type: Partial<Record<QuestionType, { correct: number; total: number; accuracy: number }>>;
+  abstention_accuracy: number;
+  session_recall_accuracy?: number;
+  turn_recall_accuracy?: number;
+  total_questions: number;
+  correct_answers: number;
+  abstention_correct?: number;
+  abstention_total?: number;
+}

package/src/embeddings/cached-openai-embedding-model.ts ADDED Viewed

@@ -0,0 +1,227 @@
+import { EmbeddingModelV2, TooManyEmbeddingValuesForCallError } from '@ai-sdk/provider';
+import { xxh3 } from '@node-rs/xxhash';
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from 'fs';
+import { join } from 'path';
+import { OpenAIEmbedding } from '@ai-sdk/openai';
+import { Mutex } from 'async-mutex';
+// Global cache statistics
+export const embeddingCacheStats = {
+  cacheHits: 0,
+  cacheMisses: 0,
+  cacheWrites: 0,
+  reset() {
+    this.cacheHits = 0;
+    this.cacheMisses = 0;
+    this.cacheWrites = 0;
+  },
+};
+export class CachedOpenAIEmbeddingModel implements EmbeddingModelV2<string> {
+  readonly specificationVersion = 'v2';
+  readonly modelId: string;
+  readonly maxEmbeddingsPerCall = 2048;
+  readonly supportsParallelCalls = true;
+  private readonly cacheDir: string;
+  private readonly delegate: EmbeddingModelV2<string>;
+  private memoryCache: Map<string, number[]> = new Map();
+  private readonly fileOperationMutex = new Mutex();
+  get provider(): string {
+    return this.delegate.provider;
+  }
+  constructor(delegate: EmbeddingModelV2<string>, cacheDir: string = './embedding-cache') {
+    this.delegate = delegate;
+    this.modelId = delegate.modelId;
+    this.cacheDir = cacheDir;
+    // Ensure cache directory exists
+    if (!existsSync(this.cacheDir)) {
+      mkdirSync(this.cacheDir, { recursive: true });
+    }
+    // Load existing cache into memory
+    this.loadMemoryCache();
+  }
+  private getCacheKey(value: string): string {
+    // Use XXHash3 for ultra-fast hashing
+    const combined = `${this.modelId}:${value}`;
+    const hash = xxh3.xxh128(combined).toString(16).padStart(32, '0');
+    return hash;
+  }
+  private getCachePath(key: string): string {
+    // Split cache files into subdirectories to avoid too many files in one directory
+    const subdir = key.substring(0, 2);
+    const dir = join(this.cacheDir, subdir);
+    if (!existsSync(dir)) {
+      mkdirSync(dir, { recursive: true });
+    }
+    return join(dir, `${key}.json`);
+  }
+  private loadMemoryCache(): void {
+    // This could be optimized to load lazily or with a size limit
+    // console.log('Loading embedding cache into memory...');
+  }
+  private async getCachedEmbedding(value: string): Promise<number[] | null> {
+    const key = this.getCacheKey(value);
+    // Check memory cache first
+    if (this.memoryCache.has(key)) {
+      embeddingCacheStats.cacheHits++;
+      return this.memoryCache.get(key)!;
+    }
+    // Check file cache with mutex
+    const cachePath = this.getCachePath(key);
+    const embedding = await this.fileOperationMutex.runExclusive(async () => {
+      // Double-check memory cache in case another thread loaded it
+      if (this.memoryCache.has(key)) {
+        return this.memoryCache.get(key)!;
+      }
+      if (existsSync(cachePath)) {
+        try {
+          const content = readFileSync(cachePath, 'utf-8');
+          const cached = JSON.parse(content);
+          return cached.embedding;
+        } catch (e) {
+          // If JSON is corrupted, delete the file
+          console.warn(`Corrupted cache file ${cachePath}, deleting...`);
+          try {
+            require('fs').unlinkSync(cachePath);
+          } catch (deleteError) {
+            // Ignore delete errors
+          }
+          return null;
+        }
+      }
+      return null;
+    });
+    if (embedding) {
+      this.memoryCache.set(key, embedding);
+      embeddingCacheStats.cacheHits++;
+      return embedding;
+    }
+    embeddingCacheStats.cacheMisses++;
+    return null;
+  }
+  private cacheEmbedding(value: string, embedding: number[]): void {
+    const key = this.getCacheKey(value);
+    // Store in memory cache immediately
+    this.memoryCache.set(key, embedding);
+    embeddingCacheStats.cacheWrites++;
+    // Store in file cache asynchronously with mutex (fire and forget)
+    this.fileOperationMutex
+      .runExclusive(async () => {
+        const cachePath = this.getCachePath(key);
+        try {
+          const data = JSON.stringify({
+            value: value,
+            embedding: embedding,
+            modelId: this.modelId,
+            timestamp: new Date().toISOString(),
+          });
+          // Write to temp file first, then rename (atomic operation)
+          const tempPath = `${cachePath}.tmp`;
+          writeFileSync(tempPath, data);
+          require('fs').renameSync(tempPath, cachePath);
+        } catch (e) {
+          // console.warn(`Failed to cache embedding for ${key}:`, e);
+          // Clean up temp file if it exists
+          try {
+            require('fs').unlinkSync(`${cachePath}.tmp`);
+          } catch (cleanupError) {
+            // Ignore cleanup errors
+          }
+        }
+      })
+      .catch(() => {
+        // Ignore write errors
+      });
+  }
+  async doEmbed({
+    values,
+    headers,
+    abortSignal,
+    providerOptions,
+  }: Parameters<EmbeddingModelV2<string>['doEmbed']>[0]): Promise<
+    Awaited<ReturnType<EmbeddingModelV2<string>['doEmbed']>>
+  > {
+    if (values.length > this.maxEmbeddingsPerCall) {
+      throw new TooManyEmbeddingValuesForCallError({
+        provider: this.provider,
+        modelId: this.modelId,
+        maxEmbeddingsPerCall: this.maxEmbeddingsPerCall,
+        values,
+      });
+    }
+    const embeddings: number[][] = [];
+    const uncachedValues: string[] = [];
+    const uncachedIndices: number[] = [];
+    // Check cache for each value
+    for (let i = 0; i < values.length; i++) {
+      const cached = await this.getCachedEmbedding(values[i]);
+      if (cached) {
+        embeddings[i] = cached;
+      } else {
+        uncachedValues.push(values[i]);
+        uncachedIndices.push(i);
+      }
+    }
+    let usage = { tokens: 0 };
+    let responseHeaders: Record<string, string> = {};
+    let rawValue: any = {};
+    // If we have uncached values, fetch them from the API
+    if (uncachedValues.length > 0) {
+      // console.log(`Fetching ${uncachedValues.length} uncached embeddings (${values.length - uncachedValues.length} cached)`);
+      const result = await this.delegate.doEmbed({
+        values: uncachedValues,
+        headers,
+        abortSignal,
+        providerOptions,
+      });
+      // Cache the new embeddings and add them to our results
+      for (let i = 0; i < uncachedValues.length; i++) {
+        const value = uncachedValues[i];
+        const embedding = result.embeddings[i];
+        const originalIndex = uncachedIndices[i];
+        this.cacheEmbedding(value, embedding);
+        embeddings[originalIndex] = embedding;
+      }
+      usage = result.usage || { tokens: 0 };
+      responseHeaders = result.response?.headers || {};
+      rawValue = result.response?.body || {};
+    } else {
+      // console.log(`All ${values.length} embeddings served from cache`);
+      // Yield to prevent blocking when everything is cached
+      await new Promise(resolve => setImmediate(resolve));
+    }
+    return {
+      embeddings,
+      usage,
+      response: { headers: responseHeaders, body: rawValue },
+    };
+  }
+}

package/src/embeddings/cached-openai-provider.ts ADDED Viewed

@@ -0,0 +1,40 @@
+import { openai as originalOpenAI, createOpenAI } from '@ai-sdk/openai';
+import { OpenAIProvider } from '@ai-sdk/openai';
+import { CachedOpenAIEmbeddingModel } from './cached-openai-embedding-model';
+import { join } from 'path';
+export interface CachedOpenAIOptions {
+  apiKey?: string;
+  cacheDir?: string;
+  baseURL?: string;
+  headers?: Record<string, string>;
+}
+export function createCachedOpenAI(options: CachedOpenAIOptions = {}) {
+  // Create the original OpenAI provider
+  const provider = createOpenAI({
+    apiKey: options.apiKey,
+    baseURL: options.baseURL,
+    headers: options.headers,
+  });
+  // Create a proxy that intercepts embedding model creation
+  return new Proxy(provider, {
+    get(target, prop, receiver) {
+      if (prop === 'embedding') {
+        // Return a function that creates cached embedding models
+        return (modelId: string) => {
+          const originalModel = target.embedding(modelId);
+          const cacheDir = options.cacheDir || join(process.cwd(), '.embedding-cache', modelId);
+          return new CachedOpenAIEmbeddingModel(originalModel, cacheDir);
+        };
+      }
+      // For all other properties, use the original
+      return Reflect.get(target, prop, receiver);
+    },
+  });
+}
+// Export a default cached OpenAI instance
+export const cachedOpenAI = createCachedOpenAI();

package/src/embeddings/index.ts ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ export { CachedOpenAIEmbeddingModel, embeddingCacheStats } from './cached-openai-embedding-model';
2	+ export { cachedOpenAI, createCachedOpenAI } from './cached-openai-provider';

package/src/evaluation/__tests__/longmemeval-metric.test.ts ADDED Viewed

@@ -0,0 +1,169 @@
+import { describe, it, expect, beforeEach, vi } from 'vitest';
+import { LongMemEvalMetric, createLongMemEvalMetric } from '../longmemeval-metric';
+import { Agent } from '@mastra/core/agent';
+// Mock Agent
+const mockAgent = {
+  generate: vi.fn().mockImplementation(async messages => {
+    const content = messages[0].content;
+    // Check if it's asking about correct response
+    if (content.includes('Is the model response correct?')) {
+      // If the model response contains the correct answer, return yes
+      if (content.includes('Model Response: Blue') && content.includes('Correct Answer: Blue')) {
+        return {
+          text: 'yes',
+        };
+      }
+      // If the response doesn't match, return no with reason
+      return {
+        text: 'no: the model did not provide the correct answer',
+      };
+    }
+    // For abstention questions
+    if (content.includes('Does the model correctly identify the question as unanswerable?')) {
+      if (content.includes('cannot answer') || content.includes("don't have that information")) {
+        return {
+          text: 'yes',
+        };
+      }
+      return {
+        text: 'no: the model attempted to answer an unanswerable question',
+      };
+    }
+    // Default response
+    return {
+      text: 'yes',
+    };
+  }),
+} as unknown as Agent;
+describe('LongMemEvalMetric', () => {
+  describe('measure', () => {
+    it('should return score 1 for correct answer', async () => {
+      const metric = new LongMemEvalMetric({
+        agent: mockAgent,
+        questionType: 'single-session-user',
+      });
+      const input = JSON.stringify({
+        question: 'What is my favorite color?',
+        answer: 'Blue',
+      });
+      const output = 'Blue';
+      const result = await metric.measure(input, output);
+      expect(result.score).toBe(1);
+      expect(result.info?.questionType).toBe('single-session-user');
+      expect(result.info?.evaluatorResponse).toBe('yes');
+    });
+    it('should return score 0 for incorrect answer', async () => {
+      const metric = new LongMemEvalMetric({
+        agent: mockAgent,
+        questionType: 'single-session-user',
+      });
+      const input = JSON.stringify({
+        question: 'What is my favorite color?',
+        answer: 'Blue',
+      });
+      const output = 'Red';
+      const result = await metric.measure(input, output);
+      expect(result.score).toBe(0);
+      expect(result.info?.reason).toBe('the model did not provide the correct answer');
+    });
+    it('should handle abstention questions correctly', async () => {
+      const metric = new LongMemEvalMetric({
+        agent: mockAgent,
+        questionType: 'single-session-user',
+        isAbstention: true,
+      });
+      const input = JSON.stringify({
+        question: 'What is my favorite food?',
+        answer: 'This question cannot be answered based on the conversation history',
+      });
+      const output = 'I cannot answer that question based on our conversation history.';
+      const result = await metric.measure(input, output);
+      expect(result.score).toBe(1);
+      expect(result.info?.isAbstention).toBe(true);
+    });
+    it('should handle different question types', async () => {
+      const temporalMetric = createLongMemEvalMetric('temporal-reasoning', mockAgent);
+      const knowledgeMetric = createLongMemEvalMetric('knowledge-update', mockAgent);
+      const preferenceMetric = createLongMemEvalMetric('single-session-preference', mockAgent);
+      // All should be instances of LongMemEvalMetric
+      expect(temporalMetric).toBeInstanceOf(LongMemEvalMetric);
+      expect(knowledgeMetric).toBeInstanceOf(LongMemEvalMetric);
+      expect(preferenceMetric).toBeInstanceOf(LongMemEvalMetric);
+    });
+    it('should throw error for unknown question type', async () => {
+      expect(() => {
+        new LongMemEvalMetric({
+          agent: mockAgent,
+          questionType: 'invalid-type' as any,
+        });
+      }).not.toThrow(); // Constructor doesn't validate
+      // The error would be thrown during measure when getting the prompt
+      const metric = new LongMemEvalMetric({
+        agent: mockAgent,
+        questionType: 'invalid-type' as any,
+      });
+      const input = JSON.stringify({
+        question: 'Test question',
+        answer: 'Test answer',
+      });
+      await expect(metric.measure(input, 'Test output')).rejects.toThrow('Unknown question type: invalid-type');
+    });
+    it('should parse evaluator response correctly', async () => {
+      const metric = new LongMemEvalMetric({
+        agent: mockAgent,
+        questionType: 'single-session-user',
+      });
+      const input = JSON.stringify({
+        question: 'What is my name?',
+        answer: 'John',
+      });
+      const output = "I don't know your name";
+      const result = await metric.measure(input, output);
+      expect(result.score).toBe(0);
+      expect(result.info?.evaluatorResponse).toContain('no');
+      expect(result.info?.reason).toBeTruthy();
+    });
+  });
+  describe('createLongMemEvalMetric', () => {
+    it('should create metric with correct configuration', () => {
+      const metric = createLongMemEvalMetric('multi-session', mockAgent);
+      expect(metric).toBeInstanceOf(LongMemEvalMetric);
+    });
+    it('should throw error when agent is not provided', () => {
+      expect(() => {
+        new LongMemEvalMetric({
+          questionType: 'single-session-user',
+        } as any);
+      }).toThrow('Agent instance is required for LongMemEvalMetric');
+    });
+  });
+});