npm - @mastra/longmemeval - Versions diffs - 0.0.0-add-libsql-changeset-20250910154739 - Mend

@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

package/CHANGELOG.md +919 -0
package/DATA_DOWNLOAD_GUIDE.md +117 -0
package/LICENSE.md +15 -0
package/README.md +173 -0
package/USAGE.md +105 -0
package/package.json +67 -0
package/scripts/download.ts +180 -0
package/scripts/find-failed.ts +176 -0
package/scripts/generate-embeddings.ts +56 -0
package/scripts/generate-wm-templates.ts +296 -0
package/scripts/setup.ts +60 -0
package/src/__fixtures__/embeddings.json +2319 -0
package/src/__fixtures__/test-dataset.json +82 -0
package/src/cli.ts +690 -0
package/src/commands/__tests__/prepare.test.ts +230 -0
package/src/commands/__tests__/run.test.ts +403 -0
package/src/commands/prepare.ts +793 -0
package/src/commands/run.ts +553 -0
package/src/config.ts +83 -0
package/src/data/loader.ts +163 -0
package/src/data/types.ts +61 -0
package/src/embeddings/cached-openai-embedding-model.ts +227 -0
package/src/embeddings/cached-openai-provider.ts +40 -0
package/src/embeddings/index.ts +2 -0
package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
package/src/evaluation/longmemeval-metric.ts +173 -0
package/src/retry-model.ts +60 -0
package/src/storage/__tests__/benchmark-store.test.ts +280 -0
package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
package/src/storage/benchmark-store.ts +540 -0
package/src/storage/benchmark-vector.ts +234 -0
package/src/storage/index.ts +2 -0
package/src/test-utils/mock-embeddings.ts +54 -0
package/src/test-utils/mock-model.ts +49 -0
package/tests/data-loader.test.ts +96 -0
package/tsconfig.json +18 -0
package/vitest.config.ts +9 -0

package/scripts/find-failed.ts ADDED Viewed

@@ -0,0 +1,176 @@
+import { readdir, readFile, unlink, rmdir } from 'fs/promises';
+import { existsSync } from 'fs';
+import { join } from 'path';
+import chalk from 'chalk';
+interface FailedQuestion {
+  questionId: string;
+  dataset: string;
+  memoryConfig: string;
+  error: string;
+  failedAt: string;
+  path: string;
+}
+async function findFailedQuestions(baseDir: string = './prepared-data'): Promise<FailedQuestion[]> {
+  const failed: FailedQuestion[] = [];
+  console.log(chalk.gray(`Scanning directory: ${baseDir}`));
+  if (!existsSync(baseDir)) {
+    console.error(chalk.red(`Base directory not found: ${baseDir}`));
+    return failed;
+  }
+  try {
+    // Iterate through datasets
+    const datasets = await readdir(baseDir);
+    console.log(chalk.gray(`Found datasets: ${datasets.join(', ')}`));
+    for (const dataset of datasets) {
+      const datasetPath = join(baseDir, dataset);
+      const stat = await readdir(datasetPath).catch(() => null);
+      if (!stat) continue;
+      // Iterate through memory configs
+      const configs = await readdir(datasetPath);
+      console.log(chalk.gray(`  ${dataset} configs: ${configs.join(', ')}`));
+      for (const config of configs) {
+        const configPath = join(datasetPath, config);
+        const configStat = await readdir(configPath).catch(() => null);
+        if (!configStat) continue;
+        // Iterate through questions
+        const questions = await readdir(configPath);
+        console.log(chalk.gray(`    ${config}: ${questions.length} questions`));
+        let progressFound = 0;
+        let failedFound = 0;
+        for (const questionId of questions) {
+          const questionPath = join(configPath, questionId);
+          const progressPath = join(questionPath, 'progress.json');
+          // Check if progress.json exists and has failed status
+          if (existsSync(progressPath)) {
+            progressFound++;
+            try {
+              const progress = JSON.parse(await readFile(progressPath, 'utf-8'));
+              if (progress.failed === true) {
+                failedFound++;
+                failed.push({
+                  questionId,
+                  dataset,
+                  memoryConfig: config,
+                  error: progress.error || 'Unknown error',
+                  failedAt: progress.failedAt || 'Unknown time',
+                  path: questionPath,
+                });
+              }
+            } catch (e) {
+              console.error(chalk.red(`Error reading progress for ${questionId}:`, e));
+            }
+          }
+        }
+        if (progressFound > 0) {
+          console.log(chalk.gray(`      Progress files found: ${progressFound}, Failed: ${failedFound}`));
+        }
+      }
+    }
+  } catch (error) {
+    console.error(chalk.red('Error scanning directories:'), error);
+  }
+  return failed;
+}
+async function deleteQuestionDir(path: string): Promise<void> {
+  // Recursively delete directory
+  const entries = await readdir(path, { withFileTypes: true });
+  for (const entry of entries) {
+    const fullPath = join(path, entry.name);
+    if (entry.isDirectory()) {
+      await deleteQuestionDir(fullPath);
+    } else {
+      await unlink(fullPath);
+    }
+  }
+  await rmdir(path);
+}
+async function main() {
+  const args = process.argv.slice(2);
+  const shouldDelete = args.includes('--delete');
+  const dataset = args.find(arg => arg.startsWith('--dataset='))?.split('=')[1];
+  const config = args.find(arg => arg.startsWith('--config='))?.split('=')[1];
+  console.log(chalk.blue('\n🔍 Finding failed questions...\n'));
+  const failed = await findFailedQuestions();
+  // Filter by dataset/config if specified
+  let filtered = failed;
+  if (dataset) {
+    filtered = filtered.filter(f => f.dataset === dataset);
+  }
+  if (config) {
+    filtered = filtered.filter(f => f.memoryConfig === config);
+  }
+  if (filtered.length === 0) {
+    console.log(chalk.green('✅ No failed questions found!\n'));
+    return;
+  }
+  // Group by dataset and config
+  const grouped = filtered.reduce(
+    (acc, f) => {
+      const key = `${f.dataset}/${f.memoryConfig}`;
+      if (!acc[key]) acc[key] = [];
+      acc[key].push(f);
+      return acc;
+    },
+    {} as Record<string, FailedQuestion[]>,
+  );
+  // Display results
+  console.log(chalk.red(`Found ${filtered.length} failed questions:\n`));
+  for (const [group, questions] of Object.entries(grouped)) {
+    console.log(chalk.yellow(`\n${group}:`));
+    for (const q of questions) {
+      console.log(chalk.gray(`  - ${q.questionId}`));
+      console.log(chalk.gray(`    Error: ${q.error.substring(0, 100)}${q.error.length > 100 ? '...' : ''}`));
+      console.log(chalk.gray(`    Failed at: ${q.failedAt}`));
+    }
+  }
+  if (shouldDelete) {
+    console.log(chalk.yellow('\n⚠️  Deleting failed question directories...\n'));
+    for (const q of filtered) {
+      try {
+        await deleteQuestionDir(q.path);
+        console.log(chalk.green(`✓ Deleted ${q.questionId}`));
+      } catch (error) {
+        console.error(chalk.red(`✗ Failed to delete ${q.questionId}:`, error));
+      }
+    }
+    console.log(chalk.green(`\n✅ Deleted ${filtered.length} failed question directories\n`));
+  } else {
+    console.log(chalk.gray('\n💡 Tip: Use --delete to remove these directories and retry preparation'));
+    console.log(chalk.gray('   Example: pnpm tsx scripts/find-failed.ts --delete'));
+    console.log(
+      chalk.gray('   Filter: pnpm tsx scripts/find-failed.ts --dataset=longmemeval_s --config=working-memory'),
+    );
+  }
+}
+main().catch(console.error);

package/scripts/generate-embeddings.ts ADDED Viewed

@@ -0,0 +1,56 @@
+#!/usr/bin/env tsx
+import { openai } from '@ai-sdk/openai';
+import { writeFile, mkdir } from 'fs/promises';
+import { join, dirname } from 'path';
+import { fileURLToPath } from 'url';
+const __dirname = dirname(fileURLToPath(import.meta.url));
+// Sample texts to generate embeddings for
+const SAMPLE_TEXTS = [
+  'My favorite color is blue',
+  'I understand your favorite color is blue.',
+  'I have a pet',
+  'What kind of pet do you have?',
+  'It is a cat named Fluffy',
+  'Fluffy is a lovely name for a cat!',
+  'Hello',
+  'Hi there!',
+  'What is my favorite color?',
+  'What did I say about my pet?',
+  'You have a cat named Fluffy',
+  'Blue',
+];
+async function generateEmbeddings() {
+  console.log('🔧 Generating fixture embeddings...\n');
+  const embedder = openai.embedding('text-embedding-3-small');
+  const embeddings: Record<string, number[]> = {};
+  for (const text of SAMPLE_TEXTS) {
+    console.log(`Generating embedding for: "${text}"`);
+    const result = await embedder.doEmbed({
+      values: [text],
+    });
+    embeddings[text] = result.embeddings[0];
+  }
+  // Save embeddings to fixtures directory
+  const fixturesDir = join(__dirname, '..', 'src', '__fixtures__');
+  await mkdir(fixturesDir, { recursive: true });
+  const outputPath = join(fixturesDir, 'embeddings.json');
+  await writeFile(outputPath, JSON.stringify(embeddings, null, 2));
+  console.log(`\n✅ Embeddings saved to: ${outputPath}`);
+  console.log(`Generated ${Object.keys(embeddings).length} embeddings`);
+}
+// Run if called directly
+if (import.meta.url === `file://${process.argv[1]}`) {
+  generateEmbeddings().catch(console.error);
+}
+export { generateEmbeddings };

package/scripts/generate-wm-templates.ts ADDED Viewed

@@ -0,0 +1,296 @@
+import { Agent } from '@mastra/core/agent';
+import { google } from '@ai-sdk/google';
+import chalk from 'chalk';
+import ora, { Ora } from 'ora';
+import { existsSync } from 'fs';
+import { readFile, writeFile, mkdir } from 'fs/promises';
+import { join } from 'path';
+import { DatasetLoader } from '../src/data/loader';
+import type { LongMemEvalQuestion } from '../src/data/types';
+interface WorkingMemoryTemplate {
+  template: string;
+  generated_at: string;
+  question_type: string;
+  question: string;
+  answer: string;
+}
+interface TemplateDatabase {
+  [questionId: string]: WorkingMemoryTemplate;
+}
+async function generateTemplate(question: LongMemEvalQuestion): Promise<string> {
+  // Create a simple agent for template generation
+  const agent = new Agent({
+    name: 'template-generator',
+    instructions: `You are an expert at designing working memory templates for AI assistants.
+Given a question and answer from a conversation history benchmark, generate a working memory instruction that would help an AI assistant extract and save the specific information needed to answer the question correctly.
+The instruction should:
+1. Be specific about what information to track
+2. Use bullet points to organize different categories
+3. Focus ONLY on information directly relevant to answering this specific question
+4. Be concise but comprehensive
+5. Do not be overly specific, the template should be generic enough to apply generally to the topic at hand, without revealing too much about the answer directly. Overly specific templates will invalidate the usefulness of the recorded information.
+${!isNaN(Number(question.answer)) ? '6. A number should be stored counting the relevant data' : '6. If the question involves keeping track of the count or number of something, make that clear in the template'}
+Format your response as a clear instruction starting with "Pay close attention to the following information (current and past):"
+Then list the specific categories and details to track using bullet points.`,
+    model: google('gemini-2.5-flash-preview-04-17'),
+  });
+  const prompt = `Question Type: ${question.question_type}
+Question: "${question.question}"
+Generate a working memory instruction specifically tailored for capturing the information needed to answer this question.
+If the question involves remembering a specific date or a specific location, make sure that's captured in the template.`;
+  const result = await agent.generate(prompt, {
+    temperature: 0,
+  });
+  const template = result.text.trim();
+  // Validate that we got a non-empty response
+  if (!template || template.length < 50) {
+    throw new Error(`Generated template is too short or empty for question ${question.question_id}`);
+  }
+  return template;
+}
+async function main() {
+  const args = process.argv.slice(2);
+  const dataset = args[0] || 'longmemeval_s';
+  const concurrency = parseInt(args[1]) || 100; // Default to 5 concurrent generations
+  const outputPath = join(process.cwd(), 'prepared-data', 'wm-templates', `${dataset}.json`);
+  console.log(chalk.blue('\n🧠 Generating Working Memory Templates\n'));
+  console.log(chalk.gray(`Dataset: ${dataset}`));
+  console.log(chalk.gray(`Concurrency: ${concurrency}`));
+  console.log(chalk.gray(`Output: ${outputPath}`));
+  // Set up signal handlers for graceful shutdown
+  let interrupted = false;
+  let currentSpinner: any = null;
+  let cleanupHandler: () => void;
+  const baseCleanup = () => {
+    interrupted = true;
+    if (currentSpinner) {
+      currentSpinner.stop();
+    }
+    console.log(chalk.yellow('\n\n⚠️  Interrupted! Progress has been saved.'));
+    console.log(chalk.gray(`Templates saved to: ${outputPath}`));
+    process.exit(0);
+  };
+  cleanupHandler = baseCleanup;
+  process.on('SIGINT', () => cleanupHandler());
+  process.on('SIGTERM', () => cleanupHandler());
+  // Check for OpenAI API key
+  if (!process.env.OPENAI_API_KEY) {
+    console.error(chalk.red('Error: OPENAI_API_KEY environment variable is required'));
+    process.exit(1);
+  }
+  // Load dataset
+  const loader = new DatasetLoader();
+  const spinner = ora('Loading dataset...').start();
+  currentSpinner = spinner;
+  const questions = await loader.loadDataset(dataset as any);
+  spinner.succeed(`Loaded ${questions.length} questions`);
+  currentSpinner = null;
+  // Load existing templates if they exist
+  let templates: TemplateDatabase = {};
+  if (existsSync(outputPath)) {
+    const loadSpinner = ora('Loading existing templates...').start();
+    currentSpinner = loadSpinner;
+    try {
+      templates = JSON.parse(await readFile(outputPath, 'utf-8'));
+      loadSpinner.succeed(`Loaded ${Object.keys(templates).length} existing templates`);
+      currentSpinner = null;
+      // Count empty templates
+      const emptyTemplates = Object.entries(templates).filter(([_, t]) => !t.template || t.template.length === 0);
+      if (emptyTemplates.length > 0) {
+        console.log(chalk.yellow(`⚠️  Found ${emptyTemplates.length} empty templates that will be regenerated`));
+        // Remove empty templates so they get regenerated
+        emptyTemplates.forEach(([id]) => delete templates[id]);
+      }
+    } catch (e) {
+      loadSpinner.warn('Could not load existing templates, starting fresh');
+      currentSpinner = null;
+    }
+  }
+  // Process questions
+  const questionsToProcess = questions.filter(q => !templates[q.question_id] || !templates[q.question_id].template);
+  if (questionsToProcess.length === 0) {
+    console.log(chalk.green('\n✅ All questions already have templates!'));
+    return;
+  }
+  console.log(chalk.yellow(`\nGenerating templates for ${questionsToProcess.length} questions...\n`));
+  let processed = 0;
+  let errors = 0;
+  let inProgress = 0;
+  const questionQueue = [...questionsToProcess];
+  const activeGenerations = new Map<string, Ora>();
+  // Update cleanup to have access to activeGenerations
+  cleanupHandler = () => {
+    interrupted = true;
+    if (currentSpinner) {
+      currentSpinner.stop();
+    }
+    activeGenerations.forEach(spinner => spinner.stop());
+    console.log(chalk.yellow('\n\n⚠️  Interrupted! Progress has been saved.'));
+    console.log(chalk.gray(`Templates saved to: ${outputPath}`));
+    process.exit(0);
+  };
+  // Create directory once
+  await mkdir(join(process.cwd(), 'prepared-data', 'wm-templates'), { recursive: true });
+  // Main progress spinner
+  const mainSpinner = ora({
+    text: `Processing: 0/${questionsToProcess.length} (0 in progress)`,
+    spinner: 'dots',
+  }).start();
+  currentSpinner = mainSpinner;
+  const updateProgress = () => {
+    mainSpinner.text = `Processing: ${processed}/${questionsToProcess.length} (${inProgress} in progress, ${errors} failed)`;
+  };
+  // Worker function to process a single question
+  const processQuestion = async (question: LongMemEvalQuestion): Promise<void> => {
+    if (interrupted) return;
+    const questionSpinner = ora({
+      text: `${question.question_id}: Starting...`,
+      prefixText: '  ',
+      spinner: 'dots',
+    }).start();
+    activeGenerations.set(question.question_id, questionSpinner);
+    inProgress++;
+    updateProgress();
+    let attempts = 0;
+    const maxAttempts = 3;
+    let lastError = null;
+    while (attempts < maxAttempts && !interrupted) {
+      try {
+        attempts++;
+        if (attempts > 1) {
+          questionSpinner.text = `${question.question_id}: Retry ${attempts}/${maxAttempts}...`;
+        } else {
+          questionSpinner.text = `${question.question_id}: Generating...`;
+        }
+        const template = await generateTemplate(question);
+        templates[question.question_id] = {
+          template,
+          generated_at: new Date().toISOString(),
+          question_type: question.question_type,
+          question: question.question,
+          answer: question.answer,
+        };
+        // Save after each successful generation
+        await writeFile(outputPath, JSON.stringify(templates, null, 2));
+        questionSpinner.succeed(`${question.question_id} (${question.question_type})`);
+        activeGenerations.delete(question.question_id);
+        processed++;
+        inProgress--;
+        updateProgress();
+        break; // Success, exit retry loop
+      } catch (error) {
+        lastError = error;
+        if (attempts < maxAttempts && !interrupted) {
+          // Add a small delay before retry
+          await new Promise(resolve => setTimeout(resolve, 1000));
+        }
+      }
+    }
+    if ((attempts === maxAttempts && lastError) || interrupted) {
+      if (!interrupted) {
+        errors++;
+        questionSpinner.fail(`${question.question_id}: ${lastError}`);
+      } else {
+        questionSpinner.warn(`${question.question_id}: Interrupted`);
+      }
+      activeGenerations.delete(question.question_id);
+      inProgress--;
+      updateProgress();
+    }
+  };
+  // Process questions concurrently with a worker pool
+  const workers: Promise<void>[] = [];
+  while (questionQueue.length > 0 && !interrupted) {
+    // Fill up to concurrency limit
+    while (workers.length < concurrency && questionQueue.length > 0 && !interrupted) {
+      const question = questionQueue.shift()!;
+      const workerPromise = processQuestion(question).catch(err => {
+        console.error(chalk.red(`Unexpected error processing ${question.question_id}:`), err);
+      });
+      workers.push(workerPromise);
+    }
+    // Wait for at least one to complete
+    if (workers.length > 0) {
+      await Promise.race(workers);
+      // Remove completed workers
+      for (let i = workers.length - 1; i >= 0; i--) {
+        if ((await Promise.race([workers[i], Promise.resolve('pending')])) !== 'pending') {
+          workers.splice(i, 1);
+        }
+      }
+    }
+  }
+  // Wait for remaining workers
+  if (!interrupted) {
+    await Promise.all(workers);
+  }
+  // Clean up spinners
+  activeGenerations.forEach(spinner => spinner.stop());
+  mainSpinner.succeed(`Completed: ${processed}/${questionsToProcess.length} (${errors} failed)`);
+  currentSpinner = null;
+  // Final summary
+  console.log(chalk.blue('\n📊 Summary'));
+  console.log(chalk.green(`✓ Successfully generated: ${processed} templates`));
+  if (errors > 0) {
+    console.log(chalk.red(`✗ Failed: ${errors} templates`));
+  }
+  console.log(chalk.gray(`Total templates: ${Object.keys(templates).length}`));
+  console.log(chalk.gray(`Saved to: ${outputPath}`));
+}
+main().catch(error => {
+  console.error(chalk.red('\nError:'), error.message);
+  console.log(chalk.gray('\nUsage: pnpm generate-wm-templates [dataset] [concurrency]'));
+  console.log(chalk.gray('  dataset: longmemeval_s (default)'));
+  console.log(chalk.gray('  concurrency: number of parallel generations (default: 5)'));
+  process.exit(1);
+});

package/scripts/setup.ts ADDED Viewed

@@ -0,0 +1,60 @@
+#!/usr/bin/env tsx
+import { execSync } from 'child_process';
+import { existsSync } from 'fs';
+import { join } from 'path';
+import chalk from 'chalk';
+import ora from 'ora';
+const DATA_DIR = join(process.cwd(), 'data');
+const EXPECTED_FILES = ['longmemeval_s.json', 'longmemeval_m.json', 'longmemeval_oracle.json'];
+async function setup() {
+  console.log(chalk.blue('\n🚀 LongMemEval Setup\n'));
+  // Check if already set up
+  const hasAllFiles = EXPECTED_FILES.every(file => existsSync(join(DATA_DIR, file)));
+  if (hasAllFiles) {
+    console.log(chalk.green('✓ All datasets are already downloaded'));
+    console.log(chalk.gray('\nYou can run the benchmark with:'));
+    console.log(chalk.cyan('  pnpm cli run --dataset longmemeval_s --model gpt-4o\n'));
+    return;
+  }
+  // Install dependencies
+  const spinner = ora('Installing dependencies...').start();
+  try {
+    execSync('pnpm install', { stdio: 'ignore' });
+    spinner.succeed('Dependencies installed');
+  } catch (error) {
+    spinner.fail('Failed to install dependencies');
+    throw error;
+  }
+  // Download datasets
+  console.log(chalk.blue('\n📥 Downloading datasets...\n'));
+  try {
+    execSync('pnpm download', { stdio: 'inherit' });
+  } catch (error) {
+    console.log(chalk.yellow('\n⚠️  Automatic download failed.'));
+    console.log(chalk.yellow('Please check the DOWNLOAD_GUIDE.md for manual download instructions.\n'));
+  }
+  // Verify setup
+  const filesAfterDownload = EXPECTED_FILES.filter(file => existsSync(join(DATA_DIR, file)));
+  if (filesAfterDownload.length === EXPECTED_FILES.length) {
+    console.log(chalk.green('\n✅ Setup complete!'));
+    console.log(chalk.gray('\nYou can now run the benchmark:'));
+    console.log(chalk.cyan('  pnpm cli run --dataset longmemeval_s --model gpt-4o'));
+    console.log(chalk.gray('\nOr view available commands:'));
+    console.log(chalk.cyan('  pnpm cli --help\n'));
+  } else {
+    console.log(chalk.yellow('\n⚠️  Setup incomplete. Please download the datasets manually.'));
+  }
+}
+// Run setup
+setup().catch(console.error);