npm - @mastra/longmemeval - Versions diffs - 0.0.0-error-handler-fix-20251020202607 → 0.0.0-execa-dynamic-import-20260304221256 - Mend

@mastra/longmemeval 0.0.0-error-handler-fix-20251020202607 → 0.0.0-execa-dynamic-import-20260304221256

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (43) hide show

package/CHANGELOG.md +847 -6
package/LICENSE.md +15 -0
package/docs/INVESTIGATION_WORKFLOW.md +314 -0
package/package.json +34 -26
package/scripts/download.ts +22 -1
package/scripts/generate-wm-templates.ts +2 -1
package/src/cli.ts +967 -155
package/src/commands/__tests__/prepare.test.ts +11 -6
package/src/commands/__tests__/run.test.ts +10 -8
package/src/commands/clean.ts +147 -0
package/src/commands/deterministic-ids.ts +126 -0
package/src/commands/find-prohibited.ts +299 -0
package/src/commands/investigate.ts +2437 -0
package/src/commands/list-partial.ts +137 -0
package/src/commands/obscure-thread-ids.ts +237 -0
package/src/commands/partial-results.ts +198 -0
package/src/commands/precompute-embeddings.ts +302 -0
package/src/commands/prepare.ts +732 -105
package/src/commands/reconcile.ts +383 -0
package/src/commands/run.ts +1240 -121
package/src/commands/sessions.ts +126 -0
package/src/commands/sync.ts +152 -0
package/src/commands/test-cleaned.ts +53 -0
package/src/commands/test-incremental.ts +120 -0
package/src/commands/test-messages.ts +95 -0
package/src/commands/test-prohibited.ts +244 -0
package/src/commands/test-thread.ts +136 -0
package/src/commands/tokens.ts +625 -0
package/src/config.ts +1253 -72
package/src/data/types.ts +48 -6
package/src/embeddings/cached-openai-embedding-model.ts +0 -1
package/src/evaluation/longmemeval-metric.ts +186 -103
package/src/processors/content-sanitizer.ts +153 -0
package/src/processors/date-injector.ts +111 -0
package/src/processors/index.ts +3 -0
package/src/processors/observation-semantic-filter.ts +1142 -0
package/src/retry-model.ts +5 -3
package/src/storage/__tests__/benchmark-store.test.ts +18 -35
package/src/storage/benchmark-store.ts +140 -153
package/src/storage/benchmark-vector.ts +1 -1
package/src/storage/index.ts +1 -0
package/src/storage/persistable-inmemory.ts +133 -0
package/src/test-utils/mock-model.ts +35 -0

package/LICENSE.md CHANGED Viewed

@@ -1,3 +1,18 @@
+Portions of this software are licensed as follows:
+- All content that resides under any directory named "ee/" within this
+  repository, including but not limited to:
+  - `packages/core/src/auth/ee/`
+  - `packages/server/src/server/auth/ee/`
+    is licensed under the license defined in `ee/LICENSE`.
+- All third-party components incorporated into the Mastra Software are
+  licensed under the original license provided by the owner of the
+  applicable component.
+- Content outside of the above-mentioned directories or restrictions is
+  available under the "Apache License 2.0" as defined below.
 # Apache License 2.0
 Copyright (c) 2025 Kepler Software, Inc.

package/docs/INVESTIGATION_WORKFLOW.md ADDED Viewed

@@ -0,0 +1,314 @@
+# LongMemEval Investigation Workflow
+This document describes how to investigate failing questions in LongMemEval benchmarks to identify root causes and implement fixes.
+The point is to find deficiencies in the LongMemEval dataset - there appear to be many broken question/answer pairs where the question is misleading, or the answer includes incorrect information or details that the question didn't ask for.
+## Overview
+The investigation workflow has 4 stages:
+```
+pending → investigated → fix-implemented → synced
+```
+1. **pending**: Initial state for all failed questions
+2. **investigated**: Root cause identified and documented
+3. **fix-implemented**: Fix has been applied (Observer/Reflector prompt, improved Q/A, etc.)
+4. **synced**: Changes synced to `longmemeval_s.json` dataset
+## Quick Start
+```bash
+# 1. List all runs with failures
+pnpm investigate --list
+# 2. Setup investigation for a specific run
+pnpm investigate <run-id>
+# 3. Open the next uninvestigated question
+pnpm investigate --next
+# 4. After investigating, mark as done
+pnpm investigate --done <question-id>
+# 5. After implementing fix, mark as fixed
+pnpm investigate --fixed <question-id>
+# 6. Sync all fixes to dataset
+pnpm investigate --sync
+```
+## Detailed Workflow
+### Step 1: Find Failures to Investigate
+```bash
+# List all runs with failures, grouped by config
+pnpm investigate --list
+# Filter by config name
+pnpm investigate --list -c gpt5
+pnpm investigate --list -c om-gemini
+```
+### Step 2: Setup Investigation
+```bash
+# Setup investigation directory for a run
+pnpm investigate run_1768439350043
+```
+This creates:
+```
+investigations/
+└── run_1768439350043/
+    ├── progress.json           # Tracks investigation status
+    └── <question-id>/
+        ├── analysis.md         # Investigation template
+        └── data/
+            ├── original.json   # Raw dataset for this question
+            ├── result.json     # Evaluation result
+            ├── om.md           # Agent's context window
+            └── om.json         # Prepared OM data (if exists)
+```
+### Step 3: Investigate Each Question
+```bash
+# Open the next uninvestigated question in your editor
+pnpm investigate --next
+# Check current progress
+pnpm investigate --status
+```
+#### Investigation Utilities
+The `investigate` command provides several utilities to help diagnose issues:
+##### Search Observations
+```bash
+# Search what the Observer extracted
+pnpm investigate --search "keyword" -q <question-id>
+```
+##### Search Original Dataset
+```bash
+# Search the raw dataset with full context
+pnpm investigate --search-original "keyword" -q <question-id>
+```
+##### Trace Information Flow
+```bash
+# Trace a keyword through the entire pipeline
+pnpm investigate --trace "keyword" -q <question-id>
+```
+This shows where information exists at each stage:
+- Original dataset sessions
+- Stored messages (om.json)
+- Extracted observations
+- Agent context (om.md)
+##### View Sessions
+```bash
+# List all sessions for a question
+pnpm investigate --list-sessions -q <question-id>
+# View a specific session
+pnpm investigate --session 33 -q <question-id>
+```
+##### Inspect Question Data
+```bash
+# Show summary of question's data
+pnpm investigate --inspect <question-id>
+```
+##### View by Date
+```bash
+# View observations around a specific date
+pnpm investigate --date "2023/05/29" -q <question-id>
+pnpm investigate --date "May 29" -q <question-id> --context 2
+```
+### Step 4: Document Findings
+Edit the `analysis.md` file for each question:
+```markdown
+## Failure Category
+- [x] Observer missed critical information
+- [ ] Reflector lost/merged information incorrectly
+- [ ] Agent reasoning error (had info, wrong conclusion)
+- [ ] Ambiguous/poorly-worded question
+- [ ] Dataset inconsistency/error
+- [ ] RAG retrieval miss (if applicable)
+- [ ] Other: \_\_\_
+## Root Cause Analysis
+<!-- Describe what went wrong -->
+## Evidence
+<!-- Quote relevant parts of om.md, original data, etc. -->
+## Potential Improvements
+### Observer/Reflector Changes
+- **Likelihood**: High
+- **Suggested prompt change**: ...
+### Fixed Question/Answer
+- **improved_question**: ...
+- **improved_answer**: ...
+- **improvement_note**: ...
+```
+### Step 5: Mark as Investigated
+```bash
+pnpm investigate --done <question-id>
+```
+This:
+- Extracts the failure category from `analysis.md`
+- Updates `progress.json`
+- Shows remaining count
+### Step 6: Implement Fixes
+Based on your investigation, implement fixes:
+1. **Observer/Reflector prompt changes**: Edit `packages/memory/src/experiments/observational-memory/observer-agent.ts` or `reflector-agent.ts`
+2. **Improved question/answer**: Add to `analysis.md`:
+   ```markdown
+   ### Fixed Question/Answer
+   - **improved_question**: What is the current location of my old sneakers?
+   - **improved_answer**: in a shoe rack in my closet
+   - **improvement_note**: Original question was ambiguous about timeframe
+   ```
+3. **Re-prepare data**: If Observer/Reflector prompts changed:
+   ```bash
+   pnpm prepare om --from-failures ./results/om/run_xxx/failures.json
+   ```
+### Step 7: Mark as Fixed
+```bash
+pnpm investigate --fixed <question-id>
+```
+### Step 8: Sync to Dataset
+```bash
+pnpm investigate --sync
+```
+This syncs `improved_question`, `improved_answer`, and `improvement_note` from `analysis.md` files to `longmemeval_s.json`.
+## Common Failure Categories
+### Observer Missed Information
+**Symptoms**: Information exists in original dataset but not in observations.
+**Diagnosis**:
+```bash
+pnpm investigate --trace "keyword" -q <question-id>
+# Look for: "❌ Observer missed this information"
+```
+**Common causes**:
+- Statement of intent misclassified as question
+- Information buried in long message
+- Implicit information not captured
+### Reflector Lost Information
+**Symptoms**: Information in observations but lost after reflection.
+**Diagnosis**: Compare observations before/after reflection in `om.json`.
+### Agent Reasoning Error
+**Symptoms**: Information present in `om.md` but agent reached wrong conclusion.
+**Diagnosis**: Check `om.md` - if the answer is there, it's a reasoning issue.
+### Dataset Inconsistency
+**Symptoms**: Conflicting information in the dataset itself.
+**Diagnosis**:
+```bash
+pnpm investigate --search-original "keyword" -q <question-id>
+# Look for contradictory statements
+```
+## Tips
+1. **Start with `--trace`**: It quickly shows where information was lost.
+2. **Use `--search-original`**: See the full context of what the user actually said.
+3. **Check the date**: Use `--list-sessions` to find when information was mentioned.
+4. **Look for patterns**: Similar failures often have the same root cause.
+5. **Document everything**: Good `analysis.md` files help identify systemic issues.
+## Example Investigation
+```bash
+# 1. Find the question
+pnpm investigate --list -c om
+# 2. Setup
+pnpm investigate run_1768439350043
+# 3. Start investigating
+pnpm investigate --next
+# 4. Trace the issue
+pnpm investigate --trace "shoe rack" -q 07741c45
+# 5. Search original data
+pnpm investigate --search-original "shoe rack" -q 07741c45
+# 6. View the session
+pnpm investigate --session 33 -q 07741c45
+# 7. Document findings in analysis.md
+# (edit the file)
+# 8. Mark as done
+pnpm investigate --done 07741c45
+# 9. After implementing fix
+pnpm investigate --fixed 07741c45
+# 10. Sync to dataset
+pnpm investigate --sync
+```

package/package.json CHANGED Viewed

@@ -1,36 +1,42 @@
 {
   "name": "@mastra/longmemeval",
-  "version": "0.0.0-error-handler-fix-20251020202607",
+  "version": "0.0.0-execa-dynamic-import-20260304221256",
   "description": "LongMemEval benchmark implementation for Mastra Memory",
   "dependencies": {
-    "@ai-sdk/openai": "^1.3.23",
-    "@ai-sdk/provider": "^1.1.3",
+    "@ai-sdk/openai": "^2.0.69",
+    "@ai-sdk/provider": "^2.0.0",
+    "@ai-sdk/provider-utils": "^3.0.18",
+    "@ai-sdk/google": "^2.0.40",
     "@huggingface/hub": "^0.15.1",
     "@node-rs/xxhash": "^1.7.6",
-    "ai": "^4.3.17",
+    "xxhash-wasm": "^1.1.0",
+    "ai": "^5.0.97",
     "async-mutex": "^0.5.0",
     "chalk": "^5.3.0",
     "commander": "^12.1.0",
+    "dotenv": "^16.4.5",
     "fastq": "^1.19.1",
     "imvectordb": "^0.0.6",
     "openai": "^4.73.1",
     "ora": "^8.1.1",
     "zod": "^3.23.8",
-    "@mastra/core": "0.0.0-error-handler-fix-20251020202607",
-    "@mastra/libsql": "0.0.0-error-handler-fix-20251020202607",
-    "@mastra/memory": "0.0.0-error-handler-fix-20251020202607",
-    "@mastra/rag": "1.0.2",
-    "@mastra/fastembed": "0.10.5"
+    "@mastra/core": "0.0.0-execa-dynamic-import-20260304221256",
+    "@mastra/fastembed": "0.0.0-execa-dynamic-import-20260304221256",
+    "@mastra/libsql": "0.0.0-execa-dynamic-import-20260304221256",
+    "@mastra/memory": "0.0.0-execa-dynamic-import-20260304221256",
+    "@mastra/rag": "0.0.0-execa-dynamic-import-20260304221256"
   },
   "devDependencies": {
-    "@ai-sdk/google": "^1.2.19",
-    "@types/node": "^22.10.2",
+    "@ai-sdk/google": "^2.0.40",
+    "@types/node": "22.19.3",
+    "@vitest/coverage-v8": "4.0.18",
+    "@vitest/ui": "4.0.18",
     "tsx": "^4.19.2",
-    "typescript": "^5.7.2",
-    "vitest": "^2.1.8"
+    "typescript": "^5.9.3",
+    "vitest": "4.0.18"
   },
   "engines": {
-    "node": ">=20"
+    "node": ">=22.13.0"
   },
   "homepage": "https://mastra.ai",
   "repository": {
@@ -50,18 +56,20 @@
     "find-failed": "tsx scripts/find-failed.ts",
     "clean-failed": "tsx scripts/find-failed.ts --delete",
     "generate-wm-templates": "tsx scripts/generate-wm-templates.ts",
-    "prepare:s:semantic": "tsx src/cli.ts prepare -d longmemeval_s -c semantic-recall --concurrency 5",
-    "bench:s:semantic": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config semantic-recall --concurrency 30",
-    "prepare:s:working": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory --concurrency 35",
-    "bench:s:working": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config working-memory --concurrency 10",
-    "prepare:s:combined": "tsx src/cli.ts prepare -d longmemeval_s -c combined --concurrency 10",
-    "bench:s:combined": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined --concurrency 20",
-    "prepare:s:combined-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c combined-tailored --concurrency 20",
-    "bench:s:combined-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined-tailored --concurrency 30",
-    "prepare:s:working-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory-tailored --concurrency 20",
-    "bench:s:working-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c working-memory-tailored --concurrency 30",
+    "prep": "tsx src/cli.ts prepare",
+    "bench": "tsx src/cli.ts run",
+    "deterministic-ids": "tsx src/cli.ts deterministic-ids",
     "results": "tsx src/cli.ts results",
-    "results:all": "tsx src/cli.ts results --all",
-    "results:s": "tsx src/cli.ts results -d longmemeval_s"
+    "results:latest": "tsx src/cli.ts results --latest",
+    "results:s": "tsx src/cli.ts results -d longmemeval_s",
+    "sync-improved-om-qa": "tsx src/cli.ts sync -d longmemeval_s -c observational-memory",
+    "obscure-thread-ids": "tsx src/cli.ts obscure-thread-ids -d longmemeval_s -c observational-memory",
+    "list-partial": "tsx src/cli.ts list-partial",
+    "clean-partial": "tsx src/cli.ts clean -d longmemeval_s -c observational-memory --partial",
+    "tokens": "tsx src/cli.ts tokens -d longmemeval_s",
+    "precompute-embeddings": "tsx src/cli.ts precompute-embeddings -d longmemeval_s -c observational-memory",
+    "find-prohibited": "tsx src/cli.ts find-prohibited -d longmemeval_s",
+    "partial": "tsx src/cli.ts partial",
+    "investigate": "tsx src/cli.ts investigate"
   }
 }

package/scripts/download.ts CHANGED Viewed

@@ -10,12 +10,23 @@ import { pipeline } from 'stream/promises';
 const REPO_ID = 'xiaowu0162/longmemeval';
 const DATA_DIR = join(process.cwd(), 'data');
-const FILES = [
+const ALL_FILES = [
   { filename: 'longmemeval_oracle.json', repoPath: 'longmemeval_oracle' },
   { filename: 'longmemeval_s.json', repoPath: 'longmemeval_s' },
   { filename: 'longmemeval_m.json', repoPath: 'longmemeval_m' },
 ];
+// Parse command line arguments to get specific dataset
+const args = process.argv.slice(2);
+const datasetArg = args.find(arg => arg.startsWith('--dataset=') || arg.startsWith('-d='));
+const datasetIndex = args.findIndex(arg => arg === '--dataset' || arg === '-d');
+const specificDataset = datasetArg ? datasetArg.split('=')[1] : datasetIndex !== -1 ? args[datasetIndex + 1] : null;
+// Filter to specific dataset if provided
+const FILES = specificDataset
+  ? ALL_FILES.filter(f => f.filename === `${specificDataset}.json` || f.filename === specificDataset)
+  : ALL_FILES;
 function formatFileSize(bytes: number): string {
   if (bytes < 1024) return bytes + ' B';
   const kb = bytes / 1024;
@@ -55,6 +66,16 @@ async function downloadWithFetch(url: string, outputPath: string, token: string)
 async function main() {
   console.log(chalk.blue('\n📥 LongMemEval Dataset Downloader\n'));
+  if (specificDataset) {
+    console.log(chalk.gray(`Downloading specific dataset: ${specificDataset}\n`));
+  }
+  if (FILES.length === 0) {
+    console.log(chalk.red(`Dataset not found: ${specificDataset}`));
+    console.log(chalk.gray('Available datasets: longmemeval_oracle, longmemeval_s, longmemeval_m'));
+    process.exit(1);
+  }
   // Create data directory if it doesn't exist
   if (!existsSync(DATA_DIR)) {
     mkdirSync(DATA_DIR, { recursive: true });

package/scripts/generate-wm-templates.ts CHANGED Viewed

@@ -24,7 +24,8 @@ interface TemplateDatabase {
 async function generateTemplate(question: LongMemEvalQuestion): Promise<string> {
   // Create a simple agent for template generation
   const agent = new Agent({
-    name: 'template-generator',
+    id: 'template-generator',
+    name: 'Template Generator',
     instructions: `You are an expert at designing working memory templates for AI assistants.
 Given a question and answer from a conversation history benchmark, generate a working memory instruction that would help an AI assistant extract and save the specific information needed to answer the question correctly.