@mastra/longmemeval 0.0.0-error-handler-fix-20251020202607 → 0.0.0-execa-dynamic-import-20260304221256

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/CHANGELOG.md +847 -6
  2. package/LICENSE.md +15 -0
  3. package/docs/INVESTIGATION_WORKFLOW.md +314 -0
  4. package/package.json +34 -26
  5. package/scripts/download.ts +22 -1
  6. package/scripts/generate-wm-templates.ts +2 -1
  7. package/src/cli.ts +967 -155
  8. package/src/commands/__tests__/prepare.test.ts +11 -6
  9. package/src/commands/__tests__/run.test.ts +10 -8
  10. package/src/commands/clean.ts +147 -0
  11. package/src/commands/deterministic-ids.ts +126 -0
  12. package/src/commands/find-prohibited.ts +299 -0
  13. package/src/commands/investigate.ts +2437 -0
  14. package/src/commands/list-partial.ts +137 -0
  15. package/src/commands/obscure-thread-ids.ts +237 -0
  16. package/src/commands/partial-results.ts +198 -0
  17. package/src/commands/precompute-embeddings.ts +302 -0
  18. package/src/commands/prepare.ts +732 -105
  19. package/src/commands/reconcile.ts +383 -0
  20. package/src/commands/run.ts +1240 -121
  21. package/src/commands/sessions.ts +126 -0
  22. package/src/commands/sync.ts +152 -0
  23. package/src/commands/test-cleaned.ts +53 -0
  24. package/src/commands/test-incremental.ts +120 -0
  25. package/src/commands/test-messages.ts +95 -0
  26. package/src/commands/test-prohibited.ts +244 -0
  27. package/src/commands/test-thread.ts +136 -0
  28. package/src/commands/tokens.ts +625 -0
  29. package/src/config.ts +1253 -72
  30. package/src/data/types.ts +48 -6
  31. package/src/embeddings/cached-openai-embedding-model.ts +0 -1
  32. package/src/evaluation/longmemeval-metric.ts +186 -103
  33. package/src/processors/content-sanitizer.ts +153 -0
  34. package/src/processors/date-injector.ts +111 -0
  35. package/src/processors/index.ts +3 -0
  36. package/src/processors/observation-semantic-filter.ts +1142 -0
  37. package/src/retry-model.ts +5 -3
  38. package/src/storage/__tests__/benchmark-store.test.ts +18 -35
  39. package/src/storage/benchmark-store.ts +140 -153
  40. package/src/storage/benchmark-vector.ts +1 -1
  41. package/src/storage/index.ts +1 -0
  42. package/src/storage/persistable-inmemory.ts +133 -0
  43. package/src/test-utils/mock-model.ts +35 -0
package/LICENSE.md CHANGED
@@ -1,3 +1,18 @@
1
+ Portions of this software are licensed as follows:
2
+
3
+ - All content that resides under any directory named "ee/" within this
4
+ repository, including but not limited to:
5
+ - `packages/core/src/auth/ee/`
6
+ - `packages/server/src/server/auth/ee/`
7
+ is licensed under the license defined in `ee/LICENSE`.
8
+
9
+ - All third-party components incorporated into the Mastra Software are
10
+ licensed under the original license provided by the owner of the
11
+ applicable component.
12
+
13
+ - Content outside of the above-mentioned directories or restrictions is
14
+ available under the "Apache License 2.0" as defined below.
15
+
1
16
  # Apache License 2.0
2
17
 
3
18
  Copyright (c) 2025 Kepler Software, Inc.
@@ -0,0 +1,314 @@
1
+ # LongMemEval Investigation Workflow
2
+
3
+ This document describes how to investigate failing questions in LongMemEval benchmarks to identify root causes and implement fixes.
4
+
5
+ The point is to find deficiencies in the LongMemEval dataset - there appear to be many broken question/answer pairs where the question is misleading, or the answer includes incorrect information or details that the question didn't ask for.
6
+
7
+ ## Overview
8
+
9
+ The investigation workflow has 4 stages:
10
+
11
+ ```
12
+ pending → investigated → fix-implemented → synced
13
+ ```
14
+
15
+ 1. **pending**: Initial state for all failed questions
16
+ 2. **investigated**: Root cause identified and documented
17
+ 3. **fix-implemented**: Fix has been applied (Observer/Reflector prompt, improved Q/A, etc.)
18
+ 4. **synced**: Changes synced to `longmemeval_s.json` dataset
19
+
20
+ ## Quick Start
21
+
22
+ ```bash
23
+ # 1. List all runs with failures
24
+ pnpm investigate --list
25
+
26
+ # 2. Setup investigation for a specific run
27
+ pnpm investigate <run-id>
28
+
29
+ # 3. Open the next uninvestigated question
30
+ pnpm investigate --next
31
+
32
+ # 4. After investigating, mark as done
33
+ pnpm investigate --done <question-id>
34
+
35
+ # 5. After implementing fix, mark as fixed
36
+ pnpm investigate --fixed <question-id>
37
+
38
+ # 6. Sync all fixes to dataset
39
+ pnpm investigate --sync
40
+ ```
41
+
42
+ ## Detailed Workflow
43
+
44
+ ### Step 1: Find Failures to Investigate
45
+
46
+ ```bash
47
+ # List all runs with failures, grouped by config
48
+ pnpm investigate --list
49
+
50
+ # Filter by config name
51
+ pnpm investigate --list -c gpt5
52
+ pnpm investigate --list -c om-gemini
53
+ ```
54
+
55
+ ### Step 2: Setup Investigation
56
+
57
+ ```bash
58
+ # Setup investigation directory for a run
59
+ pnpm investigate run_1768439350043
60
+ ```
61
+
62
+ This creates:
63
+
64
+ ```
65
+ investigations/
66
+ └── run_1768439350043/
67
+ ├── progress.json # Tracks investigation status
68
+ └── <question-id>/
69
+ ├── analysis.md # Investigation template
70
+ └── data/
71
+ ├── original.json # Raw dataset for this question
72
+ ├── result.json # Evaluation result
73
+ ├── om.md # Agent's context window
74
+ └── om.json # Prepared OM data (if exists)
75
+ ```
76
+
77
+ ### Step 3: Investigate Each Question
78
+
79
+ ```bash
80
+ # Open the next uninvestigated question in your editor
81
+ pnpm investigate --next
82
+
83
+ # Check current progress
84
+ pnpm investigate --status
85
+ ```
86
+
87
+ #### Investigation Utilities
88
+
89
+ The `investigate` command provides several utilities to help diagnose issues:
90
+
91
+ ##### Search Observations
92
+
93
+ ```bash
94
+ # Search what the Observer extracted
95
+ pnpm investigate --search "keyword" -q <question-id>
96
+ ```
97
+
98
+ ##### Search Original Dataset
99
+
100
+ ```bash
101
+ # Search the raw dataset with full context
102
+ pnpm investigate --search-original "keyword" -q <question-id>
103
+ ```
104
+
105
+ ##### Trace Information Flow
106
+
107
+ ```bash
108
+ # Trace a keyword through the entire pipeline
109
+ pnpm investigate --trace "keyword" -q <question-id>
110
+ ```
111
+
112
+ This shows where information exists at each stage:
113
+
114
+ - Original dataset sessions
115
+ - Stored messages (om.json)
116
+ - Extracted observations
117
+ - Agent context (om.md)
118
+
119
+ ##### View Sessions
120
+
121
+ ```bash
122
+ # List all sessions for a question
123
+ pnpm investigate --list-sessions -q <question-id>
124
+
125
+ # View a specific session
126
+ pnpm investigate --session 33 -q <question-id>
127
+ ```
128
+
129
+ ##### Inspect Question Data
130
+
131
+ ```bash
132
+ # Show summary of question's data
133
+ pnpm investigate --inspect <question-id>
134
+ ```
135
+
136
+ ##### View by Date
137
+
138
+ ```bash
139
+ # View observations around a specific date
140
+ pnpm investigate --date "2023/05/29" -q <question-id>
141
+ pnpm investigate --date "May 29" -q <question-id> --context 2
142
+ ```
143
+
144
+ ### Step 4: Document Findings
145
+
146
+ Edit the `analysis.md` file for each question:
147
+
148
+ ```markdown
149
+ ## Failure Category
150
+
151
+ - [x] Observer missed critical information
152
+ - [ ] Reflector lost/merged information incorrectly
153
+ - [ ] Agent reasoning error (had info, wrong conclusion)
154
+ - [ ] Ambiguous/poorly-worded question
155
+ - [ ] Dataset inconsistency/error
156
+ - [ ] RAG retrieval miss (if applicable)
157
+ - [ ] Other: \_\_\_
158
+
159
+ ## Root Cause Analysis
160
+
161
+ <!-- Describe what went wrong -->
162
+
163
+ ## Evidence
164
+
165
+ <!-- Quote relevant parts of om.md, original data, etc. -->
166
+
167
+ ## Potential Improvements
168
+
169
+ ### Observer/Reflector Changes
170
+
171
+ - **Likelihood**: High
172
+ - **Suggested prompt change**: ...
173
+
174
+ ### Fixed Question/Answer
175
+
176
+ - **improved_question**: ...
177
+ - **improved_answer**: ...
178
+ - **improvement_note**: ...
179
+ ```
180
+
181
+ ### Step 5: Mark as Investigated
182
+
183
+ ```bash
184
+ pnpm investigate --done <question-id>
185
+ ```
186
+
187
+ This:
188
+
189
+ - Extracts the failure category from `analysis.md`
190
+ - Updates `progress.json`
191
+ - Shows remaining count
192
+
193
+ ### Step 6: Implement Fixes
194
+
195
+ Based on your investigation, implement fixes:
196
+
197
+ 1. **Observer/Reflector prompt changes**: Edit `packages/memory/src/experiments/observational-memory/observer-agent.ts` or `reflector-agent.ts`
198
+
199
+ 2. **Improved question/answer**: Add to `analysis.md`:
200
+
201
+ ```markdown
202
+ ### Fixed Question/Answer
203
+
204
+ - **improved_question**: What is the current location of my old sneakers?
205
+ - **improved_answer**: in a shoe rack in my closet
206
+ - **improvement_note**: Original question was ambiguous about timeframe
207
+ ```
208
+
209
+ 3. **Re-prepare data**: If Observer/Reflector prompts changed:
210
+ ```bash
211
+ pnpm prepare om --from-failures ./results/om/run_xxx/failures.json
212
+ ```
213
+
214
+ ### Step 7: Mark as Fixed
215
+
216
+ ```bash
217
+ pnpm investigate --fixed <question-id>
218
+ ```
219
+
220
+ ### Step 8: Sync to Dataset
221
+
222
+ ```bash
223
+ pnpm investigate --sync
224
+ ```
225
+
226
+ This syncs `improved_question`, `improved_answer`, and `improvement_note` from `analysis.md` files to `longmemeval_s.json`.
227
+
228
+ ## Common Failure Categories
229
+
230
+ ### Observer Missed Information
231
+
232
+ **Symptoms**: Information exists in original dataset but not in observations.
233
+
234
+ **Diagnosis**:
235
+
236
+ ```bash
237
+ pnpm investigate --trace "keyword" -q <question-id>
238
+ # Look for: "❌ Observer missed this information"
239
+ ```
240
+
241
+ **Common causes**:
242
+
243
+ - Statement of intent misclassified as question
244
+ - Information buried in long message
245
+ - Implicit information not captured
246
+
247
+ ### Reflector Lost Information
248
+
249
+ **Symptoms**: Information in observations but lost after reflection.
250
+
251
+ **Diagnosis**: Compare observations before/after reflection in `om.json`.
252
+
253
+ ### Agent Reasoning Error
254
+
255
+ **Symptoms**: Information present in `om.md` but agent reached wrong conclusion.
256
+
257
+ **Diagnosis**: Check `om.md` - if the answer is there, it's a reasoning issue.
258
+
259
+ ### Dataset Inconsistency
260
+
261
+ **Symptoms**: Conflicting information in the dataset itself.
262
+
263
+ **Diagnosis**:
264
+
265
+ ```bash
266
+ pnpm investigate --search-original "keyword" -q <question-id>
267
+ # Look for contradictory statements
268
+ ```
269
+
270
+ ## Tips
271
+
272
+ 1. **Start with `--trace`**: It quickly shows where information was lost.
273
+
274
+ 2. **Use `--search-original`**: See the full context of what the user actually said.
275
+
276
+ 3. **Check the date**: Use `--list-sessions` to find when information was mentioned.
277
+
278
+ 4. **Look for patterns**: Similar failures often have the same root cause.
279
+
280
+ 5. **Document everything**: Good `analysis.md` files help identify systemic issues.
281
+
282
+ ## Example Investigation
283
+
284
+ ```bash
285
+ # 1. Find the question
286
+ pnpm investigate --list -c om
287
+
288
+ # 2. Setup
289
+ pnpm investigate run_1768439350043
290
+
291
+ # 3. Start investigating
292
+ pnpm investigate --next
293
+
294
+ # 4. Trace the issue
295
+ pnpm investigate --trace "shoe rack" -q 07741c45
296
+
297
+ # 5. Search original data
298
+ pnpm investigate --search-original "shoe rack" -q 07741c45
299
+
300
+ # 6. View the session
301
+ pnpm investigate --session 33 -q 07741c45
302
+
303
+ # 7. Document findings in analysis.md
304
+ # (edit the file)
305
+
306
+ # 8. Mark as done
307
+ pnpm investigate --done 07741c45
308
+
309
+ # 9. After implementing fix
310
+ pnpm investigate --fixed 07741c45
311
+
312
+ # 10. Sync to dataset
313
+ pnpm investigate --sync
314
+ ```
package/package.json CHANGED
@@ -1,36 +1,42 @@
1
1
  {
2
2
  "name": "@mastra/longmemeval",
3
- "version": "0.0.0-error-handler-fix-20251020202607",
3
+ "version": "0.0.0-execa-dynamic-import-20260304221256",
4
4
  "description": "LongMemEval benchmark implementation for Mastra Memory",
5
5
  "dependencies": {
6
- "@ai-sdk/openai": "^1.3.23",
7
- "@ai-sdk/provider": "^1.1.3",
6
+ "@ai-sdk/openai": "^2.0.69",
7
+ "@ai-sdk/provider": "^2.0.0",
8
+ "@ai-sdk/provider-utils": "^3.0.18",
9
+ "@ai-sdk/google": "^2.0.40",
8
10
  "@huggingface/hub": "^0.15.1",
9
11
  "@node-rs/xxhash": "^1.7.6",
10
- "ai": "^4.3.17",
12
+ "xxhash-wasm": "^1.1.0",
13
+ "ai": "^5.0.97",
11
14
  "async-mutex": "^0.5.0",
12
15
  "chalk": "^5.3.0",
13
16
  "commander": "^12.1.0",
17
+ "dotenv": "^16.4.5",
14
18
  "fastq": "^1.19.1",
15
19
  "imvectordb": "^0.0.6",
16
20
  "openai": "^4.73.1",
17
21
  "ora": "^8.1.1",
18
22
  "zod": "^3.23.8",
19
- "@mastra/core": "0.0.0-error-handler-fix-20251020202607",
20
- "@mastra/libsql": "0.0.0-error-handler-fix-20251020202607",
21
- "@mastra/memory": "0.0.0-error-handler-fix-20251020202607",
22
- "@mastra/rag": "1.0.2",
23
- "@mastra/fastembed": "0.10.5"
23
+ "@mastra/core": "0.0.0-execa-dynamic-import-20260304221256",
24
+ "@mastra/fastembed": "0.0.0-execa-dynamic-import-20260304221256",
25
+ "@mastra/libsql": "0.0.0-execa-dynamic-import-20260304221256",
26
+ "@mastra/memory": "0.0.0-execa-dynamic-import-20260304221256",
27
+ "@mastra/rag": "0.0.0-execa-dynamic-import-20260304221256"
24
28
  },
25
29
  "devDependencies": {
26
- "@ai-sdk/google": "^1.2.19",
27
- "@types/node": "^22.10.2",
30
+ "@ai-sdk/google": "^2.0.40",
31
+ "@types/node": "22.19.3",
32
+ "@vitest/coverage-v8": "4.0.18",
33
+ "@vitest/ui": "4.0.18",
28
34
  "tsx": "^4.19.2",
29
- "typescript": "^5.7.2",
30
- "vitest": "^2.1.8"
35
+ "typescript": "^5.9.3",
36
+ "vitest": "4.0.18"
31
37
  },
32
38
  "engines": {
33
- "node": ">=20"
39
+ "node": ">=22.13.0"
34
40
  },
35
41
  "homepage": "https://mastra.ai",
36
42
  "repository": {
@@ -50,18 +56,20 @@
50
56
  "find-failed": "tsx scripts/find-failed.ts",
51
57
  "clean-failed": "tsx scripts/find-failed.ts --delete",
52
58
  "generate-wm-templates": "tsx scripts/generate-wm-templates.ts",
53
- "prepare:s:semantic": "tsx src/cli.ts prepare -d longmemeval_s -c semantic-recall --concurrency 5",
54
- "bench:s:semantic": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config semantic-recall --concurrency 30",
55
- "prepare:s:working": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory --concurrency 35",
56
- "bench:s:working": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config working-memory --concurrency 10",
57
- "prepare:s:combined": "tsx src/cli.ts prepare -d longmemeval_s -c combined --concurrency 10",
58
- "bench:s:combined": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined --concurrency 20",
59
- "prepare:s:combined-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c combined-tailored --concurrency 20",
60
- "bench:s:combined-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined-tailored --concurrency 30",
61
- "prepare:s:working-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory-tailored --concurrency 20",
62
- "bench:s:working-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c working-memory-tailored --concurrency 30",
59
+ "prep": "tsx src/cli.ts prepare",
60
+ "bench": "tsx src/cli.ts run",
61
+ "deterministic-ids": "tsx src/cli.ts deterministic-ids",
63
62
  "results": "tsx src/cli.ts results",
64
- "results:all": "tsx src/cli.ts results --all",
65
- "results:s": "tsx src/cli.ts results -d longmemeval_s"
63
+ "results:latest": "tsx src/cli.ts results --latest",
64
+ "results:s": "tsx src/cli.ts results -d longmemeval_s",
65
+ "sync-improved-om-qa": "tsx src/cli.ts sync -d longmemeval_s -c observational-memory",
66
+ "obscure-thread-ids": "tsx src/cli.ts obscure-thread-ids -d longmemeval_s -c observational-memory",
67
+ "list-partial": "tsx src/cli.ts list-partial",
68
+ "clean-partial": "tsx src/cli.ts clean -d longmemeval_s -c observational-memory --partial",
69
+ "tokens": "tsx src/cli.ts tokens -d longmemeval_s",
70
+ "precompute-embeddings": "tsx src/cli.ts precompute-embeddings -d longmemeval_s -c observational-memory",
71
+ "find-prohibited": "tsx src/cli.ts find-prohibited -d longmemeval_s",
72
+ "partial": "tsx src/cli.ts partial",
73
+ "investigate": "tsx src/cli.ts investigate"
66
74
  }
67
75
  }
@@ -10,12 +10,23 @@ import { pipeline } from 'stream/promises';
10
10
  const REPO_ID = 'xiaowu0162/longmemeval';
11
11
  const DATA_DIR = join(process.cwd(), 'data');
12
12
 
13
- const FILES = [
13
+ const ALL_FILES = [
14
14
  { filename: 'longmemeval_oracle.json', repoPath: 'longmemeval_oracle' },
15
15
  { filename: 'longmemeval_s.json', repoPath: 'longmemeval_s' },
16
16
  { filename: 'longmemeval_m.json', repoPath: 'longmemeval_m' },
17
17
  ];
18
18
 
19
+ // Parse command line arguments to get specific dataset
20
+ const args = process.argv.slice(2);
21
+ const datasetArg = args.find(arg => arg.startsWith('--dataset=') || arg.startsWith('-d='));
22
+ const datasetIndex = args.findIndex(arg => arg === '--dataset' || arg === '-d');
23
+ const specificDataset = datasetArg ? datasetArg.split('=')[1] : datasetIndex !== -1 ? args[datasetIndex + 1] : null;
24
+
25
+ // Filter to specific dataset if provided
26
+ const FILES = specificDataset
27
+ ? ALL_FILES.filter(f => f.filename === `${specificDataset}.json` || f.filename === specificDataset)
28
+ : ALL_FILES;
29
+
19
30
  function formatFileSize(bytes: number): string {
20
31
  if (bytes < 1024) return bytes + ' B';
21
32
  const kb = bytes / 1024;
@@ -55,6 +66,16 @@ async function downloadWithFetch(url: string, outputPath: string, token: string)
55
66
  async function main() {
56
67
  console.log(chalk.blue('\n📥 LongMemEval Dataset Downloader\n'));
57
68
 
69
+ if (specificDataset) {
70
+ console.log(chalk.gray(`Downloading specific dataset: ${specificDataset}\n`));
71
+ }
72
+
73
+ if (FILES.length === 0) {
74
+ console.log(chalk.red(`Dataset not found: ${specificDataset}`));
75
+ console.log(chalk.gray('Available datasets: longmemeval_oracle, longmemeval_s, longmemeval_m'));
76
+ process.exit(1);
77
+ }
78
+
58
79
  // Create data directory if it doesn't exist
59
80
  if (!existsSync(DATA_DIR)) {
60
81
  mkdirSync(DATA_DIR, { recursive: true });
@@ -24,7 +24,8 @@ interface TemplateDatabase {
24
24
  async function generateTemplate(question: LongMemEvalQuestion): Promise<string> {
25
25
  // Create a simple agent for template generation
26
26
  const agent = new Agent({
27
- name: 'template-generator',
27
+ id: 'template-generator',
28
+ name: 'Template Generator',
28
29
  instructions: `You are an expert at designing working memory templates for AI assistants.
29
30
 
30
31
  Given a question and answer from a conversation history benchmark, generate a working memory instruction that would help an AI assistant extract and save the specific information needed to answer the question correctly.