@mastra/longmemeval 0.1.1-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +13 -0
- package/DATA_DOWNLOAD_GUIDE.md +117 -0
- package/LICENSE.md +15 -0
- package/README.md +173 -0
- package/USAGE.md +105 -0
- package/package.json +58 -0
- package/scripts/download.ts +180 -0
- package/scripts/find-failed.ts +176 -0
- package/scripts/generate-embeddings.ts +56 -0
- package/scripts/generate-wm-templates.ts +296 -0
- package/scripts/setup.ts +60 -0
- package/src/__fixtures__/embeddings.json +2319 -0
- package/src/__fixtures__/test-dataset.json +82 -0
- package/src/cli.ts +690 -0
- package/src/commands/__tests__/prepare.test.ts +230 -0
- package/src/commands/__tests__/run.test.ts +403 -0
- package/src/commands/prepare.ts +793 -0
- package/src/commands/run.ts +553 -0
- package/src/config.ts +83 -0
- package/src/data/loader.ts +163 -0
- package/src/data/types.ts +61 -0
- package/src/embeddings/cached-openai-embedding-model.ts +227 -0
- package/src/embeddings/cached-openai-provider.ts +40 -0
- package/src/embeddings/index.ts +2 -0
- package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
- package/src/evaluation/longmemeval-metric.ts +173 -0
- package/src/retry-model.ts +60 -0
- package/src/storage/__tests__/benchmark-store.test.ts +260 -0
- package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
- package/src/storage/benchmark-store.ts +536 -0
- package/src/storage/benchmark-vector.ts +234 -0
- package/src/storage/index.ts +2 -0
- package/src/test-utils/mock-embeddings.ts +54 -0
- package/src/test-utils/mock-model.ts +49 -0
- package/tests/data-loader.test.ts +96 -0
- package/tsconfig.json +18 -0
- package/vitest.config.ts +9 -0
package/CHANGELOG.md
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
# @mastra/longmemeval
|
|
2
|
+
|
|
3
|
+
## 0.1.1-alpha.0
|
|
4
|
+
|
|
5
|
+
### Patch Changes
|
|
6
|
+
|
|
7
|
+
- Updated dependencies [0b56518]
|
|
8
|
+
- Updated dependencies [2ba5b76]
|
|
9
|
+
- Updated dependencies [c3a30de]
|
|
10
|
+
- Updated dependencies [cf3a184]
|
|
11
|
+
- Updated dependencies [d6bfd60]
|
|
12
|
+
- @mastra/core@0.10.15-alpha.1
|
|
13
|
+
- @mastra/memory@0.11.3-alpha.1
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
# LongMemEval Dataset Download Guide
|
|
2
|
+
|
|
3
|
+
The LongMemEval datasets are large files (several GB) hosted on HuggingFace with Git LFS. Here are all the ways to download them:
|
|
4
|
+
|
|
5
|
+
## Option 1: JavaScript/Node.js Download
|
|
6
|
+
|
|
7
|
+
### 1. Get your HuggingFace token
|
|
8
|
+
|
|
9
|
+
- Go to https://huggingface.co/settings/tokens
|
|
10
|
+
- Create a new token with read permissions
|
|
11
|
+
- Copy the token
|
|
12
|
+
|
|
13
|
+
### 2. Set the token as environment variable
|
|
14
|
+
|
|
15
|
+
```bash
|
|
16
|
+
export HF_TOKEN=your_token_here
|
|
17
|
+
# or
|
|
18
|
+
export HUGGINGFACE_TOKEN=your_token_here
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### 3. Install dependencies and download
|
|
22
|
+
|
|
23
|
+
```bash
|
|
24
|
+
pnpm install
|
|
25
|
+
pnpm download:hf
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Option 2: Git LFS
|
|
29
|
+
|
|
30
|
+
### 1. Install Git LFS
|
|
31
|
+
|
|
32
|
+
```bash
|
|
33
|
+
# macOS
|
|
34
|
+
brew install git-lfs
|
|
35
|
+
|
|
36
|
+
# Ubuntu/Debian
|
|
37
|
+
sudo apt-get install git-lfs
|
|
38
|
+
|
|
39
|
+
# Initialize Git LFS
|
|
40
|
+
git lfs install
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
### 2. Clone with Git LFS
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
git clone https://huggingface.co/datasets/xiaowu0162/longmemeval
|
|
47
|
+
cd longmemeval
|
|
48
|
+
cp *.json ../data/
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
## Option 3: Manual Download from Google Drive
|
|
52
|
+
|
|
53
|
+
### 1. Download the archive
|
|
54
|
+
|
|
55
|
+
Go to: https://drive.google.com/file/d/1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80/view
|
|
56
|
+
|
|
57
|
+
### 2. Extract the files
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
cd packages/longmemeval/data
|
|
61
|
+
tar -xzvf ~/Downloads/longmemeval_data.tar.gz
|
|
62
|
+
```
|
|
63
|
+
|
|
64
|
+
### 3. Verify the files
|
|
65
|
+
|
|
66
|
+
```bash
|
|
67
|
+
ls -lh *.json
|
|
68
|
+
# You should see:
|
|
69
|
+
# - longmemeval_s.json (~40MB)
|
|
70
|
+
# - longmemeval_m.json (~200MB)
|
|
71
|
+
# - longmemeval_oracle.json (~2MB)
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
## Option 4: Direct Browser Download
|
|
75
|
+
|
|
76
|
+
If you have a HuggingFace account:
|
|
77
|
+
|
|
78
|
+
1. Go to https://huggingface.co/datasets/xiaowu0162/longmemeval
|
|
79
|
+
2. Click on "Files and versions"
|
|
80
|
+
3. Download each JSON file directly
|
|
81
|
+
4. Move them to `packages/longmemeval/data/`
|
|
82
|
+
|
|
83
|
+
## Troubleshooting
|
|
84
|
+
|
|
85
|
+
### "Entry not found" or small files (15 bytes)
|
|
86
|
+
|
|
87
|
+
This means the download failed due to authentication. Use one of the authenticated methods above.
|
|
88
|
+
|
|
89
|
+
### Git LFS bandwidth exceeded
|
|
90
|
+
|
|
91
|
+
HuggingFace has bandwidth limits. Try:
|
|
92
|
+
|
|
93
|
+
- Using the Google Drive link instead
|
|
94
|
+
- Waiting until the next day when bandwidth resets
|
|
95
|
+
- Using a different download method
|
|
96
|
+
|
|
97
|
+
### Permission denied
|
|
98
|
+
|
|
99
|
+
Make sure you're logged in to HuggingFace and have accepted any dataset terms of use.
|
|
100
|
+
|
|
101
|
+
## Verification
|
|
102
|
+
|
|
103
|
+
After downloading, verify the files:
|
|
104
|
+
|
|
105
|
+
```bash
|
|
106
|
+
# Check file sizes
|
|
107
|
+
ls -lh data/*.json
|
|
108
|
+
|
|
109
|
+
# Check file content (should be valid JSON)
|
|
110
|
+
head -n 5 data/longmemeval_s.json
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Expected sizes:
|
|
114
|
+
|
|
115
|
+
- `longmemeval_oracle.json`: ~2MB
|
|
116
|
+
- `longmemeval_s.json`: ~40MB
|
|
117
|
+
- `longmemeval_m.json`: ~200MB
|
package/LICENSE.md
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
# Apache License 2.0
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2025 Kepler Software, Inc.
|
|
4
|
+
|
|
5
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
|
6
|
+
you may not use this file except in compliance with the License.
|
|
7
|
+
You may obtain a copy of the License at
|
|
8
|
+
|
|
9
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
|
10
|
+
|
|
11
|
+
Unless required by applicable law or agreed to in writing, software
|
|
12
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
|
13
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
14
|
+
See the License for the specific language governing permissions and
|
|
15
|
+
limitations under the License.
|
package/README.md
ADDED
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
# LongMemEval Benchmark for Mastra
|
|
2
|
+
|
|
3
|
+
This package implements the [LongMemEval benchmark](https://arxiv.org/abs/2410.10813) ([+Github](https://github.com/xiaowu0162/LongMemEval)) for testing Mastra's long-term memory capabilities.
|
|
4
|
+
|
|
5
|
+
## About LongMemEval
|
|
6
|
+
|
|
7
|
+
LongMemEval is a comprehensive benchmark designed by researchers to evaluate the long-term memory capabilities of chat assistants. It was introduced in the paper:
|
|
8
|
+
|
|
9
|
+
**"LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory"**
|
|
10
|
+
_Di Wu, Hongwei Wang, Wenhao Yu, Yuwei Zhang, Kai-Wei Chang, Dong Yu (ICLR 2025)_
|
|
11
|
+
📄 [Paper](https://arxiv.org/abs/2410.10813) | 🌐 [Website](https://xiaowu0162.github.io/long-mem-eval/) | 🤗 [Dataset](https://huggingface.co/datasets/xiaowu0162/longmemeval)
|
|
12
|
+
|
|
13
|
+
### What LongMemEval Tests
|
|
14
|
+
|
|
15
|
+
The benchmark evaluates five core long-term memory abilities through 500 meticulously curated questions:
|
|
16
|
+
|
|
17
|
+
1. **Information Extraction** - Recalling specific information from extensive interactive histories
|
|
18
|
+
2. **Multi-Session Reasoning** - Synthesizing information across multiple history sessions
|
|
19
|
+
3. **Knowledge Updates** - Handling information that changes over time
|
|
20
|
+
4. **Temporal Reasoning** - Understanding time-based relationships in conversations
|
|
21
|
+
5. **Abstention** - Recognizing when information is not available
|
|
22
|
+
|
|
23
|
+
### Why This Matters
|
|
24
|
+
|
|
25
|
+
Current LLMs show a 30-60% performance drop when tested on LongMemEval, revealing significant challenges in maintaining coherent long-term memory. This benchmark helps identify and improve these limitations.
|
|
26
|
+
|
|
27
|
+
## Quick Start
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
# From packages/longmemeval directory
|
|
31
|
+
|
|
32
|
+
# 1. Set your API keys
|
|
33
|
+
export OPENAI_API_KEY=your_openai_key_here
|
|
34
|
+
export HF_TOKEN=your_huggingface_token_here # For automatic dataset download
|
|
35
|
+
|
|
36
|
+
# 2. Run a benchmark (downloads datasets automatically if needed)
|
|
37
|
+
pnpm bench:s # Run small dataset (10 parallel requests)
|
|
38
|
+
pnpm bench:m # Run medium dataset (10 parallel requests)
|
|
39
|
+
pnpm bench:oracle # Run oracle dataset (10 parallel requests)
|
|
40
|
+
|
|
41
|
+
# Or run quick 10-question tests
|
|
42
|
+
pnpm bench:s:quick # Test with 10 questions from small dataset
|
|
43
|
+
pnpm bench:m:quick # Test with 10 questions from medium dataset
|
|
44
|
+
pnpm bench:oracle:quick # Test with 10 questions from oracle dataset
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
**Note:** The benchmark will automatically download datasets on first run. Get your HuggingFace token from https://huggingface.co/settings/tokens
|
|
48
|
+
|
|
49
|
+
## Manual Setup
|
|
50
|
+
|
|
51
|
+
### 1. Install Dependencies
|
|
52
|
+
|
|
53
|
+
```bash
|
|
54
|
+
# From the monorepo root
|
|
55
|
+
pnpm install
|
|
56
|
+
pnpm build
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
### 2. Download Dataset
|
|
60
|
+
|
|
61
|
+
```bash
|
|
62
|
+
# Set your HuggingFace token
|
|
63
|
+
export HF_TOKEN=your_token_here
|
|
64
|
+
|
|
65
|
+
# Download datasets (no Python or Git LFS required)
|
|
66
|
+
pnpm download
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
If automatic download fails, see [DOWNLOAD_GUIDE.md](./DOWNLOAD_GUIDE.md) for manual download instructions.
|
|
70
|
+
|
|
71
|
+
## Usage
|
|
72
|
+
|
|
73
|
+
### Run Benchmark
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# From packages/longmemeval directory
|
|
77
|
+
|
|
78
|
+
# Quick commands for each dataset (10 parallel requests)
|
|
79
|
+
pnpm bench:s # Small dataset (full run)
|
|
80
|
+
pnpm bench:m # Medium dataset (full run)
|
|
81
|
+
pnpm bench:oracle # Oracle dataset (full run)
|
|
82
|
+
|
|
83
|
+
# Quick test runs (10 questions only, 5 parallel)
|
|
84
|
+
pnpm bench:s:quick # Small dataset (quick test)
|
|
85
|
+
pnpm bench:m:quick # Medium dataset (quick test)
|
|
86
|
+
pnpm bench:oracle:quick # Oracle dataset (quick test)
|
|
87
|
+
|
|
88
|
+
# Advanced: Use full CLI with custom options
|
|
89
|
+
pnpm cli run --dataset longmemeval_s --model gpt-4o
|
|
90
|
+
|
|
91
|
+
# Adjust parallelization (default: 5)
|
|
92
|
+
pnpm cli run --dataset longmemeval_s --model gpt-4o --concurrency 20
|
|
93
|
+
|
|
94
|
+
# Graceful shutdown: Press Ctrl+C to stop and save progress
|
|
95
|
+
|
|
96
|
+
# Run with specific memory configuration
|
|
97
|
+
pnpm cli run --dataset longmemeval_s --memory-config last-k --model gpt-4o
|
|
98
|
+
pnpm cli run --dataset longmemeval_s --memory-config semantic-recall --model gpt-4o
|
|
99
|
+
pnpm cli run --dataset longmemeval_s --memory-config working-memory --model gpt-4o
|
|
100
|
+
|
|
101
|
+
# Custom subset size
|
|
102
|
+
pnpm cli run --dataset longmemeval_oracle --model gpt-4o --subset 25
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
### View Dataset Statistics
|
|
106
|
+
|
|
107
|
+
```bash
|
|
108
|
+
pnpm cli stats --dataset longmemeval_s
|
|
109
|
+
```
|
|
110
|
+
|
|
111
|
+
### Evaluate Existing Results
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
pnpm cli evaluate --results ./results/run_12345/results.jsonl --dataset longmemeval_s
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
### Generate Report
|
|
118
|
+
|
|
119
|
+
```bash
|
|
120
|
+
pnpm cli report --results ./results/
|
|
121
|
+
```
|
|
122
|
+
|
|
123
|
+
## Memory Configurations
|
|
124
|
+
|
|
125
|
+
- **full-history**: Provide complete chat history (baseline)
|
|
126
|
+
- **last-k**: Use Mastra's lastMessages configuration (last 20 messages)
|
|
127
|
+
- **semantic-recall**: Use Mastra's semantic recall feature (requires vector store)
|
|
128
|
+
- **working-memory**: Use Mastra's working memory with template
|
|
129
|
+
- **combined**: Combination of last-k and semantic recall
|
|
130
|
+
|
|
131
|
+
## Output
|
|
132
|
+
|
|
133
|
+
Results are saved in the `results/` directory with:
|
|
134
|
+
|
|
135
|
+
- `results.jsonl`: Individual question results
|
|
136
|
+
- `hypotheses.json`: Model responses
|
|
137
|
+
- `questions.json`: Questions for reference
|
|
138
|
+
- `metrics.json`: Aggregated metrics and configuration
|
|
139
|
+
|
|
140
|
+
## Benchmark Datasets
|
|
141
|
+
|
|
142
|
+
LongMemEval provides three dataset variants:
|
|
143
|
+
|
|
144
|
+
- **longmemeval_s (Small)**: ~115k tokens per question (30-40 sessions)
|
|
145
|
+
- Designed to fit within 128k context windows
|
|
146
|
+
- Tests memory across dozens of conversation sessions
|
|
147
|
+
- **longmemeval_m (Medium)**: ~1.5M tokens per question (500 sessions)
|
|
148
|
+
- Challenges even the largest context windows
|
|
149
|
+
- Tests memory across hundreds of sessions
|
|
150
|
+
- **longmemeval_oracle**: Only evidence sessions included
|
|
151
|
+
- Used as a control to verify models can answer when given only relevant context
|
|
152
|
+
- Helps isolate memory retrieval issues from comprehension issues
|
|
153
|
+
|
|
154
|
+
## Citation
|
|
155
|
+
|
|
156
|
+
If you use this benchmark in your research, please cite the original paper:
|
|
157
|
+
|
|
158
|
+
```bibtex
|
|
159
|
+
@article{wu2024longmemeval,
|
|
160
|
+
title={LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory},
|
|
161
|
+
author={Wu, Di and Wang, Hongwei and Yu, Wenhao and Zhang, Yuwei and Chang, Kai-Wei and Yu, Dong},
|
|
162
|
+
journal={arXiv preprint arXiv:2410.10813},
|
|
163
|
+
year={2024}
|
|
164
|
+
}
|
|
165
|
+
```
|
|
166
|
+
|
|
167
|
+
## Extending the Benchmark
|
|
168
|
+
|
|
169
|
+
To add custom memory configurations:
|
|
170
|
+
|
|
171
|
+
1. Edit `src/benchmark/runner.ts` and add your configuration to `getMemoryConfig()`
|
|
172
|
+
2. Update the `MemoryConfigType` in `src/data/types.ts`
|
|
173
|
+
3. Implement the configuration logic in `src/memory-adapters/mastra-adapter.ts`
|
package/USAGE.md
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
# LongMemEval Usage Guide
|
|
2
|
+
|
|
3
|
+
## Quick Start
|
|
4
|
+
|
|
5
|
+
### 1. Prepare Data (Required First)
|
|
6
|
+
|
|
7
|
+
The prepare step processes the dataset through mock agents to populate the storage:
|
|
8
|
+
|
|
9
|
+
```bash
|
|
10
|
+
# Quick test with 5 questions
|
|
11
|
+
pnpm prepare:quick
|
|
12
|
+
|
|
13
|
+
# Full dataset with different memory configs
|
|
14
|
+
pnpm prepare:s # Small dataset, semantic-recall (default)
|
|
15
|
+
pnpm prepare:s:lastk # Small dataset, last-k messages
|
|
16
|
+
pnpm prepare:s:working # Small dataset, working-memory
|
|
17
|
+
pnpm prepare:s:combined # Small dataset, combined (semantic + working memory)
|
|
18
|
+
pnpm prepare:m # Medium dataset, semantic-recall
|
|
19
|
+
```
|
|
20
|
+
|
|
21
|
+
### 2. Run Benchmark
|
|
22
|
+
|
|
23
|
+
After preparing data, run the benchmark:
|
|
24
|
+
|
|
25
|
+
```bash
|
|
26
|
+
# Quick test
|
|
27
|
+
pnpm run:quick
|
|
28
|
+
|
|
29
|
+
# Full runs
|
|
30
|
+
pnpm run:s # Small dataset with semantic-recall (default)
|
|
31
|
+
pnpm run:s:lastk # Small dataset with last-k
|
|
32
|
+
pnpm run:s:working # Small dataset with working-memory
|
|
33
|
+
pnpm run:s:combined # Small dataset with combined
|
|
34
|
+
```
|
|
35
|
+
|
|
36
|
+
## Full CLI Options
|
|
37
|
+
|
|
38
|
+
### Prepare Command
|
|
39
|
+
|
|
40
|
+
```bash
|
|
41
|
+
pnpm cli prepare \
|
|
42
|
+
-d <dataset> # longmemeval_s, longmemeval_m, longmemeval_oracle
|
|
43
|
+
-c <memory-config> # full-history, last-k, semantic-recall, working-memory, combined
|
|
44
|
+
[--subset <n>] # Process only n questions
|
|
45
|
+
[--output <dir>] # Output directory (default: ./prepared-data)
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
### Run Command
|
|
49
|
+
|
|
50
|
+
```bash
|
|
51
|
+
pnpm cli run \
|
|
52
|
+
-d <dataset> # longmemeval_s, longmemeval_m, longmemeval_oracle
|
|
53
|
+
-m <model> # Model name (e.g., gpt-4o)
|
|
54
|
+
-c <memory-config> # full-history, last-k, semantic-recall, working-memory, combined
|
|
55
|
+
[--subset <n>] # Run only n questions
|
|
56
|
+
[--concurrency <n>] # Parallel requests (default: 5)
|
|
57
|
+
[--prepared-data <dir>] # Prepared data directory
|
|
58
|
+
[--output <dir>] # Results directory
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Memory Configurations
|
|
62
|
+
|
|
63
|
+
- **semantic-recall** (default): Uses embeddings to find relevant messages (requires OPENAI_API_KEY)
|
|
64
|
+
- **last-k**: Loads last 50 messages only
|
|
65
|
+
- **working-memory**: Maintains a summary of user context
|
|
66
|
+
- **combined**: Semantic recall + working memory
|
|
67
|
+
|
|
68
|
+
Note: `full-history` is available but not recommended for testing memory systems as it defeats the purpose by loading everything into context.
|
|
69
|
+
|
|
70
|
+
## Environment Variables
|
|
71
|
+
|
|
72
|
+
```bash
|
|
73
|
+
# Required for running benchmarks
|
|
74
|
+
export OPENAI_API_KEY=your-key-here
|
|
75
|
+
|
|
76
|
+
# Optional for downloading datasets
|
|
77
|
+
export HF_TOKEN=your-huggingface-token
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
## Example Workflow
|
|
81
|
+
|
|
82
|
+
```bash
|
|
83
|
+
# 1. Test with small subset first
|
|
84
|
+
pnpm prepare:quick
|
|
85
|
+
pnpm run:quick
|
|
86
|
+
|
|
87
|
+
# 2. Run full benchmark with semantic recall
|
|
88
|
+
pnpm prepare:s
|
|
89
|
+
pnpm run:s
|
|
90
|
+
```
|
|
91
|
+
|
|
92
|
+
## Viewing Results
|
|
93
|
+
|
|
94
|
+
Results are saved in `./results/run_<timestamp>/`:
|
|
95
|
+
|
|
96
|
+
- `results.jsonl`: Raw evaluation results
|
|
97
|
+
- `metrics.json`: Aggregated metrics
|
|
98
|
+
|
|
99
|
+
```bash
|
|
100
|
+
# View all runs
|
|
101
|
+
pnpm cli report -r ./results
|
|
102
|
+
|
|
103
|
+
# Check specific metrics
|
|
104
|
+
cat results/run_*/metrics.json | jq '.overall_accuracy'
|
|
105
|
+
```
|
package/package.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "@mastra/longmemeval",
|
|
3
|
+
"version": "0.1.1-alpha.0",
|
|
4
|
+
"description": "LongMemEval benchmark implementation for Mastra Memory",
|
|
5
|
+
"dependencies": {
|
|
6
|
+
"@ai-sdk/openai": "^1.3.23",
|
|
7
|
+
"@ai-sdk/provider": "^1.1.3",
|
|
8
|
+
"@huggingface/hub": "^0.15.1",
|
|
9
|
+
"@mastra/rag": "^1.0.2",
|
|
10
|
+
"@node-rs/xxhash": "^1.7.6",
|
|
11
|
+
"ai": "^4.3.17",
|
|
12
|
+
"async-mutex": "^0.5.0",
|
|
13
|
+
"chalk": "^5.3.0",
|
|
14
|
+
"commander": "^12.1.0",
|
|
15
|
+
"fastq": "^1.19.1",
|
|
16
|
+
"imvectordb": "^0.0.6",
|
|
17
|
+
"openai": "^4.73.1",
|
|
18
|
+
"ora": "^8.1.1",
|
|
19
|
+
"zod": "^3.23.8",
|
|
20
|
+
"@mastra/core": "0.10.15-alpha.1",
|
|
21
|
+
"@mastra/fastembed": "0.10.1",
|
|
22
|
+
"@mastra/libsql": "0.11.0",
|
|
23
|
+
"@mastra/memory": "0.11.3-alpha.1"
|
|
24
|
+
},
|
|
25
|
+
"devDependencies": {
|
|
26
|
+
"@ai-sdk/google": "^1.2.19",
|
|
27
|
+
"@types/node": "^22.10.2",
|
|
28
|
+
"tsx": "^4.19.2",
|
|
29
|
+
"typescript": "^5.7.2",
|
|
30
|
+
"vitest": "^2.1.8"
|
|
31
|
+
},
|
|
32
|
+
"engines": {
|
|
33
|
+
"node": ">=20"
|
|
34
|
+
},
|
|
35
|
+
"scripts": {
|
|
36
|
+
"test": "vitest",
|
|
37
|
+
"typecheck": "tsc --noEmit",
|
|
38
|
+
"generate-embeddings": "tsx scripts/generate-embeddings.ts",
|
|
39
|
+
"setup": "tsx scripts/setup.ts",
|
|
40
|
+
"download": "tsx scripts/download.ts",
|
|
41
|
+
"find-failed": "tsx scripts/find-failed.ts",
|
|
42
|
+
"clean-failed": "tsx scripts/find-failed.ts --delete",
|
|
43
|
+
"generate-wm-templates": "tsx scripts/generate-wm-templates.ts",
|
|
44
|
+
"prepare:s:semantic": "tsx src/cli.ts prepare -d longmemeval_s -c semantic-recall --concurrency 5",
|
|
45
|
+
"bench:s:semantic": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config semantic-recall --concurrency 30",
|
|
46
|
+
"prepare:s:working": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory --concurrency 35",
|
|
47
|
+
"bench:s:working": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config working-memory --concurrency 10",
|
|
48
|
+
"prepare:s:combined": "tsx src/cli.ts prepare -d longmemeval_s -c combined --concurrency 10",
|
|
49
|
+
"bench:s:combined": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined --concurrency 20",
|
|
50
|
+
"prepare:s:combined-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c combined-tailored --concurrency 20",
|
|
51
|
+
"bench:s:combined-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined-tailored --concurrency 30",
|
|
52
|
+
"prepare:s:working-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory-tailored --concurrency 20",
|
|
53
|
+
"bench:s:working-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c working-memory-tailored --concurrency 30",
|
|
54
|
+
"results": "tsx src/cli.ts results",
|
|
55
|
+
"results:all": "tsx src/cli.ts results --all",
|
|
56
|
+
"results:s": "tsx src/cli.ts results -d longmemeval_s"
|
|
57
|
+
}
|
|
58
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
#!/usr/bin/env tsx
|
|
2
|
+
|
|
3
|
+
import { downloadFile } from '@huggingface/hub';
|
|
4
|
+
import { createWriteStream, existsSync, mkdirSync, statSync } from 'fs';
|
|
5
|
+
import { join } from 'path';
|
|
6
|
+
import ora from 'ora';
|
|
7
|
+
import chalk from 'chalk';
|
|
8
|
+
import { pipeline } from 'stream/promises';
|
|
9
|
+
|
|
10
|
+
const REPO_ID = 'xiaowu0162/longmemeval';
|
|
11
|
+
const DATA_DIR = join(process.cwd(), 'data');
|
|
12
|
+
|
|
13
|
+
const FILES = [
|
|
14
|
+
{ filename: 'longmemeval_oracle.json', repoPath: 'longmemeval_oracle' },
|
|
15
|
+
{ filename: 'longmemeval_s.json', repoPath: 'longmemeval_s' },
|
|
16
|
+
{ filename: 'longmemeval_m.json', repoPath: 'longmemeval_m' },
|
|
17
|
+
];
|
|
18
|
+
|
|
19
|
+
function formatFileSize(bytes: number): string {
|
|
20
|
+
if (bytes < 1024) return bytes + ' B';
|
|
21
|
+
const kb = bytes / 1024;
|
|
22
|
+
if (kb < 1024) return kb.toFixed(1) + ' KB';
|
|
23
|
+
const mb = kb / 1024;
|
|
24
|
+
if (mb < 1024) return mb.toFixed(1) + ' MB';
|
|
25
|
+
const gb = mb / 1024;
|
|
26
|
+
return gb.toFixed(1) + ' GB';
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
async function getFileSize(filePath: string): Promise<number> {
|
|
30
|
+
try {
|
|
31
|
+
const stats = statSync(filePath);
|
|
32
|
+
return stats.size;
|
|
33
|
+
} catch {
|
|
34
|
+
return 0;
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
async function downloadWithFetch(url: string, outputPath: string, token: string): Promise<void> {
|
|
39
|
+
const response = await fetch(url, {
|
|
40
|
+
headers: {
|
|
41
|
+
Authorization: `Bearer ${token}`,
|
|
42
|
+
'User-Agent': 'longmemeval-downloader/1.0',
|
|
43
|
+
},
|
|
44
|
+
});
|
|
45
|
+
|
|
46
|
+
if (!response.ok) {
|
|
47
|
+
throw new Error(`HTTP ${response.status}: ${response.statusText}`);
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const buffer = await response.arrayBuffer();
|
|
51
|
+
const { writeFile } = await import('fs/promises');
|
|
52
|
+
await writeFile(outputPath, Buffer.from(buffer));
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
async function main() {
|
|
56
|
+
console.log(chalk.blue('\n📥 LongMemEval Dataset Downloader\n'));
|
|
57
|
+
|
|
58
|
+
// Create data directory if it doesn't exist
|
|
59
|
+
if (!existsSync(DATA_DIR)) {
|
|
60
|
+
mkdirSync(DATA_DIR, { recursive: true });
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Check if all files already exist
|
|
64
|
+
let existingCount = 0;
|
|
65
|
+
for (const fileInfo of FILES) {
|
|
66
|
+
const outputPath = join(DATA_DIR, fileInfo.filename);
|
|
67
|
+
const size = await getFileSize(outputPath);
|
|
68
|
+
if (size > 1000000) {
|
|
69
|
+
// > 1MB
|
|
70
|
+
console.log(chalk.green(`✓ ${fileInfo.filename} already exists (${formatFileSize(size)})`));
|
|
71
|
+
existingCount++;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
if (existingCount === FILES.length) {
|
|
76
|
+
console.log(chalk.green('\n✅ All datasets already downloaded!\n'));
|
|
77
|
+
console.log(chalk.gray('You can now run the benchmark:'));
|
|
78
|
+
console.log(chalk.cyan(' pnpm cli run --dataset longmemeval_s --model gpt-4o'));
|
|
79
|
+
return;
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
// Check for HuggingFace token
|
|
83
|
+
const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
84
|
+
|
|
85
|
+
if (!token) {
|
|
86
|
+
console.log(chalk.yellow('⚠️ No HuggingFace token found!\n'));
|
|
87
|
+
console.log(chalk.blue('The LongMemEval datasets require authentication to download.\n'));
|
|
88
|
+
|
|
89
|
+
console.log(chalk.gray('1. Get your token from:'));
|
|
90
|
+
console.log(chalk.cyan(' https://huggingface.co/settings/tokens\n'));
|
|
91
|
+
|
|
92
|
+
console.log(chalk.gray('2. Set it as an environment variable:'));
|
|
93
|
+
console.log(chalk.cyan(' export HF_TOKEN=your_token_here\n'));
|
|
94
|
+
|
|
95
|
+
console.log(chalk.gray('3. Run this script again:'));
|
|
96
|
+
console.log(chalk.cyan(' pnpm download\n'));
|
|
97
|
+
|
|
98
|
+
console.log(chalk.blue('Alternative: Download manually from Google Drive'));
|
|
99
|
+
console.log(chalk.gray('See DOWNLOAD_GUIDE.md for instructions'));
|
|
100
|
+
|
|
101
|
+
process.exit(1);
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
// Download missing files
|
|
105
|
+
console.log(chalk.blue('Downloading missing datasets...\n'));
|
|
106
|
+
let successCount = existingCount;
|
|
107
|
+
|
|
108
|
+
for (const fileInfo of FILES) {
|
|
109
|
+
const { filename, repoPath } = fileInfo;
|
|
110
|
+
const outputPath = join(DATA_DIR, filename);
|
|
111
|
+
|
|
112
|
+
// Skip if already exists
|
|
113
|
+
const existingSize = await getFileSize(outputPath);
|
|
114
|
+
if (existingSize > 1000000) {
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
const spinner = ora(`Downloading ${filename}...`).start();
|
|
119
|
+
|
|
120
|
+
try {
|
|
121
|
+
// Try HuggingFace Hub API first
|
|
122
|
+
try {
|
|
123
|
+
const response = await downloadFile({
|
|
124
|
+
repo: REPO_ID,
|
|
125
|
+
path: repoPath,
|
|
126
|
+
credentials: { accessToken: token },
|
|
127
|
+
});
|
|
128
|
+
|
|
129
|
+
if (response && response.body) {
|
|
130
|
+
const fileStream = createWriteStream(outputPath);
|
|
131
|
+
await pipeline(response.body as any, fileStream);
|
|
132
|
+
} else {
|
|
133
|
+
throw new Error('Empty response');
|
|
134
|
+
}
|
|
135
|
+
} catch (hubError: any) {
|
|
136
|
+
// Fallback to direct HTTPS download
|
|
137
|
+
const directUrl = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${repoPath}?download=true`;
|
|
138
|
+
await downloadWithFetch(directUrl, outputPath, token);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Verify file size
|
|
142
|
+
const downloadedSize = await getFileSize(outputPath);
|
|
143
|
+
if (downloadedSize > 1000000) {
|
|
144
|
+
spinner.succeed(`Downloaded ${filename} (${formatFileSize(downloadedSize)})`);
|
|
145
|
+
successCount++;
|
|
146
|
+
} else {
|
|
147
|
+
spinner.fail(`Downloaded ${filename} but file seems too small (${formatFileSize(downloadedSize)})`);
|
|
148
|
+
// Remove invalid file
|
|
149
|
+
const { unlink } = await import('fs/promises');
|
|
150
|
+
await unlink(outputPath).catch(() => {});
|
|
151
|
+
}
|
|
152
|
+
} catch (error: any) {
|
|
153
|
+
spinner.fail(`Failed to download ${filename}`);
|
|
154
|
+
console.error(chalk.red(` Error: ${error.message}`));
|
|
155
|
+
|
|
156
|
+
if (error.message.includes('401') || error.message.includes('403')) {
|
|
157
|
+
console.log(chalk.yellow('\n Authentication issue. Please check:'));
|
|
158
|
+
console.log(chalk.gray(' - Your token is valid'));
|
|
159
|
+
console.log(chalk.gray(' - You have accepted the dataset terms of use'));
|
|
160
|
+
console.log(chalk.cyan(` - Visit: https://huggingface.co/datasets/${REPO_ID}`));
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
// Final summary
|
|
166
|
+
console.log('');
|
|
167
|
+
if (successCount === FILES.length) {
|
|
168
|
+
console.log(chalk.green('✅ All datasets downloaded successfully!\n'));
|
|
169
|
+
console.log(chalk.gray('You can now run the benchmark:'));
|
|
170
|
+
console.log(chalk.cyan(' pnpm cli run --dataset longmemeval_s --model gpt-4o'));
|
|
171
|
+
} else {
|
|
172
|
+
console.log(chalk.yellow(`⚠️ Downloaded ${successCount}/${FILES.length} files\n`));
|
|
173
|
+
console.log(chalk.blue('If downloads failed, please check:'));
|
|
174
|
+
console.log(chalk.gray('- Your HuggingFace token is valid'));
|
|
175
|
+
console.log(chalk.gray('- You have accepted the dataset terms (if any)'));
|
|
176
|
+
console.log(chalk.gray('\nAlternatively, see DOWNLOAD_GUIDE.md for manual download instructions'));
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
main().catch(console.error);
|