@mastra/longmemeval 0.1.1-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/CHANGELOG.md +13 -0
  2. package/DATA_DOWNLOAD_GUIDE.md +117 -0
  3. package/LICENSE.md +15 -0
  4. package/README.md +173 -0
  5. package/USAGE.md +105 -0
  6. package/package.json +58 -0
  7. package/scripts/download.ts +180 -0
  8. package/scripts/find-failed.ts +176 -0
  9. package/scripts/generate-embeddings.ts +56 -0
  10. package/scripts/generate-wm-templates.ts +296 -0
  11. package/scripts/setup.ts +60 -0
  12. package/src/__fixtures__/embeddings.json +2319 -0
  13. package/src/__fixtures__/test-dataset.json +82 -0
  14. package/src/cli.ts +690 -0
  15. package/src/commands/__tests__/prepare.test.ts +230 -0
  16. package/src/commands/__tests__/run.test.ts +403 -0
  17. package/src/commands/prepare.ts +793 -0
  18. package/src/commands/run.ts +553 -0
  19. package/src/config.ts +83 -0
  20. package/src/data/loader.ts +163 -0
  21. package/src/data/types.ts +61 -0
  22. package/src/embeddings/cached-openai-embedding-model.ts +227 -0
  23. package/src/embeddings/cached-openai-provider.ts +40 -0
  24. package/src/embeddings/index.ts +2 -0
  25. package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
  26. package/src/evaluation/longmemeval-metric.ts +173 -0
  27. package/src/retry-model.ts +60 -0
  28. package/src/storage/__tests__/benchmark-store.test.ts +260 -0
  29. package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
  30. package/src/storage/benchmark-store.ts +536 -0
  31. package/src/storage/benchmark-vector.ts +234 -0
  32. package/src/storage/index.ts +2 -0
  33. package/src/test-utils/mock-embeddings.ts +54 -0
  34. package/src/test-utils/mock-model.ts +49 -0
  35. package/tests/data-loader.test.ts +96 -0
  36. package/tsconfig.json +18 -0
  37. package/vitest.config.ts +9 -0
package/CHANGELOG.md ADDED
@@ -0,0 +1,13 @@
1
+ # @mastra/longmemeval
2
+
3
+ ## 0.1.1-alpha.0
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [0b56518]
8
+ - Updated dependencies [2ba5b76]
9
+ - Updated dependencies [c3a30de]
10
+ - Updated dependencies [cf3a184]
11
+ - Updated dependencies [d6bfd60]
12
+ - @mastra/core@0.10.15-alpha.1
13
+ - @mastra/memory@0.11.3-alpha.1
@@ -0,0 +1,117 @@
1
+ # LongMemEval Dataset Download Guide
2
+
3
+ The LongMemEval datasets are large files (several GB) hosted on HuggingFace with Git LFS. Here are all the ways to download them:
4
+
5
+ ## Option 1: JavaScript/Node.js Download
6
+
7
+ ### 1. Get your HuggingFace token
8
+
9
+ - Go to https://huggingface.co/settings/tokens
10
+ - Create a new token with read permissions
11
+ - Copy the token
12
+
13
+ ### 2. Set the token as environment variable
14
+
15
+ ```bash
16
+ export HF_TOKEN=your_token_here
17
+ # or
18
+ export HUGGINGFACE_TOKEN=your_token_here
19
+ ```
20
+
21
+ ### 3. Install dependencies and download
22
+
23
+ ```bash
24
+ pnpm install
25
+ pnpm download:hf
26
+ ```
27
+
28
+ ## Option 2: Git LFS
29
+
30
+ ### 1. Install Git LFS
31
+
32
+ ```bash
33
+ # macOS
34
+ brew install git-lfs
35
+
36
+ # Ubuntu/Debian
37
+ sudo apt-get install git-lfs
38
+
39
+ # Initialize Git LFS
40
+ git lfs install
41
+ ```
42
+
43
+ ### 2. Clone with Git LFS
44
+
45
+ ```bash
46
+ git clone https://huggingface.co/datasets/xiaowu0162/longmemeval
47
+ cd longmemeval
48
+ cp *.json ../data/
49
+ ```
50
+
51
+ ## Option 3: Manual Download from Google Drive
52
+
53
+ ### 1. Download the archive
54
+
55
+ Go to: https://drive.google.com/file/d/1zJgtYRFhOh5zDQzzatiddfjYhFSnyQ80/view
56
+
57
+ ### 2. Extract the files
58
+
59
+ ```bash
60
+ cd packages/longmemeval/data
61
+ tar -xzvf ~/Downloads/longmemeval_data.tar.gz
62
+ ```
63
+
64
+ ### 3. Verify the files
65
+
66
+ ```bash
67
+ ls -lh *.json
68
+ # You should see:
69
+ # - longmemeval_s.json (~40MB)
70
+ # - longmemeval_m.json (~200MB)
71
+ # - longmemeval_oracle.json (~2MB)
72
+ ```
73
+
74
+ ## Option 4: Direct Browser Download
75
+
76
+ If you have a HuggingFace account:
77
+
78
+ 1. Go to https://huggingface.co/datasets/xiaowu0162/longmemeval
79
+ 2. Click on "Files and versions"
80
+ 3. Download each JSON file directly
81
+ 4. Move them to `packages/longmemeval/data/`
82
+
83
+ ## Troubleshooting
84
+
85
+ ### "Entry not found" or small files (15 bytes)
86
+
87
+ This means the download failed due to authentication. Use one of the authenticated methods above.
88
+
89
+ ### Git LFS bandwidth exceeded
90
+
91
+ HuggingFace has bandwidth limits. Try:
92
+
93
+ - Using the Google Drive link instead
94
+ - Waiting until the next day when bandwidth resets
95
+ - Using a different download method
96
+
97
+ ### Permission denied
98
+
99
+ Make sure you're logged in to HuggingFace and have accepted any dataset terms of use.
100
+
101
+ ## Verification
102
+
103
+ After downloading, verify the files:
104
+
105
+ ```bash
106
+ # Check file sizes
107
+ ls -lh data/*.json
108
+
109
+ # Check file content (should be valid JSON)
110
+ head -n 5 data/longmemeval_s.json
111
+ ```
112
+
113
+ Expected sizes:
114
+
115
+ - `longmemeval_oracle.json`: ~2MB
116
+ - `longmemeval_s.json`: ~40MB
117
+ - `longmemeval_m.json`: ~200MB
package/LICENSE.md ADDED
@@ -0,0 +1,15 @@
1
+ # Apache License 2.0
2
+
3
+ Copyright (c) 2025 Kepler Software, Inc.
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
package/README.md ADDED
@@ -0,0 +1,173 @@
1
+ # LongMemEval Benchmark for Mastra
2
+
3
+ This package implements the [LongMemEval benchmark](https://arxiv.org/abs/2410.10813) ([+Github](https://github.com/xiaowu0162/LongMemEval)) for testing Mastra's long-term memory capabilities.
4
+
5
+ ## About LongMemEval
6
+
7
+ LongMemEval is a comprehensive benchmark designed by researchers to evaluate the long-term memory capabilities of chat assistants. It was introduced in the paper:
8
+
9
+ **"LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory"**
10
+ _Di Wu, Hongwei Wang, Wenhao Yu, Yuwei Zhang, Kai-Wei Chang, Dong Yu (ICLR 2025)_
11
+ 📄 [Paper](https://arxiv.org/abs/2410.10813) | 🌐 [Website](https://xiaowu0162.github.io/long-mem-eval/) | 🤗 [Dataset](https://huggingface.co/datasets/xiaowu0162/longmemeval)
12
+
13
+ ### What LongMemEval Tests
14
+
15
+ The benchmark evaluates five core long-term memory abilities through 500 meticulously curated questions:
16
+
17
+ 1. **Information Extraction** - Recalling specific information from extensive interactive histories
18
+ 2. **Multi-Session Reasoning** - Synthesizing information across multiple history sessions
19
+ 3. **Knowledge Updates** - Handling information that changes over time
20
+ 4. **Temporal Reasoning** - Understanding time-based relationships in conversations
21
+ 5. **Abstention** - Recognizing when information is not available
22
+
23
+ ### Why This Matters
24
+
25
+ Current LLMs show a 30-60% performance drop when tested on LongMemEval, revealing significant challenges in maintaining coherent long-term memory. This benchmark helps identify and improve these limitations.
26
+
27
+ ## Quick Start
28
+
29
+ ```bash
30
+ # From packages/longmemeval directory
31
+
32
+ # 1. Set your API keys
33
+ export OPENAI_API_KEY=your_openai_key_here
34
+ export HF_TOKEN=your_huggingface_token_here # For automatic dataset download
35
+
36
+ # 2. Run a benchmark (downloads datasets automatically if needed)
37
+ pnpm bench:s # Run small dataset (10 parallel requests)
38
+ pnpm bench:m # Run medium dataset (10 parallel requests)
39
+ pnpm bench:oracle # Run oracle dataset (10 parallel requests)
40
+
41
+ # Or run quick 10-question tests
42
+ pnpm bench:s:quick # Test with 10 questions from small dataset
43
+ pnpm bench:m:quick # Test with 10 questions from medium dataset
44
+ pnpm bench:oracle:quick # Test with 10 questions from oracle dataset
45
+ ```
46
+
47
+ **Note:** The benchmark will automatically download datasets on first run. Get your HuggingFace token from https://huggingface.co/settings/tokens
48
+
49
+ ## Manual Setup
50
+
51
+ ### 1. Install Dependencies
52
+
53
+ ```bash
54
+ # From the monorepo root
55
+ pnpm install
56
+ pnpm build
57
+ ```
58
+
59
+ ### 2. Download Dataset
60
+
61
+ ```bash
62
+ # Set your HuggingFace token
63
+ export HF_TOKEN=your_token_here
64
+
65
+ # Download datasets (no Python or Git LFS required)
66
+ pnpm download
67
+ ```
68
+
69
+ If automatic download fails, see [DOWNLOAD_GUIDE.md](./DOWNLOAD_GUIDE.md) for manual download instructions.
70
+
71
+ ## Usage
72
+
73
+ ### Run Benchmark
74
+
75
+ ```bash
76
+ # From packages/longmemeval directory
77
+
78
+ # Quick commands for each dataset (10 parallel requests)
79
+ pnpm bench:s # Small dataset (full run)
80
+ pnpm bench:m # Medium dataset (full run)
81
+ pnpm bench:oracle # Oracle dataset (full run)
82
+
83
+ # Quick test runs (10 questions only, 5 parallel)
84
+ pnpm bench:s:quick # Small dataset (quick test)
85
+ pnpm bench:m:quick # Medium dataset (quick test)
86
+ pnpm bench:oracle:quick # Oracle dataset (quick test)
87
+
88
+ # Advanced: Use full CLI with custom options
89
+ pnpm cli run --dataset longmemeval_s --model gpt-4o
90
+
91
+ # Adjust parallelization (default: 5)
92
+ pnpm cli run --dataset longmemeval_s --model gpt-4o --concurrency 20
93
+
94
+ # Graceful shutdown: Press Ctrl+C to stop and save progress
95
+
96
+ # Run with specific memory configuration
97
+ pnpm cli run --dataset longmemeval_s --memory-config last-k --model gpt-4o
98
+ pnpm cli run --dataset longmemeval_s --memory-config semantic-recall --model gpt-4o
99
+ pnpm cli run --dataset longmemeval_s --memory-config working-memory --model gpt-4o
100
+
101
+ # Custom subset size
102
+ pnpm cli run --dataset longmemeval_oracle --model gpt-4o --subset 25
103
+ ```
104
+
105
+ ### View Dataset Statistics
106
+
107
+ ```bash
108
+ pnpm cli stats --dataset longmemeval_s
109
+ ```
110
+
111
+ ### Evaluate Existing Results
112
+
113
+ ```bash
114
+ pnpm cli evaluate --results ./results/run_12345/results.jsonl --dataset longmemeval_s
115
+ ```
116
+
117
+ ### Generate Report
118
+
119
+ ```bash
120
+ pnpm cli report --results ./results/
121
+ ```
122
+
123
+ ## Memory Configurations
124
+
125
+ - **full-history**: Provide complete chat history (baseline)
126
+ - **last-k**: Use Mastra's lastMessages configuration (last 20 messages)
127
+ - **semantic-recall**: Use Mastra's semantic recall feature (requires vector store)
128
+ - **working-memory**: Use Mastra's working memory with template
129
+ - **combined**: Combination of last-k and semantic recall
130
+
131
+ ## Output
132
+
133
+ Results are saved in the `results/` directory with:
134
+
135
+ - `results.jsonl`: Individual question results
136
+ - `hypotheses.json`: Model responses
137
+ - `questions.json`: Questions for reference
138
+ - `metrics.json`: Aggregated metrics and configuration
139
+
140
+ ## Benchmark Datasets
141
+
142
+ LongMemEval provides three dataset variants:
143
+
144
+ - **longmemeval_s (Small)**: ~115k tokens per question (30-40 sessions)
145
+ - Designed to fit within 128k context windows
146
+ - Tests memory across dozens of conversation sessions
147
+ - **longmemeval_m (Medium)**: ~1.5M tokens per question (500 sessions)
148
+ - Challenges even the largest context windows
149
+ - Tests memory across hundreds of sessions
150
+ - **longmemeval_oracle**: Only evidence sessions included
151
+ - Used as a control to verify models can answer when given only relevant context
152
+ - Helps isolate memory retrieval issues from comprehension issues
153
+
154
+ ## Citation
155
+
156
+ If you use this benchmark in your research, please cite the original paper:
157
+
158
+ ```bibtex
159
+ @article{wu2024longmemeval,
160
+ title={LongMemEval: Benchmarking Chat Assistants on Long-Term Interactive Memory},
161
+ author={Wu, Di and Wang, Hongwei and Yu, Wenhao and Zhang, Yuwei and Chang, Kai-Wei and Yu, Dong},
162
+ journal={arXiv preprint arXiv:2410.10813},
163
+ year={2024}
164
+ }
165
+ ```
166
+
167
+ ## Extending the Benchmark
168
+
169
+ To add custom memory configurations:
170
+
171
+ 1. Edit `src/benchmark/runner.ts` and add your configuration to `getMemoryConfig()`
172
+ 2. Update the `MemoryConfigType` in `src/data/types.ts`
173
+ 3. Implement the configuration logic in `src/memory-adapters/mastra-adapter.ts`
package/USAGE.md ADDED
@@ -0,0 +1,105 @@
1
+ # LongMemEval Usage Guide
2
+
3
+ ## Quick Start
4
+
5
+ ### 1. Prepare Data (Required First)
6
+
7
+ The prepare step processes the dataset through mock agents to populate the storage:
8
+
9
+ ```bash
10
+ # Quick test with 5 questions
11
+ pnpm prepare:quick
12
+
13
+ # Full dataset with different memory configs
14
+ pnpm prepare:s # Small dataset, semantic-recall (default)
15
+ pnpm prepare:s:lastk # Small dataset, last-k messages
16
+ pnpm prepare:s:working # Small dataset, working-memory
17
+ pnpm prepare:s:combined # Small dataset, combined (semantic + working memory)
18
+ pnpm prepare:m # Medium dataset, semantic-recall
19
+ ```
20
+
21
+ ### 2. Run Benchmark
22
+
23
+ After preparing data, run the benchmark:
24
+
25
+ ```bash
26
+ # Quick test
27
+ pnpm run:quick
28
+
29
+ # Full runs
30
+ pnpm run:s # Small dataset with semantic-recall (default)
31
+ pnpm run:s:lastk # Small dataset with last-k
32
+ pnpm run:s:working # Small dataset with working-memory
33
+ pnpm run:s:combined # Small dataset with combined
34
+ ```
35
+
36
+ ## Full CLI Options
37
+
38
+ ### Prepare Command
39
+
40
+ ```bash
41
+ pnpm cli prepare \
42
+ -d <dataset> # longmemeval_s, longmemeval_m, longmemeval_oracle
43
+ -c <memory-config> # full-history, last-k, semantic-recall, working-memory, combined
44
+ [--subset <n>] # Process only n questions
45
+ [--output <dir>] # Output directory (default: ./prepared-data)
46
+ ```
47
+
48
+ ### Run Command
49
+
50
+ ```bash
51
+ pnpm cli run \
52
+ -d <dataset> # longmemeval_s, longmemeval_m, longmemeval_oracle
53
+ -m <model> # Model name (e.g., gpt-4o)
54
+ -c <memory-config> # full-history, last-k, semantic-recall, working-memory, combined
55
+ [--subset <n>] # Run only n questions
56
+ [--concurrency <n>] # Parallel requests (default: 5)
57
+ [--prepared-data <dir>] # Prepared data directory
58
+ [--output <dir>] # Results directory
59
+ ```
60
+
61
+ ## Memory Configurations
62
+
63
+ - **semantic-recall** (default): Uses embeddings to find relevant messages (requires OPENAI_API_KEY)
64
+ - **last-k**: Loads last 50 messages only
65
+ - **working-memory**: Maintains a summary of user context
66
+ - **combined**: Semantic recall + working memory
67
+
68
+ Note: `full-history` is available but not recommended for testing memory systems as it defeats the purpose by loading everything into context.
69
+
70
+ ## Environment Variables
71
+
72
+ ```bash
73
+ # Required for running benchmarks
74
+ export OPENAI_API_KEY=your-key-here
75
+
76
+ # Optional for downloading datasets
77
+ export HF_TOKEN=your-huggingface-token
78
+ ```
79
+
80
+ ## Example Workflow
81
+
82
+ ```bash
83
+ # 1. Test with small subset first
84
+ pnpm prepare:quick
85
+ pnpm run:quick
86
+
87
+ # 2. Run full benchmark with semantic recall
88
+ pnpm prepare:s
89
+ pnpm run:s
90
+ ```
91
+
92
+ ## Viewing Results
93
+
94
+ Results are saved in `./results/run_<timestamp>/`:
95
+
96
+ - `results.jsonl`: Raw evaluation results
97
+ - `metrics.json`: Aggregated metrics
98
+
99
+ ```bash
100
+ # View all runs
101
+ pnpm cli report -r ./results
102
+
103
+ # Check specific metrics
104
+ cat results/run_*/metrics.json | jq '.overall_accuracy'
105
+ ```
package/package.json ADDED
@@ -0,0 +1,58 @@
1
+ {
2
+ "name": "@mastra/longmemeval",
3
+ "version": "0.1.1-alpha.0",
4
+ "description": "LongMemEval benchmark implementation for Mastra Memory",
5
+ "dependencies": {
6
+ "@ai-sdk/openai": "^1.3.23",
7
+ "@ai-sdk/provider": "^1.1.3",
8
+ "@huggingface/hub": "^0.15.1",
9
+ "@mastra/rag": "^1.0.2",
10
+ "@node-rs/xxhash": "^1.7.6",
11
+ "ai": "^4.3.17",
12
+ "async-mutex": "^0.5.0",
13
+ "chalk": "^5.3.0",
14
+ "commander": "^12.1.0",
15
+ "fastq": "^1.19.1",
16
+ "imvectordb": "^0.0.6",
17
+ "openai": "^4.73.1",
18
+ "ora": "^8.1.1",
19
+ "zod": "^3.23.8",
20
+ "@mastra/core": "0.10.15-alpha.1",
21
+ "@mastra/fastembed": "0.10.1",
22
+ "@mastra/libsql": "0.11.0",
23
+ "@mastra/memory": "0.11.3-alpha.1"
24
+ },
25
+ "devDependencies": {
26
+ "@ai-sdk/google": "^1.2.19",
27
+ "@types/node": "^22.10.2",
28
+ "tsx": "^4.19.2",
29
+ "typescript": "^5.7.2",
30
+ "vitest": "^2.1.8"
31
+ },
32
+ "engines": {
33
+ "node": ">=20"
34
+ },
35
+ "scripts": {
36
+ "test": "vitest",
37
+ "typecheck": "tsc --noEmit",
38
+ "generate-embeddings": "tsx scripts/generate-embeddings.ts",
39
+ "setup": "tsx scripts/setup.ts",
40
+ "download": "tsx scripts/download.ts",
41
+ "find-failed": "tsx scripts/find-failed.ts",
42
+ "clean-failed": "tsx scripts/find-failed.ts --delete",
43
+ "generate-wm-templates": "tsx scripts/generate-wm-templates.ts",
44
+ "prepare:s:semantic": "tsx src/cli.ts prepare -d longmemeval_s -c semantic-recall --concurrency 5",
45
+ "bench:s:semantic": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config semantic-recall --concurrency 30",
46
+ "prepare:s:working": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory --concurrency 35",
47
+ "bench:s:working": "tsx src/cli.ts run --dataset longmemeval_s --model gpt-4o --memory-config working-memory --concurrency 10",
48
+ "prepare:s:combined": "tsx src/cli.ts prepare -d longmemeval_s -c combined --concurrency 10",
49
+ "bench:s:combined": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined --concurrency 20",
50
+ "prepare:s:combined-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c combined-tailored --concurrency 20",
51
+ "bench:s:combined-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c combined-tailored --concurrency 30",
52
+ "prepare:s:working-tailored": "tsx src/cli.ts prepare -d longmemeval_s -c working-memory-tailored --concurrency 20",
53
+ "bench:s:working-tailored": "tsx src/cli.ts run -d longmemeval_s -m gpt-4o -c working-memory-tailored --concurrency 30",
54
+ "results": "tsx src/cli.ts results",
55
+ "results:all": "tsx src/cli.ts results --all",
56
+ "results:s": "tsx src/cli.ts results -d longmemeval_s"
57
+ }
58
+ }
@@ -0,0 +1,180 @@
1
+ #!/usr/bin/env tsx
2
+
3
+ import { downloadFile } from '@huggingface/hub';
4
+ import { createWriteStream, existsSync, mkdirSync, statSync } from 'fs';
5
+ import { join } from 'path';
6
+ import ora from 'ora';
7
+ import chalk from 'chalk';
8
+ import { pipeline } from 'stream/promises';
9
+
10
+ const REPO_ID = 'xiaowu0162/longmemeval';
11
+ const DATA_DIR = join(process.cwd(), 'data');
12
+
13
+ const FILES = [
14
+ { filename: 'longmemeval_oracle.json', repoPath: 'longmemeval_oracle' },
15
+ { filename: 'longmemeval_s.json', repoPath: 'longmemeval_s' },
16
+ { filename: 'longmemeval_m.json', repoPath: 'longmemeval_m' },
17
+ ];
18
+
19
+ function formatFileSize(bytes: number): string {
20
+ if (bytes < 1024) return bytes + ' B';
21
+ const kb = bytes / 1024;
22
+ if (kb < 1024) return kb.toFixed(1) + ' KB';
23
+ const mb = kb / 1024;
24
+ if (mb < 1024) return mb.toFixed(1) + ' MB';
25
+ const gb = mb / 1024;
26
+ return gb.toFixed(1) + ' GB';
27
+ }
28
+
29
+ async function getFileSize(filePath: string): Promise<number> {
30
+ try {
31
+ const stats = statSync(filePath);
32
+ return stats.size;
33
+ } catch {
34
+ return 0;
35
+ }
36
+ }
37
+
38
+ async function downloadWithFetch(url: string, outputPath: string, token: string): Promise<void> {
39
+ const response = await fetch(url, {
40
+ headers: {
41
+ Authorization: `Bearer ${token}`,
42
+ 'User-Agent': 'longmemeval-downloader/1.0',
43
+ },
44
+ });
45
+
46
+ if (!response.ok) {
47
+ throw new Error(`HTTP ${response.status}: ${response.statusText}`);
48
+ }
49
+
50
+ const buffer = await response.arrayBuffer();
51
+ const { writeFile } = await import('fs/promises');
52
+ await writeFile(outputPath, Buffer.from(buffer));
53
+ }
54
+
55
+ async function main() {
56
+ console.log(chalk.blue('\n📥 LongMemEval Dataset Downloader\n'));
57
+
58
+ // Create data directory if it doesn't exist
59
+ if (!existsSync(DATA_DIR)) {
60
+ mkdirSync(DATA_DIR, { recursive: true });
61
+ }
62
+
63
+ // Check if all files already exist
64
+ let existingCount = 0;
65
+ for (const fileInfo of FILES) {
66
+ const outputPath = join(DATA_DIR, fileInfo.filename);
67
+ const size = await getFileSize(outputPath);
68
+ if (size > 1000000) {
69
+ // > 1MB
70
+ console.log(chalk.green(`✓ ${fileInfo.filename} already exists (${formatFileSize(size)})`));
71
+ existingCount++;
72
+ }
73
+ }
74
+
75
+ if (existingCount === FILES.length) {
76
+ console.log(chalk.green('\n✅ All datasets already downloaded!\n'));
77
+ console.log(chalk.gray('You can now run the benchmark:'));
78
+ console.log(chalk.cyan(' pnpm cli run --dataset longmemeval_s --model gpt-4o'));
79
+ return;
80
+ }
81
+
82
+ // Check for HuggingFace token
83
+ const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
84
+
85
+ if (!token) {
86
+ console.log(chalk.yellow('⚠️ No HuggingFace token found!\n'));
87
+ console.log(chalk.blue('The LongMemEval datasets require authentication to download.\n'));
88
+
89
+ console.log(chalk.gray('1. Get your token from:'));
90
+ console.log(chalk.cyan(' https://huggingface.co/settings/tokens\n'));
91
+
92
+ console.log(chalk.gray('2. Set it as an environment variable:'));
93
+ console.log(chalk.cyan(' export HF_TOKEN=your_token_here\n'));
94
+
95
+ console.log(chalk.gray('3. Run this script again:'));
96
+ console.log(chalk.cyan(' pnpm download\n'));
97
+
98
+ console.log(chalk.blue('Alternative: Download manually from Google Drive'));
99
+ console.log(chalk.gray('See DOWNLOAD_GUIDE.md for instructions'));
100
+
101
+ process.exit(1);
102
+ }
103
+
104
+ // Download missing files
105
+ console.log(chalk.blue('Downloading missing datasets...\n'));
106
+ let successCount = existingCount;
107
+
108
+ for (const fileInfo of FILES) {
109
+ const { filename, repoPath } = fileInfo;
110
+ const outputPath = join(DATA_DIR, filename);
111
+
112
+ // Skip if already exists
113
+ const existingSize = await getFileSize(outputPath);
114
+ if (existingSize > 1000000) {
115
+ continue;
116
+ }
117
+
118
+ const spinner = ora(`Downloading ${filename}...`).start();
119
+
120
+ try {
121
+ // Try HuggingFace Hub API first
122
+ try {
123
+ const response = await downloadFile({
124
+ repo: REPO_ID,
125
+ path: repoPath,
126
+ credentials: { accessToken: token },
127
+ });
128
+
129
+ if (response && response.body) {
130
+ const fileStream = createWriteStream(outputPath);
131
+ await pipeline(response.body as any, fileStream);
132
+ } else {
133
+ throw new Error('Empty response');
134
+ }
135
+ } catch (hubError: any) {
136
+ // Fallback to direct HTTPS download
137
+ const directUrl = `https://huggingface.co/datasets/${REPO_ID}/resolve/main/${repoPath}?download=true`;
138
+ await downloadWithFetch(directUrl, outputPath, token);
139
+ }
140
+
141
+ // Verify file size
142
+ const downloadedSize = await getFileSize(outputPath);
143
+ if (downloadedSize > 1000000) {
144
+ spinner.succeed(`Downloaded ${filename} (${formatFileSize(downloadedSize)})`);
145
+ successCount++;
146
+ } else {
147
+ spinner.fail(`Downloaded ${filename} but file seems too small (${formatFileSize(downloadedSize)})`);
148
+ // Remove invalid file
149
+ const { unlink } = await import('fs/promises');
150
+ await unlink(outputPath).catch(() => {});
151
+ }
152
+ } catch (error: any) {
153
+ spinner.fail(`Failed to download ${filename}`);
154
+ console.error(chalk.red(` Error: ${error.message}`));
155
+
156
+ if (error.message.includes('401') || error.message.includes('403')) {
157
+ console.log(chalk.yellow('\n Authentication issue. Please check:'));
158
+ console.log(chalk.gray(' - Your token is valid'));
159
+ console.log(chalk.gray(' - You have accepted the dataset terms of use'));
160
+ console.log(chalk.cyan(` - Visit: https://huggingface.co/datasets/${REPO_ID}`));
161
+ }
162
+ }
163
+ }
164
+
165
+ // Final summary
166
+ console.log('');
167
+ if (successCount === FILES.length) {
168
+ console.log(chalk.green('✅ All datasets downloaded successfully!\n'));
169
+ console.log(chalk.gray('You can now run the benchmark:'));
170
+ console.log(chalk.cyan(' pnpm cli run --dataset longmemeval_s --model gpt-4o'));
171
+ } else {
172
+ console.log(chalk.yellow(`⚠️ Downloaded ${successCount}/${FILES.length} files\n`));
173
+ console.log(chalk.blue('If downloads failed, please check:'));
174
+ console.log(chalk.gray('- Your HuggingFace token is valid'));
175
+ console.log(chalk.gray('- You have accepted the dataset terms (if any)'));
176
+ console.log(chalk.gray('\nAlternatively, see DOWNLOAD_GUIDE.md for manual download instructions'));
177
+ }
178
+ }
179
+
180
+ main().catch(console.error);