@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +919 -0
- package/DATA_DOWNLOAD_GUIDE.md +117 -0
- package/LICENSE.md +15 -0
- package/README.md +173 -0
- package/USAGE.md +105 -0
- package/package.json +67 -0
- package/scripts/download.ts +180 -0
- package/scripts/find-failed.ts +176 -0
- package/scripts/generate-embeddings.ts +56 -0
- package/scripts/generate-wm-templates.ts +296 -0
- package/scripts/setup.ts +60 -0
- package/src/__fixtures__/embeddings.json +2319 -0
- package/src/__fixtures__/test-dataset.json +82 -0
- package/src/cli.ts +690 -0
- package/src/commands/__tests__/prepare.test.ts +230 -0
- package/src/commands/__tests__/run.test.ts +403 -0
- package/src/commands/prepare.ts +793 -0
- package/src/commands/run.ts +553 -0
- package/src/config.ts +83 -0
- package/src/data/loader.ts +163 -0
- package/src/data/types.ts +61 -0
- package/src/embeddings/cached-openai-embedding-model.ts +227 -0
- package/src/embeddings/cached-openai-provider.ts +40 -0
- package/src/embeddings/index.ts +2 -0
- package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
- package/src/evaluation/longmemeval-metric.ts +173 -0
- package/src/retry-model.ts +60 -0
- package/src/storage/__tests__/benchmark-store.test.ts +280 -0
- package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
- package/src/storage/benchmark-store.ts +540 -0
- package/src/storage/benchmark-vector.ts +234 -0
- package/src/storage/index.ts +2 -0
- package/src/test-utils/mock-embeddings.ts +54 -0
- package/src/test-utils/mock-model.ts +49 -0
- package/tests/data-loader.test.ts +96 -0
- package/tsconfig.json +18 -0
- package/vitest.config.ts +9 -0
package/src/cli.ts
ADDED
|
@@ -0,0 +1,690 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
import { Command } from 'commander';
|
|
4
|
+
import chalk from 'chalk';
|
|
5
|
+
import { readFile, readdir } from 'fs/promises';
|
|
6
|
+
import { join } from 'path';
|
|
7
|
+
import { existsSync, statSync } from 'fs';
|
|
8
|
+
import { execSync } from 'child_process';
|
|
9
|
+
|
|
10
|
+
import { DatasetLoader } from './data/loader';
|
|
11
|
+
import type { EvaluationResult, BenchmarkMetrics, QuestionType } from './data/types';
|
|
12
|
+
import { PrepareCommand } from './commands/prepare';
|
|
13
|
+
import { RunCommand } from './commands/run';
|
|
14
|
+
|
|
15
|
+
const program = new Command();
|
|
16
|
+
|
|
17
|
+
// Force immediate exit on Ctrl+C
|
|
18
|
+
process.on('SIGINT', () => {
|
|
19
|
+
console.log('\n\nForce exiting...');
|
|
20
|
+
process.exit(130); // Standard exit code for SIGINT
|
|
21
|
+
});
|
|
22
|
+
|
|
23
|
+
// Also handle SIGTERM
|
|
24
|
+
process.on('SIGTERM', () => {
|
|
25
|
+
process.exit(143); // Standard exit code for SIGTERM
|
|
26
|
+
});
|
|
27
|
+
|
|
28
|
+
// Helper function to calculate metrics
|
|
29
|
+
function calculateMetrics(results: EvaluationResult[]): BenchmarkMetrics {
|
|
30
|
+
const metrics: BenchmarkMetrics = {
|
|
31
|
+
overall_accuracy: 0,
|
|
32
|
+
accuracy_by_type: {},
|
|
33
|
+
abstention_accuracy: 0,
|
|
34
|
+
total_questions: results.length,
|
|
35
|
+
correct_answers: 0,
|
|
36
|
+
abstention_correct: 0,
|
|
37
|
+
abstention_total: 0,
|
|
38
|
+
} as const;
|
|
39
|
+
|
|
40
|
+
// Calculate overall metrics
|
|
41
|
+
for (const result of results) {
|
|
42
|
+
if (result.is_correct) {
|
|
43
|
+
metrics.correct_answers++;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Track by question type
|
|
47
|
+
const type = result.question_type;
|
|
48
|
+
if (type && !metrics.accuracy_by_type[type]) {
|
|
49
|
+
metrics.accuracy_by_type[type] = { correct: 0, total: 0, accuracy: 0 };
|
|
50
|
+
}
|
|
51
|
+
const accuracyByType = type ? metrics.accuracy_by_type[type] : null;
|
|
52
|
+
if (accuracyByType) {
|
|
53
|
+
accuracyByType.total++;
|
|
54
|
+
}
|
|
55
|
+
if (accuracyByType && result.is_correct) {
|
|
56
|
+
accuracyByType.correct++;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
// Track abstention separately
|
|
60
|
+
if (result.question_id.endsWith('_abs')) {
|
|
61
|
+
metrics.abstention_total!++;
|
|
62
|
+
if (result.is_correct) {
|
|
63
|
+
metrics.abstention_correct!++;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
// Calculate per-type accuracies first
|
|
69
|
+
for (const type in metrics.accuracy_by_type) {
|
|
70
|
+
const typeMetrics = metrics.accuracy_by_type[type as QuestionType];
|
|
71
|
+
if (typeMetrics) {
|
|
72
|
+
typeMetrics.accuracy = typeMetrics.total > 0 ? typeMetrics.correct / typeMetrics.total : 0;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (metrics && (metrics.abstention_total || 0) > 0) {
|
|
77
|
+
metrics.abstention_accuracy = (metrics.abstention_correct || 0) / (metrics.abstention_total || 0);
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
// Calculate overall accuracy as average of all question type accuracies (excluding abstention)
|
|
81
|
+
const allTypeAccuracies = Object.values(metrics.accuracy_by_type).map(t => t.accuracy);
|
|
82
|
+
|
|
83
|
+
metrics.overall_accuracy =
|
|
84
|
+
allTypeAccuracies.length > 0 ? allTypeAccuracies.reduce((sum, acc) => sum + acc, 0) / allTypeAccuracies.length : 0;
|
|
85
|
+
|
|
86
|
+
return metrics;
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
program.name('longmemeval').description('LongMemEval benchmark for Mastra Memory').version('0.1.0');
|
|
90
|
+
|
|
91
|
+
// Prepare command
|
|
92
|
+
program
|
|
93
|
+
.command('prepare')
|
|
94
|
+
.description('Prepare LongMemEval data by processing through mock agents')
|
|
95
|
+
.option('-d, --dataset <dataset>', 'Dataset to use', 'longmemeval_s')
|
|
96
|
+
.option(
|
|
97
|
+
'-c, --memory-config <config>',
|
|
98
|
+
'Memory configuration (last-k, semantic-recall, semantic-recall-reranked, working-memory, working-memory-tailored, combined, combined-tailored)',
|
|
99
|
+
'semantic-recall',
|
|
100
|
+
)
|
|
101
|
+
.option('-o, --output <dir>', 'Output directory for prepared data', './prepared-data')
|
|
102
|
+
.option('--subset <n>', 'Prepare only a subset of n questions', parseInt)
|
|
103
|
+
.option('--concurrency <n>', 'Number of questions to process in parallel', parseInt)
|
|
104
|
+
.option('--question-id <id>', 'Prepare a specific question by ID')
|
|
105
|
+
.option('--resume-from-message-id <id>', 'Resume processing from a specific message ID')
|
|
106
|
+
.option('--session-limit <n>', 'Limit processing to n sessions after resume point', parseInt)
|
|
107
|
+
.option('--session-offset <n>', 'Start processing from the nth session (1-based)', parseInt)
|
|
108
|
+
.action(async options => {
|
|
109
|
+
try {
|
|
110
|
+
console.log(chalk.blue('\nš LongMemEval Data Preparation\n'));
|
|
111
|
+
console.log(chalk.gray(`Dataset: ${options.dataset}`));
|
|
112
|
+
console.log(chalk.gray(`Memory Config: ${options.memoryConfig}`));
|
|
113
|
+
if (options.subset) {
|
|
114
|
+
console.log(chalk.gray(`Subset: ${options.subset} questions`));
|
|
115
|
+
}
|
|
116
|
+
if (options.questionId) {
|
|
117
|
+
console.log(chalk.gray(`Question ID: ${options.questionId}`));
|
|
118
|
+
}
|
|
119
|
+
if (options.resumeFromMessageId) {
|
|
120
|
+
console.log(chalk.gray(`Resume from message ID: ${options.resumeFromMessageId}`));
|
|
121
|
+
}
|
|
122
|
+
if (options.sessionLimit) {
|
|
123
|
+
console.log(chalk.gray(`Session limit: ${options.sessionLimit} sessions`));
|
|
124
|
+
}
|
|
125
|
+
if (options.sessionOffset) {
|
|
126
|
+
console.log(chalk.gray(`Session offset: Start from session ${options.sessionOffset}`));
|
|
127
|
+
}
|
|
128
|
+
console.log();
|
|
129
|
+
|
|
130
|
+
// Check for OpenAI API key (needed for embeddings in semantic-recall)
|
|
131
|
+
if (
|
|
132
|
+
(options.memoryConfig === 'semantic-recall' || options.memoryConfig === 'combined') &&
|
|
133
|
+
!process.env.OPENAI_API_KEY
|
|
134
|
+
) {
|
|
135
|
+
console.error(chalk.red('Error: OPENAI_API_KEY environment variable is required for semantic recall'));
|
|
136
|
+
console.error(chalk.gray('Please set it in your environment or .env file'));
|
|
137
|
+
process.exit(1);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
// Validate dataset option
|
|
141
|
+
const validDatasets = ['longmemeval_s', 'longmemeval_m', 'longmemeval_oracle'];
|
|
142
|
+
if (!validDatasets.includes(options.dataset)) {
|
|
143
|
+
console.error(chalk.red(`Invalid dataset: ${options.dataset}`));
|
|
144
|
+
console.error(chalk.gray(`Valid options: ${validDatasets.join(', ')}`));
|
|
145
|
+
process.exit(1);
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
// Check if dataset exists and download if needed
|
|
149
|
+
await ensureDatasetExists(options.dataset);
|
|
150
|
+
|
|
151
|
+
// Show warning and ask for confirmation
|
|
152
|
+
console.log(chalk.yellow('\nā ļø WARNING'));
|
|
153
|
+
console.log(chalk.yellow('ā'.repeat(50)));
|
|
154
|
+
console.log(chalk.bold('\nPreparing this data can be very expensive!\n'));
|
|
155
|
+
console.log('This process will:');
|
|
156
|
+
console.log(' ⢠Process many conversations through AI models');
|
|
157
|
+
console.log(' ⢠Generate embeddings for semantic recall');
|
|
158
|
+
console.log(' ⢠Potentially use significant API credits\n');
|
|
159
|
+
console.log(chalk.gray('Memory configs like "working-memory" and "combined" are especially costly.\n'));
|
|
160
|
+
|
|
161
|
+
const readline = await import('readline');
|
|
162
|
+
const rl = readline.createInterface({
|
|
163
|
+
input: process.stdin,
|
|
164
|
+
output: process.stdout,
|
|
165
|
+
});
|
|
166
|
+
|
|
167
|
+
const answer = await new Promise<string>(resolve => {
|
|
168
|
+
rl.question(chalk.bold('Are you sure you want to continue? (y/N): '), resolve);
|
|
169
|
+
});
|
|
170
|
+
rl.close();
|
|
171
|
+
|
|
172
|
+
if (answer.toLowerCase() !== 'y' && answer.toLowerCase() !== 'yes') {
|
|
173
|
+
console.log(chalk.gray('\nCancelled by user.'));
|
|
174
|
+
process.exit(0);
|
|
175
|
+
}
|
|
176
|
+
|
|
177
|
+
console.log(); // Add spacing before continuing
|
|
178
|
+
|
|
179
|
+
// Run prepare command
|
|
180
|
+
const prepareCommand = new PrepareCommand();
|
|
181
|
+
await prepareCommand.run({
|
|
182
|
+
dataset: options.dataset,
|
|
183
|
+
memoryConfig: options.memoryConfig,
|
|
184
|
+
outputDir: options.output,
|
|
185
|
+
subset: options.subset,
|
|
186
|
+
concurrency: options.concurrency,
|
|
187
|
+
questionId: options.questionId,
|
|
188
|
+
resumeFromMessageId: options.resumeFromMessageId,
|
|
189
|
+
sessionLimit: options.sessionLimit,
|
|
190
|
+
sessionOffset: options.sessionOffset,
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
// Force exit after completion
|
|
194
|
+
setTimeout(() => {
|
|
195
|
+
process.exit(0);
|
|
196
|
+
}, 100); // Give a tiny bit of time for any cleanup
|
|
197
|
+
} catch (error) {
|
|
198
|
+
console.error(chalk.red('\nError:'), error);
|
|
199
|
+
process.exit(1);
|
|
200
|
+
}
|
|
201
|
+
});
|
|
202
|
+
|
|
203
|
+
// Run benchmark command
|
|
204
|
+
program
|
|
205
|
+
.command('run')
|
|
206
|
+
.description('Run LongMemEval benchmark using prepared data')
|
|
207
|
+
.requiredOption('-d, --dataset <dataset>', 'Dataset to use (longmemeval_s, longmemeval_m, longmemeval_oracle)')
|
|
208
|
+
.requiredOption('-m, --model <model>', 'Model to use (e.g., gpt-4o, claude-3-opus)')
|
|
209
|
+
.option(
|
|
210
|
+
'-c, --memory-config <config>',
|
|
211
|
+
'Memory configuration (last-k, semantic-recall, semantic-recall-reranked, working-memory, working-memory-tailored, combined, combined-tailored)',
|
|
212
|
+
'semantic-recall',
|
|
213
|
+
)
|
|
214
|
+
.option('-o, --output <dir>', 'Output directory for results', './results')
|
|
215
|
+
.option('--prepared-data <dir>', 'Directory containing prepared data', './prepared-data')
|
|
216
|
+
.option('--subset <n>', 'Run on subset of n questions', parseInt)
|
|
217
|
+
.option('--concurrency <n>', 'Number of parallel requests (default: 5)', parseInt)
|
|
218
|
+
.option('--question-id <id>', 'Focus on a specific question by ID')
|
|
219
|
+
.action(async options => {
|
|
220
|
+
try {
|
|
221
|
+
console.log(chalk.blue('\nš LongMemEval Benchmark Runner\n'));
|
|
222
|
+
|
|
223
|
+
// Check for OpenAI API key
|
|
224
|
+
if (!process.env.OPENAI_API_KEY) {
|
|
225
|
+
console.error(chalk.red('Error: OPENAI_API_KEY environment variable is not set'));
|
|
226
|
+
console.error(chalk.gray('Please set it in your environment or .env file'));
|
|
227
|
+
process.exit(1);
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
// Validate dataset option
|
|
231
|
+
const validDatasets = ['longmemeval_s', 'longmemeval_m', 'longmemeval_oracle'];
|
|
232
|
+
if (!validDatasets.includes(options.dataset)) {
|
|
233
|
+
console.error(chalk.red(`Invalid dataset: ${options.dataset}`));
|
|
234
|
+
console.error(chalk.gray(`Valid options: ${validDatasets.join(', ')}`));
|
|
235
|
+
process.exit(1);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
// Run benchmark using prepared data
|
|
239
|
+
const runCommand = new RunCommand();
|
|
240
|
+
await runCommand.run({
|
|
241
|
+
dataset: options.dataset,
|
|
242
|
+
memoryConfig: options.memoryConfig,
|
|
243
|
+
model: options.model,
|
|
244
|
+
preparedDataDir: options.preparedData,
|
|
245
|
+
outputDir: options.output,
|
|
246
|
+
subset: options.subset,
|
|
247
|
+
concurrency: options.concurrency,
|
|
248
|
+
questionId: options.questionId,
|
|
249
|
+
});
|
|
250
|
+
|
|
251
|
+
// Force exit after completion
|
|
252
|
+
setTimeout(() => {
|
|
253
|
+
process.exit(0);
|
|
254
|
+
}, 100); // Give a tiny bit of time for any cleanup
|
|
255
|
+
} catch (error) {
|
|
256
|
+
console.error(chalk.red('\nError:'), error);
|
|
257
|
+
process.exit(1);
|
|
258
|
+
}
|
|
259
|
+
});
|
|
260
|
+
|
|
261
|
+
// Evaluate command
|
|
262
|
+
program
|
|
263
|
+
.command('evaluate')
|
|
264
|
+
.description('Evaluate existing results')
|
|
265
|
+
.requiredOption('-r, --results <file>', 'Results file (JSONL format)')
|
|
266
|
+
.requiredOption('-d, --dataset <dataset>', 'Dataset used for questions')
|
|
267
|
+
.action(async options => {
|
|
268
|
+
try {
|
|
269
|
+
console.log(chalk.blue('\nš Evaluating Results\n'));
|
|
270
|
+
|
|
271
|
+
// const loader = new DatasetLoader();
|
|
272
|
+
// const questions = await loader.loadDataset(options.dataset);
|
|
273
|
+
|
|
274
|
+
// Load results
|
|
275
|
+
const resultsContent = await readFile(options.results, 'utf-8');
|
|
276
|
+
const results: EvaluationResult[] = resultsContent
|
|
277
|
+
.split('\n')
|
|
278
|
+
.filter(line => line.trim())
|
|
279
|
+
.map(line => JSON.parse(line));
|
|
280
|
+
|
|
281
|
+
// Calculate metrics
|
|
282
|
+
const metrics = calculateMetrics(results);
|
|
283
|
+
|
|
284
|
+
// Print metrics
|
|
285
|
+
console.log(chalk.bold('Overall Accuracy:'), chalk.yellow(`${(metrics.overall_accuracy * 100).toFixed(2)}%`));
|
|
286
|
+
console.log(chalk.bold('Total Questions:'), metrics.total_questions);
|
|
287
|
+
console.log(chalk.bold('Correct Answers:'), metrics.correct_answers);
|
|
288
|
+
|
|
289
|
+
console.log(chalk.bold('\nAccuracy by Question Type:'));
|
|
290
|
+
for (const [type, typeMetrics] of Object.entries(metrics.accuracy_by_type)) {
|
|
291
|
+
const { correct, total, accuracy } = typeMetrics;
|
|
292
|
+
console.log(
|
|
293
|
+
chalk.gray(` ${type}:`),
|
|
294
|
+
chalk.yellow(`${(accuracy * 100).toFixed(2)}%`),
|
|
295
|
+
chalk.gray(`(${correct}/${total})`),
|
|
296
|
+
);
|
|
297
|
+
}
|
|
298
|
+
} catch (error) {
|
|
299
|
+
console.error(chalk.red('\nError:'), error);
|
|
300
|
+
process.exit(1);
|
|
301
|
+
}
|
|
302
|
+
});
|
|
303
|
+
|
|
304
|
+
// Stats command
|
|
305
|
+
program
|
|
306
|
+
.command('stats')
|
|
307
|
+
.description('Show dataset statistics')
|
|
308
|
+
.requiredOption('-d, --dataset <dataset>', 'Dataset to analyze')
|
|
309
|
+
.action(async options => {
|
|
310
|
+
try {
|
|
311
|
+
console.log(chalk.blue('\nš Dataset Statistics\n'));
|
|
312
|
+
|
|
313
|
+
const loader = new DatasetLoader();
|
|
314
|
+
const stats = await loader.getDatasetStats(options.dataset);
|
|
315
|
+
|
|
316
|
+
console.log(chalk.bold('Total Questions:'), stats.totalQuestions);
|
|
317
|
+
console.log(chalk.bold('Abstention Questions:'), stats.abstentionQuestions);
|
|
318
|
+
console.log(chalk.bold('Avg Sessions per Question:'), stats.avgSessionsPerQuestion.toFixed(2));
|
|
319
|
+
console.log(chalk.bold('Avg Turns per Session:'), stats.avgTurnsPerSession.toFixed(2));
|
|
320
|
+
console.log(chalk.bold('Total Tokens (estimate):'), stats.totalTokensEstimate.toLocaleString());
|
|
321
|
+
|
|
322
|
+
console.log(chalk.bold('\nQuestions by Type:'));
|
|
323
|
+
for (const [type, count] of Object.entries(stats.questionsByType)) {
|
|
324
|
+
console.log(chalk.gray(` ${type}:`), count);
|
|
325
|
+
}
|
|
326
|
+
} catch (error) {
|
|
327
|
+
console.error(chalk.red('\nError:'), error);
|
|
328
|
+
process.exit(1);
|
|
329
|
+
}
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
// List command to show available questions
|
|
333
|
+
program
|
|
334
|
+
.command('list')
|
|
335
|
+
.description('List prepared questions with their IDs')
|
|
336
|
+
.requiredOption('-d, --dataset <dataset>', 'Dataset to list from')
|
|
337
|
+
.option('-c, --memory-config <config>', 'Memory configuration', 'semantic-recall')
|
|
338
|
+
.option('--prepared-data <dir>', 'Directory containing prepared data', './prepared-data')
|
|
339
|
+
.action(async options => {
|
|
340
|
+
try {
|
|
341
|
+
console.log(chalk.blue('\nš Listing Prepared Questions\n'));
|
|
342
|
+
|
|
343
|
+
const preparedDir = join(options.preparedData, options.dataset, options.memoryConfig);
|
|
344
|
+
|
|
345
|
+
if (!existsSync(preparedDir)) {
|
|
346
|
+
console.error(chalk.red(`No prepared data found for ${options.dataset} with ${options.memoryConfig} config`));
|
|
347
|
+
console.error(chalk.gray(`Run 'longmemeval prepare' first`));
|
|
348
|
+
process.exit(1);
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
const questionDirs = await readdir(preparedDir);
|
|
352
|
+
const questions: any[] = [];
|
|
353
|
+
|
|
354
|
+
for (const questionDir of questionDirs) {
|
|
355
|
+
const metaPath = join(preparedDir, questionDir, 'meta.json');
|
|
356
|
+
if (existsSync(metaPath)) {
|
|
357
|
+
const meta = JSON.parse(await readFile(metaPath, 'utf-8'));
|
|
358
|
+
questions.push(meta);
|
|
359
|
+
}
|
|
360
|
+
}
|
|
361
|
+
|
|
362
|
+
// Sort by question ID
|
|
363
|
+
questions.sort((a, b) => a.questionId.localeCompare(b.questionId));
|
|
364
|
+
|
|
365
|
+
console.log(chalk.gray(`Found ${questions.length} prepared questions:\n`));
|
|
366
|
+
|
|
367
|
+
for (const q of questions) {
|
|
368
|
+
const typeColor = q.questionType.includes('single')
|
|
369
|
+
? 'blue'
|
|
370
|
+
: q.questionType.includes('multi')
|
|
371
|
+
? 'green'
|
|
372
|
+
: q.questionType.includes('temporal')
|
|
373
|
+
? 'yellow'
|
|
374
|
+
: 'cyan';
|
|
375
|
+
|
|
376
|
+
console.log(
|
|
377
|
+
chalk.bold(q.questionId),
|
|
378
|
+
chalk[typeColor](`[${q.questionType}]`),
|
|
379
|
+
chalk.gray(`- "${q.question.substring(0, 60)}${q.question.length > 60 ? '...' : ''}"`),
|
|
380
|
+
);
|
|
381
|
+
}
|
|
382
|
+
|
|
383
|
+
console.log(chalk.gray(`\nTo run a specific question: longmemeval run --question-id <id> ...`));
|
|
384
|
+
} catch (error) {
|
|
385
|
+
console.error(chalk.red('\nError:'), error);
|
|
386
|
+
process.exit(1);
|
|
387
|
+
}
|
|
388
|
+
});
|
|
389
|
+
|
|
390
|
+
// Results command - shows latest results for each memory configuration
|
|
391
|
+
program
|
|
392
|
+
.command('results')
|
|
393
|
+
.description('Show latest benchmark results for each memory configuration')
|
|
394
|
+
.option('-r, --results <dir>', 'Results directory', './results')
|
|
395
|
+
.option('-d, --dataset <dataset>', 'Filter by dataset')
|
|
396
|
+
.option('-a, --all', 'Show all results, not just latest')
|
|
397
|
+
.action(async options => {
|
|
398
|
+
try {
|
|
399
|
+
console.log(chalk.blue('\nš Benchmark Results Summary\n'));
|
|
400
|
+
|
|
401
|
+
// Check if results directory exists
|
|
402
|
+
if (!existsSync(options.results)) {
|
|
403
|
+
console.log(chalk.yellow('No results found. Run a benchmark first with:'));
|
|
404
|
+
console.log(chalk.gray(' longmemeval run -d <dataset> -m <model> -c <memory-config>'));
|
|
405
|
+
return;
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
// List all memory config directories
|
|
409
|
+
const memoryConfigs = await readdir(options.results).catch(() => []);
|
|
410
|
+
|
|
411
|
+
// Load all metrics from new structure (results/memory-config/run_xxx)
|
|
412
|
+
const allRuns: Array<{
|
|
413
|
+
runId: string;
|
|
414
|
+
metrics: any;
|
|
415
|
+
config: any;
|
|
416
|
+
timestamp: string;
|
|
417
|
+
}> = [];
|
|
418
|
+
|
|
419
|
+
// First, try new structure
|
|
420
|
+
for (const memConfig of memoryConfigs) {
|
|
421
|
+
const memConfigPath = join(options.results, memConfig);
|
|
422
|
+
try {
|
|
423
|
+
const stat = await require('fs/promises').stat(memConfigPath);
|
|
424
|
+
if (!stat.isDirectory()) continue;
|
|
425
|
+
|
|
426
|
+
const runs = await readdir(memConfigPath);
|
|
427
|
+
const runDirs = runs.filter(r => r.startsWith('run_')).sort();
|
|
428
|
+
|
|
429
|
+
for (const runDir of runDirs) {
|
|
430
|
+
const metricsPath = join(memConfigPath, runDir, 'metrics.json');
|
|
431
|
+
try {
|
|
432
|
+
const metricsContent = await readFile(metricsPath, 'utf-8');
|
|
433
|
+
const data = JSON.parse(metricsContent);
|
|
434
|
+
|
|
435
|
+
// Filter by dataset if specified
|
|
436
|
+
if (options.dataset && data.config.dataset !== options.dataset) {
|
|
437
|
+
continue;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
allRuns.push({
|
|
441
|
+
runId: runDir,
|
|
442
|
+
metrics: data,
|
|
443
|
+
config: data.config,
|
|
444
|
+
timestamp: data.timestamp,
|
|
445
|
+
});
|
|
446
|
+
} catch (error) {
|
|
447
|
+
// Skip runs with missing or invalid metrics
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
} catch (error) {
|
|
451
|
+
// Not a directory, skip
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
// Also check old structure for backwards compatibility
|
|
456
|
+
const oldRuns = memoryConfigs.filter(r => r.startsWith('run_')).sort();
|
|
457
|
+
for (const runDir of oldRuns) {
|
|
458
|
+
const metricsPath = join(options.results, runDir, 'metrics.json');
|
|
459
|
+
try {
|
|
460
|
+
const metricsContent = await readFile(metricsPath, 'utf-8');
|
|
461
|
+
const data = JSON.parse(metricsContent);
|
|
462
|
+
|
|
463
|
+
// Filter by dataset if specified
|
|
464
|
+
if (options.dataset && data.config.dataset !== options.dataset) {
|
|
465
|
+
continue;
|
|
466
|
+
}
|
|
467
|
+
|
|
468
|
+
allRuns.push({
|
|
469
|
+
runId: runDir,
|
|
470
|
+
metrics: data,
|
|
471
|
+
config: data.config,
|
|
472
|
+
timestamp: data.timestamp,
|
|
473
|
+
});
|
|
474
|
+
} catch (error) {
|
|
475
|
+
// Skip runs with missing or invalid metrics
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
if (allRuns.length === 0) {
|
|
480
|
+
console.log(chalk.yellow('No results found matching criteria.'));
|
|
481
|
+
return;
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// Group by memory configuration
|
|
485
|
+
const byMemoryConfig = new Map<string, typeof allRuns>();
|
|
486
|
+
for (const run of allRuns) {
|
|
487
|
+
const key = `${run.config.dataset}_${run.config.memoryConfig}`;
|
|
488
|
+
if (!byMemoryConfig.has(key)) {
|
|
489
|
+
byMemoryConfig.set(key, []);
|
|
490
|
+
}
|
|
491
|
+
byMemoryConfig.get(key)!.push(run);
|
|
492
|
+
}
|
|
493
|
+
|
|
494
|
+
// Sort groups by worst performing to best performing (based on latest run)
|
|
495
|
+
const sortedConfigs = Array.from(byMemoryConfig.entries()).sort(([_aKey, aRuns], [_bKey, bRuns]) => {
|
|
496
|
+
// Get latest run for each config (already sorted by timestamp)
|
|
497
|
+
const aLatest = aRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp))[0];
|
|
498
|
+
const bLatest = bRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp))[0];
|
|
499
|
+
|
|
500
|
+
// Sort by overall accuracy (worst first)
|
|
501
|
+
return aLatest.metrics.overall_accuracy - bLatest.metrics.overall_accuracy;
|
|
502
|
+
});
|
|
503
|
+
|
|
504
|
+
for (const [_configKey, runs] of sortedConfigs) {
|
|
505
|
+
// Sort runs by timestamp (newest first)
|
|
506
|
+
runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
507
|
+
|
|
508
|
+
// Show latest or all
|
|
509
|
+
const runsToShow = options.all ? runs : [runs[0]];
|
|
510
|
+
|
|
511
|
+
for (const run of runsToShow) {
|
|
512
|
+
// Get terminal width, default to 80 if not available
|
|
513
|
+
const terminalWidth = process.stdout.columns || 80;
|
|
514
|
+
const lineWidth = Math.min(terminalWidth - 1, 80); // Cap at 80 for readability
|
|
515
|
+
|
|
516
|
+
console.log(chalk.bold('\n' + 'ā'.repeat(lineWidth) + '\n'));
|
|
517
|
+
|
|
518
|
+
// Configuration header
|
|
519
|
+
console.log(chalk.bold('Configuration:\n'));
|
|
520
|
+
console.log(chalk.gray('Dataset:'), chalk.cyan(run.config.dataset));
|
|
521
|
+
console.log(chalk.gray('Model:'), chalk.cyan(run.config.model));
|
|
522
|
+
console.log(chalk.gray('Memory Config:'), chalk.cyan(run.config.memoryConfig));
|
|
523
|
+
if (run.config.subset) {
|
|
524
|
+
console.log(chalk.gray('Subset:'), chalk.cyan(`${run.config.subset} questions`));
|
|
525
|
+
}
|
|
526
|
+
console.log(chalk.gray('Run ID:'), chalk.dim(run.runId));
|
|
527
|
+
console.log(chalk.gray('Timestamp:'), chalk.dim(new Date(run.timestamp).toLocaleString()));
|
|
528
|
+
console.log(chalk.gray('ā'.repeat(Math.min(lineWidth, 60))));
|
|
529
|
+
|
|
530
|
+
// Display metrics using same format as regular runs
|
|
531
|
+
const metrics = run.metrics;
|
|
532
|
+
|
|
533
|
+
// Recalculate overall accuracy using the new formula (average of type averages)
|
|
534
|
+
const typeAccuracies = Object.values(metrics.accuracy_by_type).map((t: any) => t.accuracy);
|
|
535
|
+
const recalculatedOverall =
|
|
536
|
+
typeAccuracies.length > 0 ? typeAccuracies.reduce((sum, acc) => sum + acc, 0) / typeAccuracies.length : 0;
|
|
537
|
+
metrics.overall_accuracy = recalculatedOverall;
|
|
538
|
+
|
|
539
|
+
// Question type breakdown
|
|
540
|
+
console.log(chalk.bold('\nAccuracy by Question Type:'));
|
|
541
|
+
|
|
542
|
+
// Sort question types alphabetically
|
|
543
|
+
const sortedTypes = Object.entries(metrics.accuracy_by_type).sort(([a], [b]) => a.localeCompare(b));
|
|
544
|
+
|
|
545
|
+
for (const [type, typeMetrics] of sortedTypes) {
|
|
546
|
+
const { correct, total, accuracy } = typeMetrics as any;
|
|
547
|
+
const typeColor = accuracy >= 0.8 ? 'green' : accuracy >= 0.6 ? 'yellow' : 'red';
|
|
548
|
+
|
|
549
|
+
// Create a simple progress bar
|
|
550
|
+
const barLength = 20;
|
|
551
|
+
const filledLength = Math.round(accuracy * barLength);
|
|
552
|
+
const bar = 'ā'.repeat(filledLength) + 'ā'.repeat(barLength - filledLength);
|
|
553
|
+
|
|
554
|
+
console.log(
|
|
555
|
+
chalk.gray(` ${type.padEnd(25)}:`),
|
|
556
|
+
chalk[typeColor](`${(accuracy * 100).toFixed(1).padStart(5)}%`),
|
|
557
|
+
chalk.gray(`[${bar}]`),
|
|
558
|
+
chalk.gray(`(${correct}/${total})`),
|
|
559
|
+
);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
// Abstention is hidden - it tests LLM reasoning ability rather than memory system performance
|
|
563
|
+
|
|
564
|
+
// Overall summary at the bottom
|
|
565
|
+
console.log();
|
|
566
|
+
const accuracyColor =
|
|
567
|
+
metrics.overall_accuracy >= 0.8 ? 'green' : metrics.overall_accuracy >= 0.6 ? 'yellow' : 'red';
|
|
568
|
+
console.log(
|
|
569
|
+
chalk.bold('Overall Accuracy:'),
|
|
570
|
+
chalk[accuracyColor](`${(metrics.overall_accuracy * 100).toFixed(2)}%`),
|
|
571
|
+
chalk.gray(`(average of ${Object.keys(metrics.accuracy_by_type).length} question types)`),
|
|
572
|
+
);
|
|
573
|
+
}
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
// Get terminal width for final separator
|
|
577
|
+
const terminalWidth = process.stdout.columns || 80;
|
|
578
|
+
const lineWidth = Math.min(terminalWidth - 1, 80);
|
|
579
|
+
|
|
580
|
+
console.log(chalk.bold('\n' + 'ā'.repeat(lineWidth)));
|
|
581
|
+
console.log(chalk.gray(`\nFound ${allRuns.length} total runs across ${byMemoryConfig.size} configurations`));
|
|
582
|
+
if (!options.all && byMemoryConfig.size > 0) {
|
|
583
|
+
console.log(chalk.gray('Use --all to see all runs, not just the latest'));
|
|
584
|
+
}
|
|
585
|
+
} catch (error) {
|
|
586
|
+
console.error(chalk.red('\nError:'), error);
|
|
587
|
+
process.exit(1);
|
|
588
|
+
}
|
|
589
|
+
});
|
|
590
|
+
|
|
591
|
+
// Report command
|
|
592
|
+
program
|
|
593
|
+
.command('report')
|
|
594
|
+
.description('Generate report from benchmark results')
|
|
595
|
+
.requiredOption('-r, --results <dir>', 'Results directory')
|
|
596
|
+
.action(async options => {
|
|
597
|
+
try {
|
|
598
|
+
console.log(chalk.blue('\nš Generating Report\n'));
|
|
599
|
+
|
|
600
|
+
// List all runs in the results directory
|
|
601
|
+
const runs = await readdir(options.results);
|
|
602
|
+
const runDirs = runs.filter(r => r.startsWith('run_'));
|
|
603
|
+
|
|
604
|
+
if (runDirs.length === 0) {
|
|
605
|
+
console.log(chalk.yellow('No benchmark runs found in the results directory'));
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
|
|
609
|
+
console.log(chalk.bold(`Found ${runDirs.length} benchmark runs:\n`));
|
|
610
|
+
|
|
611
|
+
// Load and display metrics for each run
|
|
612
|
+
for (const runDir of runDirs) {
|
|
613
|
+
const metricsPath = join(options.results, runDir, 'metrics.json');
|
|
614
|
+
|
|
615
|
+
try {
|
|
616
|
+
const metricsContent = await readFile(metricsPath, 'utf-8');
|
|
617
|
+
const metrics = JSON.parse(metricsContent);
|
|
618
|
+
|
|
619
|
+
console.log(chalk.bold(`Run: ${runDir}`));
|
|
620
|
+
console.log(chalk.gray(` Timestamp: ${metrics.timestamp}`));
|
|
621
|
+
console.log(chalk.gray(` Dataset: ${metrics.config.dataset}`));
|
|
622
|
+
console.log(chalk.gray(` Model: ${metrics.config.model}`));
|
|
623
|
+
console.log(chalk.gray(` Memory Config: ${metrics.config.memoryConfig}`));
|
|
624
|
+
console.log(chalk.yellow(` Overall Accuracy: ${(metrics.overall_accuracy * 100).toFixed(2)}%`));
|
|
625
|
+
console.log();
|
|
626
|
+
} catch (error) {
|
|
627
|
+
console.log(chalk.red(` Error loading metrics: ${error}`));
|
|
628
|
+
}
|
|
629
|
+
}
|
|
630
|
+
} catch (error) {
|
|
631
|
+
console.error(chalk.red('\nError:'), error);
|
|
632
|
+
process.exit(1);
|
|
633
|
+
}
|
|
634
|
+
});
|
|
635
|
+
|
|
636
|
+
// Helper function to ensure dataset exists
|
|
637
|
+
async function ensureDatasetExists(dataset: string) {
|
|
638
|
+
const dataDir = join(process.cwd(), 'data');
|
|
639
|
+
const datasetPath = join(dataDir, `${dataset}.json`);
|
|
640
|
+
|
|
641
|
+
// Check if dataset exists and is valid (> 1MB)
|
|
642
|
+
if (existsSync(datasetPath)) {
|
|
643
|
+
try {
|
|
644
|
+
const stats = statSync(datasetPath);
|
|
645
|
+
if (stats.size > 1000000) {
|
|
646
|
+
return; // Dataset exists and is valid
|
|
647
|
+
}
|
|
648
|
+
} catch (error) {
|
|
649
|
+
// File exists but can't get stats, continue to download
|
|
650
|
+
}
|
|
651
|
+
}
|
|
652
|
+
|
|
653
|
+
// Dataset missing or invalid, need to download
|
|
654
|
+
console.log(chalk.yellow(`Dataset ${dataset} not found or invalid.\n`));
|
|
655
|
+
|
|
656
|
+
// Check for HuggingFace token
|
|
657
|
+
const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
|
|
658
|
+
if (!token) {
|
|
659
|
+
console.log(chalk.red('Error: HuggingFace token required to download datasets.\n'));
|
|
660
|
+
console.log(chalk.gray('1. Get your token from:'));
|
|
661
|
+
console.log(chalk.cyan(' https://huggingface.co/settings/tokens\n'));
|
|
662
|
+
console.log(chalk.gray('2. Set it as an environment variable:'));
|
|
663
|
+
console.log(chalk.cyan(' export HF_TOKEN=your_token_here\n'));
|
|
664
|
+
console.log(chalk.gray('3. Run the benchmark again\n'));
|
|
665
|
+
console.log(chalk.blue('Alternative: Download manually from Google Drive'));
|
|
666
|
+
console.log(chalk.gray('See DOWNLOAD_GUIDE.md for instructions'));
|
|
667
|
+
process.exit(1);
|
|
668
|
+
}
|
|
669
|
+
|
|
670
|
+
console.log(chalk.blue('Downloading dataset...\n'));
|
|
671
|
+
|
|
672
|
+
try {
|
|
673
|
+
// Run the download script
|
|
674
|
+
execSync('pnpm download', { stdio: 'inherit' });
|
|
675
|
+
|
|
676
|
+
// Verify download succeeded
|
|
677
|
+
if (!existsSync(datasetPath) || statSync(datasetPath).size < 1000000) {
|
|
678
|
+
throw new Error('Dataset download failed or file is invalid');
|
|
679
|
+
}
|
|
680
|
+
|
|
681
|
+
console.log(chalk.green('\nā
Dataset downloaded successfully!\n'));
|
|
682
|
+
} catch (error) {
|
|
683
|
+
console.error(chalk.red('\nError downloading dataset:'), error);
|
|
684
|
+
console.log(chalk.yellow('\nPlease download the dataset manually.'));
|
|
685
|
+
console.log(chalk.gray('See DOWNLOAD_GUIDE.md for instructions'));
|
|
686
|
+
process.exit(1);
|
|
687
|
+
}
|
|
688
|
+
}
|
|
689
|
+
|
|
690
|
+
program.parse();
|