@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/CHANGELOG.md +919 -0
  2. package/DATA_DOWNLOAD_GUIDE.md +117 -0
  3. package/LICENSE.md +15 -0
  4. package/README.md +173 -0
  5. package/USAGE.md +105 -0
  6. package/package.json +67 -0
  7. package/scripts/download.ts +180 -0
  8. package/scripts/find-failed.ts +176 -0
  9. package/scripts/generate-embeddings.ts +56 -0
  10. package/scripts/generate-wm-templates.ts +296 -0
  11. package/scripts/setup.ts +60 -0
  12. package/src/__fixtures__/embeddings.json +2319 -0
  13. package/src/__fixtures__/test-dataset.json +82 -0
  14. package/src/cli.ts +690 -0
  15. package/src/commands/__tests__/prepare.test.ts +230 -0
  16. package/src/commands/__tests__/run.test.ts +403 -0
  17. package/src/commands/prepare.ts +793 -0
  18. package/src/commands/run.ts +553 -0
  19. package/src/config.ts +83 -0
  20. package/src/data/loader.ts +163 -0
  21. package/src/data/types.ts +61 -0
  22. package/src/embeddings/cached-openai-embedding-model.ts +227 -0
  23. package/src/embeddings/cached-openai-provider.ts +40 -0
  24. package/src/embeddings/index.ts +2 -0
  25. package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
  26. package/src/evaluation/longmemeval-metric.ts +173 -0
  27. package/src/retry-model.ts +60 -0
  28. package/src/storage/__tests__/benchmark-store.test.ts +280 -0
  29. package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
  30. package/src/storage/benchmark-store.ts +540 -0
  31. package/src/storage/benchmark-vector.ts +234 -0
  32. package/src/storage/index.ts +2 -0
  33. package/src/test-utils/mock-embeddings.ts +54 -0
  34. package/src/test-utils/mock-model.ts +49 -0
  35. package/tests/data-loader.test.ts +96 -0
  36. package/tsconfig.json +18 -0
  37. package/vitest.config.ts +9 -0
package/src/cli.ts ADDED
@@ -0,0 +1,690 @@
1
+ #!/usr/bin/env node
2
+
3
+ import { Command } from 'commander';
4
+ import chalk from 'chalk';
5
+ import { readFile, readdir } from 'fs/promises';
6
+ import { join } from 'path';
7
+ import { existsSync, statSync } from 'fs';
8
+ import { execSync } from 'child_process';
9
+
10
+ import { DatasetLoader } from './data/loader';
11
+ import type { EvaluationResult, BenchmarkMetrics, QuestionType } from './data/types';
12
+ import { PrepareCommand } from './commands/prepare';
13
+ import { RunCommand } from './commands/run';
14
+
15
+ const program = new Command();
16
+
17
+ // Force immediate exit on Ctrl+C
18
+ process.on('SIGINT', () => {
19
+ console.log('\n\nForce exiting...');
20
+ process.exit(130); // Standard exit code for SIGINT
21
+ });
22
+
23
+ // Also handle SIGTERM
24
+ process.on('SIGTERM', () => {
25
+ process.exit(143); // Standard exit code for SIGTERM
26
+ });
27
+
28
+ // Helper function to calculate metrics
29
+ function calculateMetrics(results: EvaluationResult[]): BenchmarkMetrics {
30
+ const metrics: BenchmarkMetrics = {
31
+ overall_accuracy: 0,
32
+ accuracy_by_type: {},
33
+ abstention_accuracy: 0,
34
+ total_questions: results.length,
35
+ correct_answers: 0,
36
+ abstention_correct: 0,
37
+ abstention_total: 0,
38
+ } as const;
39
+
40
+ // Calculate overall metrics
41
+ for (const result of results) {
42
+ if (result.is_correct) {
43
+ metrics.correct_answers++;
44
+ }
45
+
46
+ // Track by question type
47
+ const type = result.question_type;
48
+ if (type && !metrics.accuracy_by_type[type]) {
49
+ metrics.accuracy_by_type[type] = { correct: 0, total: 0, accuracy: 0 };
50
+ }
51
+ const accuracyByType = type ? metrics.accuracy_by_type[type] : null;
52
+ if (accuracyByType) {
53
+ accuracyByType.total++;
54
+ }
55
+ if (accuracyByType && result.is_correct) {
56
+ accuracyByType.correct++;
57
+ }
58
+
59
+ // Track abstention separately
60
+ if (result.question_id.endsWith('_abs')) {
61
+ metrics.abstention_total!++;
62
+ if (result.is_correct) {
63
+ metrics.abstention_correct!++;
64
+ }
65
+ }
66
+ }
67
+
68
+ // Calculate per-type accuracies first
69
+ for (const type in metrics.accuracy_by_type) {
70
+ const typeMetrics = metrics.accuracy_by_type[type as QuestionType];
71
+ if (typeMetrics) {
72
+ typeMetrics.accuracy = typeMetrics.total > 0 ? typeMetrics.correct / typeMetrics.total : 0;
73
+ }
74
+ }
75
+
76
+ if (metrics && (metrics.abstention_total || 0) > 0) {
77
+ metrics.abstention_accuracy = (metrics.abstention_correct || 0) / (metrics.abstention_total || 0);
78
+ }
79
+
80
+ // Calculate overall accuracy as average of all question type accuracies (excluding abstention)
81
+ const allTypeAccuracies = Object.values(metrics.accuracy_by_type).map(t => t.accuracy);
82
+
83
+ metrics.overall_accuracy =
84
+ allTypeAccuracies.length > 0 ? allTypeAccuracies.reduce((sum, acc) => sum + acc, 0) / allTypeAccuracies.length : 0;
85
+
86
+ return metrics;
87
+ }
88
+
89
+ program.name('longmemeval').description('LongMemEval benchmark for Mastra Memory').version('0.1.0');
90
+
91
+ // Prepare command
92
+ program
93
+ .command('prepare')
94
+ .description('Prepare LongMemEval data by processing through mock agents')
95
+ .option('-d, --dataset <dataset>', 'Dataset to use', 'longmemeval_s')
96
+ .option(
97
+ '-c, --memory-config <config>',
98
+ 'Memory configuration (last-k, semantic-recall, semantic-recall-reranked, working-memory, working-memory-tailored, combined, combined-tailored)',
99
+ 'semantic-recall',
100
+ )
101
+ .option('-o, --output <dir>', 'Output directory for prepared data', './prepared-data')
102
+ .option('--subset <n>', 'Prepare only a subset of n questions', parseInt)
103
+ .option('--concurrency <n>', 'Number of questions to process in parallel', parseInt)
104
+ .option('--question-id <id>', 'Prepare a specific question by ID')
105
+ .option('--resume-from-message-id <id>', 'Resume processing from a specific message ID')
106
+ .option('--session-limit <n>', 'Limit processing to n sessions after resume point', parseInt)
107
+ .option('--session-offset <n>', 'Start processing from the nth session (1-based)', parseInt)
108
+ .action(async options => {
109
+ try {
110
+ console.log(chalk.blue('\nšŸš€ LongMemEval Data Preparation\n'));
111
+ console.log(chalk.gray(`Dataset: ${options.dataset}`));
112
+ console.log(chalk.gray(`Memory Config: ${options.memoryConfig}`));
113
+ if (options.subset) {
114
+ console.log(chalk.gray(`Subset: ${options.subset} questions`));
115
+ }
116
+ if (options.questionId) {
117
+ console.log(chalk.gray(`Question ID: ${options.questionId}`));
118
+ }
119
+ if (options.resumeFromMessageId) {
120
+ console.log(chalk.gray(`Resume from message ID: ${options.resumeFromMessageId}`));
121
+ }
122
+ if (options.sessionLimit) {
123
+ console.log(chalk.gray(`Session limit: ${options.sessionLimit} sessions`));
124
+ }
125
+ if (options.sessionOffset) {
126
+ console.log(chalk.gray(`Session offset: Start from session ${options.sessionOffset}`));
127
+ }
128
+ console.log();
129
+
130
+ // Check for OpenAI API key (needed for embeddings in semantic-recall)
131
+ if (
132
+ (options.memoryConfig === 'semantic-recall' || options.memoryConfig === 'combined') &&
133
+ !process.env.OPENAI_API_KEY
134
+ ) {
135
+ console.error(chalk.red('Error: OPENAI_API_KEY environment variable is required for semantic recall'));
136
+ console.error(chalk.gray('Please set it in your environment or .env file'));
137
+ process.exit(1);
138
+ }
139
+
140
+ // Validate dataset option
141
+ const validDatasets = ['longmemeval_s', 'longmemeval_m', 'longmemeval_oracle'];
142
+ if (!validDatasets.includes(options.dataset)) {
143
+ console.error(chalk.red(`Invalid dataset: ${options.dataset}`));
144
+ console.error(chalk.gray(`Valid options: ${validDatasets.join(', ')}`));
145
+ process.exit(1);
146
+ }
147
+
148
+ // Check if dataset exists and download if needed
149
+ await ensureDatasetExists(options.dataset);
150
+
151
+ // Show warning and ask for confirmation
152
+ console.log(chalk.yellow('\nāš ļø WARNING'));
153
+ console.log(chalk.yellow('━'.repeat(50)));
154
+ console.log(chalk.bold('\nPreparing this data can be very expensive!\n'));
155
+ console.log('This process will:');
156
+ console.log(' • Process many conversations through AI models');
157
+ console.log(' • Generate embeddings for semantic recall');
158
+ console.log(' • Potentially use significant API credits\n');
159
+ console.log(chalk.gray('Memory configs like "working-memory" and "combined" are especially costly.\n'));
160
+
161
+ const readline = await import('readline');
162
+ const rl = readline.createInterface({
163
+ input: process.stdin,
164
+ output: process.stdout,
165
+ });
166
+
167
+ const answer = await new Promise<string>(resolve => {
168
+ rl.question(chalk.bold('Are you sure you want to continue? (y/N): '), resolve);
169
+ });
170
+ rl.close();
171
+
172
+ if (answer.toLowerCase() !== 'y' && answer.toLowerCase() !== 'yes') {
173
+ console.log(chalk.gray('\nCancelled by user.'));
174
+ process.exit(0);
175
+ }
176
+
177
+ console.log(); // Add spacing before continuing
178
+
179
+ // Run prepare command
180
+ const prepareCommand = new PrepareCommand();
181
+ await prepareCommand.run({
182
+ dataset: options.dataset,
183
+ memoryConfig: options.memoryConfig,
184
+ outputDir: options.output,
185
+ subset: options.subset,
186
+ concurrency: options.concurrency,
187
+ questionId: options.questionId,
188
+ resumeFromMessageId: options.resumeFromMessageId,
189
+ sessionLimit: options.sessionLimit,
190
+ sessionOffset: options.sessionOffset,
191
+ });
192
+
193
+ // Force exit after completion
194
+ setTimeout(() => {
195
+ process.exit(0);
196
+ }, 100); // Give a tiny bit of time for any cleanup
197
+ } catch (error) {
198
+ console.error(chalk.red('\nError:'), error);
199
+ process.exit(1);
200
+ }
201
+ });
202
+
203
+ // Run benchmark command
204
+ program
205
+ .command('run')
206
+ .description('Run LongMemEval benchmark using prepared data')
207
+ .requiredOption('-d, --dataset <dataset>', 'Dataset to use (longmemeval_s, longmemeval_m, longmemeval_oracle)')
208
+ .requiredOption('-m, --model <model>', 'Model to use (e.g., gpt-4o, claude-3-opus)')
209
+ .option(
210
+ '-c, --memory-config <config>',
211
+ 'Memory configuration (last-k, semantic-recall, semantic-recall-reranked, working-memory, working-memory-tailored, combined, combined-tailored)',
212
+ 'semantic-recall',
213
+ )
214
+ .option('-o, --output <dir>', 'Output directory for results', './results')
215
+ .option('--prepared-data <dir>', 'Directory containing prepared data', './prepared-data')
216
+ .option('--subset <n>', 'Run on subset of n questions', parseInt)
217
+ .option('--concurrency <n>', 'Number of parallel requests (default: 5)', parseInt)
218
+ .option('--question-id <id>', 'Focus on a specific question by ID')
219
+ .action(async options => {
220
+ try {
221
+ console.log(chalk.blue('\nšŸš€ LongMemEval Benchmark Runner\n'));
222
+
223
+ // Check for OpenAI API key
224
+ if (!process.env.OPENAI_API_KEY) {
225
+ console.error(chalk.red('Error: OPENAI_API_KEY environment variable is not set'));
226
+ console.error(chalk.gray('Please set it in your environment or .env file'));
227
+ process.exit(1);
228
+ }
229
+
230
+ // Validate dataset option
231
+ const validDatasets = ['longmemeval_s', 'longmemeval_m', 'longmemeval_oracle'];
232
+ if (!validDatasets.includes(options.dataset)) {
233
+ console.error(chalk.red(`Invalid dataset: ${options.dataset}`));
234
+ console.error(chalk.gray(`Valid options: ${validDatasets.join(', ')}`));
235
+ process.exit(1);
236
+ }
237
+
238
+ // Run benchmark using prepared data
239
+ const runCommand = new RunCommand();
240
+ await runCommand.run({
241
+ dataset: options.dataset,
242
+ memoryConfig: options.memoryConfig,
243
+ model: options.model,
244
+ preparedDataDir: options.preparedData,
245
+ outputDir: options.output,
246
+ subset: options.subset,
247
+ concurrency: options.concurrency,
248
+ questionId: options.questionId,
249
+ });
250
+
251
+ // Force exit after completion
252
+ setTimeout(() => {
253
+ process.exit(0);
254
+ }, 100); // Give a tiny bit of time for any cleanup
255
+ } catch (error) {
256
+ console.error(chalk.red('\nError:'), error);
257
+ process.exit(1);
258
+ }
259
+ });
260
+
261
+ // Evaluate command
262
+ program
263
+ .command('evaluate')
264
+ .description('Evaluate existing results')
265
+ .requiredOption('-r, --results <file>', 'Results file (JSONL format)')
266
+ .requiredOption('-d, --dataset <dataset>', 'Dataset used for questions')
267
+ .action(async options => {
268
+ try {
269
+ console.log(chalk.blue('\nšŸ“Š Evaluating Results\n'));
270
+
271
+ // const loader = new DatasetLoader();
272
+ // const questions = await loader.loadDataset(options.dataset);
273
+
274
+ // Load results
275
+ const resultsContent = await readFile(options.results, 'utf-8');
276
+ const results: EvaluationResult[] = resultsContent
277
+ .split('\n')
278
+ .filter(line => line.trim())
279
+ .map(line => JSON.parse(line));
280
+
281
+ // Calculate metrics
282
+ const metrics = calculateMetrics(results);
283
+
284
+ // Print metrics
285
+ console.log(chalk.bold('Overall Accuracy:'), chalk.yellow(`${(metrics.overall_accuracy * 100).toFixed(2)}%`));
286
+ console.log(chalk.bold('Total Questions:'), metrics.total_questions);
287
+ console.log(chalk.bold('Correct Answers:'), metrics.correct_answers);
288
+
289
+ console.log(chalk.bold('\nAccuracy by Question Type:'));
290
+ for (const [type, typeMetrics] of Object.entries(metrics.accuracy_by_type)) {
291
+ const { correct, total, accuracy } = typeMetrics;
292
+ console.log(
293
+ chalk.gray(` ${type}:`),
294
+ chalk.yellow(`${(accuracy * 100).toFixed(2)}%`),
295
+ chalk.gray(`(${correct}/${total})`),
296
+ );
297
+ }
298
+ } catch (error) {
299
+ console.error(chalk.red('\nError:'), error);
300
+ process.exit(1);
301
+ }
302
+ });
303
+
304
+ // Stats command
305
+ program
306
+ .command('stats')
307
+ .description('Show dataset statistics')
308
+ .requiredOption('-d, --dataset <dataset>', 'Dataset to analyze')
309
+ .action(async options => {
310
+ try {
311
+ console.log(chalk.blue('\nšŸ“ˆ Dataset Statistics\n'));
312
+
313
+ const loader = new DatasetLoader();
314
+ const stats = await loader.getDatasetStats(options.dataset);
315
+
316
+ console.log(chalk.bold('Total Questions:'), stats.totalQuestions);
317
+ console.log(chalk.bold('Abstention Questions:'), stats.abstentionQuestions);
318
+ console.log(chalk.bold('Avg Sessions per Question:'), stats.avgSessionsPerQuestion.toFixed(2));
319
+ console.log(chalk.bold('Avg Turns per Session:'), stats.avgTurnsPerSession.toFixed(2));
320
+ console.log(chalk.bold('Total Tokens (estimate):'), stats.totalTokensEstimate.toLocaleString());
321
+
322
+ console.log(chalk.bold('\nQuestions by Type:'));
323
+ for (const [type, count] of Object.entries(stats.questionsByType)) {
324
+ console.log(chalk.gray(` ${type}:`), count);
325
+ }
326
+ } catch (error) {
327
+ console.error(chalk.red('\nError:'), error);
328
+ process.exit(1);
329
+ }
330
+ });
331
+
332
+ // List command to show available questions
333
+ program
334
+ .command('list')
335
+ .description('List prepared questions with their IDs')
336
+ .requiredOption('-d, --dataset <dataset>', 'Dataset to list from')
337
+ .option('-c, --memory-config <config>', 'Memory configuration', 'semantic-recall')
338
+ .option('--prepared-data <dir>', 'Directory containing prepared data', './prepared-data')
339
+ .action(async options => {
340
+ try {
341
+ console.log(chalk.blue('\nšŸ“‹ Listing Prepared Questions\n'));
342
+
343
+ const preparedDir = join(options.preparedData, options.dataset, options.memoryConfig);
344
+
345
+ if (!existsSync(preparedDir)) {
346
+ console.error(chalk.red(`No prepared data found for ${options.dataset} with ${options.memoryConfig} config`));
347
+ console.error(chalk.gray(`Run 'longmemeval prepare' first`));
348
+ process.exit(1);
349
+ }
350
+
351
+ const questionDirs = await readdir(preparedDir);
352
+ const questions: any[] = [];
353
+
354
+ for (const questionDir of questionDirs) {
355
+ const metaPath = join(preparedDir, questionDir, 'meta.json');
356
+ if (existsSync(metaPath)) {
357
+ const meta = JSON.parse(await readFile(metaPath, 'utf-8'));
358
+ questions.push(meta);
359
+ }
360
+ }
361
+
362
+ // Sort by question ID
363
+ questions.sort((a, b) => a.questionId.localeCompare(b.questionId));
364
+
365
+ console.log(chalk.gray(`Found ${questions.length} prepared questions:\n`));
366
+
367
+ for (const q of questions) {
368
+ const typeColor = q.questionType.includes('single')
369
+ ? 'blue'
370
+ : q.questionType.includes('multi')
371
+ ? 'green'
372
+ : q.questionType.includes('temporal')
373
+ ? 'yellow'
374
+ : 'cyan';
375
+
376
+ console.log(
377
+ chalk.bold(q.questionId),
378
+ chalk[typeColor](`[${q.questionType}]`),
379
+ chalk.gray(`- "${q.question.substring(0, 60)}${q.question.length > 60 ? '...' : ''}"`),
380
+ );
381
+ }
382
+
383
+ console.log(chalk.gray(`\nTo run a specific question: longmemeval run --question-id <id> ...`));
384
+ } catch (error) {
385
+ console.error(chalk.red('\nError:'), error);
386
+ process.exit(1);
387
+ }
388
+ });
389
+
390
+ // Results command - shows latest results for each memory configuration
391
+ program
392
+ .command('results')
393
+ .description('Show latest benchmark results for each memory configuration')
394
+ .option('-r, --results <dir>', 'Results directory', './results')
395
+ .option('-d, --dataset <dataset>', 'Filter by dataset')
396
+ .option('-a, --all', 'Show all results, not just latest')
397
+ .action(async options => {
398
+ try {
399
+ console.log(chalk.blue('\nšŸ“Š Benchmark Results Summary\n'));
400
+
401
+ // Check if results directory exists
402
+ if (!existsSync(options.results)) {
403
+ console.log(chalk.yellow('No results found. Run a benchmark first with:'));
404
+ console.log(chalk.gray(' longmemeval run -d <dataset> -m <model> -c <memory-config>'));
405
+ return;
406
+ }
407
+
408
+ // List all memory config directories
409
+ const memoryConfigs = await readdir(options.results).catch(() => []);
410
+
411
+ // Load all metrics from new structure (results/memory-config/run_xxx)
412
+ const allRuns: Array<{
413
+ runId: string;
414
+ metrics: any;
415
+ config: any;
416
+ timestamp: string;
417
+ }> = [];
418
+
419
+ // First, try new structure
420
+ for (const memConfig of memoryConfigs) {
421
+ const memConfigPath = join(options.results, memConfig);
422
+ try {
423
+ const stat = await require('fs/promises').stat(memConfigPath);
424
+ if (!stat.isDirectory()) continue;
425
+
426
+ const runs = await readdir(memConfigPath);
427
+ const runDirs = runs.filter(r => r.startsWith('run_')).sort();
428
+
429
+ for (const runDir of runDirs) {
430
+ const metricsPath = join(memConfigPath, runDir, 'metrics.json');
431
+ try {
432
+ const metricsContent = await readFile(metricsPath, 'utf-8');
433
+ const data = JSON.parse(metricsContent);
434
+
435
+ // Filter by dataset if specified
436
+ if (options.dataset && data.config.dataset !== options.dataset) {
437
+ continue;
438
+ }
439
+
440
+ allRuns.push({
441
+ runId: runDir,
442
+ metrics: data,
443
+ config: data.config,
444
+ timestamp: data.timestamp,
445
+ });
446
+ } catch (error) {
447
+ // Skip runs with missing or invalid metrics
448
+ }
449
+ }
450
+ } catch (error) {
451
+ // Not a directory, skip
452
+ }
453
+ }
454
+
455
+ // Also check old structure for backwards compatibility
456
+ const oldRuns = memoryConfigs.filter(r => r.startsWith('run_')).sort();
457
+ for (const runDir of oldRuns) {
458
+ const metricsPath = join(options.results, runDir, 'metrics.json');
459
+ try {
460
+ const metricsContent = await readFile(metricsPath, 'utf-8');
461
+ const data = JSON.parse(metricsContent);
462
+
463
+ // Filter by dataset if specified
464
+ if (options.dataset && data.config.dataset !== options.dataset) {
465
+ continue;
466
+ }
467
+
468
+ allRuns.push({
469
+ runId: runDir,
470
+ metrics: data,
471
+ config: data.config,
472
+ timestamp: data.timestamp,
473
+ });
474
+ } catch (error) {
475
+ // Skip runs with missing or invalid metrics
476
+ }
477
+ }
478
+
479
+ if (allRuns.length === 0) {
480
+ console.log(chalk.yellow('No results found matching criteria.'));
481
+ return;
482
+ }
483
+
484
+ // Group by memory configuration
485
+ const byMemoryConfig = new Map<string, typeof allRuns>();
486
+ for (const run of allRuns) {
487
+ const key = `${run.config.dataset}_${run.config.memoryConfig}`;
488
+ if (!byMemoryConfig.has(key)) {
489
+ byMemoryConfig.set(key, []);
490
+ }
491
+ byMemoryConfig.get(key)!.push(run);
492
+ }
493
+
494
+ // Sort groups by worst performing to best performing (based on latest run)
495
+ const sortedConfigs = Array.from(byMemoryConfig.entries()).sort(([_aKey, aRuns], [_bKey, bRuns]) => {
496
+ // Get latest run for each config (already sorted by timestamp)
497
+ const aLatest = aRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp))[0];
498
+ const bLatest = bRuns.sort((a, b) => b.timestamp.localeCompare(a.timestamp))[0];
499
+
500
+ // Sort by overall accuracy (worst first)
501
+ return aLatest.metrics.overall_accuracy - bLatest.metrics.overall_accuracy;
502
+ });
503
+
504
+ for (const [_configKey, runs] of sortedConfigs) {
505
+ // Sort runs by timestamp (newest first)
506
+ runs.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
507
+
508
+ // Show latest or all
509
+ const runsToShow = options.all ? runs : [runs[0]];
510
+
511
+ for (const run of runsToShow) {
512
+ // Get terminal width, default to 80 if not available
513
+ const terminalWidth = process.stdout.columns || 80;
514
+ const lineWidth = Math.min(terminalWidth - 1, 80); // Cap at 80 for readability
515
+
516
+ console.log(chalk.bold('\n' + '═'.repeat(lineWidth) + '\n'));
517
+
518
+ // Configuration header
519
+ console.log(chalk.bold('Configuration:\n'));
520
+ console.log(chalk.gray('Dataset:'), chalk.cyan(run.config.dataset));
521
+ console.log(chalk.gray('Model:'), chalk.cyan(run.config.model));
522
+ console.log(chalk.gray('Memory Config:'), chalk.cyan(run.config.memoryConfig));
523
+ if (run.config.subset) {
524
+ console.log(chalk.gray('Subset:'), chalk.cyan(`${run.config.subset} questions`));
525
+ }
526
+ console.log(chalk.gray('Run ID:'), chalk.dim(run.runId));
527
+ console.log(chalk.gray('Timestamp:'), chalk.dim(new Date(run.timestamp).toLocaleString()));
528
+ console.log(chalk.gray('─'.repeat(Math.min(lineWidth, 60))));
529
+
530
+ // Display metrics using same format as regular runs
531
+ const metrics = run.metrics;
532
+
533
+ // Recalculate overall accuracy using the new formula (average of type averages)
534
+ const typeAccuracies = Object.values(metrics.accuracy_by_type).map((t: any) => t.accuracy);
535
+ const recalculatedOverall =
536
+ typeAccuracies.length > 0 ? typeAccuracies.reduce((sum, acc) => sum + acc, 0) / typeAccuracies.length : 0;
537
+ metrics.overall_accuracy = recalculatedOverall;
538
+
539
+ // Question type breakdown
540
+ console.log(chalk.bold('\nAccuracy by Question Type:'));
541
+
542
+ // Sort question types alphabetically
543
+ const sortedTypes = Object.entries(metrics.accuracy_by_type).sort(([a], [b]) => a.localeCompare(b));
544
+
545
+ for (const [type, typeMetrics] of sortedTypes) {
546
+ const { correct, total, accuracy } = typeMetrics as any;
547
+ const typeColor = accuracy >= 0.8 ? 'green' : accuracy >= 0.6 ? 'yellow' : 'red';
548
+
549
+ // Create a simple progress bar
550
+ const barLength = 20;
551
+ const filledLength = Math.round(accuracy * barLength);
552
+ const bar = 'ā–ˆ'.repeat(filledLength) + 'ā–‘'.repeat(barLength - filledLength);
553
+
554
+ console.log(
555
+ chalk.gray(` ${type.padEnd(25)}:`),
556
+ chalk[typeColor](`${(accuracy * 100).toFixed(1).padStart(5)}%`),
557
+ chalk.gray(`[${bar}]`),
558
+ chalk.gray(`(${correct}/${total})`),
559
+ );
560
+ }
561
+
562
+ // Abstention is hidden - it tests LLM reasoning ability rather than memory system performance
563
+
564
+ // Overall summary at the bottom
565
+ console.log();
566
+ const accuracyColor =
567
+ metrics.overall_accuracy >= 0.8 ? 'green' : metrics.overall_accuracy >= 0.6 ? 'yellow' : 'red';
568
+ console.log(
569
+ chalk.bold('Overall Accuracy:'),
570
+ chalk[accuracyColor](`${(metrics.overall_accuracy * 100).toFixed(2)}%`),
571
+ chalk.gray(`(average of ${Object.keys(metrics.accuracy_by_type).length} question types)`),
572
+ );
573
+ }
574
+ }
575
+
576
+ // Get terminal width for final separator
577
+ const terminalWidth = process.stdout.columns || 80;
578
+ const lineWidth = Math.min(terminalWidth - 1, 80);
579
+
580
+ console.log(chalk.bold('\n' + '═'.repeat(lineWidth)));
581
+ console.log(chalk.gray(`\nFound ${allRuns.length} total runs across ${byMemoryConfig.size} configurations`));
582
+ if (!options.all && byMemoryConfig.size > 0) {
583
+ console.log(chalk.gray('Use --all to see all runs, not just the latest'));
584
+ }
585
+ } catch (error) {
586
+ console.error(chalk.red('\nError:'), error);
587
+ process.exit(1);
588
+ }
589
+ });
590
+
591
+ // Report command
592
+ program
593
+ .command('report')
594
+ .description('Generate report from benchmark results')
595
+ .requiredOption('-r, --results <dir>', 'Results directory')
596
+ .action(async options => {
597
+ try {
598
+ console.log(chalk.blue('\nšŸ“„ Generating Report\n'));
599
+
600
+ // List all runs in the results directory
601
+ const runs = await readdir(options.results);
602
+ const runDirs = runs.filter(r => r.startsWith('run_'));
603
+
604
+ if (runDirs.length === 0) {
605
+ console.log(chalk.yellow('No benchmark runs found in the results directory'));
606
+ return;
607
+ }
608
+
609
+ console.log(chalk.bold(`Found ${runDirs.length} benchmark runs:\n`));
610
+
611
+ // Load and display metrics for each run
612
+ for (const runDir of runDirs) {
613
+ const metricsPath = join(options.results, runDir, 'metrics.json');
614
+
615
+ try {
616
+ const metricsContent = await readFile(metricsPath, 'utf-8');
617
+ const metrics = JSON.parse(metricsContent);
618
+
619
+ console.log(chalk.bold(`Run: ${runDir}`));
620
+ console.log(chalk.gray(` Timestamp: ${metrics.timestamp}`));
621
+ console.log(chalk.gray(` Dataset: ${metrics.config.dataset}`));
622
+ console.log(chalk.gray(` Model: ${metrics.config.model}`));
623
+ console.log(chalk.gray(` Memory Config: ${metrics.config.memoryConfig}`));
624
+ console.log(chalk.yellow(` Overall Accuracy: ${(metrics.overall_accuracy * 100).toFixed(2)}%`));
625
+ console.log();
626
+ } catch (error) {
627
+ console.log(chalk.red(` Error loading metrics: ${error}`));
628
+ }
629
+ }
630
+ } catch (error) {
631
+ console.error(chalk.red('\nError:'), error);
632
+ process.exit(1);
633
+ }
634
+ });
635
+
636
+ // Helper function to ensure dataset exists
637
+ async function ensureDatasetExists(dataset: string) {
638
+ const dataDir = join(process.cwd(), 'data');
639
+ const datasetPath = join(dataDir, `${dataset}.json`);
640
+
641
+ // Check if dataset exists and is valid (> 1MB)
642
+ if (existsSync(datasetPath)) {
643
+ try {
644
+ const stats = statSync(datasetPath);
645
+ if (stats.size > 1000000) {
646
+ return; // Dataset exists and is valid
647
+ }
648
+ } catch (error) {
649
+ // File exists but can't get stats, continue to download
650
+ }
651
+ }
652
+
653
+ // Dataset missing or invalid, need to download
654
+ console.log(chalk.yellow(`Dataset ${dataset} not found or invalid.\n`));
655
+
656
+ // Check for HuggingFace token
657
+ const token = process.env.HF_TOKEN || process.env.HUGGINGFACE_TOKEN;
658
+ if (!token) {
659
+ console.log(chalk.red('Error: HuggingFace token required to download datasets.\n'));
660
+ console.log(chalk.gray('1. Get your token from:'));
661
+ console.log(chalk.cyan(' https://huggingface.co/settings/tokens\n'));
662
+ console.log(chalk.gray('2. Set it as an environment variable:'));
663
+ console.log(chalk.cyan(' export HF_TOKEN=your_token_here\n'));
664
+ console.log(chalk.gray('3. Run the benchmark again\n'));
665
+ console.log(chalk.blue('Alternative: Download manually from Google Drive'));
666
+ console.log(chalk.gray('See DOWNLOAD_GUIDE.md for instructions'));
667
+ process.exit(1);
668
+ }
669
+
670
+ console.log(chalk.blue('Downloading dataset...\n'));
671
+
672
+ try {
673
+ // Run the download script
674
+ execSync('pnpm download', { stdio: 'inherit' });
675
+
676
+ // Verify download succeeded
677
+ if (!existsSync(datasetPath) || statSync(datasetPath).size < 1000000) {
678
+ throw new Error('Dataset download failed or file is invalid');
679
+ }
680
+
681
+ console.log(chalk.green('\nāœ… Dataset downloaded successfully!\n'));
682
+ } catch (error) {
683
+ console.error(chalk.red('\nError downloading dataset:'), error);
684
+ console.log(chalk.yellow('\nPlease download the dataset manually.'));
685
+ console.log(chalk.gray('See DOWNLOAD_GUIDE.md for instructions'));
686
+ process.exit(1);
687
+ }
688
+ }
689
+
690
+ program.parse();