@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +919 -0
- package/DATA_DOWNLOAD_GUIDE.md +117 -0
- package/LICENSE.md +15 -0
- package/README.md +173 -0
- package/USAGE.md +105 -0
- package/package.json +67 -0
- package/scripts/download.ts +180 -0
- package/scripts/find-failed.ts +176 -0
- package/scripts/generate-embeddings.ts +56 -0
- package/scripts/generate-wm-templates.ts +296 -0
- package/scripts/setup.ts +60 -0
- package/src/__fixtures__/embeddings.json +2319 -0
- package/src/__fixtures__/test-dataset.json +82 -0
- package/src/cli.ts +690 -0
- package/src/commands/__tests__/prepare.test.ts +230 -0
- package/src/commands/__tests__/run.test.ts +403 -0
- package/src/commands/prepare.ts +793 -0
- package/src/commands/run.ts +553 -0
- package/src/config.ts +83 -0
- package/src/data/loader.ts +163 -0
- package/src/data/types.ts +61 -0
- package/src/embeddings/cached-openai-embedding-model.ts +227 -0
- package/src/embeddings/cached-openai-provider.ts +40 -0
- package/src/embeddings/index.ts +2 -0
- package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
- package/src/evaluation/longmemeval-metric.ts +173 -0
- package/src/retry-model.ts +60 -0
- package/src/storage/__tests__/benchmark-store.test.ts +280 -0
- package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
- package/src/storage/benchmark-store.ts +540 -0
- package/src/storage/benchmark-vector.ts +234 -0
- package/src/storage/index.ts +2 -0
- package/src/test-utils/mock-embeddings.ts +54 -0
- package/src/test-utils/mock-model.ts +49 -0
- package/tests/data-loader.test.ts +96 -0
- package/tsconfig.json +18 -0
- package/vitest.config.ts +9 -0
|
@@ -0,0 +1,793 @@
|
|
|
1
|
+
import { Agent } from '@mastra/core/agent';
|
|
2
|
+
import { Memory } from '@mastra/memory';
|
|
3
|
+
import { MockLanguageModelV1 } from '../test-utils/mock-model';
|
|
4
|
+
import { openai } from '@ai-sdk/openai';
|
|
5
|
+
import { cachedOpenAI } from '../embeddings/cached-openai-provider';
|
|
6
|
+
import { embeddingCacheStats } from '../embeddings';
|
|
7
|
+
import chalk from 'chalk';
|
|
8
|
+
import ora from 'ora';
|
|
9
|
+
import { join } from 'path';
|
|
10
|
+
import { mkdir, writeFile, readFile, unlink } from 'fs/promises';
|
|
11
|
+
import { existsSync } from 'fs';
|
|
12
|
+
|
|
13
|
+
import { DatasetLoader } from '../data/loader';
|
|
14
|
+
import { BenchmarkStore, BenchmarkVectorStore } from '../storage';
|
|
15
|
+
import type { LongMemEvalQuestion, MemoryConfigOptions, MemoryConfigType } from '../data/types';
|
|
16
|
+
import type { CoreMessage } from 'ai';
|
|
17
|
+
|
|
18
|
+
import { getMemoryOptions } from '../config';
|
|
19
|
+
import { makeRetryModel } from '../retry-model';
|
|
20
|
+
|
|
21
|
+
const retry4o = makeRetryModel(openai('gpt-4o'));
|
|
22
|
+
|
|
23
|
+
export interface PrepareOptions {
|
|
24
|
+
dataset: 'longmemeval_s' | 'longmemeval_m' | 'longmemeval_oracle';
|
|
25
|
+
memoryConfig: MemoryConfigType;
|
|
26
|
+
outputDir?: string;
|
|
27
|
+
subset?: number;
|
|
28
|
+
concurrency?: number;
|
|
29
|
+
questionId?: string;
|
|
30
|
+
resumeFromMessageId?: string;
|
|
31
|
+
sessionLimit?: number;
|
|
32
|
+
sessionOffset?: number;
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
export class PrepareCommand {
|
|
36
|
+
private loader: DatasetLoader;
|
|
37
|
+
private baseDir: string;
|
|
38
|
+
|
|
39
|
+
constructor() {
|
|
40
|
+
this.loader = new DatasetLoader();
|
|
41
|
+
this.baseDir = './prepared-data';
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
async run(options: PrepareOptions): Promise<void> {
|
|
45
|
+
console.log(chalk.blue('\n🔧 Preparing LongMemEval Data\n'));
|
|
46
|
+
|
|
47
|
+
// Reset embedding cache statistics for this run
|
|
48
|
+
embeddingCacheStats.reset();
|
|
49
|
+
|
|
50
|
+
// Load dataset
|
|
51
|
+
const spinner = ora('Loading dataset...').start();
|
|
52
|
+
const questions = await this.loader.loadDataset(options.dataset);
|
|
53
|
+
spinner.succeed(`Loaded ${questions.length} questions`);
|
|
54
|
+
|
|
55
|
+
// Load working memory templates if using tailored working memory
|
|
56
|
+
let wmTemplates: Record<string, any> = {};
|
|
57
|
+
const usesTailoredWorkingMemory =
|
|
58
|
+
options.memoryConfig === 'working-memory-tailored' || options.memoryConfig === 'combined-tailored';
|
|
59
|
+
if (usesTailoredWorkingMemory) {
|
|
60
|
+
const templatePath = join(this.baseDir, 'wm-templates', `${options.dataset}.json`);
|
|
61
|
+
if (existsSync(templatePath)) {
|
|
62
|
+
try {
|
|
63
|
+
wmTemplates = JSON.parse(await readFile(templatePath, 'utf-8'));
|
|
64
|
+
console.log(chalk.green(`✓ Loaded ${Object.keys(wmTemplates).length} working memory templates`));
|
|
65
|
+
} catch (e) {
|
|
66
|
+
console.log(chalk.yellow('⚠️ Could not load working memory templates, using default'));
|
|
67
|
+
}
|
|
68
|
+
} else {
|
|
69
|
+
console.log(chalk.yellow('⚠️ No working memory templates found, using default'));
|
|
70
|
+
console.log(chalk.gray('Run "pnpm generate-wm-templates" to generate them'));
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Filter by questionId if specified
|
|
75
|
+
let questionsToProcess = questions;
|
|
76
|
+
if (options.questionId) {
|
|
77
|
+
questionsToProcess = questions.filter(q => q.question_id === options.questionId);
|
|
78
|
+
if (questionsToProcess.length === 0) {
|
|
79
|
+
throw new Error(`Question with ID "${options.questionId}" not found in dataset`);
|
|
80
|
+
}
|
|
81
|
+
console.log(chalk.yellow(`\nFocusing on question: ${options.questionId}\n`));
|
|
82
|
+
} else if (options.subset) {
|
|
83
|
+
// Apply subset if requested
|
|
84
|
+
questionsToProcess = questions.slice(0, options.subset);
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
console.log(
|
|
88
|
+
chalk.yellow(`\nProcessing ${questionsToProcess.length} question${questionsToProcess.length !== 1 ? 's' : ''}\n`),
|
|
89
|
+
);
|
|
90
|
+
|
|
91
|
+
// Get memory configuration
|
|
92
|
+
const memoryOptions = getMemoryOptions(options.memoryConfig);
|
|
93
|
+
|
|
94
|
+
// Use real model for working memory, mock for others
|
|
95
|
+
const needsRealModel =
|
|
96
|
+
options.memoryConfig === 'working-memory' ||
|
|
97
|
+
options.memoryConfig === 'working-memory-tailored' ||
|
|
98
|
+
options.memoryConfig === 'combined' ||
|
|
99
|
+
options.memoryConfig === 'combined-tailored';
|
|
100
|
+
|
|
101
|
+
if (needsRealModel && !process.env.OPENAI_API_KEY) {
|
|
102
|
+
throw new Error('OPENAI_API_KEY is required for working memory preparation');
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
const model = needsRealModel
|
|
106
|
+
? retry4o.model
|
|
107
|
+
: new MockLanguageModelV1({
|
|
108
|
+
doGenerate: async () => ({
|
|
109
|
+
rawCall: { rawPrompt: null, rawSettings: {} },
|
|
110
|
+
finishReason: 'stop',
|
|
111
|
+
usage: { promptTokens: 10, completionTokens: 20 },
|
|
112
|
+
}),
|
|
113
|
+
});
|
|
114
|
+
|
|
115
|
+
// Track active questions progress
|
|
116
|
+
const activeQuestions = new Map<
|
|
117
|
+
number,
|
|
118
|
+
{ questionId: string; status: string; totalSessions?: number; processedSessions?: number; questionType?: string }
|
|
119
|
+
>();
|
|
120
|
+
|
|
121
|
+
// Create main progress spinner
|
|
122
|
+
const mainSpinner = ora('Starting data preparation...').start();
|
|
123
|
+
|
|
124
|
+
let processedCount = 0;
|
|
125
|
+
let cachedCount = 0;
|
|
126
|
+
let completedCount = 0;
|
|
127
|
+
let inProgressCount = 0;
|
|
128
|
+
const startTime = Date.now();
|
|
129
|
+
|
|
130
|
+
// Determine question batch size based on config
|
|
131
|
+
const questionConcurrency = options.concurrency || 10; // Allow concurrency for all configs
|
|
132
|
+
|
|
133
|
+
console.log(chalk.gray(`Question concurrency: ${questionConcurrency}`));
|
|
134
|
+
|
|
135
|
+
// Warn about working memory concurrency
|
|
136
|
+
if ((options.memoryConfig === 'working-memory' || options.memoryConfig === 'combined') && questionConcurrency > 1) {
|
|
137
|
+
console.log(
|
|
138
|
+
chalk.yellow(
|
|
139
|
+
`⚠️ Note: Running working memory questions concurrently. Each question has its own resource scope.`,
|
|
140
|
+
),
|
|
141
|
+
);
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
let lastText = ``;
|
|
145
|
+
// Function to update progress display
|
|
146
|
+
const updateProgress = () => {
|
|
147
|
+
const elapsed = Math.round((Date.now() - startTime) / 1000);
|
|
148
|
+
const rate = elapsed > 0 ? completedCount / elapsed : 0;
|
|
149
|
+
const remaining = rate > 0 ? Math.round((questionsToProcess.length - completedCount) / rate) : 0;
|
|
150
|
+
|
|
151
|
+
// Build progress text with active questions
|
|
152
|
+
let progressText = `Overall: ${completedCount}/${questionsToProcess.length} (${inProgressCount} in progress, ${cachedCount} cached, ~${remaining}s remaining)`;
|
|
153
|
+
|
|
154
|
+
// Add embedding cache stats if available
|
|
155
|
+
const totalEmbeddingOps = embeddingCacheStats.cacheHits + embeddingCacheStats.cacheMisses;
|
|
156
|
+
if (totalEmbeddingOps > 0) {
|
|
157
|
+
const hitRate = embeddingCacheStats.cacheHits / totalEmbeddingOps;
|
|
158
|
+
progressText += `\nEmbedding cache: ${embeddingCacheStats.cacheHits} hits, ${embeddingCacheStats.cacheMisses} misses (${(hitRate * 100).toFixed(1)}% hit rate)`;
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
progressText += `\nRate limit count: ${retry4o.state.rateLimitCount}`;
|
|
162
|
+
if (retry4o.state.pauseTime > 0 && retry4o.state.pause)
|
|
163
|
+
progressText += ` (paused, waiting for ${retry4o.state.pauseTime}ms)`;
|
|
164
|
+
|
|
165
|
+
if (activeQuestions.size > 0) {
|
|
166
|
+
progressText += '\n\nActive questions:';
|
|
167
|
+
|
|
168
|
+
// Sort active questions by completion percentage
|
|
169
|
+
const sortedQuestions = Array.from(activeQuestions.entries())
|
|
170
|
+
.map(([index, info]) => {
|
|
171
|
+
const progress =
|
|
172
|
+
info.processedSessions && info.totalSessions ? info.processedSessions / info.totalSessions : 0;
|
|
173
|
+
return { index, info, progress };
|
|
174
|
+
})
|
|
175
|
+
.sort((a, b) => b.progress - a.progress); // Sort by most complete first
|
|
176
|
+
|
|
177
|
+
sortedQuestions.forEach(({ info, progress }) => {
|
|
178
|
+
const percentage = (progress * 100).toFixed(0);
|
|
179
|
+
progressText += `\n ${info.status} (${percentage}%) ${chalk.grey(info.questionType || '')}`;
|
|
180
|
+
});
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
if (lastText !== progressText) {
|
|
184
|
+
lastText = progressText;
|
|
185
|
+
mainSpinner.text = progressText;
|
|
186
|
+
}
|
|
187
|
+
};
|
|
188
|
+
|
|
189
|
+
// Create a queue of questions to process
|
|
190
|
+
const questionQueue = [...questionsToProcess];
|
|
191
|
+
let questionIndex = 0;
|
|
192
|
+
|
|
193
|
+
// Function to process next question from queue
|
|
194
|
+
const processNextQuestion = async (slotIndex: number): Promise<void> => {
|
|
195
|
+
while (questionQueue.length > 0) {
|
|
196
|
+
const question = questionQueue.shift();
|
|
197
|
+
if (!question) break;
|
|
198
|
+
|
|
199
|
+
const currentIndex = questionIndex++;
|
|
200
|
+
|
|
201
|
+
// Check if already prepared
|
|
202
|
+
const questionDir = join(
|
|
203
|
+
options.outputDir || this.baseDir,
|
|
204
|
+
options.dataset,
|
|
205
|
+
options.memoryConfig,
|
|
206
|
+
question.question_id,
|
|
207
|
+
);
|
|
208
|
+
|
|
209
|
+
// Check if question has failed previously
|
|
210
|
+
const progressPath = join(questionDir, 'progress.json');
|
|
211
|
+
if (existsSync(progressPath)) {
|
|
212
|
+
try {
|
|
213
|
+
const progress = JSON.parse(await readFile(progressPath, 'utf-8'));
|
|
214
|
+
if (progress.failed) {
|
|
215
|
+
// Retry failed questions
|
|
216
|
+
mainSpinner.clear();
|
|
217
|
+
console.log(
|
|
218
|
+
chalk.yellow(`↻`),
|
|
219
|
+
chalk.blue(`${question.question_id}`),
|
|
220
|
+
chalk.gray(`(${question.question_type})`),
|
|
221
|
+
chalk.yellow(`[retrying previously failed]`),
|
|
222
|
+
);
|
|
223
|
+
mainSpinner.render();
|
|
224
|
+
|
|
225
|
+
// Delete the failed progress file to start fresh
|
|
226
|
+
await unlink(progressPath);
|
|
227
|
+
|
|
228
|
+
// Continue processing this question normally (don't skip)
|
|
229
|
+
}
|
|
230
|
+
} catch (e) {
|
|
231
|
+
// If we can't read progress, continue with normal processing
|
|
232
|
+
}
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
// Skip cache check if we're resuming from a specific message
|
|
236
|
+
if (!options.resumeFromMessageId && existsSync(join(questionDir, 'meta.json'))) {
|
|
237
|
+
cachedCount++;
|
|
238
|
+
completedCount++;
|
|
239
|
+
|
|
240
|
+
mainSpinner.clear();
|
|
241
|
+
console.log(
|
|
242
|
+
chalk.green(`✓`),
|
|
243
|
+
chalk.blue(`${question.question_id}`),
|
|
244
|
+
chalk.gray(`(${question.question_type})`),
|
|
245
|
+
chalk.yellow(`[cached]`),
|
|
246
|
+
chalk.gray(`- ${completedCount}/${questionsToProcess.length}`),
|
|
247
|
+
);
|
|
248
|
+
mainSpinner.render();
|
|
249
|
+
|
|
250
|
+
// Update progress
|
|
251
|
+
updateProgress();
|
|
252
|
+
|
|
253
|
+
// Continue to next question
|
|
254
|
+
continue;
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// Mark as in progress
|
|
258
|
+
inProgressCount++;
|
|
259
|
+
activeQuestions.set(slotIndex, { questionId: question.question_id, status: 'Starting...' });
|
|
260
|
+
updateProgress();
|
|
261
|
+
|
|
262
|
+
try {
|
|
263
|
+
await this.processQuestion(
|
|
264
|
+
question,
|
|
265
|
+
options,
|
|
266
|
+
model,
|
|
267
|
+
memoryOptions,
|
|
268
|
+
true,
|
|
269
|
+
slotIndex,
|
|
270
|
+
activeQuestions,
|
|
271
|
+
wmTemplates,
|
|
272
|
+
);
|
|
273
|
+
|
|
274
|
+
// Mark as completed
|
|
275
|
+
inProgressCount--;
|
|
276
|
+
processedCount++;
|
|
277
|
+
completedCount++;
|
|
278
|
+
|
|
279
|
+
// Remove from active questions
|
|
280
|
+
activeQuestions.delete(slotIndex);
|
|
281
|
+
|
|
282
|
+
mainSpinner.clear();
|
|
283
|
+
console.log(
|
|
284
|
+
chalk.green(`✓`),
|
|
285
|
+
chalk.blue(`${question.question_id}`),
|
|
286
|
+
chalk.gray(`(${question.question_type})`),
|
|
287
|
+
chalk.gray(`${question.haystack_sessions.length} sessions`),
|
|
288
|
+
chalk.gray(`- ${completedCount}/${questionsToProcess.length}`),
|
|
289
|
+
);
|
|
290
|
+
mainSpinner.render();
|
|
291
|
+
} catch (error) {
|
|
292
|
+
// Check if this is a rate limit error
|
|
293
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
294
|
+
const isRateLimitError =
|
|
295
|
+
errorMessage.includes('Rate limit') ||
|
|
296
|
+
errorMessage.includes('rate limit') ||
|
|
297
|
+
errorMessage.includes('RPM') ||
|
|
298
|
+
errorMessage.includes('TPM') ||
|
|
299
|
+
errorMessage.includes('429');
|
|
300
|
+
|
|
301
|
+
if (isRateLimitError) {
|
|
302
|
+
// Don't mark as failed for rate limits - just skip this run
|
|
303
|
+
inProgressCount--;
|
|
304
|
+
|
|
305
|
+
// Remove from active questions
|
|
306
|
+
activeQuestions.delete(slotIndex);
|
|
307
|
+
|
|
308
|
+
mainSpinner.clear();
|
|
309
|
+
console.log(
|
|
310
|
+
chalk.yellow(`⏸`),
|
|
311
|
+
chalk.blue(`${question.question_id}`),
|
|
312
|
+
chalk.gray(`(${question.question_type})`),
|
|
313
|
+
chalk.yellow(`Rate limited - will retry later`),
|
|
314
|
+
chalk.gray(`- ${completedCount}/${questionsToProcess.length}`),
|
|
315
|
+
);
|
|
316
|
+
mainSpinner.render();
|
|
317
|
+
|
|
318
|
+
// Re-add to the end of the queue to retry later
|
|
319
|
+
questionQueue.push(question);
|
|
320
|
+
|
|
321
|
+
// Add a small delay to help with rate limiting
|
|
322
|
+
await new Promise(resolve => setTimeout(resolve, 1000)); // Wait 1 second
|
|
323
|
+
} else {
|
|
324
|
+
// Mark as completed but failed for non-rate-limit errors
|
|
325
|
+
inProgressCount--;
|
|
326
|
+
completedCount++;
|
|
327
|
+
|
|
328
|
+
// Remove from active questions
|
|
329
|
+
activeQuestions.delete(slotIndex);
|
|
330
|
+
|
|
331
|
+
mainSpinner.clear();
|
|
332
|
+
console.log(
|
|
333
|
+
chalk.red(`✗`),
|
|
334
|
+
chalk.blue(`${question.question_id}`),
|
|
335
|
+
chalk.gray(`(${question.question_type})`),
|
|
336
|
+
chalk.red(`Failed: ${errorMessage}`),
|
|
337
|
+
chalk.gray(`- ${completedCount}/${questionsToProcess.length}`),
|
|
338
|
+
);
|
|
339
|
+
mainSpinner.render();
|
|
340
|
+
|
|
341
|
+
// Save error state to progress file
|
|
342
|
+
const questionDir = join(
|
|
343
|
+
options.outputDir || this.baseDir,
|
|
344
|
+
options.dataset,
|
|
345
|
+
options.memoryConfig,
|
|
346
|
+
question.question_id,
|
|
347
|
+
);
|
|
348
|
+
const progressFile = join(questionDir, 'progress.json');
|
|
349
|
+
|
|
350
|
+
try {
|
|
351
|
+
await mkdir(questionDir, { recursive: true });
|
|
352
|
+
|
|
353
|
+
// Try to load existing progress if available
|
|
354
|
+
let existingProgress = { processedSessionIds: [] };
|
|
355
|
+
if (existsSync(progressFile)) {
|
|
356
|
+
existingProgress = JSON.parse(await readFile(progressFile, 'utf-8'));
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
await writeFile(
|
|
360
|
+
progressFile,
|
|
361
|
+
JSON.stringify(
|
|
362
|
+
{
|
|
363
|
+
processedSessionIds: existingProgress.processedSessionIds || [],
|
|
364
|
+
completed: true,
|
|
365
|
+
failed: true,
|
|
366
|
+
error: errorMessage,
|
|
367
|
+
failedAt: new Date().toISOString(),
|
|
368
|
+
},
|
|
369
|
+
null,
|
|
370
|
+
2,
|
|
371
|
+
),
|
|
372
|
+
);
|
|
373
|
+
} catch (saveError) {
|
|
374
|
+
console.error(chalk.red(`Failed to save error state: ${saveError}`));
|
|
375
|
+
}
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
|
|
379
|
+
updateProgress();
|
|
380
|
+
}
|
|
381
|
+
};
|
|
382
|
+
|
|
383
|
+
const progressInterval = setInterval(updateProgress, 500);
|
|
384
|
+
const workers = Array.from({ length: questionConcurrency }, (_, i) => processNextQuestion(i));
|
|
385
|
+
await Promise.all(workers);
|
|
386
|
+
clearInterval(progressInterval);
|
|
387
|
+
updateProgress();
|
|
388
|
+
|
|
389
|
+
mainSpinner.succeed(`Prepared ${processedCount} questions (${cachedCount} from cache)`);
|
|
390
|
+
const totalTime = Math.round((Date.now() - startTime) / 1000);
|
|
391
|
+
console.log(chalk.gray(`Total time: ${totalTime}s (${Math.round((processedCount / totalTime) * 60)} q/min)`));
|
|
392
|
+
|
|
393
|
+
// Display embedding cache statistics if any embeddings were processed
|
|
394
|
+
const totalEmbeddingOps = embeddingCacheStats.cacheHits + embeddingCacheStats.cacheMisses;
|
|
395
|
+
if (totalEmbeddingOps > 0) {
|
|
396
|
+
const hitRate = embeddingCacheStats.cacheHits / totalEmbeddingOps;
|
|
397
|
+
console.log(
|
|
398
|
+
chalk.gray(
|
|
399
|
+
`Embedding cache: ${embeddingCacheStats.cacheHits} hits, ${embeddingCacheStats.cacheMisses} misses, ${embeddingCacheStats.cacheWrites} writes (${(hitRate * 100).toFixed(1)}% hit rate)`,
|
|
400
|
+
),
|
|
401
|
+
);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
console.log(chalk.green('\n✅ Data preparation complete!\n'));
|
|
405
|
+
console.log(chalk.gray(`Prepared data saved to: ${this.baseDir}/${options.dataset}/${options.memoryConfig}/`));
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
private async processQuestion(
|
|
409
|
+
question: LongMemEvalQuestion,
|
|
410
|
+
options: PrepareOptions,
|
|
411
|
+
model: any,
|
|
412
|
+
memoryOptions: MemoryConfigOptions,
|
|
413
|
+
isConcurrent: boolean = false,
|
|
414
|
+
slotIndex?: number,
|
|
415
|
+
activeQuestions?: Map<
|
|
416
|
+
number,
|
|
417
|
+
{ questionId: string; status: string; totalSessions?: number; processedSessions?: number; questionType?: string }
|
|
418
|
+
>,
|
|
419
|
+
wmTemplates?: Record<string, any>,
|
|
420
|
+
): Promise<void> {
|
|
421
|
+
// Create fresh storage instances for this question
|
|
422
|
+
const benchmarkStore = new BenchmarkStore();
|
|
423
|
+
const benchmarkVectorStore = new BenchmarkVectorStore();
|
|
424
|
+
|
|
425
|
+
// Initialize stores
|
|
426
|
+
await benchmarkStore.init();
|
|
427
|
+
|
|
428
|
+
// Create vector index if using semantic recall
|
|
429
|
+
if (options.memoryConfig === 'semantic-recall' || options.memoryConfig.includes('combined')) {
|
|
430
|
+
await benchmarkVectorStore.createIndex({
|
|
431
|
+
indexName: 'memory_messages',
|
|
432
|
+
dimension: 1536, // text-embedding-3-small dimension
|
|
433
|
+
metric: 'cosine',
|
|
434
|
+
});
|
|
435
|
+
}
|
|
436
|
+
|
|
437
|
+
const usesWorkingMemory =
|
|
438
|
+
options.memoryConfig === 'working-memory' ||
|
|
439
|
+
options.memoryConfig === 'working-memory-tailored' ||
|
|
440
|
+
options.memoryConfig === 'combined' ||
|
|
441
|
+
options.memoryConfig === 'combined-tailored';
|
|
442
|
+
const usesTailoredTemplate =
|
|
443
|
+
options.memoryConfig === 'working-memory-tailored' || options.memoryConfig === 'combined-tailored';
|
|
444
|
+
|
|
445
|
+
// Working memory must run one session (thread) at a time, in order
|
|
446
|
+
// otherwise the data will not be accurate as working memory is meant
|
|
447
|
+
// to build up over time, using the previous working memory state to create the next.
|
|
448
|
+
if (usesWorkingMemory) isConcurrent = false;
|
|
449
|
+
|
|
450
|
+
// Use custom template if available for tailored configs
|
|
451
|
+
if (usesTailoredTemplate && wmTemplates && wmTemplates[question.question_id]) {
|
|
452
|
+
memoryOptions.options.workingMemory = {
|
|
453
|
+
enabled: true,
|
|
454
|
+
template: wmTemplates[question.question_id].template,
|
|
455
|
+
scope: 'resource',
|
|
456
|
+
};
|
|
457
|
+
// if (!isConcurrent) {
|
|
458
|
+
// console.log(chalk.cyan(' Using tailored working memory template'));
|
|
459
|
+
// }
|
|
460
|
+
}
|
|
461
|
+
|
|
462
|
+
// Create memory with appropriate configuration
|
|
463
|
+
const memory = new Memory({
|
|
464
|
+
storage: benchmarkStore,
|
|
465
|
+
vector:
|
|
466
|
+
options.memoryConfig === 'semantic-recall' || options.memoryConfig.includes('combined')
|
|
467
|
+
? benchmarkVectorStore
|
|
468
|
+
: undefined,
|
|
469
|
+
embedder:
|
|
470
|
+
options.memoryConfig === 'semantic-recall' || options.memoryConfig.includes('combined')
|
|
471
|
+
? cachedOpenAI.embedding('text-embedding-3-small')
|
|
472
|
+
: undefined,
|
|
473
|
+
options: memoryOptions.options,
|
|
474
|
+
});
|
|
475
|
+
|
|
476
|
+
// Create agent with appropriate model
|
|
477
|
+
const agent = new Agent({
|
|
478
|
+
name: 'prep-agent',
|
|
479
|
+
instructions:
|
|
480
|
+
"You are a helpful assistant. Process and store conversation history. Only store working memory information if it's in the template. Other information is not relevant",
|
|
481
|
+
model: model,
|
|
482
|
+
memory: memory,
|
|
483
|
+
});
|
|
484
|
+
|
|
485
|
+
// Process all haystack sessions
|
|
486
|
+
const resourceId = `resource_${question.question_id}`;
|
|
487
|
+
|
|
488
|
+
// Sort sessions by date for chronological processing (important for working memory)
|
|
489
|
+
const sessionsWithDates = question.haystack_sessions.map((session, index) => ({
|
|
490
|
+
session,
|
|
491
|
+
sessionId: question.haystack_session_ids[index],
|
|
492
|
+
date: question.haystack_dates[index],
|
|
493
|
+
}));
|
|
494
|
+
|
|
495
|
+
// Sort by date (oldest first)
|
|
496
|
+
sessionsWithDates.sort((a, b) => new Date(a.date).getTime() - new Date(b.date).getTime());
|
|
497
|
+
|
|
498
|
+
// Debug: Log first and last dates to confirm sorting
|
|
499
|
+
if (sessionsWithDates.length > 0 && !isConcurrent) {
|
|
500
|
+
// const firstDate = new Date(sessionsWithDates[0].date).toISOString().split('T')[0];
|
|
501
|
+
// const lastDate = new Date(sessionsWithDates[sessionsWithDates.length - 1].date).toISOString().split('T')[0];
|
|
502
|
+
// console.log(chalk.gray(` Sessions sorted: ${firstDate} (oldest) → ${lastDate} (newest)`));
|
|
503
|
+
}
|
|
504
|
+
|
|
505
|
+
// Create output directory early to save progress
|
|
506
|
+
const questionDir = join(
|
|
507
|
+
options.outputDir || this.baseDir,
|
|
508
|
+
options.dataset,
|
|
509
|
+
options.memoryConfig,
|
|
510
|
+
question.question_id,
|
|
511
|
+
);
|
|
512
|
+
await mkdir(questionDir, { recursive: true });
|
|
513
|
+
|
|
514
|
+
// Check if this question has partial progress saved
|
|
515
|
+
const progressFile = join(questionDir, 'progress.json');
|
|
516
|
+
let processedSessionIds: Set<string> = new Set();
|
|
517
|
+
|
|
518
|
+
// Always try to load existing db.json if it exists (for resume scenarios)
|
|
519
|
+
const dbPath = join(questionDir, 'db.json');
|
|
520
|
+
const vectorPath = join(questionDir, 'vector.json');
|
|
521
|
+
|
|
522
|
+
if (existsSync(dbPath)) {
|
|
523
|
+
// console.log(chalk.gray('Loading existing database...'));
|
|
524
|
+
await benchmarkStore.hydrate(dbPath);
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
if (
|
|
528
|
+
existsSync(vectorPath) &&
|
|
529
|
+
(options.memoryConfig === 'semantic-recall' || options.memoryConfig.includes('combined'))
|
|
530
|
+
) {
|
|
531
|
+
// console.log(chalk.gray('Loading existing vector store...'));
|
|
532
|
+
await benchmarkVectorStore.hydrate(vectorPath);
|
|
533
|
+
}
|
|
534
|
+
|
|
535
|
+
if (existsSync(progressFile)) {
|
|
536
|
+
try {
|
|
537
|
+
const progress = JSON.parse(await readFile(progressFile, 'utf-8'));
|
|
538
|
+
processedSessionIds = new Set(progress.processedSessionIds || []);
|
|
539
|
+
|
|
540
|
+
if (slotIndex !== undefined && activeQuestions) {
|
|
541
|
+
activeQuestions.set(slotIndex, {
|
|
542
|
+
questionId: question.question_id,
|
|
543
|
+
status: `Resuming from session ${processedSessionIds.size}/${sessionsWithDates.length}`,
|
|
544
|
+
});
|
|
545
|
+
}
|
|
546
|
+
} catch (e) {
|
|
547
|
+
console.log(chalk.red(`Failed to load progress for ${question.question_id}:`));
|
|
548
|
+
console.error(e);
|
|
549
|
+
if (options.resumeFromMessageId) {
|
|
550
|
+
console.log(chalk.red(`Cannot resume without valid progress data. Exiting.`));
|
|
551
|
+
process.exit(1);
|
|
552
|
+
}
|
|
553
|
+
processedSessionIds = new Set();
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
// Process sessions in batches to avoid overwhelming the system
|
|
558
|
+
const BATCH_SIZE = usesWorkingMemory ? 1 : 50; // Process x sessions at a time. working memory must run one at a time since each conversation will use resource working memory from the last conversation and build on it.
|
|
559
|
+
let processedSessions = processedSessionIds.size;
|
|
560
|
+
|
|
561
|
+
// Apply session offset if specified
|
|
562
|
+
if (options.sessionOffset && !options.resumeFromMessageId) {
|
|
563
|
+
const offsetIndex = options.sessionOffset - 1; // Convert to 0-based index
|
|
564
|
+
if (offsetIndex >= 0 && offsetIndex < sessionsWithDates.length) {
|
|
565
|
+
console.log(
|
|
566
|
+
chalk.yellow(`\n⏭️ Starting from session ${options.sessionOffset} (skipping first ${offsetIndex} sessions)`),
|
|
567
|
+
);
|
|
568
|
+
|
|
569
|
+
// Mark all sessions before the offset as processed
|
|
570
|
+
for (let i = 0; i < offsetIndex; i++) {
|
|
571
|
+
processedSessionIds.add(sessionsWithDates[i].sessionId);
|
|
572
|
+
}
|
|
573
|
+
processedSessions = processedSessionIds.size;
|
|
574
|
+
} else {
|
|
575
|
+
console.log(
|
|
576
|
+
chalk.red(`✗ Session offset ${options.sessionOffset} is out of range (1-${sessionsWithDates.length})`),
|
|
577
|
+
);
|
|
578
|
+
process.exit(1);
|
|
579
|
+
}
|
|
580
|
+
}
|
|
581
|
+
|
|
582
|
+
// Apply session limit if specified
|
|
583
|
+
let sessionsToProcess = sessionsWithDates;
|
|
584
|
+
if (options.sessionLimit) {
|
|
585
|
+
const startIndex = processedSessionIds.size;
|
|
586
|
+
const endIndex = Math.min(startIndex + options.sessionLimit, sessionsWithDates.length);
|
|
587
|
+
sessionsToProcess = sessionsWithDates.slice(0, endIndex);
|
|
588
|
+
console.log(
|
|
589
|
+
chalk.yellow(`\n📊 Processing limited to ${options.sessionLimit} sessions (${startIndex + 1} to ${endIndex})`),
|
|
590
|
+
);
|
|
591
|
+
}
|
|
592
|
+
|
|
593
|
+
for (let i = 0; i < sessionsToProcess.length; i += BATCH_SIZE) {
|
|
594
|
+
const sessionBatch = sessionsToProcess.slice(i, i + BATCH_SIZE);
|
|
595
|
+
|
|
596
|
+
// Update progress
|
|
597
|
+
if (slotIndex !== undefined && activeQuestions) {
|
|
598
|
+
// Calculate current session index (1-based)
|
|
599
|
+
const currentSessionIndex = processedSessions + 1;
|
|
600
|
+
// Update active questions status
|
|
601
|
+
activeQuestions.set(slotIndex, {
|
|
602
|
+
questionId: question.question_id,
|
|
603
|
+
status: `${chalk.green('->')} preparing ${chalk.blue(question.question_id)}[${chalk.green(currentSessionIndex)}] ${chalk.white(`${processedSessions}/${sessionsToProcess.length} `)}`,
|
|
604
|
+
totalSessions: sessionsToProcess.length,
|
|
605
|
+
processedSessions,
|
|
606
|
+
questionType: question.question_type,
|
|
607
|
+
});
|
|
608
|
+
}
|
|
609
|
+
|
|
610
|
+
// Process batch in parallel
|
|
611
|
+
const batchPromises = sessionBatch.map(async ({ session, sessionId }) => {
|
|
612
|
+
// Skip if already processed
|
|
613
|
+
if (processedSessionIds.has(sessionId)) {
|
|
614
|
+
return;
|
|
615
|
+
}
|
|
616
|
+
|
|
617
|
+
// Convert session to messages
|
|
618
|
+
const messages: CoreMessage[] = [];
|
|
619
|
+
for (const turn of session) {
|
|
620
|
+
if (!turn.content) continue;
|
|
621
|
+
|
|
622
|
+
const role = turn.role === 'user' || turn.role === 'assistant' ? turn.role : 'user';
|
|
623
|
+
messages.push({
|
|
624
|
+
role,
|
|
625
|
+
content: turn.content,
|
|
626
|
+
});
|
|
627
|
+
}
|
|
628
|
+
|
|
629
|
+
if (messages.length > 0) {
|
|
630
|
+
// Process through agent to save to memory
|
|
631
|
+
try {
|
|
632
|
+
await agent.generate(messages, {
|
|
633
|
+
threadId: sessionId, // Use haystack session ID as thread ID
|
|
634
|
+
resourceId,
|
|
635
|
+
memoryOptions: memoryOptions.options,
|
|
636
|
+
temperature: 0.3,
|
|
637
|
+
frequencyPenalty: 0.3,
|
|
638
|
+
});
|
|
639
|
+
} catch (error) {
|
|
640
|
+
console.error(`Error in agent.generate for ${question.question_id}, session ${sessionId}:`, error);
|
|
641
|
+
throw error;
|
|
642
|
+
}
|
|
643
|
+
}
|
|
644
|
+
|
|
645
|
+
// Mark as processed
|
|
646
|
+
processedSessionIds.add(sessionId);
|
|
647
|
+
|
|
648
|
+
// Save progress after each session if using working memory
|
|
649
|
+
if (usesWorkingMemory) {
|
|
650
|
+
await writeFile(
|
|
651
|
+
progressFile,
|
|
652
|
+
JSON.stringify({
|
|
653
|
+
processedSessionIds: Array.from(processedSessionIds),
|
|
654
|
+
lastSavedDb: 'db.json',
|
|
655
|
+
lastSavedVector: 'vector.json',
|
|
656
|
+
}),
|
|
657
|
+
);
|
|
658
|
+
|
|
659
|
+
// Persist current state
|
|
660
|
+
await benchmarkStore.persist(join(questionDir, 'db.json'));
|
|
661
|
+
if (options.memoryConfig === 'semantic-recall' || options.memoryConfig.includes('combined')) {
|
|
662
|
+
await benchmarkVectorStore.persist(join(questionDir, 'vector.json'));
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
});
|
|
666
|
+
|
|
667
|
+
await Promise.all(batchPromises);
|
|
668
|
+
|
|
669
|
+
// Fix dates for newly processed sessions
|
|
670
|
+
const newlyProcessedSessions = sessionBatch.filter(s => processedSessionIds.has(s.sessionId));
|
|
671
|
+
if (newlyProcessedSessions.length > 0) {
|
|
672
|
+
await this.fixSessionDates(questionDir, newlyProcessedSessions, benchmarkStore);
|
|
673
|
+
}
|
|
674
|
+
|
|
675
|
+
// Update processed count based on actual processed sessions
|
|
676
|
+
processedSessions = processedSessionIds.size;
|
|
677
|
+
|
|
678
|
+
// Update progress after batch completes
|
|
679
|
+
if (slotIndex !== undefined && activeQuestions) {
|
|
680
|
+
// Calculate current session index (1-based)
|
|
681
|
+
const currentSessionIndex = processedSessions + 1;
|
|
682
|
+
activeQuestions.set(slotIndex, {
|
|
683
|
+
questionId: question.question_id,
|
|
684
|
+
status: `session ${currentSessionIndex} (${processedSessions}/${sessionsToProcess.length} total)`,
|
|
685
|
+
});
|
|
686
|
+
}
|
|
687
|
+
}
|
|
688
|
+
|
|
689
|
+
// Update status to saving
|
|
690
|
+
if (slotIndex !== undefined && activeQuestions) {
|
|
691
|
+
activeQuestions.set(slotIndex, {
|
|
692
|
+
questionId: question.question_id,
|
|
693
|
+
status: 'Saving data...',
|
|
694
|
+
});
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
// Persist storage
|
|
698
|
+
await benchmarkStore.persist(join(questionDir, 'db.json'));
|
|
699
|
+
|
|
700
|
+
// Persist vector store if used
|
|
701
|
+
if (options.memoryConfig === 'semantic-recall' || options.memoryConfig.includes('combined')) {
|
|
702
|
+
await benchmarkVectorStore.persist(join(questionDir, 'vector.json'));
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
// Save metadata
|
|
706
|
+
const metadata = {
|
|
707
|
+
questionId: question.question_id,
|
|
708
|
+
questionType: question.question_type,
|
|
709
|
+
question: question.question,
|
|
710
|
+
answer: question.answer,
|
|
711
|
+
questionDate: question.question_date,
|
|
712
|
+
resourceId,
|
|
713
|
+
threadIds: question.haystack_session_ids,
|
|
714
|
+
preparedAt: new Date().toISOString(),
|
|
715
|
+
memoryConfig: options.memoryConfig,
|
|
716
|
+
sessionCount: sessionsWithDates.length,
|
|
717
|
+
evidenceSessionIds: question.answer_session_ids,
|
|
718
|
+
note: 'Sessions were processed in chronological order (oldest first) for working memory',
|
|
719
|
+
};
|
|
720
|
+
|
|
721
|
+
await writeFile(join(questionDir, 'meta.json'), JSON.stringify(metadata, null, 2));
|
|
722
|
+
|
|
723
|
+
// Clean up progress file after successful completion
|
|
724
|
+
if (existsSync(progressFile)) {
|
|
725
|
+
await writeFile(
|
|
726
|
+
progressFile,
|
|
727
|
+
JSON.stringify({
|
|
728
|
+
processedSessionIds: Array.from(processedSessionIds),
|
|
729
|
+
completed: true,
|
|
730
|
+
completedAt: new Date().toISOString(),
|
|
731
|
+
}),
|
|
732
|
+
);
|
|
733
|
+
}
|
|
734
|
+
}
|
|
735
|
+
|
|
736
|
+
private async fixSessionDates(
|
|
737
|
+
questionDir: string,
|
|
738
|
+
sessionBatch: Array<{ session: any; sessionId: string; date: string }>,
|
|
739
|
+
benchmarkStore: BenchmarkStore,
|
|
740
|
+
): Promise<void> {
|
|
741
|
+
// Save current state to temp file
|
|
742
|
+
const tempPath = join(questionDir, 'temp_db.json');
|
|
743
|
+
await benchmarkStore.persist(tempPath);
|
|
744
|
+
|
|
745
|
+
// Read and modify the data
|
|
746
|
+
const data = JSON.parse(await readFile(tempPath, 'utf-8'));
|
|
747
|
+
|
|
748
|
+
// Fix dates for each session in the batch
|
|
749
|
+
for (const { sessionId, date } of sessionBatch) {
|
|
750
|
+
const sessionDate = new Date(date);
|
|
751
|
+
|
|
752
|
+
// Get messages for this session
|
|
753
|
+
const sessionMessages: Array<[string, any]> = [];
|
|
754
|
+
if (data.mastra_messages) {
|
|
755
|
+
for (const [key, message] of data.mastra_messages) {
|
|
756
|
+
if (message.threadId === sessionId) {
|
|
757
|
+
sessionMessages.push([key, message]);
|
|
758
|
+
}
|
|
759
|
+
}
|
|
760
|
+
}
|
|
761
|
+
|
|
762
|
+
// Sort messages by their current createdAt to maintain order
|
|
763
|
+
sessionMessages.sort((a, b) => new Date(a[1].createdAt).getTime() - new Date(b[1].createdAt).getTime());
|
|
764
|
+
|
|
765
|
+
// Update each message's date
|
|
766
|
+
sessionMessages.forEach(([_key, message], idx) => {
|
|
767
|
+
// Add 5 minutes for each message in the conversation
|
|
768
|
+
const messageDate = new Date(sessionDate.getTime() + idx * 5 * 60 * 1000);
|
|
769
|
+
message.createdAt = messageDate.toISOString();
|
|
770
|
+
message.updatedAt = messageDate.toISOString();
|
|
771
|
+
});
|
|
772
|
+
|
|
773
|
+
// Update thread dates
|
|
774
|
+
if (data.mastra_threads) {
|
|
775
|
+
for (const [threadId, thread] of data.mastra_threads) {
|
|
776
|
+
if (threadId === sessionId) {
|
|
777
|
+
thread.createdAt = sessionDate.toISOString();
|
|
778
|
+
thread.updatedAt = sessionDate.toISOString();
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
}
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
// Write back the modified data
|
|
785
|
+
await writeFile(tempPath, JSON.stringify(data, null, 2));
|
|
786
|
+
|
|
787
|
+
// Reload the modified data into the store
|
|
788
|
+
await benchmarkStore.hydrate(tempPath);
|
|
789
|
+
|
|
790
|
+
// Clean up temp file
|
|
791
|
+
await unlink(tempPath);
|
|
792
|
+
}
|
|
793
|
+
}
|