@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +919 -0
- package/DATA_DOWNLOAD_GUIDE.md +117 -0
- package/LICENSE.md +15 -0
- package/README.md +173 -0
- package/USAGE.md +105 -0
- package/package.json +67 -0
- package/scripts/download.ts +180 -0
- package/scripts/find-failed.ts +176 -0
- package/scripts/generate-embeddings.ts +56 -0
- package/scripts/generate-wm-templates.ts +296 -0
- package/scripts/setup.ts +60 -0
- package/src/__fixtures__/embeddings.json +2319 -0
- package/src/__fixtures__/test-dataset.json +82 -0
- package/src/cli.ts +690 -0
- package/src/commands/__tests__/prepare.test.ts +230 -0
- package/src/commands/__tests__/run.test.ts +403 -0
- package/src/commands/prepare.ts +793 -0
- package/src/commands/run.ts +553 -0
- package/src/config.ts +83 -0
- package/src/data/loader.ts +163 -0
- package/src/data/types.ts +61 -0
- package/src/embeddings/cached-openai-embedding-model.ts +227 -0
- package/src/embeddings/cached-openai-provider.ts +40 -0
- package/src/embeddings/index.ts +2 -0
- package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
- package/src/evaluation/longmemeval-metric.ts +173 -0
- package/src/retry-model.ts +60 -0
- package/src/storage/__tests__/benchmark-store.test.ts +280 -0
- package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
- package/src/storage/benchmark-store.ts +540 -0
- package/src/storage/benchmark-vector.ts +234 -0
- package/src/storage/index.ts +2 -0
- package/src/test-utils/mock-embeddings.ts +54 -0
- package/src/test-utils/mock-model.ts +49 -0
- package/tests/data-loader.test.ts +96 -0
- package/tsconfig.json +18 -0
- package/vitest.config.ts +9 -0
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
|
2
|
+
import { PrepareCommand } from '../prepare';
|
|
3
|
+
import { rm, readFile } from 'fs/promises';
|
|
4
|
+
import { existsSync } from 'fs';
|
|
5
|
+
import { join } from 'path';
|
|
6
|
+
import { tmpdir } from 'os';
|
|
7
|
+
import type { LongMemEvalQuestion } from '../../data/types';
|
|
8
|
+
import { createMockEmbedding } from '../../test-utils/mock-embeddings';
|
|
9
|
+
|
|
10
|
+
// Mock OpenAI embeddings with fixture embeddings
|
|
11
|
+
vi.mock('@ai-sdk/openai', () => ({
|
|
12
|
+
openai: {
|
|
13
|
+
embedding: vi.fn(() => createMockEmbedding()),
|
|
14
|
+
},
|
|
15
|
+
}));
|
|
16
|
+
|
|
17
|
+
// Mock the DatasetLoader
|
|
18
|
+
vi.mock('../../data/loader', () => ({
|
|
19
|
+
DatasetLoader: vi.fn().mockImplementation(() => ({
|
|
20
|
+
loadDataset: vi.fn().mockResolvedValue([
|
|
21
|
+
{
|
|
22
|
+
question_id: 'test-q1',
|
|
23
|
+
question_type: 'single-session-user',
|
|
24
|
+
question: 'What is my favorite color?',
|
|
25
|
+
answer: 'Blue',
|
|
26
|
+
question_date: '2024-01-01',
|
|
27
|
+
haystack_session_ids: ['session-1'],
|
|
28
|
+
haystack_dates: ['2024-01-01'],
|
|
29
|
+
haystack_sessions: [
|
|
30
|
+
[
|
|
31
|
+
{ role: 'user', content: 'My favorite color is blue', has_answer: true },
|
|
32
|
+
{ role: 'assistant', content: 'I understand your favorite color is blue.' },
|
|
33
|
+
],
|
|
34
|
+
],
|
|
35
|
+
answer_session_ids: ['session-1'],
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
question_id: 'test-q2',
|
|
39
|
+
question_type: 'multi-session',
|
|
40
|
+
question: 'What did I say about my pet?',
|
|
41
|
+
answer: 'You have a cat named Fluffy',
|
|
42
|
+
question_date: '2024-01-02',
|
|
43
|
+
haystack_session_ids: ['session-2', 'session-3'],
|
|
44
|
+
haystack_dates: ['2024-01-01', '2024-01-02'],
|
|
45
|
+
haystack_sessions: [
|
|
46
|
+
[
|
|
47
|
+
{ role: 'user', content: 'I have a pet', has_answer: false },
|
|
48
|
+
{ role: 'assistant', content: 'What kind of pet do you have?' },
|
|
49
|
+
],
|
|
50
|
+
[
|
|
51
|
+
{ role: 'user', content: 'It is a cat named Fluffy', has_answer: true },
|
|
52
|
+
{ role: 'assistant', content: 'Fluffy is a lovely name for a cat!' },
|
|
53
|
+
],
|
|
54
|
+
],
|
|
55
|
+
answer_session_ids: ['session-3'],
|
|
56
|
+
},
|
|
57
|
+
] as LongMemEvalQuestion[]),
|
|
58
|
+
})),
|
|
59
|
+
}));
|
|
60
|
+
|
|
61
|
+
// Mock chalk and ora to avoid console output in tests
|
|
62
|
+
vi.mock('chalk', () => ({
|
|
63
|
+
default: {
|
|
64
|
+
blue: (str: string) => str,
|
|
65
|
+
yellow: (str: string) => str,
|
|
66
|
+
green: (str: string) => str,
|
|
67
|
+
gray: (str: string) => str,
|
|
68
|
+
},
|
|
69
|
+
}));
|
|
70
|
+
|
|
71
|
+
vi.mock('ora', () => ({
|
|
72
|
+
default: () => ({
|
|
73
|
+
start: vi.fn().mockReturnThis(),
|
|
74
|
+
succeed: vi.fn().mockReturnThis(),
|
|
75
|
+
fail: vi.fn().mockReturnThis(),
|
|
76
|
+
}),
|
|
77
|
+
}));
|
|
78
|
+
|
|
79
|
+
describe('PrepareCommand', () => {
|
|
80
|
+
let command: PrepareCommand;
|
|
81
|
+
let testDir: string;
|
|
82
|
+
|
|
83
|
+
beforeEach(() => {
|
|
84
|
+
command = new PrepareCommand();
|
|
85
|
+
testDir = join(tmpdir(), `prepare-test-${Date.now()}`);
|
|
86
|
+
// Override the base directory
|
|
87
|
+
(command as any).baseDir = testDir;
|
|
88
|
+
});
|
|
89
|
+
|
|
90
|
+
afterEach(async () => {
|
|
91
|
+
// Clean up test directory
|
|
92
|
+
if (existsSync(testDir)) {
|
|
93
|
+
await rm(testDir, { recursive: true });
|
|
94
|
+
}
|
|
95
|
+
});
|
|
96
|
+
|
|
97
|
+
describe('run', () => {
|
|
98
|
+
it('should process questions and save prepared data', async () => {
|
|
99
|
+
await command.run({
|
|
100
|
+
dataset: 'longmemeval_s',
|
|
101
|
+
memoryConfig: 'full-history',
|
|
102
|
+
});
|
|
103
|
+
|
|
104
|
+
// Check that directories were created
|
|
105
|
+
const q1Dir = join(testDir, 'longmemeval_s', 'full-history', 'test-q1');
|
|
106
|
+
const q2Dir = join(testDir, 'longmemeval_s', 'full-history', 'test-q2');
|
|
107
|
+
|
|
108
|
+
expect(existsSync(q1Dir)).toBe(true);
|
|
109
|
+
expect(existsSync(q2Dir)).toBe(true);
|
|
110
|
+
|
|
111
|
+
// Check that files were created
|
|
112
|
+
expect(existsSync(join(q1Dir, 'db.json'))).toBe(true);
|
|
113
|
+
expect(existsSync(join(q1Dir, 'meta.json'))).toBe(true);
|
|
114
|
+
expect(existsSync(join(q2Dir, 'db.json'))).toBe(true);
|
|
115
|
+
expect(existsSync(join(q2Dir, 'meta.json'))).toBe(true);
|
|
116
|
+
|
|
117
|
+
// Check metadata content
|
|
118
|
+
const meta1 = JSON.parse(await readFile(join(q1Dir, 'meta.json'), 'utf-8'));
|
|
119
|
+
expect(meta1.questionId).toBe('test-q1');
|
|
120
|
+
expect(meta1.questionType).toBe('single-session-user');
|
|
121
|
+
expect(meta1.question).toBe('What is my favorite color?');
|
|
122
|
+
expect(meta1.answer).toBe('Blue');
|
|
123
|
+
expect(meta1.resourceId).toBe('resource_test-q1');
|
|
124
|
+
expect(meta1.threadIds).toEqual(['session-1']);
|
|
125
|
+
expect(meta1.memoryConfig).toBe('full-history');
|
|
126
|
+
|
|
127
|
+
const meta2 = JSON.parse(await readFile(join(q2Dir, 'meta.json'), 'utf-8'));
|
|
128
|
+
expect(meta2.questionId).toBe('test-q2');
|
|
129
|
+
expect(meta2.question).toBe('What did I say about my pet?');
|
|
130
|
+
expect(meta2.answer).toBe('You have a cat named Fluffy');
|
|
131
|
+
expect(meta2.threadIds).toEqual(['session-2', 'session-3']);
|
|
132
|
+
});
|
|
133
|
+
|
|
134
|
+
it('should create vector store files for semantic-recall config', async () => {
|
|
135
|
+
await command.run({
|
|
136
|
+
dataset: 'longmemeval_s',
|
|
137
|
+
memoryConfig: 'semantic-recall',
|
|
138
|
+
});
|
|
139
|
+
|
|
140
|
+
const q1Dir = join(testDir, 'longmemeval_s', 'semantic-recall', 'test-q1');
|
|
141
|
+
|
|
142
|
+
// Should have both db.json and vector.json
|
|
143
|
+
expect(existsSync(join(q1Dir, 'db.json'))).toBe(true);
|
|
144
|
+
expect(existsSync(join(q1Dir, 'vector.json'))).toBe(true);
|
|
145
|
+
expect(existsSync(join(q1Dir, 'meta.json'))).toBe(true);
|
|
146
|
+
});
|
|
147
|
+
|
|
148
|
+
it('should process subset of questions when specified', async () => {
|
|
149
|
+
await command.run({
|
|
150
|
+
dataset: 'longmemeval_s',
|
|
151
|
+
memoryConfig: 'last-k',
|
|
152
|
+
subset: 1,
|
|
153
|
+
});
|
|
154
|
+
|
|
155
|
+
const q1Dir = join(testDir, 'longmemeval_s', 'last-k', 'test-q1');
|
|
156
|
+
const q2Dir = join(testDir, 'longmemeval_s', 'last-k', 'test-q2');
|
|
157
|
+
|
|
158
|
+
// Only first question should be processed
|
|
159
|
+
expect(existsSync(q1Dir)).toBe(true);
|
|
160
|
+
expect(existsSync(q2Dir)).toBe(false);
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
it('should use custom output directory when specified', async () => {
|
|
164
|
+
const customDir = join(tmpdir(), `custom-prepare-${Date.now()}`);
|
|
165
|
+
|
|
166
|
+
await command.run({
|
|
167
|
+
dataset: 'longmemeval_s',
|
|
168
|
+
memoryConfig: 'working-memory',
|
|
169
|
+
outputDir: customDir,
|
|
170
|
+
subset: 1,
|
|
171
|
+
});
|
|
172
|
+
|
|
173
|
+
const questionDir = join(customDir, 'longmemeval_s', 'working-memory', 'test-q1');
|
|
174
|
+
expect(existsSync(questionDir)).toBe(true);
|
|
175
|
+
|
|
176
|
+
// Clean up
|
|
177
|
+
await rm(customDir, { recursive: true });
|
|
178
|
+
});
|
|
179
|
+
|
|
180
|
+
it('should handle combined memory config with vector store', async () => {
|
|
181
|
+
await command.run({
|
|
182
|
+
dataset: 'longmemeval_s',
|
|
183
|
+
memoryConfig: 'combined',
|
|
184
|
+
subset: 1,
|
|
185
|
+
});
|
|
186
|
+
|
|
187
|
+
const q1Dir = join(testDir, 'longmemeval_s', 'combined', 'test-q1');
|
|
188
|
+
|
|
189
|
+
// Combined config should have vector store
|
|
190
|
+
expect(existsSync(join(q1Dir, 'db.json'))).toBe(true);
|
|
191
|
+
expect(existsSync(join(q1Dir, 'vector.json'))).toBe(true);
|
|
192
|
+
expect(existsSync(join(q1Dir, 'meta.json'))).toBe(true);
|
|
193
|
+
});
|
|
194
|
+
});
|
|
195
|
+
|
|
196
|
+
describe('getMemoryOptions', () => {
|
|
197
|
+
it('should return correct options for each memory config', () => {
|
|
198
|
+
const fullHistory = (command as any).getMemoryOptions('full-history');
|
|
199
|
+
expect(fullHistory.type).toBe('full-history');
|
|
200
|
+
expect(fullHistory.options.lastMessages).toBe(999999);
|
|
201
|
+
expect(fullHistory.options.semanticRecall).toBe(false);
|
|
202
|
+
|
|
203
|
+
const lastK = (command as any).getMemoryOptions('last-k');
|
|
204
|
+
expect(lastK.type).toBe('last-k');
|
|
205
|
+
expect(lastK.options.lastMessages).toBe(50);
|
|
206
|
+
|
|
207
|
+
const semanticRecall = (command as any).getMemoryOptions('semantic-recall');
|
|
208
|
+
expect(semanticRecall.type).toBe('semantic-recall');
|
|
209
|
+
expect(semanticRecall.options.semanticRecall).toEqual({
|
|
210
|
+
topK: 10,
|
|
211
|
+
messageRange: 2,
|
|
212
|
+
scope: 'resource',
|
|
213
|
+
});
|
|
214
|
+
|
|
215
|
+
const workingMemory = (command as any).getMemoryOptions('working-memory');
|
|
216
|
+
expect(workingMemory.type).toBe('working-memory');
|
|
217
|
+
expect(workingMemory.options.workingMemory.enabled).toBe(true);
|
|
218
|
+
expect(workingMemory.options.workingMemory.template).toContain('User Context');
|
|
219
|
+
|
|
220
|
+
const combined = (command as any).getMemoryOptions('combined');
|
|
221
|
+
expect(combined.type).toBe('combined');
|
|
222
|
+
expect(combined.options.semanticRecall).toBeTruthy();
|
|
223
|
+
expect(combined.options.workingMemory.enabled).toBe(true);
|
|
224
|
+
});
|
|
225
|
+
|
|
226
|
+
it('should throw error for unknown memory config', () => {
|
|
227
|
+
expect(() => (command as any).getMemoryOptions('invalid')).toThrow('Unknown memory config: invalid');
|
|
228
|
+
});
|
|
229
|
+
});
|
|
230
|
+
});
|
|
@@ -0,0 +1,403 @@
|
|
|
1
|
+
import { describe, it, expect, beforeEach, afterEach, vi } from 'vitest';
|
|
2
|
+
import { RunCommand } from '../run';
|
|
3
|
+
import { rm, mkdir, writeFile } from 'fs/promises';
|
|
4
|
+
import { existsSync } from 'fs';
|
|
5
|
+
import { join } from 'path';
|
|
6
|
+
import { tmpdir } from 'os';
|
|
7
|
+
import { createMockEmbedding } from '../../test-utils/mock-embeddings';
|
|
8
|
+
|
|
9
|
+
// Mock OpenAI using vi.hoisted to avoid initialization issues
|
|
10
|
+
const { openaiModel } = vi.hoisted(() => {
|
|
11
|
+
const openaiModel = vi.fn((modelName: string) => ({
|
|
12
|
+
doGenerate: vi.fn().mockResolvedValue({
|
|
13
|
+
rawCall: { rawPrompt: null, rawSettings: {} },
|
|
14
|
+
finishReason: 'stop',
|
|
15
|
+
usage: { promptTokens: 10, completionTokens: 20 },
|
|
16
|
+
text: 'Blue',
|
|
17
|
+
}),
|
|
18
|
+
}));
|
|
19
|
+
|
|
20
|
+
// Add embedding as a property using fixture embeddings
|
|
21
|
+
openaiModel.embedding = vi.fn(() => createMockEmbedding());
|
|
22
|
+
|
|
23
|
+
return { openaiModel };
|
|
24
|
+
});
|
|
25
|
+
|
|
26
|
+
vi.mock('@ai-sdk/openai', () => ({
|
|
27
|
+
openai: openaiModel,
|
|
28
|
+
}));
|
|
29
|
+
|
|
30
|
+
// Mock the LongMemEvalMetric
|
|
31
|
+
vi.mock('../../evaluation/longmemeval-metric', () => ({
|
|
32
|
+
LongMemEvalMetric: vi.fn().mockImplementation(() => ({
|
|
33
|
+
measure: vi.fn().mockResolvedValue({ score: 1 }), // Always returns correct
|
|
34
|
+
})),
|
|
35
|
+
}));
|
|
36
|
+
|
|
37
|
+
// Mock chalk and ora
|
|
38
|
+
vi.mock('chalk', () => ({
|
|
39
|
+
default: {
|
|
40
|
+
blue: (str: string) => str,
|
|
41
|
+
yellow: (str: string) => str,
|
|
42
|
+
green: (str: string) => str,
|
|
43
|
+
gray: (str: string) => str,
|
|
44
|
+
red: (str: string) => str,
|
|
45
|
+
bold: (str: string) => str,
|
|
46
|
+
},
|
|
47
|
+
}));
|
|
48
|
+
|
|
49
|
+
vi.mock('ora', () => ({
|
|
50
|
+
default: () => ({
|
|
51
|
+
start: vi.fn().mockReturnThis(),
|
|
52
|
+
succeed: vi.fn().mockReturnThis(),
|
|
53
|
+
fail: vi.fn().mockReturnThis(),
|
|
54
|
+
text: '',
|
|
55
|
+
}),
|
|
56
|
+
}));
|
|
57
|
+
|
|
58
|
+
describe('RunCommand', () => {
|
|
59
|
+
let command: RunCommand;
|
|
60
|
+
let testDir: string;
|
|
61
|
+
let preparedDataDir: string;
|
|
62
|
+
let outputDir: string;
|
|
63
|
+
|
|
64
|
+
beforeEach(async () => {
|
|
65
|
+
command = new RunCommand();
|
|
66
|
+
testDir = join(tmpdir(), `run-test-${Date.now()}`);
|
|
67
|
+
preparedDataDir = join(testDir, 'prepared-data');
|
|
68
|
+
outputDir = join(testDir, 'results');
|
|
69
|
+
|
|
70
|
+
// Override the directories
|
|
71
|
+
(command as any).preparedDataDir = preparedDataDir;
|
|
72
|
+
(command as any).outputDir = outputDir;
|
|
73
|
+
|
|
74
|
+
// Create prepared test data
|
|
75
|
+
await createPreparedData(preparedDataDir);
|
|
76
|
+
});
|
|
77
|
+
|
|
78
|
+
afterEach(async () => {
|
|
79
|
+
// Clean up test directory
|
|
80
|
+
if (existsSync(testDir)) {
|
|
81
|
+
await rm(testDir, { recursive: true });
|
|
82
|
+
}
|
|
83
|
+
});
|
|
84
|
+
|
|
85
|
+
async function createPreparedData(baseDir: string) {
|
|
86
|
+
// Create directory structure
|
|
87
|
+
const dataDir = join(baseDir, 'longmemeval_s', 'full-history');
|
|
88
|
+
await mkdir(dataDir, { recursive: true });
|
|
89
|
+
|
|
90
|
+
// Question 1
|
|
91
|
+
const q1Dir = join(dataDir, 'test-q1');
|
|
92
|
+
await mkdir(q1Dir, { recursive: true });
|
|
93
|
+
|
|
94
|
+
await writeFile(
|
|
95
|
+
join(q1Dir, 'meta.json'),
|
|
96
|
+
JSON.stringify({
|
|
97
|
+
questionId: 'test-q1',
|
|
98
|
+
questionType: 'single-session-user',
|
|
99
|
+
resourceId: 'resource_test-q1',
|
|
100
|
+
threadIds: ['session-1'],
|
|
101
|
+
memoryConfig: 'full-history',
|
|
102
|
+
question: 'What is my favorite color?',
|
|
103
|
+
answer: 'Blue',
|
|
104
|
+
}),
|
|
105
|
+
);
|
|
106
|
+
|
|
107
|
+
await writeFile(
|
|
108
|
+
join(q1Dir, 'db.json'),
|
|
109
|
+
JSON.stringify({
|
|
110
|
+
mastra_messages: [
|
|
111
|
+
[
|
|
112
|
+
'msg-1',
|
|
113
|
+
{
|
|
114
|
+
id: 'msg-1',
|
|
115
|
+
threadId: 'session-1',
|
|
116
|
+
resourceId: 'resource_test-q1',
|
|
117
|
+
role: 'user',
|
|
118
|
+
content: 'My favorite color is blue',
|
|
119
|
+
createdAt: new Date().toISOString(),
|
|
120
|
+
type: 'text',
|
|
121
|
+
},
|
|
122
|
+
],
|
|
123
|
+
[
|
|
124
|
+
'msg-2',
|
|
125
|
+
{
|
|
126
|
+
id: 'msg-2',
|
|
127
|
+
threadId: 'session-1',
|
|
128
|
+
resourceId: 'resource_test-q1',
|
|
129
|
+
role: 'assistant',
|
|
130
|
+
content: 'I understand your favorite color is blue.',
|
|
131
|
+
createdAt: new Date().toISOString(),
|
|
132
|
+
type: 'text',
|
|
133
|
+
},
|
|
134
|
+
],
|
|
135
|
+
],
|
|
136
|
+
mastra_threads: [
|
|
137
|
+
[
|
|
138
|
+
'session-1',
|
|
139
|
+
{
|
|
140
|
+
id: 'session-1',
|
|
141
|
+
resourceId: 'resource_test-q1',
|
|
142
|
+
title: 'Session 1',
|
|
143
|
+
metadata: {},
|
|
144
|
+
createdAt: new Date().toISOString(),
|
|
145
|
+
updatedAt: new Date().toISOString(),
|
|
146
|
+
},
|
|
147
|
+
],
|
|
148
|
+
],
|
|
149
|
+
mastra_resources: [],
|
|
150
|
+
mastra_workflow_snapshot: [],
|
|
151
|
+
mastra_evals: [],
|
|
152
|
+
mastra_traces: [],
|
|
153
|
+
}),
|
|
154
|
+
);
|
|
155
|
+
|
|
156
|
+
// Question 2
|
|
157
|
+
const q2Dir = join(dataDir, 'test-q2');
|
|
158
|
+
await mkdir(q2Dir, { recursive: true });
|
|
159
|
+
|
|
160
|
+
await writeFile(
|
|
161
|
+
join(q2Dir, 'meta.json'),
|
|
162
|
+
JSON.stringify({
|
|
163
|
+
questionId: 'test-q2',
|
|
164
|
+
questionType: 'multi-session',
|
|
165
|
+
resourceId: 'resource_test-q2',
|
|
166
|
+
threadIds: ['session-2', 'session-3'],
|
|
167
|
+
memoryConfig: 'full-history',
|
|
168
|
+
question: 'What did I say about my pet?',
|
|
169
|
+
answer: 'You have a cat named Fluffy',
|
|
170
|
+
}),
|
|
171
|
+
);
|
|
172
|
+
|
|
173
|
+
await writeFile(
|
|
174
|
+
join(q2Dir, 'db.json'),
|
|
175
|
+
JSON.stringify({
|
|
176
|
+
mastra_messages: [
|
|
177
|
+
[
|
|
178
|
+
'msg-1',
|
|
179
|
+
{
|
|
180
|
+
id: 'msg-1',
|
|
181
|
+
threadId: 'session-2',
|
|
182
|
+
resourceId: 'resource_test-q2',
|
|
183
|
+
role: 'user',
|
|
184
|
+
content: 'I have a pet',
|
|
185
|
+
createdAt: new Date().toISOString(),
|
|
186
|
+
type: 'text',
|
|
187
|
+
},
|
|
188
|
+
],
|
|
189
|
+
[
|
|
190
|
+
'msg-2',
|
|
191
|
+
{
|
|
192
|
+
id: 'msg-2',
|
|
193
|
+
threadId: 'session-3',
|
|
194
|
+
resourceId: 'resource_test-q2',
|
|
195
|
+
role: 'user',
|
|
196
|
+
content: 'It is a cat named Fluffy',
|
|
197
|
+
createdAt: new Date().toISOString(),
|
|
198
|
+
type: 'text',
|
|
199
|
+
},
|
|
200
|
+
],
|
|
201
|
+
],
|
|
202
|
+
mastra_threads: [
|
|
203
|
+
[
|
|
204
|
+
'session-2',
|
|
205
|
+
{
|
|
206
|
+
id: 'session-2',
|
|
207
|
+
resourceId: 'resource_test-q2',
|
|
208
|
+
title: 'Session 2',
|
|
209
|
+
metadata: {},
|
|
210
|
+
createdAt: new Date().toISOString(),
|
|
211
|
+
updatedAt: new Date().toISOString(),
|
|
212
|
+
},
|
|
213
|
+
],
|
|
214
|
+
[
|
|
215
|
+
'session-3',
|
|
216
|
+
{
|
|
217
|
+
id: 'session-3',
|
|
218
|
+
resourceId: 'resource_test-q2',
|
|
219
|
+
title: 'Session 3',
|
|
220
|
+
metadata: {},
|
|
221
|
+
createdAt: new Date().toISOString(),
|
|
222
|
+
updatedAt: new Date().toISOString(),
|
|
223
|
+
},
|
|
224
|
+
],
|
|
225
|
+
],
|
|
226
|
+
mastra_resources: [],
|
|
227
|
+
mastra_workflow_snapshot: [],
|
|
228
|
+
mastra_evals: [],
|
|
229
|
+
mastra_traces: [],
|
|
230
|
+
}),
|
|
231
|
+
);
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
describe('run', () => {
|
|
235
|
+
it('should evaluate questions from prepared data', async () => {
|
|
236
|
+
const metrics = await command.run({
|
|
237
|
+
dataset: 'longmemeval_s',
|
|
238
|
+
memoryConfig: 'full-history',
|
|
239
|
+
model: 'gpt-4o',
|
|
240
|
+
preparedDataDir,
|
|
241
|
+
outputDir,
|
|
242
|
+
});
|
|
243
|
+
|
|
244
|
+
expect(metrics.total_questions).toBe(2);
|
|
245
|
+
expect(metrics.correct_answers).toBe(2);
|
|
246
|
+
expect(metrics.overall_accuracy).toBe(1.0);
|
|
247
|
+
|
|
248
|
+
// Check output files
|
|
249
|
+
const runDirs = await Promise.resolve().then(() =>
|
|
250
|
+
existsSync(outputDir) ? require('fs/promises').readdir(outputDir) : [],
|
|
251
|
+
);
|
|
252
|
+
expect(runDirs.length).toBeGreaterThan(0);
|
|
253
|
+
|
|
254
|
+
const runDir = join(outputDir, runDirs[0]);
|
|
255
|
+
expect(existsSync(join(runDir, 'results.jsonl'))).toBe(true);
|
|
256
|
+
expect(existsSync(join(runDir, 'metrics.json'))).toBe(true);
|
|
257
|
+
});
|
|
258
|
+
|
|
259
|
+
it('should process subset of questions when specified', async () => {
|
|
260
|
+
const metrics = await command.run({
|
|
261
|
+
dataset: 'longmemeval_s',
|
|
262
|
+
memoryConfig: 'full-history',
|
|
263
|
+
model: 'gpt-4o',
|
|
264
|
+
preparedDataDir,
|
|
265
|
+
outputDir,
|
|
266
|
+
subset: 1,
|
|
267
|
+
});
|
|
268
|
+
|
|
269
|
+
expect(metrics.total_questions).toBe(1);
|
|
270
|
+
});
|
|
271
|
+
|
|
272
|
+
it('should handle semantic-recall memory config with vector store', async () => {
|
|
273
|
+
// Create semantic-recall prepared data
|
|
274
|
+
const semanticDir = join(preparedDataDir, 'longmemeval_s', 'semantic-recall');
|
|
275
|
+
await mkdir(semanticDir, { recursive: true });
|
|
276
|
+
|
|
277
|
+
const q1Dir = join(semanticDir, 'test-q1');
|
|
278
|
+
await mkdir(q1Dir, { recursive: true });
|
|
279
|
+
|
|
280
|
+
await writeFile(
|
|
281
|
+
join(q1Dir, 'meta.json'),
|
|
282
|
+
JSON.stringify({
|
|
283
|
+
questionId: 'test-q1',
|
|
284
|
+
questionType: 'single-session-user',
|
|
285
|
+
resourceId: 'resource_test-q1',
|
|
286
|
+
threadIds: ['session-1'],
|
|
287
|
+
memoryConfig: 'semantic-recall',
|
|
288
|
+
question: 'What is my favorite color?',
|
|
289
|
+
answer: 'Blue',
|
|
290
|
+
}),
|
|
291
|
+
);
|
|
292
|
+
|
|
293
|
+
await writeFile(
|
|
294
|
+
join(q1Dir, 'db.json'),
|
|
295
|
+
JSON.stringify({
|
|
296
|
+
mastra_messages: [
|
|
297
|
+
[
|
|
298
|
+
'msg-1',
|
|
299
|
+
{
|
|
300
|
+
id: 'msg-1',
|
|
301
|
+
threadId: 'session-1',
|
|
302
|
+
resourceId: 'resource_test-q1',
|
|
303
|
+
role: 'user',
|
|
304
|
+
content: 'My favorite color is blue',
|
|
305
|
+
createdAt: new Date().toISOString(),
|
|
306
|
+
type: 'text',
|
|
307
|
+
},
|
|
308
|
+
],
|
|
309
|
+
],
|
|
310
|
+
mastra_threads: [
|
|
311
|
+
[
|
|
312
|
+
'session-1',
|
|
313
|
+
{
|
|
314
|
+
id: 'session-1',
|
|
315
|
+
resourceId: 'resource_test-q1',
|
|
316
|
+
title: 'Session 1',
|
|
317
|
+
metadata: {},
|
|
318
|
+
createdAt: new Date().toISOString(),
|
|
319
|
+
updatedAt: new Date().toISOString(),
|
|
320
|
+
},
|
|
321
|
+
],
|
|
322
|
+
],
|
|
323
|
+
mastra_resources: [],
|
|
324
|
+
mastra_workflow_snapshot: [],
|
|
325
|
+
mastra_evals: [],
|
|
326
|
+
mastra_traces: [],
|
|
327
|
+
}),
|
|
328
|
+
);
|
|
329
|
+
|
|
330
|
+
// Add vector store data in the correct format
|
|
331
|
+
await writeFile(
|
|
332
|
+
join(q1Dir, 'vector.json'),
|
|
333
|
+
JSON.stringify({
|
|
334
|
+
messages: {
|
|
335
|
+
config: {
|
|
336
|
+
dimension: 1536,
|
|
337
|
+
metric: 'cosine',
|
|
338
|
+
},
|
|
339
|
+
documents: [
|
|
340
|
+
{
|
|
341
|
+
id: 'msg-1',
|
|
342
|
+
vector: new Array(1536).fill(0).map(() => Math.random()),
|
|
343
|
+
metadata: {
|
|
344
|
+
threadId: 'session-1',
|
|
345
|
+
resourceId: 'resource_test-q1',
|
|
346
|
+
content: 'My favorite color is blue',
|
|
347
|
+
},
|
|
348
|
+
},
|
|
349
|
+
],
|
|
350
|
+
},
|
|
351
|
+
}),
|
|
352
|
+
);
|
|
353
|
+
|
|
354
|
+
const metrics = await command.run({
|
|
355
|
+
dataset: 'longmemeval_s',
|
|
356
|
+
memoryConfig: 'semantic-recall',
|
|
357
|
+
model: 'gpt-4o',
|
|
358
|
+
preparedDataDir,
|
|
359
|
+
outputDir,
|
|
360
|
+
subset: 1,
|
|
361
|
+
});
|
|
362
|
+
|
|
363
|
+
expect(metrics.total_questions).toBe(1);
|
|
364
|
+
});
|
|
365
|
+
|
|
366
|
+
it('should throw error if prepared data does not exist', async () => {
|
|
367
|
+
await expect(
|
|
368
|
+
command.run({
|
|
369
|
+
dataset: 'longmemeval_s',
|
|
370
|
+
memoryConfig: 'working-memory',
|
|
371
|
+
model: 'gpt-4o',
|
|
372
|
+
preparedDataDir,
|
|
373
|
+
outputDir,
|
|
374
|
+
}),
|
|
375
|
+
).rejects.toThrow(/Prepared data not found/);
|
|
376
|
+
});
|
|
377
|
+
});
|
|
378
|
+
|
|
379
|
+
describe('getMemoryOptions', () => {
|
|
380
|
+
it('should return correct options for each memory config', () => {
|
|
381
|
+
const fullHistory = (command as any).getMemoryOptions('full-history');
|
|
382
|
+
expect(fullHistory.type).toBe('full-history');
|
|
383
|
+
expect(fullHistory.options.lastMessages).toBe(999999);
|
|
384
|
+
|
|
385
|
+
const semanticRecall = (command as any).getMemoryOptions('semantic-recall');
|
|
386
|
+
expect(semanticRecall.type).toBe('semantic-recall');
|
|
387
|
+
expect(semanticRecall.options.semanticRecall.scope).toBe('resource');
|
|
388
|
+
|
|
389
|
+
const workingMemory = (command as any).getMemoryOptions('working-memory');
|
|
390
|
+
expect(workingMemory.type).toBe('working-memory');
|
|
391
|
+
expect(workingMemory.options.workingMemory.enabled).toBe(true);
|
|
392
|
+
|
|
393
|
+
const combined = (command as any).getMemoryOptions('combined');
|
|
394
|
+
expect(combined.type).toBe('combined');
|
|
395
|
+
expect(combined.options.semanticRecall).toBeTruthy();
|
|
396
|
+
expect(combined.options.workingMemory.enabled).toBe(true);
|
|
397
|
+
});
|
|
398
|
+
|
|
399
|
+
it('should throw error for unknown memory config', () => {
|
|
400
|
+
expect(() => (command as any).getMemoryOptions('invalid')).toThrow('Unknown memory config: invalid');
|
|
401
|
+
});
|
|
402
|
+
});
|
|
403
|
+
});
|