@mastra/longmemeval 0.0.0-add-libsql-changeset-20250910154739

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/CHANGELOG.md +919 -0
  2. package/DATA_DOWNLOAD_GUIDE.md +117 -0
  3. package/LICENSE.md +15 -0
  4. package/README.md +173 -0
  5. package/USAGE.md +105 -0
  6. package/package.json +67 -0
  7. package/scripts/download.ts +180 -0
  8. package/scripts/find-failed.ts +176 -0
  9. package/scripts/generate-embeddings.ts +56 -0
  10. package/scripts/generate-wm-templates.ts +296 -0
  11. package/scripts/setup.ts +60 -0
  12. package/src/__fixtures__/embeddings.json +2319 -0
  13. package/src/__fixtures__/test-dataset.json +82 -0
  14. package/src/cli.ts +690 -0
  15. package/src/commands/__tests__/prepare.test.ts +230 -0
  16. package/src/commands/__tests__/run.test.ts +403 -0
  17. package/src/commands/prepare.ts +793 -0
  18. package/src/commands/run.ts +553 -0
  19. package/src/config.ts +83 -0
  20. package/src/data/loader.ts +163 -0
  21. package/src/data/types.ts +61 -0
  22. package/src/embeddings/cached-openai-embedding-model.ts +227 -0
  23. package/src/embeddings/cached-openai-provider.ts +40 -0
  24. package/src/embeddings/index.ts +2 -0
  25. package/src/evaluation/__tests__/longmemeval-metric.test.ts +169 -0
  26. package/src/evaluation/longmemeval-metric.ts +173 -0
  27. package/src/retry-model.ts +60 -0
  28. package/src/storage/__tests__/benchmark-store.test.ts +280 -0
  29. package/src/storage/__tests__/benchmark-vector.test.ts +214 -0
  30. package/src/storage/benchmark-store.ts +540 -0
  31. package/src/storage/benchmark-vector.ts +234 -0
  32. package/src/storage/index.ts +2 -0
  33. package/src/test-utils/mock-embeddings.ts +54 -0
  34. package/src/test-utils/mock-model.ts +49 -0
  35. package/tests/data-loader.test.ts +96 -0
  36. package/tsconfig.json +18 -0
  37. package/vitest.config.ts +9 -0
@@ -0,0 +1,173 @@
1
+ import { Metric, type MetricResult } from '@mastra/core/eval';
2
+ import { Agent } from '@mastra/core/agent';
3
+ import type { QuestionType } from '../data/types';
4
+
5
+ export interface LongMemEvalMetricConfig {
6
+ agent: Agent;
7
+ questionType: QuestionType;
8
+ isAbstention?: boolean;
9
+ }
10
+
11
+ /**
12
+ * LongMemEval Metric implementation using Mastra's eval framework
13
+ *
14
+ * This metric evaluates whether an LLM correctly recalls information
15
+ * from long conversation histories across different question types.
16
+ */
17
+ export class LongMemEvalMetric extends Metric {
18
+ private agent: Agent;
19
+ private questionType: QuestionType;
20
+ private isAbstention: boolean;
21
+
22
+ constructor(config: LongMemEvalMetricConfig) {
23
+ super();
24
+ this.agent = config.agent;
25
+ if (!this.agent) {
26
+ throw new Error('Agent instance is required for LongMemEvalMetric');
27
+ }
28
+ this.questionType = config.questionType;
29
+ this.isAbstention = config.isAbstention || false;
30
+ }
31
+
32
+ /**
33
+ * Measure the correctness of a model's response
34
+ *
35
+ * @param input - JSON string containing question and expected answer
36
+ * @param output - Model's response
37
+ * @returns MetricResult with score (0 or 1) and additional info
38
+ */
39
+ async measure(input: string, output: string): Promise<MetricResult> {
40
+ // Parse input to get question and expected answer
41
+ const { question, answer } = JSON.parse(input) as {
42
+ question: string;
43
+ answer: string;
44
+ };
45
+
46
+ const prompt = this.getEvalPrompt(this.questionType, question, answer, output, this.isAbstention);
47
+
48
+ try {
49
+ const response = await this.agent.generate(
50
+ [
51
+ {
52
+ role: 'user',
53
+ content: prompt,
54
+ },
55
+ ],
56
+ {
57
+ temperature: 0,
58
+ },
59
+ );
60
+
61
+ const responseText = response.text?.toLowerCase().trim();
62
+ const isCorrect = responseText === 'yes' || responseText?.toLowerCase()?.startsWith('yes.');
63
+
64
+ // Extract reason if answer is no
65
+ let reason: string | undefined;
66
+ if (!isCorrect && responseText?.includes(':')) {
67
+ reason = responseText.split(':')[1]?.trim();
68
+ }
69
+
70
+ return {
71
+ score: isCorrect ? 1 : 0,
72
+ info: {
73
+ questionType: this.questionType,
74
+ isAbstention: this.isAbstention,
75
+ evaluatorResponse: responseText,
76
+ reason,
77
+ },
78
+ };
79
+ } catch (error) {
80
+ throw new Error(`Failed to evaluate response: ${error}`);
81
+ }
82
+ }
83
+
84
+ /**
85
+ * Get the evaluation prompt based on question type
86
+ */
87
+ private getEvalPrompt(
88
+ taskType: QuestionType,
89
+ question: string,
90
+ answer: string,
91
+ response: string,
92
+ isAbstention: boolean,
93
+ ): string {
94
+ if (isAbstention) {
95
+ return `I will give you an unanswerable question, an explanation, and a response from a model. Please answer yes if the model correctly identifies the question as unanswerable. The model could say that the information is incomplete, or some other information is given but the asked information is not.
96
+
97
+ Question: ${question}
98
+
99
+ Explanation: ${answer}
100
+
101
+ Model Response: ${response}
102
+
103
+ Does the model correctly identify the question as unanswerable? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
104
+ }
105
+
106
+ switch (taskType) {
107
+ case 'single-session-user':
108
+ case 'single-session-assistant':
109
+ case 'multi-session':
110
+ return `I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no.
111
+ If you answer "no" please also include a reason why (ex "no: because x reason")
112
+
113
+ Question: ${question}
114
+
115
+ Correct Answer: ${answer}
116
+
117
+ Model Response: ${response}
118
+
119
+ Is the model response correct? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
120
+
121
+ case 'temporal-reasoning':
122
+ return `I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response is equivalent to the correct answer or contains all the intermediate steps to get the correct answer, you should also answer yes. If the response only contains a subset of the information required by the answer, answer no. In addition, do not penalize off-by-one errors for the number of days. If the question asks for the number of days/weeks/months, etc., and the model makes off-by-one errors (e.g., predicting 19 days when the answer is 18), the model's response is still correct.
123
+
124
+ Question: ${question}
125
+
126
+ Correct Answer: ${answer}
127
+
128
+ Model Response: ${response}
129
+
130
+ Is the model response correct? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
131
+
132
+ case 'knowledge-update':
133
+ return `I will give you a question, a correct answer, and a response from a model. Please answer yes if the response contains the correct answer. Otherwise, answer no. If the response contains some previous information along with an updated answer, the response should be considered as correct as long as the updated answer is the required answer.
134
+
135
+ Question: ${question}
136
+
137
+ Correct Answer: ${answer}
138
+
139
+ Model Response: ${response}
140
+
141
+ Is the model response correct? Answer yes or no only.`;
142
+
143
+ case 'single-session-preference':
144
+ return `I will give you a question, a rubric for desired personalized response, and a response from a model. Please answer yes if the response satisfies the desired response. Otherwise, answer no and provide a reason why. The model does not need to reflect all the points in the rubric. The response is correct as long as it recalls and utilizes the user's personal information correctly.
145
+
146
+ Question: ${question}
147
+
148
+ Rubric: ${answer}
149
+
150
+ Model Response: ${response}
151
+
152
+ Is the model response correct? Answer yes or no only. If you answer "no" please also include a reason why (ex "no: because x reason")`;
153
+
154
+ default:
155
+ throw new Error(`Unknown question type: ${taskType}`);
156
+ }
157
+ }
158
+ }
159
+
160
+ /**
161
+ * Factory function to create LongMemEval metrics for different question types
162
+ */
163
+ export function createLongMemEvalMetric(
164
+ questionType: QuestionType,
165
+ agent: Agent,
166
+ options?: Partial<LongMemEvalMetricConfig>,
167
+ ): LongMemEvalMetric {
168
+ return new LongMemEvalMetric({
169
+ ...options,
170
+ agent,
171
+ questionType,
172
+ });
173
+ }
@@ -0,0 +1,60 @@
1
+ import { LanguageModel, wrapLanguageModel } from 'ai';
2
+
3
+ export function makeRetryModel(model: LanguageModel) {
4
+ const state = {
5
+ rateLimitCount: 0,
6
+ pause: null as null | Promise<void>,
7
+ pauseResolve: () => {},
8
+ pauseTime: 0,
9
+ };
10
+ const wrapped = wrapLanguageModel({
11
+ model,
12
+ middleware: {
13
+ wrapGenerate: async ({ doGenerate }) => {
14
+ if (state.pause) await state.pause;
15
+ const maxRetries = 10;
16
+ let retries = 0;
17
+ while (retries < maxRetries) {
18
+ try {
19
+ const result = await doGenerate();
20
+ return result;
21
+ } catch (error: any) {
22
+ if (error.status === 429 || error.statusCode === 429) {
23
+ retries++;
24
+ state.rateLimitCount++;
25
+ const newPauseTime = 2000 * retries;
26
+ if (state.pause) {
27
+ await state.pause;
28
+ }
29
+ if (retries >= maxRetries) {
30
+ throw error;
31
+ }
32
+ if (newPauseTime <= state.pauseTime) {
33
+ continue;
34
+ }
35
+ if (!state.pause) {
36
+ state.pauseTime = newPauseTime;
37
+ state.pause = new Promise(resolve => {
38
+ setTimeout(() => {
39
+ resolve();
40
+ state.pause = null;
41
+ state.pauseTime = 0;
42
+ }, state.pauseTime);
43
+ });
44
+ }
45
+ await state.pause;
46
+ } else {
47
+ throw error;
48
+ }
49
+ }
50
+ }
51
+ throw new Error('unhandled');
52
+ },
53
+ },
54
+ });
55
+
56
+ return {
57
+ model: wrapped,
58
+ state,
59
+ };
60
+ }
@@ -0,0 +1,280 @@
1
+ import { describe, it, expect, beforeEach, afterEach } from 'vitest';
2
+ import { BenchmarkStore } from '../benchmark-store';
3
+ import { rm } from 'fs/promises';
4
+ import { existsSync } from 'fs';
5
+ import { join } from 'path';
6
+ import { tmpdir } from 'os';
7
+
8
+ describe('BenchmarkStore', () => {
9
+ let store: BenchmarkStore;
10
+ let testFilePath: string;
11
+
12
+ beforeEach(async () => {
13
+ store = new BenchmarkStore();
14
+ await store.init();
15
+ testFilePath = join(tmpdir(), `benchmark-store-test-${Date.now()}.json`);
16
+ });
17
+
18
+ afterEach(async () => {
19
+ // Clean up test files
20
+ if (existsSync(testFilePath)) {
21
+ await rm(testFilePath);
22
+ }
23
+ });
24
+
25
+ describe('supports', () => {
26
+ it('should support resource scope and working memory', () => {
27
+ expect(store.supports.selectByIncludeResourceScope).toBe(true);
28
+ expect(store.supports.resourceWorkingMemory).toBe(true);
29
+ });
30
+ });
31
+
32
+ describe('persist', () => {
33
+ it('should save store data to a JSON file', async () => {
34
+ // Add some test data
35
+ await store.saveThread({
36
+ thread: {
37
+ id: 'test-thread-1',
38
+ resourceId: 'test-resource-1',
39
+ title: 'Test Thread',
40
+ metadata: { test: true },
41
+ createdAt: new Date(),
42
+ updatedAt: new Date(),
43
+ },
44
+ });
45
+
46
+ await store.saveMessages({
47
+ messages: [
48
+ {
49
+ id: 'msg-1',
50
+ threadId: 'test-thread-1',
51
+ resourceId: 'test-resource-1',
52
+ role: 'user' as const,
53
+ content: 'Hello',
54
+ createdAt: new Date(),
55
+ type: 'text' as const,
56
+ },
57
+ {
58
+ id: 'msg-2',
59
+ threadId: 'test-thread-1',
60
+ resourceId: 'test-resource-1',
61
+ role: 'assistant' as const,
62
+ content: 'Hi there!',
63
+ createdAt: new Date(),
64
+ type: 'text' as const,
65
+ },
66
+ ],
67
+ });
68
+
69
+ // Persist to file
70
+ await store.persist(testFilePath);
71
+
72
+ // Verify file exists
73
+ expect(existsSync(testFilePath)).toBe(true);
74
+ });
75
+ });
76
+
77
+ describe('hydrate', () => {
78
+ it('should restore store data from a JSON file', async () => {
79
+ // Create first store with data
80
+ const store1 = new BenchmarkStore();
81
+ await store1.init();
82
+
83
+ const thread = {
84
+ id: 'test-thread-1',
85
+ resourceId: 'test-resource-1',
86
+ title: 'Test Thread',
87
+ metadata: { test: true },
88
+ createdAt: new Date(),
89
+ updatedAt: new Date(),
90
+ };
91
+
92
+ await store1.saveThread({ thread });
93
+ await store1.saveMessages({
94
+ messages: [
95
+ {
96
+ id: 'msg-1',
97
+ threadId: 'test-thread-1',
98
+ resourceId: 'test-resource-1',
99
+ role: 'user' as const,
100
+ content: 'Hello',
101
+ createdAt: new Date(),
102
+ type: 'text' as const,
103
+ },
104
+ ],
105
+ });
106
+
107
+ // Persist store1
108
+ await store1.persist(testFilePath);
109
+
110
+ // Create new store and hydrate
111
+ const store2 = new BenchmarkStore();
112
+ await store2.init();
113
+ await store2.hydrate(testFilePath);
114
+
115
+ // Verify data was restored
116
+ const restoredThread = await store2.getThreadById({ threadId: 'test-thread-1' });
117
+ expect(restoredThread).toBeTruthy();
118
+ expect(restoredThread?.title).toBe('Test Thread');
119
+
120
+ const restoredMessages = await store2.getMessages({ threadId: 'test-thread-1' });
121
+ expect(restoredMessages).toHaveLength(1);
122
+ expect(restoredMessages[0].content).toBe('Hello');
123
+ });
124
+
125
+ it('should throw error if file does not exist', async () => {
126
+ await expect(store.hydrate('/non/existent/file.json')).rejects.toThrow('Storage file not found');
127
+ });
128
+ });
129
+
130
+ describe('cross-thread queries (resource scope)', () => {
131
+ it('should support selectBy.include with different threadIds', async () => {
132
+ // Create messages in different threads but same resource
133
+ await store.saveThread({
134
+ thread: {
135
+ id: 'thread-1',
136
+ resourceId: 'resource-1',
137
+ title: 'Thread 1',
138
+ metadata: {},
139
+ createdAt: new Date(),
140
+ updatedAt: new Date(),
141
+ },
142
+ });
143
+
144
+ await store.saveThread({
145
+ thread: {
146
+ id: 'thread-2',
147
+ resourceId: 'resource-1',
148
+ title: 'Thread 2',
149
+ metadata: {},
150
+ createdAt: new Date(),
151
+ updatedAt: new Date(),
152
+ },
153
+ });
154
+
155
+ await store.saveMessages({
156
+ messages: [
157
+ {
158
+ id: 'msg-1',
159
+ threadId: 'thread-1',
160
+ resourceId: 'resource-1',
161
+ role: 'user' as const,
162
+ content: 'Message in thread 1',
163
+ createdAt: new Date('2024-01-01'),
164
+ type: 'text' as const,
165
+ },
166
+ {
167
+ id: 'msg-2',
168
+ threadId: 'thread-2',
169
+ resourceId: 'resource-1',
170
+ role: 'user' as const,
171
+ content: 'Message in thread 2',
172
+ createdAt: new Date('2024-01-02'),
173
+ type: 'text' as const,
174
+ },
175
+ {
176
+ id: 'msg-3',
177
+ threadId: 'thread-2',
178
+ resourceId: 'resource-1',
179
+ role: 'assistant' as const,
180
+ content: 'Response in thread 2',
181
+ createdAt: new Date('2024-01-03'),
182
+ type: 'text' as const,
183
+ },
184
+ ],
185
+ });
186
+
187
+ // Query using selectBy.include to get messages from different threads
188
+ const messages = await store.getMessages({
189
+ threadId: 'thread-1',
190
+ selectBy: {
191
+ include: [
192
+ {
193
+ id: 'msg-2',
194
+ threadId: 'thread-2', // Different thread!
195
+ withPreviousMessages: 0,
196
+ withNextMessages: 1,
197
+ },
198
+ ],
199
+ },
200
+ });
201
+
202
+ expect(messages).toHaveLength(2);
203
+ expect(messages[0].content).toBe('Message in thread 2');
204
+ expect(messages[1].content).toBe('Response in thread 2');
205
+ });
206
+ });
207
+
208
+ describe('resource operations', () => {
209
+ it('should support resource working memory', async () => {
210
+ const resource = await store.saveResource({
211
+ resource: {
212
+ id: 'resource-1',
213
+ workingMemory: 'Initial working memory',
214
+ metadata: { key: 'value' },
215
+ createdAt: new Date(),
216
+ updatedAt: new Date(),
217
+ },
218
+ });
219
+
220
+ expect(resource.workingMemory).toBe('Initial working memory');
221
+
222
+ // Update resource
223
+ const updated = await store.updateResource({
224
+ resourceId: 'resource-1',
225
+ workingMemory: 'Updated working memory',
226
+ metadata: { key: 'newValue', extra: 'data' },
227
+ });
228
+
229
+ expect(updated.workingMemory).toBe('Updated working memory');
230
+ expect(updated.metadata).toEqual({ key: 'newValue', extra: 'data' });
231
+
232
+ // Get resource
233
+ const retrieved = await store.getResourceById({ resourceId: 'resource-1' });
234
+ expect(retrieved?.workingMemory).toBe('Updated working memory');
235
+ });
236
+ });
237
+
238
+ describe('clear', () => {
239
+ it('should clear all data', async () => {
240
+ // Add data
241
+ await store.saveThread({
242
+ thread: {
243
+ id: 'test-thread-1',
244
+ resourceId: 'test-resource-1',
245
+ title: 'Test Thread',
246
+ metadata: {},
247
+ createdAt: new Date(),
248
+ updatedAt: new Date(),
249
+ },
250
+ });
251
+
252
+ // Clear
253
+ await store.clear();
254
+
255
+ // Verify data is gone
256
+ const thread = await store.getThreadById({ threadId: 'test-thread-1' });
257
+ expect(thread).toBeNull();
258
+ });
259
+ });
260
+
261
+ describe('getting messages', () => {
262
+ it('should throw when threadId is an empty string or whitespace only', async () => {
263
+ await expect(() => store.getMessages({ threadId: '' })).rejects.toThrowError(
264
+ 'threadId must be a non-empty string',
265
+ );
266
+
267
+ await expect(() => store.getMessagesPaginated({ threadId: '' })).rejects.toThrowError(
268
+ 'threadId must be a non-empty string',
269
+ );
270
+
271
+ await expect(() => store.getMessages({ threadId: ' ' })).rejects.toThrowError(
272
+ 'threadId must be a non-empty string',
273
+ );
274
+
275
+ await expect(() => store.getMessagesPaginated({ threadId: ' ' })).rejects.toThrowError(
276
+ 'threadId must be a non-empty string',
277
+ );
278
+ });
279
+ });
280
+ });