@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
5
5
 
@@ -150,7 +150,6 @@ const testCases: TestCaseWithContext[] = [
150
150
  ];
151
151
 
152
152
  const SECONDS = 10000;
153
- jest.setTimeout(15 * SECONDS);
154
153
 
155
154
  const modelConfig: ModelConfig = {
156
155
  provider: 'OPEN_AI',
@@ -159,88 +158,94 @@ const modelConfig: ModelConfig = {
159
158
  apiKey: process.env.OPENAI_API_KEY,
160
159
  };
161
160
 
162
- describe('ContextPositionMetric', () => {
163
- it('should handle perfect ordering with all relevant pieces', async () => {
164
- const testCase = testCases[0]!;
165
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
166
- const result = await metric.measure(testCase.input, testCase.output);
167
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
168
- });
169
-
170
- it('should handle mixed relevance case', async () => {
171
- const testCase = testCases[1]!;
172
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
173
- const result = await metric.measure(testCase.input, testCase.output);
174
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
175
- });
176
-
177
- it('should handle domain knowledge relevance', async () => {
178
- const testCase = testCases[2]!;
179
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
180
- const result = await metric.measure(testCase.input, testCase.output);
181
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
182
- });
183
-
184
- it('should handle mixed relevance with good ordering', async () => {
185
- const testCase = testCases[3]!;
186
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
187
- const result = await metric.measure(testCase.input, testCase.output);
188
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
189
- });
190
-
191
- it('should handle single relevant piece at start', async () => {
192
- const testCase = testCases[4]!;
193
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
194
- const result = await metric.measure(testCase.input, testCase.output);
195
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
196
- });
197
-
198
- it('should handle single relevant piece in middle', async () => {
199
- const testCase = testCases[5]!;
200
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
201
- const result = await metric.measure(testCase.input, testCase.output);
202
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
203
- });
204
-
205
- it('should handle single relevant piece at end', async () => {
206
- const testCase = testCases[6]!;
207
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
208
- const result = await metric.measure(testCase.input, testCase.output);
209
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
210
- });
211
-
212
- it('should handle empty context', async () => {
213
- const testCase = testCases[7]!;
214
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
215
- const result = await metric.measure(testCase.input, testCase.output);
216
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
217
- });
218
-
219
- it('should handle all irrelevant context', async () => {
220
- const testCase = testCases[8]!;
221
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
222
- const result = await metric.measure(testCase.input, testCase.output);
223
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
224
- });
225
-
226
- it('should handle complex interdependent context', async () => {
227
- const testCase = testCases[9]!;
228
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
229
- const result = await metric.measure(testCase.input, testCase.output);
230
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
231
- });
232
-
233
- it('should handle single piece context', async () => {
234
- const testCase = testCases[10]!;
235
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
236
- const result = await metric.measure(testCase.input, testCase.output);
237
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
238
- });
239
-
240
- it('should handle two relevant pieces at end', async () => {
241
- const testCase = testCases[11]!;
242
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
243
- const result = await metric.measure(testCase.input, testCase.output);
244
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
245
- });
246
- });
161
+ describe(
162
+ 'ContextPositionMetric',
163
+ () => {
164
+ it('should handle perfect ordering with all relevant pieces', async () => {
165
+ const testCase = testCases[0]!;
166
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
167
+ const result = await metric.measure(testCase.input, testCase.output);
168
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
169
+ });
170
+
171
+ it('should handle mixed relevance case', async () => {
172
+ const testCase = testCases[1]!;
173
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
174
+ const result = await metric.measure(testCase.input, testCase.output);
175
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
176
+ });
177
+
178
+ it('should handle domain knowledge relevance', async () => {
179
+ const testCase = testCases[2]!;
180
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
181
+ const result = await metric.measure(testCase.input, testCase.output);
182
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
183
+ });
184
+
185
+ it('should handle mixed relevance with good ordering', async () => {
186
+ const testCase = testCases[3]!;
187
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
188
+ const result = await metric.measure(testCase.input, testCase.output);
189
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
190
+ });
191
+
192
+ it('should handle single relevant piece at start', async () => {
193
+ const testCase = testCases[4]!;
194
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
195
+ const result = await metric.measure(testCase.input, testCase.output);
196
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
197
+ });
198
+
199
+ it('should handle single relevant piece in middle', async () => {
200
+ const testCase = testCases[5]!;
201
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
202
+ const result = await metric.measure(testCase.input, testCase.output);
203
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
204
+ });
205
+
206
+ it('should handle single relevant piece at end', async () => {
207
+ const testCase = testCases[6]!;
208
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
209
+ const result = await metric.measure(testCase.input, testCase.output);
210
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
211
+ });
212
+
213
+ it('should handle empty context', async () => {
214
+ const testCase = testCases[7]!;
215
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
216
+ const result = await metric.measure(testCase.input, testCase.output);
217
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
218
+ });
219
+
220
+ it('should handle all irrelevant context', async () => {
221
+ const testCase = testCases[8]!;
222
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
223
+ const result = await metric.measure(testCase.input, testCase.output);
224
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
225
+ });
226
+
227
+ it('should handle complex interdependent context', async () => {
228
+ const testCase = testCases[9]!;
229
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
230
+ const result = await metric.measure(testCase.input, testCase.output);
231
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
232
+ });
233
+
234
+ it('should handle single piece context', async () => {
235
+ const testCase = testCases[10]!;
236
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
237
+ const result = await metric.measure(testCase.input, testCase.output);
238
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
239
+ });
240
+
241
+ it('should handle two relevant pieces at end', async () => {
242
+ const testCase = testCases[11]!;
243
+ const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
244
+ const result = await metric.measure(testCase.input, testCase.output);
245
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
246
+ });
247
+ },
248
+ {
249
+ timeout: 15 * SECONDS,
250
+ },
251
+ );
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
5
5
 
@@ -127,7 +127,6 @@ const testCases: TestCaseWithContext[] = [
127
127
  ];
128
128
 
129
129
  const SECONDS = 10000;
130
- jest.setTimeout(15 * SECONDS);
131
130
 
132
131
  const modelConfig: ModelConfig = {
133
132
  provider: 'OPEN_AI',
@@ -136,74 +135,80 @@ const modelConfig: ModelConfig = {
136
135
  apiKey: process.env.OPENAI_API_KEY,
137
136
  };
138
137
 
139
- describe('ContextPrecisionMetric', () => {
140
- it('should measure perfect context precision with all relevant items', async () => {
141
- const testCase = testCases[0]!;
142
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
143
- const result = await metric.measure(testCase.input, testCase.output);
144
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
145
- });
138
+ describe(
139
+ 'ContextPrecisionMetric',
140
+ () => {
141
+ it('should measure perfect context precision with all relevant items', async () => {
142
+ const testCase = testCases[0]!;
143
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
144
+ const result = await metric.measure(testCase.input, testCase.output);
145
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
146
+ });
146
147
 
147
- it('should measure high precision with irrelevant item at end', async () => {
148
- const testCase = testCases[1]!;
149
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
150
- const result = await metric.measure(testCase.input, testCase.output);
151
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
152
- });
148
+ it('should measure high precision with irrelevant item at end', async () => {
149
+ const testCase = testCases[1]!;
150
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
151
+ const result = await metric.measure(testCase.input, testCase.output);
152
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
153
+ });
153
154
 
154
- it('should measure precision with two relevant items after irrelevant start', async () => {
155
- const testCase = testCases[2]!;
156
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
157
- const result = await metric.measure(testCase.input, testCase.output);
158
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
159
- });
155
+ it('should measure precision with two relevant items after irrelevant start', async () => {
156
+ const testCase = testCases[2]!;
157
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
158
+ const result = await metric.measure(testCase.input, testCase.output);
159
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
160
+ });
160
161
 
161
- it('should measure precision with alternating relevant items', async () => {
162
- const testCase = testCases[3]!;
163
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
164
- const result = await metric.measure(testCase.input, testCase.output);
165
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
166
- });
162
+ it('should measure precision with alternating relevant items', async () => {
163
+ const testCase = testCases[3]!;
164
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
165
+ const result = await metric.measure(testCase.input, testCase.output);
166
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
167
+ });
167
168
 
168
- it('should measure precision with single relevant item at start', async () => {
169
- const testCase = testCases[4]!;
170
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
171
- const result = await metric.measure(testCase.input, testCase.output);
172
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
173
- });
169
+ it('should measure precision with single relevant item at start', async () => {
170
+ const testCase = testCases[4]!;
171
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
172
+ const result = await metric.measure(testCase.input, testCase.output);
173
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
174
+ });
174
175
 
175
- it('should handle completely irrelevant context', async () => {
176
- const testCase = testCases[5]!;
177
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
178
- const result = await metric.measure(testCase.input, testCase.output);
179
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
180
- });
176
+ it('should handle completely irrelevant context', async () => {
177
+ const testCase = testCases[5]!;
178
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
179
+ const result = await metric.measure(testCase.input, testCase.output);
180
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
181
+ });
181
182
 
182
- it('should handle single relevant context perfectly', async () => {
183
- const testCase = testCases[6]!;
184
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
185
- const result = await metric.measure(testCase.input, testCase.output);
186
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
187
- });
183
+ it('should handle single relevant context perfectly', async () => {
184
+ const testCase = testCases[6]!;
185
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
186
+ const result = await metric.measure(testCase.input, testCase.output);
187
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
188
+ });
188
189
 
189
- it('should measure precision with single relevant item at end', async () => {
190
- const testCase = testCases[7]!;
191
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
192
- const result = await metric.measure(testCase.input, testCase.output);
193
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
194
- });
190
+ it('should measure precision with single relevant item at end', async () => {
191
+ const testCase = testCases[7]!;
192
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
193
+ const result = await metric.measure(testCase.input, testCase.output);
194
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
195
+ });
195
196
 
196
- it('should handle empty context', async () => {
197
- const testCase = testCases[8]!;
198
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
199
- const result = await metric.measure(testCase.input, testCase.output);
200
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
201
- });
197
+ it('should handle empty context', async () => {
198
+ const testCase = testCases[8]!;
199
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
200
+ const result = await metric.measure(testCase.input, testCase.output);
201
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
202
+ });
202
203
 
203
- it('should handle single irrelevant context', async () => {
204
- const testCase = testCases[9]!;
205
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
206
- const result = await metric.measure(testCase.input, testCase.output);
207
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
208
- });
209
- });
204
+ it('should handle single irrelevant context', async () => {
205
+ const testCase = testCases[9]!;
206
+ const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
207
+ const result = await metric.measure(testCase.input, testCase.output);
208
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
209
+ });
210
+ },
211
+ {
212
+ timeout: 15 * SECONDS,
213
+ },
214
+ );
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { isCloserTo } from '../utils';
5
5
  import { TestCaseWithContext } from '../utils';
@@ -55,7 +55,6 @@ const testCases: TestCaseWithContext[] = [
55
55
  ];
56
56
 
57
57
  const SECONDS = 10000;
58
- jest.setTimeout(15 * SECONDS);
59
58
 
60
59
  const modelConfig: ModelConfig = {
61
60
  provider: 'OPEN_AI',
@@ -64,25 +63,31 @@ const modelConfig: ModelConfig = {
64
63
  apiKey: process.env.OPENAI_API_KEY,
65
64
  };
66
65
 
67
- describe('ContextPrecisionMetric', () => {
68
- it('should measure perfect context relevancy with all relevant items', async () => {
69
- const testCase = testCases[0]!;
70
- const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
71
- const result = await metric.measure(testCase.input, testCase.output);
72
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
73
- });
66
+ describe(
67
+ 'ContextPrecisionMetric',
68
+ () => {
69
+ it('should measure perfect context relevancy with all relevant items', async () => {
70
+ const testCase = testCases[0]!;
71
+ const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
72
+ const result = await metric.measure(testCase.input, testCase.output);
73
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
74
+ });
74
75
 
75
- it('should measure mixed relevancy where only some contexts are relevant', async () => {
76
- const testCase = testCases[1]!;
77
- const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
78
- const result = await metric.measure(testCase.input, testCase.output);
79
- expect(isCloserTo(result.score, testCase.expectedResult.score, 0)).toBe(true);
80
- });
76
+ it('should measure mixed relevancy where only some contexts are relevant', async () => {
77
+ const testCase = testCases[1]!;
78
+ const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
79
+ const result = await metric.measure(testCase.input, testCase.output);
80
+ expect(isCloserTo(result.score, testCase.expectedResult.score, 0)).toBe(true);
81
+ });
81
82
 
82
- it('should measure no relevancy where contexts are completely unrelated', async () => {
83
- const testCase = testCases[2]!;
84
- const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
85
- const result = await metric.measure(testCase.input, testCase.output);
86
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
87
- });
88
- });
83
+ it('should measure no relevancy where contexts are completely unrelated', async () => {
84
+ const testCase = testCases[2]!;
85
+ const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
86
+ const result = await metric.measure(testCase.input, testCase.output);
87
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
88
+ });
89
+ },
90
+ {
91
+ timeout: 15 * SECONDS,
92
+ },
93
+ );
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { isCloserTo } from '../utils';
5
5
  import { TestCaseWithContext } from '../utils';
@@ -51,7 +51,6 @@ const testCases: TestCaseWithContext[] = [
51
51
  ];
52
52
 
53
53
  const SECONDS = 10000;
54
- jest.setTimeout(15 * SECONDS);
55
54
 
56
55
  const modelConfig: ModelConfig = {
57
56
  provider: 'OPEN_AI',
@@ -60,27 +59,33 @@ const modelConfig: ModelConfig = {
60
59
  apiKey: process.env.OPENAI_API_KEY,
61
60
  };
62
61
 
63
- describe('ContextualRecallMetric', () => {
64
- it('should succeed when context is relevant', async () => {
65
- const testCase = testCases[0]!;
66
- const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
67
- const result = await metric.measure(testCase.input, testCase.output);
68
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
69
- });
62
+ describe(
63
+ 'ContextualRecallMetric',
64
+ () => {
65
+ it('should succeed when context is relevant', async () => {
66
+ const testCase = testCases[0]!;
67
+ const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
68
+ const result = await metric.measure(testCase.input, testCase.output);
69
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
70
+ });
70
71
 
71
- it('should be mixed', async () => {
72
- const testCase = testCases[1]!;
73
- const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
74
- const result = await metric.measure(testCase.input, testCase.output);
72
+ it('should be mixed', async () => {
73
+ const testCase = testCases[1]!;
74
+ const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
75
+ const result = await metric.measure(testCase.input, testCase.output);
75
76
 
76
- expect(isCloserTo(result.score, testCase.expectedResult.score, 1)).toBe(true);
77
- expect(result.score - testCase.expectedResult.score).toBeGreaterThan(0);
78
- });
77
+ expect(isCloserTo(result.score, testCase.expectedResult.score, 1)).toBe(true);
78
+ expect(result.score - testCase.expectedResult.score).toBeGreaterThan(0);
79
+ });
79
80
 
80
- it('should be none', async () => {
81
- const testCase = testCases[2]!;
82
- const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
83
- const result = await metric.measure(testCase.input, testCase.output);
84
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
85
- });
86
- });
81
+ it('should be none', async () => {
82
+ const testCase = testCases[2]!;
83
+ const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
84
+ const result = await metric.measure(testCase.input, testCase.output);
85
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
86
+ });
87
+ },
88
+ {
89
+ timeout: 15 * SECONDS,
90
+ },
91
+ );