@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.21

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
5
5
 
@@ -146,7 +146,6 @@ const testCases: TestCaseWithContext[] = [
146
146
  ];
147
147
 
148
148
  const SECONDS = 10000;
149
- jest.setTimeout(15 * SECONDS);
150
149
 
151
150
  const modelConfig: ModelConfig = {
152
151
  provider: 'OPEN_AI',
@@ -155,100 +154,106 @@ const modelConfig: ModelConfig = {
155
154
  apiKey: process.env.OPENAI_API_KEY,
156
155
  };
157
156
 
158
- describe('FaithfulnessMetric', () => {
159
- it('should handle perfect faithfulness', async () => {
160
- const testCase = testCases[0]!;
161
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
162
- const result = await metric.measure(testCase.input, testCase.output);
157
+ describe(
158
+ 'FaithfulnessMetric',
159
+ () => {
160
+ it('should handle perfect faithfulness', async () => {
161
+ const testCase = testCases[0]!;
162
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
163
+ const result = await metric.measure(testCase.input, testCase.output);
163
164
 
164
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
165
- });
165
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
166
+ });
166
167
 
167
- it('should handle mixed faithfulness with contradictions', async () => {
168
- const testCase = testCases[1]!;
169
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
170
- const result = await metric.measure(testCase.input, testCase.output);
168
+ it('should handle mixed faithfulness with contradictions', async () => {
169
+ const testCase = testCases[1]!;
170
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
171
+ const result = await metric.measure(testCase.input, testCase.output);
171
172
 
172
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
173
- });
173
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
174
+ });
174
175
 
175
- it('should handle claims with speculative language', async () => {
176
- const testCase = testCases[2]!;
177
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
178
- const result = await metric.measure(testCase.input, testCase.output);
176
+ it('should handle claims with speculative language', async () => {
177
+ const testCase = testCases[2]!;
178
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
179
+ const result = await metric.measure(testCase.input, testCase.output);
179
180
 
180
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
181
- });
181
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
182
+ });
182
183
 
183
- it('should handle empty output', async () => {
184
- const testCase = testCases[3]!;
185
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
186
- const result = await metric.measure(testCase.input, testCase.output);
184
+ it('should handle empty output', async () => {
185
+ const testCase = testCases[3]!;
186
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
187
+ const result = await metric.measure(testCase.input, testCase.output);
187
188
 
188
- expect(result.score).toBe(testCase.expectedResult.score);
189
- });
189
+ expect(result.score).toBe(testCase.expectedResult.score);
190
+ });
190
191
 
191
- it('should handle empty context', async () => {
192
- const testCase = testCases[4]!;
193
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
194
- const result = await metric.measure(testCase.input, testCase.output);
192
+ it('should handle empty context', async () => {
193
+ const testCase = testCases[4]!;
194
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
195
+ const result = await metric.measure(testCase.input, testCase.output);
195
196
 
196
- expect(result.score).toBe(testCase.expectedResult.score);
197
- });
197
+ expect(result.score).toBe(testCase.expectedResult.score);
198
+ });
198
199
 
199
- it('should handle subjective claims', async () => {
200
- const testCase = testCases[5]!;
201
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
202
- const result = await metric.measure(testCase.input, testCase.output);
200
+ it('should handle subjective claims', async () => {
201
+ const testCase = testCases[5]!;
202
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
203
+ const result = await metric.measure(testCase.input, testCase.output);
203
204
 
204
- expect(result.score).toBe(testCase.expectedResult.score);
205
- });
205
+ expect(result.score).toBe(testCase.expectedResult.score);
206
+ });
206
207
 
207
- it('should handle claims with speculative language appropriately', async () => {
208
- const testCase = testCases[6]!;
209
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
210
- const result = await metric.measure(testCase.input, testCase.output);
208
+ it('should handle claims with speculative language appropriately', async () => {
209
+ const testCase = testCases[6]!;
210
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
211
+ const result = await metric.measure(testCase.input, testCase.output);
211
212
 
212
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
213
- });
213
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
214
+ });
214
215
 
215
- it('should handle compound statements correctly', async () => {
216
- const testCase = testCases[7]!;
217
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
218
- const result = await metric.measure(testCase.input, testCase.output);
216
+ it('should handle compound statements correctly', async () => {
217
+ const testCase = testCases[7]!;
218
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
219
+ const result = await metric.measure(testCase.input, testCase.output);
219
220
 
220
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
221
- });
221
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
222
+ });
222
223
 
223
- it('should handle precise numerical claims', async () => {
224
- const testCase = testCases[8]!;
225
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
226
- const result = await metric.measure(testCase.input, testCase.output);
224
+ it('should handle precise numerical claims', async () => {
225
+ const testCase = testCases[8]!;
226
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
227
+ const result = await metric.measure(testCase.input, testCase.output);
227
228
 
228
- expect(result.score).toBe(testCase.expectedResult.score);
229
- });
229
+ expect(result.score).toBe(testCase.expectedResult.score);
230
+ });
230
231
 
231
- it('should handle partially supported claims', async () => {
232
- const testCase = testCases[9]!;
233
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
234
- const result = await metric.measure(testCase.input, testCase.output);
232
+ it('should handle partially supported claims', async () => {
233
+ const testCase = testCases[9]!;
234
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
235
+ const result = await metric.measure(testCase.input, testCase.output);
235
236
 
236
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
237
- });
237
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
238
+ });
238
239
 
239
- it('should handle mixed factual and speculative claims', async () => {
240
- const testCase = testCases[10]!;
241
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
242
- const result = await metric.measure(testCase.input, testCase.output);
240
+ it('should handle mixed factual and speculative claims', async () => {
241
+ const testCase = testCases[10]!;
242
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
243
+ const result = await metric.measure(testCase.input, testCase.output);
243
244
 
244
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
245
- });
245
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
246
+ });
246
247
 
247
- it('should handle implicit information appropriately', async () => {
248
- const testCase = testCases[11]!;
249
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
250
- const result = await metric.measure(testCase.input, testCase.output);
248
+ it('should handle implicit information appropriately', async () => {
249
+ const testCase = testCases[11]!;
250
+ const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
251
+ const result = await metric.measure(testCase.input, testCase.output);
251
252
 
252
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
253
- });
254
- });
253
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
254
+ });
255
+ },
256
+ {
257
+ timeout: 15 * SECONDS,
258
+ },
259
+ );
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
5
5
 
@@ -128,7 +128,6 @@ const testCases: TestCaseWithContext[] = [
128
128
  ];
129
129
 
130
130
  const SECONDS = 10000;
131
- jest.setTimeout(15 * SECONDS);
132
131
 
133
132
  const modelConfig: ModelConfig = {
134
133
  provider: 'OPEN_AI',
@@ -137,81 +136,87 @@ const modelConfig: ModelConfig = {
137
136
  apiKey: process.env.OPENAI_API_KEY,
138
137
  };
139
138
 
140
- describe('HallucinationMetric', () => {
141
- it('should handle perfect alignment', async () => {
142
- const testCase = testCases[0]!;
143
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
144
- const result = await metric.measure(testCase.input, testCase.output);
145
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
146
- });
147
-
148
- it('should handle complete hallucination', async () => {
149
- const testCase = testCases[1]!;
150
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
151
- const result = await metric.measure(testCase.input, testCase.output);
152
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
153
- });
154
-
155
- it('should handle partial hallucination', async () => {
156
- const testCase = testCases[2]!;
157
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
158
- const result = await metric.measure(testCase.input, testCase.output);
159
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
160
- });
161
-
162
- it('should handle empty output', async () => {
163
- const testCase = testCases[3]!;
164
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
165
- const result = await metric.measure(testCase.input, testCase.output);
166
- expect(result.score).toBe(testCase.expectedResult.score);
167
- });
168
-
169
- it('should handle speculative language', async () => {
170
- const testCase = testCases[4]!;
171
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
172
- const result = await metric.measure(testCase.input, testCase.output);
173
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
174
- });
175
-
176
- it('should handle empty context', async () => {
177
- const testCase = testCases[5]!;
178
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
179
- const result = await metric.measure(testCase.input, testCase.output);
180
- expect(result.score).toBe(testCase.expectedResult.score);
181
- });
182
-
183
- it('should handle implicit contradictions', async () => {
184
- const testCase = testCases[6]!;
185
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
186
- const result = await metric.measure(testCase.input, testCase.output);
187
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
188
- });
189
-
190
- it('should handle numerical approximations', async () => {
191
- const testCase = testCases[7]!;
192
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
193
- const result = await metric.measure(testCase.input, testCase.output);
194
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
195
- });
196
-
197
- it('should handle out of scope additions', async () => {
198
- const testCase = testCases[8]!;
199
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
200
- const result = await metric.measure(testCase.input, testCase.output);
201
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
202
- });
203
-
204
- it('should handle temporal contradictions', async () => {
205
- const testCase = testCases[9]!;
206
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
207
- const result = await metric.measure(testCase.input, testCase.output);
208
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
209
- });
210
-
211
- it('should handle numerical contradiction despite approximation', async () => {
212
- const testCase = testCases[10]!;
213
- const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
214
- const result = await metric.measure(testCase.input, testCase.output);
215
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
216
- });
217
- });
139
+ describe(
140
+ 'HallucinationMetric',
141
+ () => {
142
+ it('should handle perfect alignment', async () => {
143
+ const testCase = testCases[0]!;
144
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
145
+ const result = await metric.measure(testCase.input, testCase.output);
146
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
147
+ });
148
+
149
+ it('should handle complete hallucination', async () => {
150
+ const testCase = testCases[1]!;
151
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
152
+ const result = await metric.measure(testCase.input, testCase.output);
153
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
154
+ });
155
+
156
+ it('should handle partial hallucination', async () => {
157
+ const testCase = testCases[2]!;
158
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
159
+ const result = await metric.measure(testCase.input, testCase.output);
160
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
161
+ });
162
+
163
+ it('should handle empty output', async () => {
164
+ const testCase = testCases[3]!;
165
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
166
+ const result = await metric.measure(testCase.input, testCase.output);
167
+ expect(result.score).toBe(testCase.expectedResult.score);
168
+ });
169
+
170
+ it('should handle speculative language', async () => {
171
+ const testCase = testCases[4]!;
172
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
173
+ const result = await metric.measure(testCase.input, testCase.output);
174
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
175
+ });
176
+
177
+ it('should handle empty context', async () => {
178
+ const testCase = testCases[5]!;
179
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
180
+ const result = await metric.measure(testCase.input, testCase.output);
181
+ expect(result.score).toBe(testCase.expectedResult.score);
182
+ });
183
+
184
+ it('should handle implicit contradictions', async () => {
185
+ const testCase = testCases[6]!;
186
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
187
+ const result = await metric.measure(testCase.input, testCase.output);
188
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
189
+ });
190
+
191
+ it('should handle numerical approximations', async () => {
192
+ const testCase = testCases[7]!;
193
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
194
+ const result = await metric.measure(testCase.input, testCase.output);
195
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
196
+ });
197
+
198
+ it('should handle out of scope additions', async () => {
199
+ const testCase = testCases[8]!;
200
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
201
+ const result = await metric.measure(testCase.input, testCase.output);
202
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
203
+ });
204
+
205
+ it('should handle temporal contradictions', async () => {
206
+ const testCase = testCases[9]!;
207
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
208
+ const result = await metric.measure(testCase.input, testCase.output);
209
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
210
+ });
211
+
212
+ it('should handle numerical contradiction despite approximation', async () => {
213
+ const testCase = testCases[10]!;
214
+ const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
215
+ const result = await metric.measure(testCase.input, testCase.output);
216
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
217
+ });
218
+ },
219
+ {
220
+ timeout: 15 * SECONDS,
221
+ },
222
+ );
@@ -1,5 +1,5 @@
1
- import { describe, it, expect, jest } from '@jest/globals';
2
1
  import { type ModelConfig } from '@mastra/core';
2
+ import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithInstructions } from '../utils';
5
5
 
@@ -69,7 +69,6 @@ const testCases: TestCaseWithInstructions[] = [
69
69
  ];
70
70
 
71
71
  const SECONDS = 10000;
72
- jest.setTimeout(15 * SECONDS);
73
72
 
74
73
  const modelConfig: ModelConfig = {
75
74
  provider: 'OPEN_AI',
@@ -78,69 +77,75 @@ const modelConfig: ModelConfig = {
78
77
  apiKey: process.env.OPENAI_API_KEY,
79
78
  };
80
79
 
81
- describe('PromptAlignmentMetric', () => {
82
- it('should measure perfect alignment with single instruction', async () => {
83
- const testCase = testCases[0]!;
84
- const metric = new PromptAlignmentMetric(modelConfig, {
85
- instructions: testCase.instructions,
80
+ describe(
81
+ 'PromptAlignmentMetric',
82
+ () => {
83
+ it('should measure perfect alignment with single instruction', async () => {
84
+ const testCase = testCases[0]!;
85
+ const metric = new PromptAlignmentMetric(modelConfig, {
86
+ instructions: testCase.instructions,
87
+ });
88
+
89
+ const result = await metric.measure(testCase.input, testCase.output);
90
+ expect(result.score).toBe(testCase.expectedResult.score);
86
91
  });
87
92
 
88
- const result = await metric.measure(testCase.input, testCase.output);
89
- expect(result.score).toBe(testCase.expectedResult.score);
90
- });
93
+ it('should measure zero alignment with single instruction', async () => {
94
+ const testCase = testCases[1]!;
95
+ const metric = new PromptAlignmentMetric(modelConfig, {
96
+ instructions: testCase.instructions,
97
+ });
91
98
 
92
- it('should measure zero alignment with single instruction', async () => {
93
- const testCase = testCases[1]!;
94
- const metric = new PromptAlignmentMetric(modelConfig, {
95
- instructions: testCase.instructions,
99
+ const result = await metric.measure(testCase.input, testCase.output);
100
+
101
+ expect(result.score).toBe(testCase.expectedResult.score);
96
102
  });
97
103
 
98
- const result = await metric.measure(testCase.input, testCase.output);
104
+ it('should measure perfect alignment with multiple instructions', async () => {
105
+ const testCase = testCases[2]!;
106
+ const metric = new PromptAlignmentMetric(modelConfig, {
107
+ instructions: testCase.instructions,
108
+ });
99
109
 
100
- expect(result.score).toBe(testCase.expectedResult.score);
101
- });
110
+ const result = await metric.measure(testCase.input, testCase.output);
102
111
 
103
- it('should measure perfect alignment with multiple instructions', async () => {
104
- const testCase = testCases[2]!;
105
- const metric = new PromptAlignmentMetric(modelConfig, {
106
- instructions: testCase.instructions,
112
+ expect(result.score).toBe(testCase.expectedResult.score);
107
113
  });
108
114
 
109
- const result = await metric.measure(testCase.input, testCase.output);
115
+ it('should measure partial alignment with multiple instructions', async () => {
116
+ const testCase = testCases[3]!;
117
+ const metric = new PromptAlignmentMetric(modelConfig, {
118
+ instructions: testCase.instructions,
119
+ });
110
120
 
111
- expect(result.score).toBe(testCase.expectedResult.score);
112
- });
121
+ const result = await metric.measure(testCase.input, testCase.output);
113
122
 
114
- it('should measure partial alignment with multiple instructions', async () => {
115
- const testCase = testCases[3]!;
116
- const metric = new PromptAlignmentMetric(modelConfig, {
117
- instructions: testCase.instructions,
123
+ expect(result.score).toBe(testCase.expectedResult.score);
118
124
  });
119
125
 
120
- const result = await metric.measure(testCase.input, testCase.output);
126
+ it('should measure alignment with complex formatting instructions', async () => {
127
+ const testCase = testCases[4]!;
128
+ const metric = new PromptAlignmentMetric(modelConfig, {
129
+ instructions: testCase.instructions,
130
+ });
121
131
 
122
- expect(result.score).toBe(testCase.expectedResult.score);
123
- });
132
+ const result = await metric.measure(testCase.input, testCase.output);
124
133
 
125
- it('should measure alignment with complex formatting instructions', async () => {
126
- const testCase = testCases[4]!;
127
- const metric = new PromptAlignmentMetric(modelConfig, {
128
- instructions: testCase.instructions,
134
+ expect(result.score).toBe(testCase.expectedResult.score);
129
135
  });
130
136
 
131
- const result = await metric.measure(testCase.input, testCase.output);
137
+ it('should handle empty output', async () => {
138
+ const testCase = testCases[5]!;
139
+ const metric = new PromptAlignmentMetric(modelConfig, {
140
+ instructions: testCase.instructions,
141
+ });
132
142
 
133
- expect(result.score).toBe(testCase.expectedResult.score);
134
- });
143
+ const result = await metric.measure(testCase.input, testCase.output);
135
144
 
136
- it('should handle empty output', async () => {
137
- const testCase = testCases[5]!;
138
- const metric = new PromptAlignmentMetric(modelConfig, {
139
- instructions: testCase.instructions,
145
+ expect(result.score).toBe(testCase.expectedResult.score);
140
146
  });
141
-
142
- const result = await metric.measure(testCase.input, testCase.output);
143
-
144
- expect(result.score).toBe(testCase.expectedResult.score);
145
- });
146
- });
147
+ },
148
+ {
149
+ timeout: 15 * SECONDS,
150
+ },
151
+ );