@mastra/evals 0.1.0-alpha.17 → 0.1.0-alpha.21
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +29 -0
- package/README.md +186 -0
- package/dist/evals.cjs.development.js +1 -0
- package/dist/evals.cjs.development.js.map +1 -1
- package/dist/evals.cjs.production.min.js.map +1 -1
- package/dist/evals.esm.js +1 -0
- package/dist/evals.esm.js.map +1 -1
- package/dist/evaluation.d.ts +2 -2
- package/dist/evaluation.d.ts.map +1 -1
- package/package.json +4 -7
- package/src/evaluation.test.ts +1 -1
- package/src/evaluation.ts +2 -0
- package/src/metrics/llm/answer-relevancy/index.test.ts +49 -44
- package/src/metrics/llm/bias/index.test.ts +13 -12
- package/src/metrics/llm/context-position/index.test.ts +92 -87
- package/src/metrics/llm/context-precision/index.test.ts +69 -64
- package/src/metrics/llm/context-relevancy/index.test.ts +27 -22
- package/src/metrics/llm/contextual-recall/index.test.ts +28 -23
- package/src/metrics/llm/faithfulness/index.test.ts +81 -76
- package/src/metrics/llm/hallucination/index.test.ts +85 -80
- package/src/metrics/llm/prompt-alignment/index.test.ts +53 -48
- package/src/metrics/llm/summarization/index.test.ts +85 -80
- package/src/metrics/llm/toxicity/index.test.ts +22 -17
- package/src/metrics/nlp/completeness/index.test.ts +1 -1
- package/src/metrics/nlp/content-similarity/index.test.ts +1 -1
- package/src/metrics/nlp/keyword-coverage/index.test.ts +1 -1
- package/src/metrics/nlp/textual-difference/index.test.ts +1 -1
- package/src/metrics/nlp/tone/index.test.ts +1 -1
- package/vitest.config.ts +9 -0
- package/jest.config.ts +0 -21
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { describe, it, expect, jest } from '@jest/globals';
|
|
2
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCaseWithContext } from '../utils';
|
|
5
5
|
|
|
@@ -146,7 +146,6 @@ const testCases: TestCaseWithContext[] = [
|
|
|
146
146
|
];
|
|
147
147
|
|
|
148
148
|
const SECONDS = 10000;
|
|
149
|
-
jest.setTimeout(15 * SECONDS);
|
|
150
149
|
|
|
151
150
|
const modelConfig: ModelConfig = {
|
|
152
151
|
provider: 'OPEN_AI',
|
|
@@ -155,100 +154,106 @@ const modelConfig: ModelConfig = {
|
|
|
155
154
|
apiKey: process.env.OPENAI_API_KEY,
|
|
156
155
|
};
|
|
157
156
|
|
|
158
|
-
describe(
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
157
|
+
describe(
|
|
158
|
+
'FaithfulnessMetric',
|
|
159
|
+
() => {
|
|
160
|
+
it('should handle perfect faithfulness', async () => {
|
|
161
|
+
const testCase = testCases[0]!;
|
|
162
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
163
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
163
164
|
|
|
164
|
-
|
|
165
|
-
|
|
165
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
166
|
+
});
|
|
166
167
|
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
168
|
+
it('should handle mixed faithfulness with contradictions', async () => {
|
|
169
|
+
const testCase = testCases[1]!;
|
|
170
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
171
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
171
172
|
|
|
172
|
-
|
|
173
|
-
|
|
173
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
174
|
+
});
|
|
174
175
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
176
|
+
it('should handle claims with speculative language', async () => {
|
|
177
|
+
const testCase = testCases[2]!;
|
|
178
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
179
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
179
180
|
|
|
180
|
-
|
|
181
|
-
|
|
181
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
182
|
+
});
|
|
182
183
|
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
184
|
+
it('should handle empty output', async () => {
|
|
185
|
+
const testCase = testCases[3]!;
|
|
186
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
187
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
187
188
|
|
|
188
|
-
|
|
189
|
-
|
|
189
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
190
|
+
});
|
|
190
191
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
192
|
+
it('should handle empty context', async () => {
|
|
193
|
+
const testCase = testCases[4]!;
|
|
194
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
195
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
195
196
|
|
|
196
|
-
|
|
197
|
-
|
|
197
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
198
|
+
});
|
|
198
199
|
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
200
|
+
it('should handle subjective claims', async () => {
|
|
201
|
+
const testCase = testCases[5]!;
|
|
202
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
203
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
203
204
|
|
|
204
|
-
|
|
205
|
-
|
|
205
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
206
|
+
});
|
|
206
207
|
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
208
|
+
it('should handle claims with speculative language appropriately', async () => {
|
|
209
|
+
const testCase = testCases[6]!;
|
|
210
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
211
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
211
212
|
|
|
212
|
-
|
|
213
|
-
|
|
213
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
214
|
+
});
|
|
214
215
|
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
216
|
+
it('should handle compound statements correctly', async () => {
|
|
217
|
+
const testCase = testCases[7]!;
|
|
218
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
219
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
219
220
|
|
|
220
|
-
|
|
221
|
-
|
|
221
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
222
|
+
});
|
|
222
223
|
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
224
|
+
it('should handle precise numerical claims', async () => {
|
|
225
|
+
const testCase = testCases[8]!;
|
|
226
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
227
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
227
228
|
|
|
228
|
-
|
|
229
|
-
|
|
229
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
230
|
+
});
|
|
230
231
|
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
232
|
+
it('should handle partially supported claims', async () => {
|
|
233
|
+
const testCase = testCases[9]!;
|
|
234
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
235
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
235
236
|
|
|
236
|
-
|
|
237
|
-
|
|
237
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
238
|
+
});
|
|
238
239
|
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
240
|
+
it('should handle mixed factual and speculative claims', async () => {
|
|
241
|
+
const testCase = testCases[10]!;
|
|
242
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
243
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
243
244
|
|
|
244
|
-
|
|
245
|
-
|
|
245
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
246
|
+
});
|
|
246
247
|
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
248
|
+
it('should handle implicit information appropriately', async () => {
|
|
249
|
+
const testCase = testCases[11]!;
|
|
250
|
+
const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
|
|
251
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
251
252
|
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
}
|
|
253
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
254
|
+
});
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
timeout: 15 * SECONDS,
|
|
258
|
+
},
|
|
259
|
+
);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { describe, it, expect, jest } from '@jest/globals';
|
|
2
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCaseWithContext } from '../utils';
|
|
5
5
|
|
|
@@ -128,7 +128,6 @@ const testCases: TestCaseWithContext[] = [
|
|
|
128
128
|
];
|
|
129
129
|
|
|
130
130
|
const SECONDS = 10000;
|
|
131
|
-
jest.setTimeout(15 * SECONDS);
|
|
132
131
|
|
|
133
132
|
const modelConfig: ModelConfig = {
|
|
134
133
|
provider: 'OPEN_AI',
|
|
@@ -137,81 +136,87 @@ const modelConfig: ModelConfig = {
|
|
|
137
136
|
apiKey: process.env.OPENAI_API_KEY,
|
|
138
137
|
};
|
|
139
138
|
|
|
140
|
-
describe(
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
139
|
+
describe(
|
|
140
|
+
'HallucinationMetric',
|
|
141
|
+
() => {
|
|
142
|
+
it('should handle perfect alignment', async () => {
|
|
143
|
+
const testCase = testCases[0]!;
|
|
144
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
145
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
146
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
147
|
+
});
|
|
148
|
+
|
|
149
|
+
it('should handle complete hallucination', async () => {
|
|
150
|
+
const testCase = testCases[1]!;
|
|
151
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
152
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
153
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
154
|
+
});
|
|
155
|
+
|
|
156
|
+
it('should handle partial hallucination', async () => {
|
|
157
|
+
const testCase = testCases[2]!;
|
|
158
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
159
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
160
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
161
|
+
});
|
|
162
|
+
|
|
163
|
+
it('should handle empty output', async () => {
|
|
164
|
+
const testCase = testCases[3]!;
|
|
165
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
166
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
167
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
it('should handle speculative language', async () => {
|
|
171
|
+
const testCase = testCases[4]!;
|
|
172
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
173
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
174
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
175
|
+
});
|
|
176
|
+
|
|
177
|
+
it('should handle empty context', async () => {
|
|
178
|
+
const testCase = testCases[5]!;
|
|
179
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
180
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
181
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
it('should handle implicit contradictions', async () => {
|
|
185
|
+
const testCase = testCases[6]!;
|
|
186
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
187
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
188
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
189
|
+
});
|
|
190
|
+
|
|
191
|
+
it('should handle numerical approximations', async () => {
|
|
192
|
+
const testCase = testCases[7]!;
|
|
193
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
194
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
195
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
196
|
+
});
|
|
197
|
+
|
|
198
|
+
it('should handle out of scope additions', async () => {
|
|
199
|
+
const testCase = testCases[8]!;
|
|
200
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
201
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
202
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
203
|
+
});
|
|
204
|
+
|
|
205
|
+
it('should handle temporal contradictions', async () => {
|
|
206
|
+
const testCase = testCases[9]!;
|
|
207
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
208
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
209
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
210
|
+
});
|
|
211
|
+
|
|
212
|
+
it('should handle numerical contradiction despite approximation', async () => {
|
|
213
|
+
const testCase = testCases[10]!;
|
|
214
|
+
const metric = new HallucinationMetric(modelConfig, { context: testCase.context });
|
|
215
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
216
|
+
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
217
|
+
});
|
|
218
|
+
},
|
|
219
|
+
{
|
|
220
|
+
timeout: 15 * SECONDS,
|
|
221
|
+
},
|
|
222
|
+
);
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { describe, it, expect, jest } from '@jest/globals';
|
|
2
1
|
import { type ModelConfig } from '@mastra/core';
|
|
2
|
+
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCaseWithInstructions } from '../utils';
|
|
5
5
|
|
|
@@ -69,7 +69,6 @@ const testCases: TestCaseWithInstructions[] = [
|
|
|
69
69
|
];
|
|
70
70
|
|
|
71
71
|
const SECONDS = 10000;
|
|
72
|
-
jest.setTimeout(15 * SECONDS);
|
|
73
72
|
|
|
74
73
|
const modelConfig: ModelConfig = {
|
|
75
74
|
provider: 'OPEN_AI',
|
|
@@ -78,69 +77,75 @@ const modelConfig: ModelConfig = {
|
|
|
78
77
|
apiKey: process.env.OPENAI_API_KEY,
|
|
79
78
|
};
|
|
80
79
|
|
|
81
|
-
describe(
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
80
|
+
describe(
|
|
81
|
+
'PromptAlignmentMetric',
|
|
82
|
+
() => {
|
|
83
|
+
it('should measure perfect alignment with single instruction', async () => {
|
|
84
|
+
const testCase = testCases[0]!;
|
|
85
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
86
|
+
instructions: testCase.instructions,
|
|
87
|
+
});
|
|
88
|
+
|
|
89
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
90
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
86
91
|
});
|
|
87
92
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
93
|
+
it('should measure zero alignment with single instruction', async () => {
|
|
94
|
+
const testCase = testCases[1]!;
|
|
95
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
96
|
+
instructions: testCase.instructions,
|
|
97
|
+
});
|
|
91
98
|
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
instructions: testCase.instructions,
|
|
99
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
100
|
+
|
|
101
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
96
102
|
});
|
|
97
103
|
|
|
98
|
-
|
|
104
|
+
it('should measure perfect alignment with multiple instructions', async () => {
|
|
105
|
+
const testCase = testCases[2]!;
|
|
106
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
107
|
+
instructions: testCase.instructions,
|
|
108
|
+
});
|
|
99
109
|
|
|
100
|
-
|
|
101
|
-
});
|
|
110
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
102
111
|
|
|
103
|
-
|
|
104
|
-
const testCase = testCases[2]!;
|
|
105
|
-
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
106
|
-
instructions: testCase.instructions,
|
|
112
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
107
113
|
});
|
|
108
114
|
|
|
109
|
-
|
|
115
|
+
it('should measure partial alignment with multiple instructions', async () => {
|
|
116
|
+
const testCase = testCases[3]!;
|
|
117
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
118
|
+
instructions: testCase.instructions,
|
|
119
|
+
});
|
|
110
120
|
|
|
111
|
-
|
|
112
|
-
});
|
|
121
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
113
122
|
|
|
114
|
-
|
|
115
|
-
const testCase = testCases[3]!;
|
|
116
|
-
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
117
|
-
instructions: testCase.instructions,
|
|
123
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
118
124
|
});
|
|
119
125
|
|
|
120
|
-
|
|
126
|
+
it('should measure alignment with complex formatting instructions', async () => {
|
|
127
|
+
const testCase = testCases[4]!;
|
|
128
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
129
|
+
instructions: testCase.instructions,
|
|
130
|
+
});
|
|
121
131
|
|
|
122
|
-
|
|
123
|
-
});
|
|
132
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
124
133
|
|
|
125
|
-
|
|
126
|
-
const testCase = testCases[4]!;
|
|
127
|
-
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
128
|
-
instructions: testCase.instructions,
|
|
134
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
129
135
|
});
|
|
130
136
|
|
|
131
|
-
|
|
137
|
+
it('should handle empty output', async () => {
|
|
138
|
+
const testCase = testCases[5]!;
|
|
139
|
+
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
140
|
+
instructions: testCase.instructions,
|
|
141
|
+
});
|
|
132
142
|
|
|
133
|
-
|
|
134
|
-
});
|
|
143
|
+
const result = await metric.measure(testCase.input, testCase.output);
|
|
135
144
|
|
|
136
|
-
|
|
137
|
-
const testCase = testCases[5]!;
|
|
138
|
-
const metric = new PromptAlignmentMetric(modelConfig, {
|
|
139
|
-
instructions: testCase.instructions,
|
|
145
|
+
expect(result.score).toBe(testCase.expectedResult.score);
|
|
140
146
|
});
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
});
|
|
147
|
+
},
|
|
148
|
+
{
|
|
149
|
+
timeout: 15 * SECONDS,
|
|
150
|
+
},
|
|
151
|
+
);
|