@mastra/evals 0.1.0-alpha.30 → 0.1.0-alpha.33
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/dist/{dist-56AYDN4X.js → dist-XPBCCWOM.js} +8 -8
- package/dist/index.js +1 -1
- package/dist/metrics/llm/index.d.ts +12 -11
- package/dist/metrics/llm/index.js +51 -49
- package/package.json +4 -3
- package/src/evaluation.test.ts +4 -6
- package/src/metrics/judge/index.ts +5 -4
- package/src/metrics/llm/answer-relevancy/index.test.ts +4 -7
- package/src/metrics/llm/answer-relevancy/index.ts +4 -3
- package/src/metrics/llm/answer-relevancy/metricJudge.ts +3 -3
- package/src/metrics/llm/bias/index.test.ts +4 -6
- package/src/metrics/llm/bias/index.ts +4 -3
- package/src/metrics/llm/bias/metricJudge.ts +3 -3
- package/src/metrics/llm/context-position/index.test.ts +15 -17
- package/src/metrics/llm/context-position/index.ts +6 -4
- package/src/metrics/llm/context-position/metricJudge.ts +3 -3
- package/src/metrics/llm/context-precision/index.test.ts +13 -15
- package/src/metrics/llm/context-precision/index.ts +6 -4
- package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
- package/src/metrics/llm/context-relevancy/index.test.ts +7 -9
- package/src/metrics/llm/context-relevancy/index.ts +6 -4
- package/src/metrics/llm/context-relevancy/metricJudge.ts +3 -3
- package/src/metrics/llm/contextual-recall/index.test.ts +6 -8
- package/src/metrics/llm/contextual-recall/index.ts +6 -4
- package/src/metrics/llm/contextual-recall/metricJudge.ts +3 -3
- package/src/metrics/llm/faithfulness/index.test.ts +15 -17
- package/src/metrics/llm/faithfulness/index.ts +6 -4
- package/src/metrics/llm/faithfulness/metricJudge.ts +3 -3
- package/src/metrics/llm/hallucination/index.test.ts +15 -19
- package/src/metrics/llm/hallucination/index.ts +7 -5
- package/src/metrics/llm/hallucination/metricJudge.ts +3 -3
- package/src/metrics/llm/prompt-alignment/index.test.ts +9 -11
- package/src/metrics/llm/prompt-alignment/index.ts +4 -3
- package/src/metrics/llm/prompt-alignment/metricJudge.ts +3 -3
- package/src/metrics/llm/summarization/index.test.ts +4 -6
- package/src/metrics/llm/summarization/index.ts +4 -3
- package/src/metrics/llm/summarization/metricJudge.ts +3 -3
- package/src/metrics/llm/toxicity/index.test.ts +4 -6
- package/src/metrics/llm/toxicity/index.ts +4 -3
- package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
- package/src/metrics/llm/types.ts +1 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { type
|
|
1
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -11,8 +11,8 @@ import {
|
|
|
11
11
|
} from './prompts';
|
|
12
12
|
|
|
13
13
|
export class BiasJudge extends MastraAgentJudge {
|
|
14
|
-
constructor(
|
|
15
|
-
super('Bias', BIAS_AGENT_INSTRUCTIONS,
|
|
14
|
+
constructor(llm: MastraLLMBase) {
|
|
15
|
+
super('Bias', BIAS_AGENT_INSTRUCTIONS, llm);
|
|
16
16
|
}
|
|
17
17
|
|
|
18
18
|
async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCaseWithContext } from '../utils';
|
|
@@ -151,96 +151,94 @@ const testCases: TestCaseWithContext[] = [
|
|
|
151
151
|
|
|
152
152
|
const SECONDS = 10000;
|
|
153
153
|
|
|
154
|
-
const
|
|
155
|
-
provider: 'OPEN_AI',
|
|
154
|
+
const llm = new OpenAI({
|
|
156
155
|
name: 'gpt-4o',
|
|
157
|
-
toolChoice: 'auto',
|
|
158
156
|
apiKey: process.env.OPENAI_API_KEY,
|
|
159
|
-
};
|
|
157
|
+
});
|
|
160
158
|
|
|
161
159
|
describe(
|
|
162
160
|
'ContextPositionMetric',
|
|
163
161
|
() => {
|
|
164
162
|
it('should handle perfect ordering with all relevant pieces', async () => {
|
|
165
163
|
const testCase = testCases[0]!;
|
|
166
|
-
const metric = new ContextPositionMetric(
|
|
164
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
167
165
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
168
166
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
169
167
|
});
|
|
170
168
|
|
|
171
169
|
it('should handle mixed relevance case', async () => {
|
|
172
170
|
const testCase = testCases[1]!;
|
|
173
|
-
const metric = new ContextPositionMetric(
|
|
171
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
174
172
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
175
173
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
176
174
|
});
|
|
177
175
|
|
|
178
176
|
it('should handle domain knowledge relevance', async () => {
|
|
179
177
|
const testCase = testCases[2]!;
|
|
180
|
-
const metric = new ContextPositionMetric(
|
|
178
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
181
179
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
182
180
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
183
181
|
});
|
|
184
182
|
|
|
185
183
|
it('should handle mixed relevance with good ordering', async () => {
|
|
186
184
|
const testCase = testCases[3]!;
|
|
187
|
-
const metric = new ContextPositionMetric(
|
|
185
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
188
186
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
189
187
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
190
188
|
});
|
|
191
189
|
|
|
192
190
|
it('should handle single relevant piece at start', async () => {
|
|
193
191
|
const testCase = testCases[4]!;
|
|
194
|
-
const metric = new ContextPositionMetric(
|
|
192
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
195
193
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
196
194
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
197
195
|
});
|
|
198
196
|
|
|
199
197
|
it('should handle single relevant piece in middle', async () => {
|
|
200
198
|
const testCase = testCases[5]!;
|
|
201
|
-
const metric = new ContextPositionMetric(
|
|
199
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
202
200
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
203
201
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
204
202
|
});
|
|
205
203
|
|
|
206
204
|
it('should handle single relevant piece at end', async () => {
|
|
207
205
|
const testCase = testCases[6]!;
|
|
208
|
-
const metric = new ContextPositionMetric(
|
|
206
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
209
207
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
210
208
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
211
209
|
});
|
|
212
210
|
|
|
213
211
|
it('should handle empty context', async () => {
|
|
214
212
|
const testCase = testCases[7]!;
|
|
215
|
-
const metric = new ContextPositionMetric(
|
|
213
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
216
214
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
217
215
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
218
216
|
});
|
|
219
217
|
|
|
220
218
|
it('should handle all irrelevant context', async () => {
|
|
221
219
|
const testCase = testCases[8]!;
|
|
222
|
-
const metric = new ContextPositionMetric(
|
|
220
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
223
221
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
224
222
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
225
223
|
});
|
|
226
224
|
|
|
227
225
|
it('should handle complex interdependent context', async () => {
|
|
228
226
|
const testCase = testCases[9]!;
|
|
229
|
-
const metric = new ContextPositionMetric(
|
|
227
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
230
228
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
231
229
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
232
230
|
});
|
|
233
231
|
|
|
234
232
|
it('should handle single piece context', async () => {
|
|
235
233
|
const testCase = testCases[10]!;
|
|
236
|
-
const metric = new ContextPositionMetric(
|
|
234
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
237
235
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
238
236
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
239
237
|
});
|
|
240
238
|
|
|
241
239
|
it('should handle two relevant pieces at end', async () => {
|
|
242
240
|
const testCase = testCases[11]!;
|
|
243
|
-
const metric = new ContextPositionMetric(
|
|
241
|
+
const metric = new ContextPositionMetric(llm, { context: testCase.context });
|
|
244
242
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
245
243
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
246
244
|
});
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -15,11 +16,12 @@ export class ContextPositionMetric extends Metric {
|
|
|
15
16
|
private scale: number;
|
|
16
17
|
private context: string[];
|
|
17
18
|
|
|
18
|
-
constructor(
|
|
19
|
+
constructor(llm: MastraLLMBase, { scale = 1, context }: ContextPositionMetricOptions) {
|
|
19
20
|
super();
|
|
20
|
-
|
|
21
|
-
this.scale = scale;
|
|
21
|
+
|
|
22
22
|
this.context = context;
|
|
23
|
+
this.judge = new ContextPositionJudge(llm);
|
|
24
|
+
this.scale = scale;
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { type
|
|
1
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
|
|
|
6
6
|
import { CONTEXT_POSITION_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
|
|
7
7
|
|
|
8
8
|
export class ContextPositionJudge extends MastraAgentJudge {
|
|
9
|
-
constructor(
|
|
10
|
-
super('Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS,
|
|
9
|
+
constructor(llm: MastraLLMBase) {
|
|
10
|
+
super('Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
async evaluate(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCaseWithContext } from '../utils';
|
|
@@ -128,82 +128,80 @@ const testCases: TestCaseWithContext[] = [
|
|
|
128
128
|
|
|
129
129
|
const SECONDS = 10000;
|
|
130
130
|
|
|
131
|
-
const
|
|
132
|
-
provider: 'OPEN_AI',
|
|
131
|
+
const llm = new OpenAI({
|
|
133
132
|
name: 'gpt-4o',
|
|
134
|
-
toolChoice: 'auto',
|
|
135
133
|
apiKey: process.env.OPENAI_API_KEY,
|
|
136
|
-
};
|
|
134
|
+
});
|
|
137
135
|
|
|
138
136
|
describe(
|
|
139
137
|
'ContextPrecisionMetric',
|
|
140
138
|
() => {
|
|
141
139
|
it('should measure perfect context precision with all relevant items', async () => {
|
|
142
140
|
const testCase = testCases[0]!;
|
|
143
|
-
const metric = new ContextPrecisionMetric(
|
|
141
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
144
142
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
145
143
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
146
144
|
});
|
|
147
145
|
|
|
148
146
|
it('should measure high precision with irrelevant item at end', async () => {
|
|
149
147
|
const testCase = testCases[1]!;
|
|
150
|
-
const metric = new ContextPrecisionMetric(
|
|
148
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
151
149
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
152
150
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
153
151
|
});
|
|
154
152
|
|
|
155
153
|
it('should measure precision with two relevant items after irrelevant start', async () => {
|
|
156
154
|
const testCase = testCases[2]!;
|
|
157
|
-
const metric = new ContextPrecisionMetric(
|
|
155
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
158
156
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
159
157
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
160
158
|
});
|
|
161
159
|
|
|
162
160
|
it('should measure precision with alternating relevant items', async () => {
|
|
163
161
|
const testCase = testCases[3]!;
|
|
164
|
-
const metric = new ContextPrecisionMetric(
|
|
162
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
165
163
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
166
164
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
167
165
|
});
|
|
168
166
|
|
|
169
167
|
it('should measure precision with single relevant item at start', async () => {
|
|
170
168
|
const testCase = testCases[4]!;
|
|
171
|
-
const metric = new ContextPrecisionMetric(
|
|
169
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
172
170
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
173
171
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
174
172
|
});
|
|
175
173
|
|
|
176
174
|
it('should handle completely irrelevant context', async () => {
|
|
177
175
|
const testCase = testCases[5]!;
|
|
178
|
-
const metric = new ContextPrecisionMetric(
|
|
176
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
179
177
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
180
178
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
181
179
|
});
|
|
182
180
|
|
|
183
181
|
it('should handle single relevant context perfectly', async () => {
|
|
184
182
|
const testCase = testCases[6]!;
|
|
185
|
-
const metric = new ContextPrecisionMetric(
|
|
183
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
186
184
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
187
185
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
188
186
|
});
|
|
189
187
|
|
|
190
188
|
it('should measure precision with single relevant item at end', async () => {
|
|
191
189
|
const testCase = testCases[7]!;
|
|
192
|
-
const metric = new ContextPrecisionMetric(
|
|
190
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
193
191
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
194
192
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
195
193
|
});
|
|
196
194
|
|
|
197
195
|
it('should handle empty context', async () => {
|
|
198
196
|
const testCase = testCases[8]!;
|
|
199
|
-
const metric = new ContextPrecisionMetric(
|
|
197
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
200
198
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
201
199
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
202
200
|
});
|
|
203
201
|
|
|
204
202
|
it('should handle single irrelevant context', async () => {
|
|
205
203
|
const testCase = testCases[9]!;
|
|
206
|
-
const metric = new ContextPrecisionMetric(
|
|
204
|
+
const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
|
|
207
205
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
208
206
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
209
207
|
});
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -15,11 +16,12 @@ export class ContextPrecisionMetric extends Metric {
|
|
|
15
16
|
private scale: number;
|
|
16
17
|
private context: string[];
|
|
17
18
|
|
|
18
|
-
constructor(
|
|
19
|
+
constructor(llm: MastraLLMBase, { scale = 1, context }: ContextPrecisionMetricOptions) {
|
|
19
20
|
super();
|
|
20
|
-
|
|
21
|
-
this.scale = scale;
|
|
21
|
+
|
|
22
22
|
this.context = context;
|
|
23
|
+
this.judge = new ContextPrecisionJudge(llm);
|
|
24
|
+
this.scale = scale;
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { type
|
|
1
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -7,8 +7,8 @@ import './prompts';
|
|
|
7
7
|
import { CONTEXT_PRECISION_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
|
|
8
8
|
|
|
9
9
|
export class ContextPrecisionJudge extends MastraAgentJudge {
|
|
10
|
-
constructor(
|
|
11
|
-
super('Context Precision', CONTEXT_PRECISION_AGENT_INSTRUCTIONS,
|
|
10
|
+
constructor(llm: MastraLLMBase) {
|
|
11
|
+
super('Context Precision', CONTEXT_PRECISION_AGENT_INSTRUCTIONS, llm);
|
|
12
12
|
}
|
|
13
13
|
|
|
14
14
|
async evaluate(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { isCloserTo } from '../utils';
|
|
@@ -56,33 +56,31 @@ const testCases: TestCaseWithContext[] = [
|
|
|
56
56
|
|
|
57
57
|
const SECONDS = 10000;
|
|
58
58
|
|
|
59
|
-
const
|
|
60
|
-
provider: 'OPEN_AI',
|
|
59
|
+
const llm = new OpenAI({
|
|
61
60
|
name: 'gpt-4o',
|
|
62
|
-
toolChoice: 'auto',
|
|
63
61
|
apiKey: process.env.OPENAI_API_KEY,
|
|
64
|
-
};
|
|
62
|
+
});
|
|
65
63
|
|
|
66
64
|
describe(
|
|
67
|
-
'
|
|
65
|
+
'ContextRelevancyMetric',
|
|
68
66
|
() => {
|
|
69
67
|
it('should measure perfect context relevancy with all relevant items', async () => {
|
|
70
68
|
const testCase = testCases[0]!;
|
|
71
|
-
const metric = new ContextRelevancyMetric(
|
|
69
|
+
const metric = new ContextRelevancyMetric(llm, { context: testCase.context });
|
|
72
70
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
73
71
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
74
72
|
});
|
|
75
73
|
|
|
76
74
|
it('should measure mixed relevancy where only some contexts are relevant', async () => {
|
|
77
75
|
const testCase = testCases[1]!;
|
|
78
|
-
const metric = new ContextRelevancyMetric(
|
|
76
|
+
const metric = new ContextRelevancyMetric(llm, { context: testCase.context });
|
|
79
77
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
80
78
|
expect(isCloserTo(result.score, testCase.expectedResult.score, 0)).toBe(true);
|
|
81
79
|
});
|
|
82
80
|
|
|
83
81
|
it('should measure no relevancy where contexts are completely unrelated', async () => {
|
|
84
82
|
const testCase = testCases[2]!;
|
|
85
|
-
const metric = new ContextRelevancyMetric(
|
|
83
|
+
const metric = new ContextRelevancyMetric(llm, { context: testCase.context });
|
|
86
84
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
87
85
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
88
86
|
});
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -15,11 +16,12 @@ export class ContextRelevancyMetric extends Metric {
|
|
|
15
16
|
private scale: number;
|
|
16
17
|
private context: string[];
|
|
17
18
|
|
|
18
|
-
constructor(
|
|
19
|
+
constructor(llm: MastraLLMBase, { scale = 1, context }: ContextRelevancyOptions) {
|
|
19
20
|
super();
|
|
20
|
-
|
|
21
|
-
this.scale = scale;
|
|
21
|
+
|
|
22
22
|
this.context = context;
|
|
23
|
+
this.judge = new ContextRelevancyJudge(llm);
|
|
24
|
+
this.scale = scale;
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { type
|
|
1
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
|
|
|
6
6
|
import { CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
|
|
7
7
|
|
|
8
8
|
export class ContextRelevancyJudge extends MastraAgentJudge {
|
|
9
|
-
constructor(
|
|
10
|
-
super('Context Relevancy', CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS,
|
|
9
|
+
constructor(llm: MastraLLMBase) {
|
|
10
|
+
super('Context Relevancy', CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, llm);
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
async evaluate(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { isCloserTo } from '../utils';
|
|
@@ -52,26 +52,24 @@ const testCases: TestCaseWithContext[] = [
|
|
|
52
52
|
|
|
53
53
|
const SECONDS = 10000;
|
|
54
54
|
|
|
55
|
-
const
|
|
56
|
-
provider: 'OPEN_AI',
|
|
55
|
+
const llm = new OpenAI({
|
|
57
56
|
name: 'gpt-4o',
|
|
58
|
-
toolChoice: 'auto',
|
|
59
57
|
apiKey: process.env.OPENAI_API_KEY,
|
|
60
|
-
};
|
|
58
|
+
});
|
|
61
59
|
|
|
62
60
|
describe(
|
|
63
61
|
'ContextualRecallMetric',
|
|
64
62
|
() => {
|
|
65
63
|
it('should succeed when context is relevant', async () => {
|
|
66
64
|
const testCase = testCases[0]!;
|
|
67
|
-
const metric = new ContextualRecallMetric(
|
|
65
|
+
const metric = new ContextualRecallMetric(llm, { context: testCase.context });
|
|
68
66
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
69
67
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
|
|
70
68
|
});
|
|
71
69
|
|
|
72
70
|
it('should be mixed', async () => {
|
|
73
71
|
const testCase = testCases[1]!;
|
|
74
|
-
const metric = new ContextualRecallMetric(
|
|
72
|
+
const metric = new ContextualRecallMetric(llm, { context: testCase.context });
|
|
75
73
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
76
74
|
|
|
77
75
|
expect(isCloserTo(result.score, testCase.expectedResult.score, 1)).toBe(true);
|
|
@@ -80,7 +78,7 @@ describe(
|
|
|
80
78
|
|
|
81
79
|
it('should be none', async () => {
|
|
82
80
|
const testCase = testCases[2]!;
|
|
83
|
-
const metric = new ContextualRecallMetric(
|
|
81
|
+
const metric = new ContextualRecallMetric(llm, { context: testCase.context });
|
|
84
82
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
85
83
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
86
84
|
});
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -15,11 +16,12 @@ export class ContextualRecallMetric extends Metric {
|
|
|
15
16
|
private scale: number;
|
|
16
17
|
private context: string[];
|
|
17
18
|
|
|
18
|
-
constructor(
|
|
19
|
+
constructor(llm: MastraLLMBase, { scale = 1, context }: ContextualRecallMetricOptions) {
|
|
19
20
|
super();
|
|
20
|
-
|
|
21
|
-
this.scale = scale;
|
|
21
|
+
|
|
22
22
|
this.context = context;
|
|
23
|
+
this.judge = new ContextualRecallJudge(llm);
|
|
24
|
+
this.scale = scale;
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { type
|
|
1
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
2
|
import { z } from 'zod';
|
|
3
3
|
|
|
4
4
|
import { MastraAgentJudge } from '../../judge';
|
|
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
|
|
|
6
6
|
import { CONTEXT_RECALL_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
|
|
7
7
|
|
|
8
8
|
export class ContextualRecallJudge extends MastraAgentJudge {
|
|
9
|
-
constructor(
|
|
10
|
-
super('Contextual Recall', CONTEXT_RECALL_AGENT_INSTRUCTIONS,
|
|
9
|
+
constructor(llm: MastraLLMBase) {
|
|
10
|
+
super('Contextual Recall', CONTEXT_RECALL_AGENT_INSTRUCTIONS, llm);
|
|
11
11
|
}
|
|
12
12
|
|
|
13
13
|
async evaluate(
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { OpenAI } from '@mastra/core/llm/openai';
|
|
2
2
|
import { describe, it, expect } from 'vitest';
|
|
3
3
|
|
|
4
4
|
import { TestCaseWithContext } from '../utils';
|
|
@@ -147,19 +147,17 @@ const testCases: TestCaseWithContext[] = [
|
|
|
147
147
|
|
|
148
148
|
const SECONDS = 10000;
|
|
149
149
|
|
|
150
|
-
const
|
|
151
|
-
provider: 'OPEN_AI',
|
|
150
|
+
const llm = new OpenAI({
|
|
152
151
|
name: 'gpt-4o',
|
|
153
|
-
toolChoice: 'auto',
|
|
154
152
|
apiKey: process.env.OPENAI_API_KEY,
|
|
155
|
-
};
|
|
153
|
+
});
|
|
156
154
|
|
|
157
155
|
describe(
|
|
158
156
|
'FaithfulnessMetric',
|
|
159
157
|
() => {
|
|
160
158
|
it('should handle perfect faithfulness', async () => {
|
|
161
159
|
const testCase = testCases[0]!;
|
|
162
|
-
const metric = new FaithfulnessMetric(
|
|
160
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
163
161
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
164
162
|
|
|
165
163
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -167,7 +165,7 @@ describe(
|
|
|
167
165
|
|
|
168
166
|
it('should handle mixed faithfulness with contradictions', async () => {
|
|
169
167
|
const testCase = testCases[1]!;
|
|
170
|
-
const metric = new FaithfulnessMetric(
|
|
168
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
171
169
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
172
170
|
|
|
173
171
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -175,7 +173,7 @@ describe(
|
|
|
175
173
|
|
|
176
174
|
it('should handle claims with speculative language', async () => {
|
|
177
175
|
const testCase = testCases[2]!;
|
|
178
|
-
const metric = new FaithfulnessMetric(
|
|
176
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
179
177
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
180
178
|
|
|
181
179
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -183,7 +181,7 @@ describe(
|
|
|
183
181
|
|
|
184
182
|
it('should handle empty output', async () => {
|
|
185
183
|
const testCase = testCases[3]!;
|
|
186
|
-
const metric = new FaithfulnessMetric(
|
|
184
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
187
185
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
188
186
|
|
|
189
187
|
expect(result.score).toBe(testCase.expectedResult.score);
|
|
@@ -191,7 +189,7 @@ describe(
|
|
|
191
189
|
|
|
192
190
|
it('should handle empty context', async () => {
|
|
193
191
|
const testCase = testCases[4]!;
|
|
194
|
-
const metric = new FaithfulnessMetric(
|
|
192
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
195
193
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
196
194
|
|
|
197
195
|
expect(result.score).toBe(testCase.expectedResult.score);
|
|
@@ -199,7 +197,7 @@ describe(
|
|
|
199
197
|
|
|
200
198
|
it('should handle subjective claims', async () => {
|
|
201
199
|
const testCase = testCases[5]!;
|
|
202
|
-
const metric = new FaithfulnessMetric(
|
|
200
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
203
201
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
204
202
|
|
|
205
203
|
expect(result.score).toBe(testCase.expectedResult.score);
|
|
@@ -207,7 +205,7 @@ describe(
|
|
|
207
205
|
|
|
208
206
|
it('should handle claims with speculative language appropriately', async () => {
|
|
209
207
|
const testCase = testCases[6]!;
|
|
210
|
-
const metric = new FaithfulnessMetric(
|
|
208
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
211
209
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
212
210
|
|
|
213
211
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -215,7 +213,7 @@ describe(
|
|
|
215
213
|
|
|
216
214
|
it('should handle compound statements correctly', async () => {
|
|
217
215
|
const testCase = testCases[7]!;
|
|
218
|
-
const metric = new FaithfulnessMetric(
|
|
216
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
219
217
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
220
218
|
|
|
221
219
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -223,7 +221,7 @@ describe(
|
|
|
223
221
|
|
|
224
222
|
it('should handle precise numerical claims', async () => {
|
|
225
223
|
const testCase = testCases[8]!;
|
|
226
|
-
const metric = new FaithfulnessMetric(
|
|
224
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
227
225
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
228
226
|
|
|
229
227
|
expect(result.score).toBe(testCase.expectedResult.score);
|
|
@@ -231,7 +229,7 @@ describe(
|
|
|
231
229
|
|
|
232
230
|
it('should handle partially supported claims', async () => {
|
|
233
231
|
const testCase = testCases[9]!;
|
|
234
|
-
const metric = new FaithfulnessMetric(
|
|
232
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
235
233
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
236
234
|
|
|
237
235
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -239,7 +237,7 @@ describe(
|
|
|
239
237
|
|
|
240
238
|
it('should handle mixed factual and speculative claims', async () => {
|
|
241
239
|
const testCase = testCases[10]!;
|
|
242
|
-
const metric = new FaithfulnessMetric(
|
|
240
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
243
241
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
244
242
|
|
|
245
243
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -247,7 +245,7 @@ describe(
|
|
|
247
245
|
|
|
248
246
|
it('should handle implicit information appropriately', async () => {
|
|
249
247
|
const testCase = testCases[11]!;
|
|
250
|
-
const metric = new FaithfulnessMetric(
|
|
248
|
+
const metric = new FaithfulnessMetric(llm, { context: testCase.context });
|
|
251
249
|
const result = await metric.measure(testCase.input, testCase.output);
|
|
252
250
|
|
|
253
251
|
expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
|
|
@@ -1,4 +1,5 @@
|
|
|
1
|
-
import { Metric
|
|
1
|
+
import { Metric } from '@mastra/core/eval';
|
|
2
|
+
import { type MastraLLMBase } from '@mastra/core/llm';
|
|
2
3
|
|
|
3
4
|
import { type MetricResultWithReason } from '../types';
|
|
4
5
|
import { roundToTwoDecimals } from '../utils';
|
|
@@ -15,11 +16,12 @@ export class FaithfulnessMetric extends Metric {
|
|
|
15
16
|
private scale: number;
|
|
16
17
|
private context: string[];
|
|
17
18
|
|
|
18
|
-
constructor(
|
|
19
|
+
constructor(llm: MastraLLMBase, { scale = 1, context }: FaithfulnessMetricOptions) {
|
|
19
20
|
super();
|
|
20
|
-
|
|
21
|
+
|
|
21
22
|
this.context = context;
|
|
22
|
-
this.judge = new FaithfulnessJudge(
|
|
23
|
+
this.judge = new FaithfulnessJudge(llm);
|
|
24
|
+
this.scale = scale;
|
|
23
25
|
}
|
|
24
26
|
|
|
25
27
|
async measure(input: string, output: string): Promise<MetricResultWithReason> {
|