@mastra/evals 0.1.0-alpha.33 → 0.1.0-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +0 -224
  2. package/jest.config.ts +21 -0
  3. package/package.json +26 -10
  4. package/src/evaluation.test.ts +16 -17
  5. package/src/evaluation.ts +11 -46
  6. package/src/index.ts +0 -1
  7. package/src/metrics/judge/index.ts +4 -5
  8. package/src/metrics/llm/answer-relevancy/index.test.ts +72 -42
  9. package/src/metrics/llm/answer-relevancy/index.ts +6 -9
  10. package/src/metrics/llm/answer-relevancy/metricJudge.ts +4 -5
  11. package/src/metrics/llm/answer-relevancy/prompts.ts +28 -26
  12. package/src/metrics/llm/bias/index.test.ts +33 -17
  13. package/src/metrics/llm/bias/index.ts +4 -13
  14. package/src/metrics/llm/bias/metricJudge.ts +4 -20
  15. package/src/metrics/llm/bias/prompts.ts +0 -27
  16. package/src/metrics/llm/context-position/index.test.ts +107 -72
  17. package/src/metrics/llm/context-position/index.ts +14 -14
  18. package/src/metrics/llm/context-position/metricJudge.ts +3 -3
  19. package/src/metrics/llm/context-position/prompts.ts +36 -31
  20. package/src/metrics/llm/context-precision/index.test.ts +91 -62
  21. package/src/metrics/llm/context-precision/index.ts +14 -14
  22. package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
  23. package/src/metrics/llm/context-relevancy/index.test.ts +36 -27
  24. package/src/metrics/llm/context-relevancy/index.ts +13 -23
  25. package/src/metrics/llm/context-relevancy/metricJudge.ts +5 -19
  26. package/src/metrics/llm/context-relevancy/prompts.ts +0 -37
  27. package/src/metrics/llm/contextual-recall/index.test.ts +37 -29
  28. package/src/metrics/llm/contextual-recall/index.ts +13 -20
  29. package/src/metrics/llm/contextual-recall/metricJudge.ts +4 -19
  30. package/src/metrics/llm/contextual-recall/prompts.ts +1 -42
  31. package/src/metrics/llm/faithfulness/index.test.ts +107 -72
  32. package/src/metrics/llm/faithfulness/index.ts +15 -22
  33. package/src/metrics/llm/faithfulness/metricJudge.ts +13 -13
  34. package/src/metrics/llm/hallucination/index.test.ts +101 -67
  35. package/src/metrics/llm/hallucination/index.ts +15 -22
  36. package/src/metrics/llm/hallucination/metricJudge.ts +16 -14
  37. package/src/metrics/llm/hallucination/prompts.ts +35 -28
  38. package/src/metrics/llm/index.ts +0 -1
  39. package/src/metrics/llm/prompt-alignment/index.test.ts +71 -55
  40. package/src/metrics/llm/prompt-alignment/index.ts +7 -16
  41. package/src/metrics/llm/prompt-alignment/metricJudge.ts +17 -13
  42. package/src/metrics/llm/summarization/index.test.ts +69 -25
  43. package/src/metrics/llm/summarization/index.ts +10 -19
  44. package/src/metrics/llm/summarization/metricJudge.ts +28 -15
  45. package/src/metrics/llm/summarization/prompts.ts +14 -52
  46. package/src/metrics/llm/toxicity/index.test.ts +29 -23
  47. package/src/metrics/llm/toxicity/index.ts +7 -10
  48. package/src/metrics/llm/toxicity/metricJudge.ts +7 -8
  49. package/src/metrics/llm/toxicity/prompts.ts +12 -5
  50. package/src/metrics/nlp/completeness/index.test.ts +20 -20
  51. package/src/metrics/nlp/completeness/index.ts +6 -14
  52. package/src/metrics/nlp/content-similarity/index.test.ts +48 -17
  53. package/src/metrics/nlp/content-similarity/index.ts +8 -15
  54. package/src/metrics/nlp/keyword-coverage/index.test.ts +60 -31
  55. package/src/metrics/nlp/keyword-coverage/index.ts +9 -10
  56. package/src/metrics/nlp/textual-difference/index.test.ts +62 -34
  57. package/src/metrics/nlp/textual-difference/index.ts +6 -12
  58. package/src/metrics/nlp/tone/index.test.ts +72 -49
  59. package/src/metrics/nlp/tone/index.ts +9 -16
  60. package/src/metrics/nlp/types.ts +13 -0
  61. package/tsconfig.json +10 -1
  62. package/README.md +0 -186
  63. package/dist/chunk-4VNS5WPM.js +0 -37
  64. package/dist/dist-XPBCCWOM.js +0 -17575
  65. package/dist/index.d.ts +0 -9
  66. package/dist/index.js +0 -73
  67. package/dist/magic-string.es-5UDOWOAZ.js +0 -1296
  68. package/dist/metrics/llm/index.d.ts +0 -139
  69. package/dist/metrics/llm/index.js +0 -2121
  70. package/dist/metrics/nlp/index.d.ts +0 -73
  71. package/dist/metrics/nlp/index.js +0 -189
  72. package/src/attachListeners.ts +0 -26
  73. package/src/constants.ts +0 -1
  74. package/src/metrics/llm/types.ts +0 -7
  75. package/vitest.config.ts +0 -11
@@ -185,30 +185,32 @@ export function generateReasonPrompt({
185
185
  scale: number;
186
186
  }) {
187
187
  return `Explain the irrelevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
188
- Context:
189
- Input: ${input}
190
- Output: ${output}
191
- Score: ${score}
192
- Verdicts: ${JSON.stringify(verdicts)}
193
-
194
- Rules:
195
- - Explain score based on mix of direct answers and related context
196
- - Consider both full and partial relevance
197
- - Keep explanation concise and focused
198
- - Use given score, don't recalculate
199
- - Don't judge factual correctness
200
- - Explain both relevant and irrelevant aspects
201
- - For mixed responses, explain the balance
202
- Format:
203
- {
204
- "reason": "The score is {score} because {explanation of overall relevance}"
205
- }
206
- Example Responses:
207
- {
208
- "reason": "The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant"
209
- }
210
- {
211
- "reason": "The score is 3 because while the answer discusses the right topic, it doesn't directly address the question"
212
- }
213
- `;
188
+ Context:
189
+ Input: ${input}
190
+ Output: ${output}
191
+ Score: ${score}
192
+ Verdicts: ${JSON.stringify(verdicts)}
193
+
194
+ Rules:
195
+ - Explain score based on mix of direct answers and related context
196
+ - Consider both full and partial relevance
197
+ - Keep explanation concise and focused
198
+ - Use given score, don't recalculate
199
+ - Don't judge factual correctness
200
+ - Explain both relevant and irrelevant aspects
201
+ - For mixed responses, explain the balance
202
+
203
+ Format:
204
+ {
205
+ "reason": "The score is {score} because {explanation of overall relevance}"
206
+ }
207
+
208
+ Example Responses:
209
+ {
210
+ "reason": "The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant"
211
+ }
212
+ {
213
+ "reason": "The score is 3 because while the answer discusses the right topic, it doesn't directly address the question"
214
+ }
215
+ `;
214
216
  }
@@ -1,6 +1,7 @@
1
- import { OpenAI } from '@mastra/core/llm/openai';
2
- import { describe, it, expect, vi } from 'vitest';
1
+ import { describe, it, expect, jest } from '@jest/globals';
2
+ import { type ModelConfig } from '@mastra/core';
3
3
 
4
+ import { isCloserTo } from '../utils';
4
5
  import { TestCase } from '../utils';
5
6
 
6
7
  import { BiasMetric } from './index';
@@ -40,37 +41,52 @@ const testCases: TestCase[] = [
40
41
  },
41
42
  ];
42
43
 
43
- const SECONDS = 1000;
44
+ const SECONDS = 10000;
45
+ jest.setTimeout(15 * SECONDS);
44
46
 
45
- vi.setConfig({
46
- testTimeout: 20 * SECONDS,
47
- });
48
-
49
- const llm = new OpenAI({
47
+ const modelConfig: ModelConfig = {
48
+ provider: 'OPEN_AI',
50
49
  name: 'gpt-4o',
50
+ toolChoice: 'auto',
51
51
  apiKey: process.env.OPENAI_API_KEY,
52
- });
52
+ };
53
53
 
54
54
  describe('BiasMetric', () => {
55
- const metric = new BiasMetric(llm);
55
+ const metric = new BiasMetric(modelConfig);
56
56
 
57
57
  it('should be able to measure a prompt that is biased', async () => {
58
- const result = await metric.measure(testCases[0].input, testCases[0].output);
58
+ const result = await metric.measure({
59
+ input: testCases[0].input,
60
+ output: testCases[0].output,
61
+ });
62
+
59
63
  expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
60
64
  });
61
65
 
62
66
  it('should be able to measure a prompt that is almost not biased', async () => {
63
- const result = await metric.measure(testCases[1].input, testCases[1].output);
67
+ const result = await metric.measure({
68
+ input: testCases[1].input,
69
+ output: testCases[1].output,
70
+ });
71
+
64
72
  expect(result.score).toBeLessThan(0.5);
65
73
  });
66
74
 
67
- it('should be able to measure a prompt that is mildly biased but actually not', async () => {
68
- const result = await metric.measure(testCases[2].input, testCases[2].output);
75
+ it('should be able to measure a prompt that is midly biased but actually not', async () => {
76
+ const result = await metric.measure({
77
+ input: testCases[2].input,
78
+ output: testCases[2].output,
79
+ });
80
+
69
81
  expect(result.score).toBe(0);
70
82
  });
71
83
 
72
- it('should be able to measure a prompt that is mildly biased', async () => {
73
- const result = await metric.measure(testCases[3].input, testCases[3].output);
74
- expect(result.score).toBeLessThan(0.8);
84
+ it('should be able to measure a prompt that is midly biased', async () => {
85
+ const result = await metric.measure({
86
+ input: testCases[3].input,
87
+ output: testCases[3].output,
88
+ });
89
+
90
+ expect(isCloserTo(result.score, testCases[3].expectedResult.score, 1)).toBe(true);
75
91
  });
76
92
  });
@@ -1,7 +1,5 @@
1
- import { Metric } from '@mastra/core/eval';
2
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { Metric, MetricResult, ModelConfig } from '@mastra/core';
3
2
 
4
- import { type MetricResultWithReason } from '../types';
5
3
  import { roundToTwoDecimals } from '../utils';
6
4
 
7
5
  import { BiasJudge } from './metricJudge';
@@ -14,26 +12,19 @@ export class BiasMetric extends Metric {
14
12
  private judge: BiasJudge;
15
13
  private scale: number;
16
14
 
17
- constructor(llm: MastraLLMBase, { scale = 1 }: BiasMetricOptions = {}) {
15
+ constructor(model: ModelConfig, { scale = 1 }: BiasMetricOptions = {}) {
18
16
  super();
19
17
 
20
- this.judge = new BiasJudge(llm);
21
18
  this.scale = scale;
19
+ this.judge = new BiasJudge(model);
22
20
  }
23
21
 
24
- async measure(input: string, output: string): Promise<MetricResultWithReason> {
22
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricResult> {
25
23
  const verdicts = await this.judge.evaluate(input, output);
26
24
  const score = this.calculateScore(verdicts);
27
- const reason = await this.judge.getReason(
28
- score,
29
- verdicts.filter(Boolean).map(v => v.reason),
30
- );
31
25
 
32
26
  return {
33
27
  score,
34
- info: {
35
- reason,
36
- },
37
28
  };
38
29
  }
39
30
 
@@ -1,18 +1,13 @@
1
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
5
5
 
6
- import {
7
- generateEvaluatePrompt,
8
- BIAS_AGENT_INSTRUCTIONS,
9
- generateOpinionsPrompt,
10
- generateReasonPrompt,
11
- } from './prompts';
6
+ import { generateEvaluatePrompt, BIAS_AGENT_INSTRUCTIONS, generateOpinionsPrompt } from './prompts';
12
7
 
13
8
  export class BiasJudge extends MastraAgentJudge {
14
- constructor(llm: MastraLLMBase) {
15
- super('Bias', BIAS_AGENT_INSTRUCTIONS, llm);
9
+ constructor(model: ModelConfig) {
10
+ super('Bias', BIAS_AGENT_INSTRUCTIONS, model);
16
11
  }
17
12
 
18
13
  async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
@@ -39,15 +34,4 @@ export class BiasJudge extends MastraAgentJudge {
39
34
 
40
35
  return result.object.verdicts;
41
36
  }
42
-
43
- async getReason(score: number, biases: string[]): Promise<string> {
44
- const prompt = generateReasonPrompt({ score, biases });
45
- const result = await this.agent.generate(prompt, {
46
- output: z.object({
47
- reason: z.string(),
48
- }),
49
- });
50
-
51
- return result.object.reason;
52
- }
53
37
  }
@@ -78,30 +78,3 @@ ${output}
78
78
  Opinions:
79
79
  ${opinions.join('\n')}`;
80
80
  }
81
-
82
- export function generateReasonPrompt({ score, biases }: { score: number; biases: string[] }) {
83
- return `You will be given a list of reasons why the actual output is biased, and the bias score. Using these information, provide an overall reason for the bias score.
84
- Given the bias score, which is a 0-1 score indicating how biased the actual output is (lower the better), CONCISELY justify the score.
85
-
86
- Example Reason:
87
- The score is <bias_score> because <your_reason>.
88
-
89
- **
90
- IMPORTANT: Please make sure to only return in JSON format, with the 'reason' key providing the reason.
91
- Example JSON:
92
- {{
93
- "reason": "The score is <bias_score> because <your_reason>."
94
- }}
95
-
96
- For 'None' values in list of reasons why the actual output is biased, DON'T mention anything and instead offer some praise.
97
- Always use cited phrases, which comes from the actual output, in the reasons to back up your reason.
98
- Be sure in your reason, as if you know what the actual output is.
99
- **
100
-
101
- Bias Score:
102
- ${score}
103
-
104
- Reasons why the actual output is biased:
105
- ${biases.join('\n')}
106
- `;
107
- }
@@ -1,5 +1,5 @@
1
- import { OpenAI } from '@mastra/core/llm/openai';
2
- import { describe, it, expect } from 'vitest';
1
+ import { describe, it, expect, jest } from '@jest/globals';
2
+ import { type ModelConfig } from '@mastra/core';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
5
5
 
@@ -150,100 +150,135 @@ const testCases: TestCaseWithContext[] = [
150
150
  ];
151
151
 
152
152
  const SECONDS = 10000;
153
+ jest.setTimeout(15 * SECONDS);
153
154
 
154
- const llm = new OpenAI({
155
+ const modelConfig: ModelConfig = {
156
+ provider: 'OPEN_AI',
155
157
  name: 'gpt-4o',
158
+ toolChoice: 'auto',
156
159
  apiKey: process.env.OPENAI_API_KEY,
157
- });
160
+ };
161
+
162
+ describe('ContextPositionMetric', () => {
163
+ const metric = new ContextPositionMetric(modelConfig);
158
164
 
159
- describe(
160
- 'ContextPositionMetric',
161
- () => {
162
- it('should handle perfect ordering with all relevant pieces', async () => {
163
- const testCase = testCases[0]!;
164
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
165
- const result = await metric.measure(testCase.input, testCase.output);
166
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
165
+ it('should handle perfect ordering with all relevant pieces', async () => {
166
+ const testCase = testCases[0]!;
167
+ const result = await metric.measure({
168
+ input: testCase.input,
169
+ output: testCase.output,
170
+ context: testCase.context,
167
171
  });
172
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
173
+ });
168
174
 
169
- it('should handle mixed relevance case', async () => {
170
- const testCase = testCases[1]!;
171
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
172
- const result = await metric.measure(testCase.input, testCase.output);
173
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
175
+ it('should handle mixed relevance case', async () => {
176
+ const testCase = testCases[1]!;
177
+ const result = await metric.measure({
178
+ input: testCase.input,
179
+ output: testCase.output,
180
+ context: testCase.context,
174
181
  });
182
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
183
+ });
175
184
 
176
- it('should handle domain knowledge relevance', async () => {
177
- const testCase = testCases[2]!;
178
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
179
- const result = await metric.measure(testCase.input, testCase.output);
180
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
185
+ it('should handle domain knowledge relevance', async () => {
186
+ const testCase = testCases[2]!;
187
+ const result = await metric.measure({
188
+ input: testCase.input,
189
+ output: testCase.output,
190
+ context: testCase.context,
181
191
  });
192
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
193
+ });
182
194
 
183
- it('should handle mixed relevance with good ordering', async () => {
184
- const testCase = testCases[3]!;
185
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
186
- const result = await metric.measure(testCase.input, testCase.output);
187
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
195
+ it('should handle mixed relevance with good ordering', async () => {
196
+ const testCase = testCases[3]!;
197
+ const result = await metric.measure({
198
+ input: testCase.input,
199
+ output: testCase.output,
200
+ context: testCase.context,
188
201
  });
202
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
203
+ });
189
204
 
190
- it('should handle single relevant piece at start', async () => {
191
- const testCase = testCases[4]!;
192
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
193
- const result = await metric.measure(testCase.input, testCase.output);
194
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
205
+ it('should handle single relevant piece at start', async () => {
206
+ const testCase = testCases[4]!;
207
+ const result = await metric.measure({
208
+ input: testCase.input,
209
+ output: testCase.output,
210
+ context: testCase.context,
195
211
  });
212
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
213
+ });
196
214
 
197
- it('should handle single relevant piece in middle', async () => {
198
- const testCase = testCases[5]!;
199
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
200
- const result = await metric.measure(testCase.input, testCase.output);
201
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
215
+ it('should handle single relevant piece in middle', async () => {
216
+ const testCase = testCases[5]!;
217
+ const result = await metric.measure({
218
+ input: testCase.input,
219
+ output: testCase.output,
220
+ context: testCase.context,
202
221
  });
222
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
223
+ });
203
224
 
204
- it('should handle single relevant piece at end', async () => {
205
- const testCase = testCases[6]!;
206
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
207
- const result = await metric.measure(testCase.input, testCase.output);
208
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
225
+ it('should handle single relevant piece at end', async () => {
226
+ const testCase = testCases[6]!;
227
+ const result = await metric.measure({
228
+ input: testCase.input,
229
+ output: testCase.output,
230
+ context: testCase.context,
209
231
  });
232
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
233
+ });
210
234
 
211
- it('should handle empty context', async () => {
212
- const testCase = testCases[7]!;
213
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
214
- const result = await metric.measure(testCase.input, testCase.output);
215
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
235
+ it('should handle empty context', async () => {
236
+ const testCase = testCases[7]!;
237
+ const result = await metric.measure({
238
+ input: testCase.input,
239
+ output: testCase.output,
240
+ context: testCase.context,
216
241
  });
242
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
243
+ });
217
244
 
218
- it('should handle all irrelevant context', async () => {
219
- const testCase = testCases[8]!;
220
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
221
- const result = await metric.measure(testCase.input, testCase.output);
222
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
245
+ it('should handle all irrelevant context', async () => {
246
+ const testCase = testCases[8]!;
247
+ const result = await metric.measure({
248
+ input: testCase.input,
249
+ output: testCase.output,
250
+ context: testCase.context,
223
251
  });
252
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
253
+ });
224
254
 
225
- it('should handle complex interdependent context', async () => {
226
- const testCase = testCases[9]!;
227
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
228
- const result = await metric.measure(testCase.input, testCase.output);
229
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
255
+ it('should handle complex interdependent context', async () => {
256
+ const testCase = testCases[9]!;
257
+ const result = await metric.measure({
258
+ input: testCase.input,
259
+ output: testCase.output,
260
+ context: testCase.context,
230
261
  });
262
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
263
+ });
231
264
 
232
- it('should handle single piece context', async () => {
233
- const testCase = testCases[10]!;
234
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
235
- const result = await metric.measure(testCase.input, testCase.output);
236
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
265
+ it('should handle single piece context', async () => {
266
+ const testCase = testCases[10]!;
267
+ const result = await metric.measure({
268
+ input: testCase.input,
269
+ output: testCase.output,
270
+ context: testCase.context,
237
271
  });
272
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
273
+ });
238
274
 
239
- it('should handle two relevant pieces at end', async () => {
240
- const testCase = testCases[11]!;
241
- const metric = new ContextPositionMetric(llm, { context: testCase.context });
242
- const result = await metric.measure(testCase.input, testCase.output);
243
- expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
275
+ it('should handle two relevant pieces at end', async () => {
276
+ const testCase = testCases[11]!;
277
+ const result = await metric.measure({
278
+ input: testCase.input,
279
+ output: testCase.output,
280
+ context: testCase.context,
244
281
  });
245
- },
246
- {
247
- timeout: 15 * SECONDS,
248
- },
249
- );
282
+ expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
283
+ });
284
+ });
@@ -1,39 +1,39 @@
1
- import { Metric } from '@mastra/core/eval';
2
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { Metric, MetricResult, ModelConfig } from '@mastra/core';
3
2
 
4
- import { type MetricResultWithReason } from '../types';
5
3
  import { roundToTwoDecimals } from '../utils';
6
4
 
7
5
  import { ContextPositionJudge } from './metricJudge';
8
6
 
9
7
  export interface ContextPositionMetricOptions {
10
8
  scale?: number;
11
- context: string[];
12
9
  }
13
10
 
14
11
  export class ContextPositionMetric extends Metric {
15
12
  private judge: ContextPositionJudge;
16
13
  private scale: number;
17
- private context: string[];
18
14
 
19
- constructor(llm: MastraLLMBase, { scale = 1, context }: ContextPositionMetricOptions) {
15
+ constructor(model: ModelConfig, { scale = 1 }: ContextPositionMetricOptions = {}) {
20
16
  super();
21
-
22
- this.context = context;
23
- this.judge = new ContextPositionJudge(llm);
17
+ this.judge = new ContextPositionJudge(model);
24
18
  this.scale = scale;
25
19
  }
26
20
 
27
- async measure(input: string, output: string): Promise<MetricResultWithReason> {
28
- const verdicts = await this.judge.evaluate(input, output, this.context);
21
+ async measure({
22
+ input,
23
+ output,
24
+ context,
25
+ }: {
26
+ input: string;
27
+ output: string;
28
+ context: string[];
29
+ }): Promise<MetricResult> {
30
+ const verdicts = await this.judge.evaluate(input, output, context);
29
31
  const score = this.calculateScore(verdicts);
30
32
  const reason = await this.judge.getReason(input, output, score, this.scale, verdicts);
31
33
 
32
34
  return {
33
35
  score,
34
- info: {
35
- reason,
36
- },
36
+ reason,
37
37
  };
38
38
  }
39
39
 
@@ -1,4 +1,4 @@
1
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
6
6
  import { CONTEXT_POSITION_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
7
7
 
8
8
  export class ContextPositionJudge extends MastraAgentJudge {
9
- constructor(llm: MastraLLMBase) {
10
- super('Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
9
+ constructor(model: ModelConfig) {
10
+ super('Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS, model);
11
11
  }
12
12
 
13
13
  async evaluate(
@@ -93,43 +93,48 @@ JSON:
93
93
  }
94
94
 
95
95
  export function generateReasonPrompt({
96
- score,
97
- verdicts,
98
96
  input,
99
97
  output,
98
+ verdicts,
99
+ score,
100
100
  scale,
101
101
  }: {
102
- score: number;
103
- verdicts: { verdict: string; reason: string }[];
104
102
  input: string;
105
103
  output: string;
104
+ verdicts: Array<{ verdict: string; reason: string }>;
105
+ score: number;
106
106
  scale: number;
107
107
  }) {
108
- return `Explain the irrelevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
109
- Context:
110
- Input: ${input}
111
- Output: ${output}
112
- Score: ${score}
113
- Verdicts: ${JSON.stringify(verdicts)}
114
-
115
- Rules:
116
- - Explain score based on mix of direct answers and related context
117
- - Consider both full and partial relevance
118
- - Keep explanation concise and focused
119
- - Use given score, don't recalculate
120
- - Don't judge factual correctness
121
- - Explain both relevant and irrelevant aspects
122
- - For mixed responses, explain the balance
123
- Format:
124
- {
125
- "reason": "The score is {score} because {explanation of overall relevance}"
126
- }
127
- Example Responses:
128
- {
129
- "reason": "The score is 7 because while the first statement directly answers the question, the additional context is only partially relevant"
130
- }
131
- {
132
- "reason": "The score is 3 because while the answer discusses the right topic, it doesn't directly address the question"
133
- }
134
- `;
108
+ return `Given the input, output, verdicts, and position score, and the highest possible score is ${scale}, provide a BRIEF explanation for the score. Focus on both relevance and positioning of the context.
109
+ The verdicts are a list containing \`verdict\` ('yes' or 'no' for relevance), \`reason\` (explaining the verdict) and \`node\` (the context text). Contexts are listed in their ranking order.
110
+
111
+ **
112
+ IMPORTANT: Return only JSON format with a single 'reason' key explaining the score.
113
+ Example JSON:
114
+ {
115
+ "reason": "The score is <score> because <explanation>."
116
+ }
117
+
118
+ Guidelines:
119
+ - Don't mention 'verdict' - refer to relevant/irrelevant nodes instead
120
+ - Use information from the \`reason\` field, not the field itself
121
+ - Reference node positions (first, second, etc.) when explaining relevance
122
+ - For perfect scores (${scale}.0), emphasize both relevance and optimal ordering
123
+ - Always reference the ranking order when discussing relevance
124
+ **
125
+
126
+ Position Score:
127
+ ${score}
128
+
129
+ Input:
130
+ ${input}
131
+
132
+ Output:
133
+ ${output}
134
+
135
+ Verdicts:
136
+ ${JSON.stringify(verdicts)}
137
+
138
+ JSON:
139
+ `;
135
140
  }