@mastra/evals 0.1.0-alpha.33 → 0.1.0-alpha.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. package/CHANGELOG.md +0 -224
  2. package/jest.config.ts +21 -0
  3. package/package.json +26 -10
  4. package/src/evaluation.test.ts +16 -17
  5. package/src/evaluation.ts +11 -46
  6. package/src/index.ts +0 -1
  7. package/src/metrics/judge/index.ts +4 -5
  8. package/src/metrics/llm/answer-relevancy/index.test.ts +72 -42
  9. package/src/metrics/llm/answer-relevancy/index.ts +6 -9
  10. package/src/metrics/llm/answer-relevancy/metricJudge.ts +4 -5
  11. package/src/metrics/llm/answer-relevancy/prompts.ts +28 -26
  12. package/src/metrics/llm/bias/index.test.ts +33 -17
  13. package/src/metrics/llm/bias/index.ts +4 -13
  14. package/src/metrics/llm/bias/metricJudge.ts +4 -20
  15. package/src/metrics/llm/bias/prompts.ts +0 -27
  16. package/src/metrics/llm/context-position/index.test.ts +107 -72
  17. package/src/metrics/llm/context-position/index.ts +14 -14
  18. package/src/metrics/llm/context-position/metricJudge.ts +3 -3
  19. package/src/metrics/llm/context-position/prompts.ts +36 -31
  20. package/src/metrics/llm/context-precision/index.test.ts +91 -62
  21. package/src/metrics/llm/context-precision/index.ts +14 -14
  22. package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
  23. package/src/metrics/llm/context-relevancy/index.test.ts +36 -27
  24. package/src/metrics/llm/context-relevancy/index.ts +13 -23
  25. package/src/metrics/llm/context-relevancy/metricJudge.ts +5 -19
  26. package/src/metrics/llm/context-relevancy/prompts.ts +0 -37
  27. package/src/metrics/llm/contextual-recall/index.test.ts +37 -29
  28. package/src/metrics/llm/contextual-recall/index.ts +13 -20
  29. package/src/metrics/llm/contextual-recall/metricJudge.ts +4 -19
  30. package/src/metrics/llm/contextual-recall/prompts.ts +1 -42
  31. package/src/metrics/llm/faithfulness/index.test.ts +107 -72
  32. package/src/metrics/llm/faithfulness/index.ts +15 -22
  33. package/src/metrics/llm/faithfulness/metricJudge.ts +13 -13
  34. package/src/metrics/llm/hallucination/index.test.ts +101 -67
  35. package/src/metrics/llm/hallucination/index.ts +15 -22
  36. package/src/metrics/llm/hallucination/metricJudge.ts +16 -14
  37. package/src/metrics/llm/hallucination/prompts.ts +35 -28
  38. package/src/metrics/llm/index.ts +0 -1
  39. package/src/metrics/llm/prompt-alignment/index.test.ts +71 -55
  40. package/src/metrics/llm/prompt-alignment/index.ts +7 -16
  41. package/src/metrics/llm/prompt-alignment/metricJudge.ts +17 -13
  42. package/src/metrics/llm/summarization/index.test.ts +69 -25
  43. package/src/metrics/llm/summarization/index.ts +10 -19
  44. package/src/metrics/llm/summarization/metricJudge.ts +28 -15
  45. package/src/metrics/llm/summarization/prompts.ts +14 -52
  46. package/src/metrics/llm/toxicity/index.test.ts +29 -23
  47. package/src/metrics/llm/toxicity/index.ts +7 -10
  48. package/src/metrics/llm/toxicity/metricJudge.ts +7 -8
  49. package/src/metrics/llm/toxicity/prompts.ts +12 -5
  50. package/src/metrics/nlp/completeness/index.test.ts +20 -20
  51. package/src/metrics/nlp/completeness/index.ts +6 -14
  52. package/src/metrics/nlp/content-similarity/index.test.ts +48 -17
  53. package/src/metrics/nlp/content-similarity/index.ts +8 -15
  54. package/src/metrics/nlp/keyword-coverage/index.test.ts +60 -31
  55. package/src/metrics/nlp/keyword-coverage/index.ts +9 -10
  56. package/src/metrics/nlp/textual-difference/index.test.ts +62 -34
  57. package/src/metrics/nlp/textual-difference/index.ts +6 -12
  58. package/src/metrics/nlp/tone/index.test.ts +72 -49
  59. package/src/metrics/nlp/tone/index.ts +9 -16
  60. package/src/metrics/nlp/types.ts +13 -0
  61. package/tsconfig.json +10 -1
  62. package/README.md +0 -186
  63. package/dist/chunk-4VNS5WPM.js +0 -37
  64. package/dist/dist-XPBCCWOM.js +0 -17575
  65. package/dist/index.d.ts +0 -9
  66. package/dist/index.js +0 -73
  67. package/dist/magic-string.es-5UDOWOAZ.js +0 -1296
  68. package/dist/metrics/llm/index.d.ts +0 -139
  69. package/dist/metrics/llm/index.js +0 -2121
  70. package/dist/metrics/nlp/index.d.ts +0 -73
  71. package/dist/metrics/nlp/index.js +0 -189
  72. package/src/attachListeners.ts +0 -26
  73. package/src/constants.ts +0 -1
  74. package/src/metrics/llm/types.ts +0 -7
  75. package/vitest.config.ts +0 -11
package/CHANGELOG.md CHANGED
@@ -1,229 +1,5 @@
1
1
  # @mastra/evals
2
2
 
3
- ## 0.1.0-alpha.33
4
-
5
- ### Patch Changes
6
-
7
- - Updated dependencies [d7d465a]
8
- - Updated dependencies [d7d465a]
9
- - Updated dependencies [2017553]
10
- - Updated dependencies [a10b7a3]
11
- - Updated dependencies [16e5b04]
12
- - @mastra/core@0.2.0-alpha.91
13
-
14
- ## 0.1.0-alpha.32
15
-
16
- ### Patch Changes
17
-
18
- - Updated dependencies [8151f44]
19
- - Updated dependencies [e897f1c]
20
- - Updated dependencies [3700be1]
21
- - @mastra/core@0.2.0-alpha.90
22
-
23
- ## 0.1.0-alpha.31
24
-
25
- ### Patch Changes
26
-
27
- - Updated dependencies [27275c9]
28
- - @mastra/core@0.2.0-alpha.89
29
-
30
- ## 0.1.0-alpha.30
31
-
32
- ### Patch Changes
33
-
34
- - Updated dependencies [ccbc581]
35
- - @mastra/core@0.2.0-alpha.88
36
-
37
- ## 0.1.0-alpha.29
38
-
39
- ### Patch Changes
40
-
41
- - Updated dependencies [7365b6c]
42
- - @mastra/core@0.2.0-alpha.87
43
-
44
- ## 0.1.0-alpha.28
45
-
46
- ### Minor Changes
47
-
48
- - 5916f9d: Update deps from fixed to ^
49
-
50
- ### Patch Changes
51
-
52
- - Updated dependencies [6fa4bd2]
53
- - Updated dependencies [e2e76de]
54
- - Updated dependencies [7f24c29]
55
- - Updated dependencies [67637ba]
56
- - Updated dependencies [04f3171]
57
- - @mastra/core@0.2.0-alpha.86
58
-
59
- ## 0.1.0-alpha.27
60
-
61
- ### Patch Changes
62
-
63
- - Updated dependencies [e9d1b47]
64
- - @mastra/core@0.2.0-alpha.85
65
-
66
- ## 0.1.0-alpha.26
67
-
68
- ### Patch Changes
69
-
70
- - Updated dependencies [2f17a5f]
71
- - Updated dependencies [cb290ee]
72
- - Updated dependencies [b4d7416]
73
- - Updated dependencies [38b7f66]
74
- - @mastra/core@0.2.0-alpha.84
75
-
76
- ## 0.1.0-alpha.25
77
-
78
- ### Patch Changes
79
-
80
- - 9625602: Use mastra core splitted bundles in other packages
81
- - 8769a62: Split core into seperate entry fils
82
- - Updated dependencies [30322ce]
83
- - Updated dependencies [78eec7c]
84
- - Updated dependencies [9625602]
85
- - Updated dependencies [8769a62]
86
- - @mastra/core@0.2.0-alpha.83
87
-
88
- ## 0.1.0-alpha.24
89
-
90
- ### Patch Changes
91
-
92
- - Updated dependencies [73d112c]
93
- - @mastra/core@0.1.27-alpha.82
94
-
95
- ## 0.1.0-alpha.23
96
-
97
- ### Patch Changes
98
-
99
- - Updated dependencies [9fb3039]
100
- - @mastra/core@0.1.27-alpha.81
101
-
102
- ## 0.1.0-alpha.22
103
-
104
- ### Patch Changes
105
-
106
- - cb2e997: Bundle evals package with tsup
107
-
108
- ## 0.1.0-alpha.21
109
-
110
- ### Patch Changes
111
-
112
- - Updated dependencies [327ece7]
113
- - @mastra/core@0.1.27-alpha.80
114
-
115
- ## 0.1.0-alpha.20
116
-
117
- ### Patch Changes
118
-
119
- - Updated dependencies [21fe536]
120
- - @mastra/core@0.1.27-alpha.79
121
-
122
- ## 0.1.0-alpha.19
123
-
124
- ### Patch Changes
125
-
126
- - Updated dependencies [685108a]
127
- - Updated dependencies [685108a]
128
- - @mastra/core@0.1.27-alpha.78
129
-
130
- ## 0.1.0-alpha.18
131
-
132
- ### Patch Changes
133
-
134
- - Updated dependencies [8105fae]
135
- - @mastra/core@0.1.27-alpha.77
136
-
137
- ## 0.1.0-alpha.17
138
-
139
- ### Patch Changes
140
-
141
- - Updated dependencies [ae7bf94]
142
- - Updated dependencies [ae7bf94]
143
- - @mastra/core@0.1.27-alpha.76
144
-
145
- ## 0.1.0-alpha.16
146
-
147
- ### Patch Changes
148
-
149
- - Updated dependencies [23dcb23]
150
- - @mastra/core@0.1.27-alpha.75
151
-
152
- ## 0.1.0-alpha.15
153
-
154
- ### Patch Changes
155
-
156
- - Updated dependencies [7b87567]
157
- - @mastra/core@0.1.27-alpha.74
158
-
159
- ## 0.1.0-alpha.14
160
-
161
- ### Patch Changes
162
-
163
- - Updated dependencies [3427b95]
164
- - @mastra/core@0.1.27-alpha.73
165
-
166
- ## 0.1.0-alpha.13
167
-
168
- ### Patch Changes
169
-
170
- - 06b2c0a: Update summarization prompt and fix eval input
171
- - Updated dependencies [e4d4ede]
172
- - Updated dependencies [06b2c0a]
173
- - @mastra/core@0.1.27-alpha.72
174
-
175
- ## 0.1.0-alpha.12
176
-
177
- ### Patch Changes
178
-
179
- - Updated dependencies [d9c8dd0]
180
- - @mastra/core@0.1.27-alpha.71
181
-
182
- ## 0.1.0-alpha.11
183
-
184
- ### Patch Changes
185
-
186
- - bdaf834: publish packages
187
-
188
- ## 0.1.0-alpha.10
189
-
190
- ### Patch Changes
191
-
192
- - Updated dependencies [dd6d87f]
193
- - Updated dependencies [04434b6]
194
- - @mastra/core@0.1.27-alpha.70
195
-
196
- ## 0.1.0-alpha.9
197
-
198
- ### Patch Changes
199
-
200
- - 1944807: Unified logger and major step in better logs
201
- - 9ade36e: Changed measure for evals, added endpoints, attached metrics to agent, added ui for evals in playground, and updated docs
202
- - Updated dependencies [1944807]
203
- - Updated dependencies [9ade36e]
204
- - @mastra/core@0.1.27-alpha.69
205
-
206
- ## 0.1.0-alpha.8
207
-
208
- ### Patch Changes
209
-
210
- - Updated dependencies [0be7181]
211
- - Updated dependencies [0be7181]
212
- - @mastra/core@0.1.27-alpha.68
213
-
214
- ## 0.1.0-alpha.7
215
-
216
- ### Patch Changes
217
-
218
- - Updated dependencies [c8ff2f5]
219
- - @mastra/core@0.1.27-alpha.67
220
-
221
- ## 0.1.0-alpha.6
222
-
223
- ### Patch Changes
224
-
225
- - aea3c13: Fix evals export for llm and nlp
226
-
227
3
  ## 0.1.0-alpha.5
228
4
 
229
5
  ### Minor Changes
package/jest.config.ts ADDED
@@ -0,0 +1,21 @@
1
+ import { config } from 'dotenv';
2
+
3
+ config();
4
+
5
+ export default {
6
+ maxWorkers: 1,
7
+ preset: 'ts-jest',
8
+ extensionsToTreatAsEsm: ['.ts'],
9
+ moduleNameMapper: {
10
+ '^(\\.{1,2}/.*)\\.js$': '$1',
11
+ },
12
+ transform: {
13
+ '^.+\\.tsx?$': [
14
+ 'ts-jest',
15
+ {
16
+ useESM: true,
17
+ isolatedModules: true,
18
+ },
19
+ ],
20
+ },
21
+ };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/evals",
3
- "version": "0.1.0-alpha.33",
3
+ "version": "0.1.0-alpha.5",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -11,18 +11,30 @@
11
11
  "import": {
12
12
  "types": "./dist/index.d.ts",
13
13
  "default": "./dist/evals.esm.js"
14
+ },
15
+ "require": {
16
+ "types": "./dist/index.d.ts",
17
+ "default": "./dist/index.js"
14
18
  }
15
19
  },
16
20
  "./nlp": {
17
21
  "import": {
18
22
  "types": "./dist/metrics/nlp/index.d.ts",
19
- "default": "./dist/nlp.esm.js"
23
+ "default": "./dist/evals.esm.js"
24
+ },
25
+ "require": {
26
+ "types": "./dist/metrics/nlp/index.d.ts",
27
+ "default": "./dist/index.js"
20
28
  }
21
29
  },
22
30
  "./llm": {
23
31
  "import": {
24
32
  "types": "./dist/metrics/llm/index.d.ts",
25
- "default": "./dist/llm.esm.js"
33
+ "default": "./dist/evals.esm.js"
34
+ },
35
+ "require": {
36
+ "types": "./dist/metrics/llm/index.d.ts",
37
+ "default": "./dist/index.js"
26
38
  }
27
39
  },
28
40
  "./package.json": "./package.json"
@@ -38,21 +50,25 @@
38
50
  "sentiment": "^5.0.2",
39
51
  "string-similarity": "^4.0.4",
40
52
  "zod": "^3.24.1",
41
- "@mastra/core": "^0.2.0-alpha.91"
53
+ "@mastra/core": "0.1.27-alpha.66"
42
54
  },
43
55
  "devDependencies": {
56
+ "@babel/preset-env": "^7.26.0",
57
+ "@babel/preset-typescript": "^7.26.0",
58
+ "@jest/globals": "^29.7.0",
44
59
  "@tsconfig/recommended": "^1.0.7",
45
60
  "@types/difflib": "^0.2.7",
46
61
  "@types/fs-extra": "^11.0.4",
62
+ "@types/jest": "^29.5.12",
47
63
  "@types/sentiment": "^5.0.4",
48
64
  "@types/string-similarity": "^4.0.2",
49
- "tsup": "^8.0.1",
50
- "vitest": "^3.0.4"
65
+ "dts-cli": "^2.0.5",
66
+ "jest": "^29.7.0",
67
+ "ts-jest": "^29.2.5"
51
68
  },
52
69
  "scripts": {
53
- "check": "tsc --noEmit",
54
- "build": "pnpm check && tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
55
- "dev": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --watch",
56
- "test": "vitest"
70
+ "build": "dts build",
71
+ "build:dev": "dts watch",
72
+ "test": "node --experimental-vm-modules node_modules/jest/bin/jest.js"
57
73
  }
58
74
  }
@@ -1,7 +1,6 @@
1
- import { Agent } from '@mastra/core/agent';
2
- import { Metric } from '@mastra/core/eval';
3
- import { OpenAI } from '@mastra/core/llm/openai';
4
- import { describe, expect, it } from 'vitest';
1
+ import { Agent } from '@mastra/core';
2
+ import { ModelConfig } from '@mastra/core';
3
+ import { Metric } from '@mastra/core';
5
4
 
6
5
  import { evaluate } from './evaluation';
7
6
 
@@ -14,20 +13,20 @@ class TestMetric extends Metric {
14
13
  }
15
14
  }
16
15
 
17
- const llm = new OpenAI({
16
+ const modelConfig: ModelConfig = {
17
+ provider: 'OPEN_AI',
18
18
  name: 'gpt-4o',
19
- });
19
+ toolChoice: 'auto',
20
+ };
20
21
 
21
- describe('evaluate', () => {
22
- it('should get a text response from the agent', async () => {
23
- const electionAgent = new Agent({
24
- name: 'US Election agent',
25
- instructions: 'You know about the past US elections',
26
- llm,
27
- });
22
+ it.skip('should get a text response from the agent', async () => {
23
+ const electionAgent = new Agent({
24
+ name: 'US Election agent',
25
+ instructions: 'You know about the past US elections',
26
+ model: modelConfig,
27
+ });
28
28
 
29
- const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());
29
+ const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());
30
30
 
31
- expect(result.score).toBe(1);
32
- }, 10000);
33
- });
31
+ expect(result.score).toBe(1);
32
+ }, 10000);
package/src/evaluation.ts CHANGED
@@ -1,55 +1,20 @@
1
- import { type Agent, type Metric, evaluate as coreEvaluate } from '@mastra/core';
2
-
3
- import { GLOBAL_RUN_ID_ENV_KEY } from './constants';
1
+ import { AvailableHooks, executeHook } from '@mastra/core';
2
+ import { type Agent, type Metric } from '@mastra/core';
4
3
 
5
4
  export async function evaluate<T extends Agent>(agent: T, input: Parameters<T['generate']>[0], metric: Metric) {
6
- const testInfo = await getCurrentTestInfo();
7
- let globalRunId = process.env[GLOBAL_RUN_ID_ENV_KEY];
8
- const runId = crypto.randomUUID();
9
- const agentOutput = await agent.generate(input, {
10
- runId,
11
- });
5
+ const agentOutput = await agent.generate(input);
12
6
 
13
- if (!globalRunId) {
14
- globalRunId = process.env[GLOBAL_RUN_ID_ENV_KEY] = crypto.randomUUID();
15
- console.warn('Global run id not set, you should run "globalSetup" from "@mastra/evals" before evaluating.');
16
- }
7
+ const metricResult = await metric.measure({
8
+ input: input.toString(),
9
+ output: agentOutput.text,
10
+ });
17
11
 
18
- const metricResult = await coreEvaluate({
19
- agentName: agent.name,
20
- input,
21
- metric,
12
+ // capture infomration about the evaluation
13
+ executeHook(AvailableHooks.ON_EVALUATION, {
14
+ input: input.toString(),
22
15
  output: agentOutput.text,
23
- globalRunId,
24
- runId,
25
- testInfo,
16
+ result: metricResult,
26
17
  });
27
18
 
28
19
  return metricResult;
29
20
  }
30
-
31
- export const getCurrentTestInfo = async () => {
32
- // Jest
33
- // @ts-ignore
34
- if (typeof expect !== 'undefined' && expect.getState) {
35
- // @ts-ignore
36
- const state = expect.getState();
37
- return {
38
- testName: state.currentTestName,
39
- testPath: state.testPath,
40
- };
41
- }
42
-
43
- try {
44
- const vitest = await import('vitest');
45
- if (typeof vitest !== 'undefined' && vitest.expect?.getState) {
46
- const state = vitest.expect.getState();
47
- return {
48
- testName: state.currentTestName,
49
- testPath: state.testPath,
50
- };
51
- }
52
- } catch {}
53
-
54
- return null;
55
- };
package/src/index.ts CHANGED
@@ -1,2 +1 @@
1
1
  export { evaluate } from './evaluation';
2
- export { attachListeners, globalSetup } from './attachListeners';
@@ -1,14 +1,13 @@
1
- import { Agent } from '@mastra/core/agent';
2
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { Agent, ModelConfig } from '@mastra/core';
3
2
 
4
3
  export abstract class MastraAgentJudge {
5
4
  protected readonly agent: Agent;
6
5
 
7
- constructor(name: string, instructions: string, llm: MastraLLMBase) {
6
+ constructor(name: string, instructions: string, model: ModelConfig) {
8
7
  this.agent = new Agent({
9
- name: `Mastra Eval Judge ${llm.name} ${name}`,
8
+ name: `Mastra Eval Judge ${model.provider} ${name}`,
10
9
  instructions: instructions,
11
- llm,
10
+ model,
12
11
  });
13
12
  }
14
13
  }
@@ -1,5 +1,5 @@
1
- import { OpenAI } from '@mastra/core/llm/openai';
2
- import { describe, it, expect } from 'vitest';
1
+ import { describe, it, expect, jest } from '@jest/globals';
2
+ import { type ModelConfig } from '@mastra/core';
3
3
 
4
4
  import { TestCase } from '../utils';
5
5
 
@@ -91,65 +91,95 @@ const testCases: TestCase[] = [
91
91
  ];
92
92
 
93
93
  const SECONDS = 10000;
94
+ jest.setTimeout(15 * SECONDS);
94
95
 
95
- const llm = new OpenAI({
96
+ const modelConfig: ModelConfig = {
97
+ provider: 'OPEN_AI',
96
98
  name: 'gpt-4o',
97
- });
99
+ toolChoice: 'auto',
100
+ apiKey: process.env.OPENAI_API_KEY,
101
+ };
98
102
 
99
- describe(
100
- 'AnswerRelevancyMetric',
101
- () => {
102
- const metric = new AnswerRelevancyMetric(llm);
103
+ describe('AnswerRelevancyMetric', () => {
104
+ const metric = new AnswerRelevancyMetric(modelConfig);
103
105
 
104
- it('should be able to measure a prompt with perfect relevancy', async () => {
105
- const result = await metric.measure(testCases[0].input, testCases[0].output);
106
- expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
106
+ it('should be able to measure a prompt with perfect relevancy', async () => {
107
+ const result = await metric.measure({
108
+ input: testCases[0].input,
109
+ output: testCases[0].output,
107
110
  });
108
111
 
109
- it('should be able to measure a prompt with mostly relevant information', async () => {
110
- const result = await metric.measure(testCases[1].input, testCases[1].output);
111
- const expectedScore = testCases[1].expectedResult.score;
112
- const difference = Math.abs(result.score - expectedScore);
112
+ expect(result.score).toBeCloseTo(testCases[0].expectedResult.score, 1);
113
+ });
113
114
 
114
- expect(Math.round(difference * 10) / 10).toBeLessThanOrEqual(0.1);
115
+ it('should be able to measure a prompt with mostly relevant information', async () => {
116
+ const result = await metric.measure({
117
+ input: testCases[1].input,
118
+ output: testCases[1].output,
115
119
  });
116
120
 
117
- it('should be able to measure a prompt with partial relevance', async () => {
118
- const result = await metric.measure(testCases[2].input, testCases[2].output);
119
- expect(result.score).toBeCloseTo(testCases[2].expectedResult.score, 1);
121
+ expect(result.score).toBeCloseTo(testCases[1].expectedResult.score, 1);
122
+ });
123
+
124
+ it('should be able to measure a prompt with partial relevance', async () => {
125
+ const result = await metric.measure({
126
+ input: testCases[2].input,
127
+ output: testCases[2].output,
120
128
  });
121
129
 
122
- it('should be able to measure a prompt with low relevance', async () => {
123
- const result = await metric.measure(testCases[3].input, testCases[3].output);
124
- expect(result.score).toBeCloseTo(testCases[3].expectedResult.score, 1);
130
+ expect(result.score).toBeCloseTo(testCases[2].expectedResult.score, 1);
131
+ });
132
+
133
+ it('should be able to measure a prompt with low relevance', async () => {
134
+ const result = await metric.measure({
135
+ input: testCases[3].input,
136
+ output: testCases[3].output,
125
137
  });
126
138
 
127
- it('should be able to measure a prompt with empty output', async () => {
128
- const result = await metric.measure(testCases[5].input, testCases[5].output);
129
- expect(result.score).toBeCloseTo(testCases[5].expectedResult.score, 1);
139
+ expect(result.score).toBeCloseTo(testCases[3].expectedResult.score, 1);
140
+ });
141
+
142
+ it('should be able to measure a prompt with empty output', async () => {
143
+ const result = await metric.measure({
144
+ input: testCases[5].input,
145
+ output: testCases[5].output,
130
146
  });
131
147
 
132
- it('should be able to measure a prompt with incorrect but relevant answer', async () => {
133
- const result = await metric.measure(testCases[6].input, testCases[6].output);
134
- expect(result.score).toBeCloseTo(testCases[6].expectedResult.score, 1);
148
+ expect(result.score).toBeCloseTo(testCases[5].expectedResult.score, 1);
149
+ });
150
+
151
+ it('should be able to measure a prompt with incorrect but relevant answer', async () => {
152
+ const result = await metric.measure({
153
+ input: testCases[6].input,
154
+ output: testCases[6].output,
135
155
  });
156
+ expect(result.score).toBeCloseTo(testCases[6].expectedResult.score, 1);
157
+ });
136
158
 
137
- it('should be able to measure a prompt with a single word correct answer', async () => {
138
- const result = await metric.measure(testCases[7].input, testCases[7].output);
139
- expect(result.score).toBeCloseTo(testCases[7].expectedResult.score, 1);
159
+ it('should be able to measure a prompt with a single word correct answer', async () => {
160
+ const result = await metric.measure({
161
+ input: testCases[7].input,
162
+ output: testCases[7].output,
140
163
  });
141
164
 
142
- it('should be able to measure a prompt with multiple questions', async () => {
143
- const result = await metric.measure(testCases[8].input, testCases[8].output);
144
- expect(result.score).toBeCloseTo(testCases[8].expectedResult.score, 1);
165
+ expect(result.score).toBeCloseTo(testCases[7].expectedResult.score, 1);
166
+ });
167
+
168
+ it('should be able to measure a prompt with multiple questions', async () => {
169
+ const result = await metric.measure({
170
+ input: testCases[8].input,
171
+ output: testCases[8].output,
145
172
  });
146
173
 
147
- it('should be able to measure a prompt with technical gibberish', async () => {
148
- const result = await metric.measure(testCases[9].input, testCases[9].output);
149
- expect(result.score).toBeCloseTo(testCases[9].expectedResult.score, 1);
174
+ expect(result.score).toBeCloseTo(testCases[8].expectedResult.score, 1);
175
+ });
176
+
177
+ it('should be able to measure a prompt with technical gibberish', async () => {
178
+ const result = await metric.measure({
179
+ input: testCases[9].input,
180
+ output: testCases[9].output,
150
181
  });
151
- },
152
- {
153
- timeout: 15 * SECONDS,
154
- },
155
- );
182
+
183
+ expect(result.score).toBeCloseTo(testCases[9].expectedResult.score, 1);
184
+ });
185
+ });
@@ -1,7 +1,5 @@
1
- import { Metric } from '@mastra/core/eval';
2
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { Metric, MetricResult, ModelConfig } from '@mastra/core';
3
2
 
4
- import { type MetricResultWithReason } from '../types';
5
3
  import { roundToTwoDecimals } from '../utils';
6
4
 
7
5
  import { AnswerRelevancyJudge } from './metricJudge';
@@ -16,24 +14,23 @@ export class AnswerRelevancyMetric extends Metric {
16
14
  private uncertaintyWeight: number;
17
15
  private scale: number;
18
16
 
19
- constructor(llm: MastraLLMBase, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
17
+ constructor(model: ModelConfig, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
20
18
  super();
21
19
 
22
20
  this.uncertaintyWeight = uncertaintyWeight;
23
- this.judge = new AnswerRelevancyJudge(llm);
21
+ this.judge = new AnswerRelevancyJudge(model);
24
22
  this.scale = scale;
25
23
  }
26
24
 
27
- async measure(input: string, output: string): Promise<MetricResultWithReason> {
25
+ async measure({ input, output }: { input: string; output: string }): Promise<MetricResult> {
28
26
  const verdicts = await this.judge.evaluate(input, output);
29
27
  const score = this.calculateScore(verdicts);
28
+
30
29
  const reason = await this.judge.getReason(input, output, score, this.scale, verdicts);
31
30
 
32
31
  return {
33
32
  score,
34
- info: {
35
- reason,
36
- },
33
+ reason,
37
34
  };
38
35
  }
39
36
 
@@ -1,18 +1,18 @@
1
- import { type MastraLLMBase } from '@mastra/core/llm';
1
+ import { ModelConfig } from '@mastra/core';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
5
5
 
6
6
  import {
7
7
  generateEvaluatePrompt,
8
+ generateReasonPrompt,
8
9
  ANSWER_RELEVANCY_AGENT_INSTRUCTIONS,
9
10
  generateEvaluationStatementsPrompt,
10
- generateReasonPrompt,
11
11
  } from './prompts';
12
12
 
13
13
  export class AnswerRelevancyJudge extends MastraAgentJudge {
14
- constructor(llm: MastraLLMBase) {
15
- super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
14
+ constructor(model: ModelConfig) {
15
+ super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
16
16
  }
17
17
 
18
18
  async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
@@ -50,7 +50,6 @@ export class AnswerRelevancyJudge extends MastraAgentJudge {
50
50
  reason: z.string(),
51
51
  }),
52
52
  });
53
-
54
53
  return result.object.reason;
55
54
  }
56
55
  }