@mastra/evals 0.1.0-alpha.30 → 0.1.0-alpha.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/dist/{dist-56AYDN4X.js → dist-XPBCCWOM.js} +8 -8
  3. package/dist/index.js +1 -1
  4. package/dist/metrics/llm/index.d.ts +12 -11
  5. package/dist/metrics/llm/index.js +51 -49
  6. package/package.json +4 -3
  7. package/src/evaluation.test.ts +4 -6
  8. package/src/metrics/judge/index.ts +5 -4
  9. package/src/metrics/llm/answer-relevancy/index.test.ts +4 -7
  10. package/src/metrics/llm/answer-relevancy/index.ts +4 -3
  11. package/src/metrics/llm/answer-relevancy/metricJudge.ts +3 -3
  12. package/src/metrics/llm/bias/index.test.ts +4 -6
  13. package/src/metrics/llm/bias/index.ts +4 -3
  14. package/src/metrics/llm/bias/metricJudge.ts +3 -3
  15. package/src/metrics/llm/context-position/index.test.ts +15 -17
  16. package/src/metrics/llm/context-position/index.ts +6 -4
  17. package/src/metrics/llm/context-position/metricJudge.ts +3 -3
  18. package/src/metrics/llm/context-precision/index.test.ts +13 -15
  19. package/src/metrics/llm/context-precision/index.ts +6 -4
  20. package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
  21. package/src/metrics/llm/context-relevancy/index.test.ts +7 -9
  22. package/src/metrics/llm/context-relevancy/index.ts +6 -4
  23. package/src/metrics/llm/context-relevancy/metricJudge.ts +3 -3
  24. package/src/metrics/llm/contextual-recall/index.test.ts +6 -8
  25. package/src/metrics/llm/contextual-recall/index.ts +6 -4
  26. package/src/metrics/llm/contextual-recall/metricJudge.ts +3 -3
  27. package/src/metrics/llm/faithfulness/index.test.ts +15 -17
  28. package/src/metrics/llm/faithfulness/index.ts +6 -4
  29. package/src/metrics/llm/faithfulness/metricJudge.ts +3 -3
  30. package/src/metrics/llm/hallucination/index.test.ts +15 -19
  31. package/src/metrics/llm/hallucination/index.ts +7 -5
  32. package/src/metrics/llm/hallucination/metricJudge.ts +3 -3
  33. package/src/metrics/llm/prompt-alignment/index.test.ts +9 -11
  34. package/src/metrics/llm/prompt-alignment/index.ts +4 -3
  35. package/src/metrics/llm/prompt-alignment/metricJudge.ts +3 -3
  36. package/src/metrics/llm/summarization/index.test.ts +4 -6
  37. package/src/metrics/llm/summarization/index.ts +4 -3
  38. package/src/metrics/llm/summarization/metricJudge.ts +3 -3
  39. package/src/metrics/llm/toxicity/index.test.ts +4 -6
  40. package/src/metrics/llm/toxicity/index.ts +4 -3
  41. package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
  42. package/src/metrics/llm/types.ts +1 -1
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -11,8 +11,8 @@ import {
11
11
  } from './prompts';
12
12
 
13
13
  export class BiasJudge extends MastraAgentJudge {
14
- constructor(model: ModelConfig) {
15
- super('Bias', BIAS_AGENT_INSTRUCTIONS, model);
14
+ constructor(llm: MastraLLMBase) {
15
+ super('Bias', BIAS_AGENT_INSTRUCTIONS, llm);
16
16
  }
17
17
 
18
18
  async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
@@ -151,96 +151,94 @@ const testCases: TestCaseWithContext[] = [
151
151
 
152
152
  const SECONDS = 10000;
153
153
 
154
- const modelConfig: ModelConfig = {
155
- provider: 'OPEN_AI',
154
+ const llm = new OpenAI({
156
155
  name: 'gpt-4o',
157
- toolChoice: 'auto',
158
156
  apiKey: process.env.OPENAI_API_KEY,
159
- };
157
+ });
160
158
 
161
159
  describe(
162
160
  'ContextPositionMetric',
163
161
  () => {
164
162
  it('should handle perfect ordering with all relevant pieces', async () => {
165
163
  const testCase = testCases[0]!;
166
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
164
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
167
165
  const result = await metric.measure(testCase.input, testCase.output);
168
166
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
169
167
  });
170
168
 
171
169
  it('should handle mixed relevance case', async () => {
172
170
  const testCase = testCases[1]!;
173
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
171
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
174
172
  const result = await metric.measure(testCase.input, testCase.output);
175
173
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
176
174
  });
177
175
 
178
176
  it('should handle domain knowledge relevance', async () => {
179
177
  const testCase = testCases[2]!;
180
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
178
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
181
179
  const result = await metric.measure(testCase.input, testCase.output);
182
180
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
183
181
  });
184
182
 
185
183
  it('should handle mixed relevance with good ordering', async () => {
186
184
  const testCase = testCases[3]!;
187
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
185
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
188
186
  const result = await metric.measure(testCase.input, testCase.output);
189
187
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
190
188
  });
191
189
 
192
190
  it('should handle single relevant piece at start', async () => {
193
191
  const testCase = testCases[4]!;
194
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
192
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
195
193
  const result = await metric.measure(testCase.input, testCase.output);
196
194
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
197
195
  });
198
196
 
199
197
  it('should handle single relevant piece in middle', async () => {
200
198
  const testCase = testCases[5]!;
201
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
199
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
202
200
  const result = await metric.measure(testCase.input, testCase.output);
203
201
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
204
202
  });
205
203
 
206
204
  it('should handle single relevant piece at end', async () => {
207
205
  const testCase = testCases[6]!;
208
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
206
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
209
207
  const result = await metric.measure(testCase.input, testCase.output);
210
208
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
211
209
  });
212
210
 
213
211
  it('should handle empty context', async () => {
214
212
  const testCase = testCases[7]!;
215
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
213
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
216
214
  const result = await metric.measure(testCase.input, testCase.output);
217
215
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
218
216
  });
219
217
 
220
218
  it('should handle all irrelevant context', async () => {
221
219
  const testCase = testCases[8]!;
222
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
220
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
223
221
  const result = await metric.measure(testCase.input, testCase.output);
224
222
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
225
223
  });
226
224
 
227
225
  it('should handle complex interdependent context', async () => {
228
226
  const testCase = testCases[9]!;
229
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
227
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
230
228
  const result = await metric.measure(testCase.input, testCase.output);
231
229
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
232
230
  });
233
231
 
234
232
  it('should handle single piece context', async () => {
235
233
  const testCase = testCases[10]!;
236
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
234
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
237
235
  const result = await metric.measure(testCase.input, testCase.output);
238
236
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
239
237
  });
240
238
 
241
239
  it('should handle two relevant pieces at end', async () => {
242
240
  const testCase = testCases[11]!;
243
- const metric = new ContextPositionMetric(modelConfig, { context: testCase.context });
241
+ const metric = new ContextPositionMetric(llm, { context: testCase.context });
244
242
  const result = await metric.measure(testCase.input, testCase.output);
245
243
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
246
244
  });
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,12 @@ export class ContextPositionMetric extends Metric {
15
16
  private scale: number;
16
17
  private context: string[];
17
18
 
18
- constructor(model: ModelConfig, { scale = 1, context }: ContextPositionMetricOptions) {
19
+ constructor(llm: MastraLLMBase, { scale = 1, context }: ContextPositionMetricOptions) {
19
20
  super();
20
- this.judge = new ContextPositionJudge(model);
21
- this.scale = scale;
21
+
22
22
  this.context = context;
23
+ this.judge = new ContextPositionJudge(llm);
24
+ this.scale = scale;
23
25
  }
24
26
 
25
27
  async measure(input: string, output: string): Promise<MetricResultWithReason> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
6
6
  import { CONTEXT_POSITION_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
7
7
 
8
8
  export class ContextPositionJudge extends MastraAgentJudge {
9
- constructor(model: ModelConfig) {
10
- super('Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS, model);
9
+ constructor(llm: MastraLLMBase) {
10
+ super('Context Position', CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
11
11
  }
12
12
 
13
13
  async evaluate(
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
@@ -128,82 +128,80 @@ const testCases: TestCaseWithContext[] = [
128
128
 
129
129
  const SECONDS = 10000;
130
130
 
131
- const modelConfig: ModelConfig = {
132
- provider: 'OPEN_AI',
131
+ const llm = new OpenAI({
133
132
  name: 'gpt-4o',
134
- toolChoice: 'auto',
135
133
  apiKey: process.env.OPENAI_API_KEY,
136
- };
134
+ });
137
135
 
138
136
  describe(
139
137
  'ContextPrecisionMetric',
140
138
  () => {
141
139
  it('should measure perfect context precision with all relevant items', async () => {
142
140
  const testCase = testCases[0]!;
143
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
141
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
144
142
  const result = await metric.measure(testCase.input, testCase.output);
145
143
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
146
144
  });
147
145
 
148
146
  it('should measure high precision with irrelevant item at end', async () => {
149
147
  const testCase = testCases[1]!;
150
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
148
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
151
149
  const result = await metric.measure(testCase.input, testCase.output);
152
150
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
153
151
  });
154
152
 
155
153
  it('should measure precision with two relevant items after irrelevant start', async () => {
156
154
  const testCase = testCases[2]!;
157
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
155
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
158
156
  const result = await metric.measure(testCase.input, testCase.output);
159
157
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
160
158
  });
161
159
 
162
160
  it('should measure precision with alternating relevant items', async () => {
163
161
  const testCase = testCases[3]!;
164
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
162
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
165
163
  const result = await metric.measure(testCase.input, testCase.output);
166
164
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
167
165
  });
168
166
 
169
167
  it('should measure precision with single relevant item at start', async () => {
170
168
  const testCase = testCases[4]!;
171
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
169
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
172
170
  const result = await metric.measure(testCase.input, testCase.output);
173
171
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
174
172
  });
175
173
 
176
174
  it('should handle completely irrelevant context', async () => {
177
175
  const testCase = testCases[5]!;
178
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
176
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
179
177
  const result = await metric.measure(testCase.input, testCase.output);
180
178
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
181
179
  });
182
180
 
183
181
  it('should handle single relevant context perfectly', async () => {
184
182
  const testCase = testCases[6]!;
185
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
183
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
186
184
  const result = await metric.measure(testCase.input, testCase.output);
187
185
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
188
186
  });
189
187
 
190
188
  it('should measure precision with single relevant item at end', async () => {
191
189
  const testCase = testCases[7]!;
192
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
190
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
193
191
  const result = await metric.measure(testCase.input, testCase.output);
194
192
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
195
193
  });
196
194
 
197
195
  it('should handle empty context', async () => {
198
196
  const testCase = testCases[8]!;
199
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
197
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
200
198
  const result = await metric.measure(testCase.input, testCase.output);
201
199
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
202
200
  });
203
201
 
204
202
  it('should handle single irrelevant context', async () => {
205
203
  const testCase = testCases[9]!;
206
- const metric = new ContextPrecisionMetric(modelConfig, { context: testCase.context });
204
+ const metric = new ContextPrecisionMetric(llm, { context: testCase.context });
207
205
  const result = await metric.measure(testCase.input, testCase.output);
208
206
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
209
207
  });
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,12 @@ export class ContextPrecisionMetric extends Metric {
15
16
  private scale: number;
16
17
  private context: string[];
17
18
 
18
- constructor(model: ModelConfig, { scale = 1, context }: ContextPrecisionMetricOptions) {
19
+ constructor(llm: MastraLLMBase, { scale = 1, context }: ContextPrecisionMetricOptions) {
19
20
  super();
20
- this.judge = new ContextPrecisionJudge(model);
21
- this.scale = scale;
21
+
22
22
  this.context = context;
23
+ this.judge = new ContextPrecisionJudge(llm);
24
+ this.scale = scale;
23
25
  }
24
26
 
25
27
  async measure(input: string, output: string): Promise<MetricResultWithReason> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -7,8 +7,8 @@ import './prompts';
7
7
  import { CONTEXT_PRECISION_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
8
8
 
9
9
  export class ContextPrecisionJudge extends MastraAgentJudge {
10
- constructor(model: ModelConfig) {
11
- super('Context Precision', CONTEXT_PRECISION_AGENT_INSTRUCTIONS, model);
10
+ constructor(llm: MastraLLMBase) {
11
+ super('Context Precision', CONTEXT_PRECISION_AGENT_INSTRUCTIONS, llm);
12
12
  }
13
13
 
14
14
  async evaluate(
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { isCloserTo } from '../utils';
@@ -56,33 +56,31 @@ const testCases: TestCaseWithContext[] = [
56
56
 
57
57
  const SECONDS = 10000;
58
58
 
59
- const modelConfig: ModelConfig = {
60
- provider: 'OPEN_AI',
59
+ const llm = new OpenAI({
61
60
  name: 'gpt-4o',
62
- toolChoice: 'auto',
63
61
  apiKey: process.env.OPENAI_API_KEY,
64
- };
62
+ });
65
63
 
66
64
  describe(
67
- 'ContextPrecisionMetric',
65
+ 'ContextRelevancyMetric',
68
66
  () => {
69
67
  it('should measure perfect context relevancy with all relevant items', async () => {
70
68
  const testCase = testCases[0]!;
71
- const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
69
+ const metric = new ContextRelevancyMetric(llm, { context: testCase.context });
72
70
  const result = await metric.measure(testCase.input, testCase.output);
73
71
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
74
72
  });
75
73
 
76
74
  it('should measure mixed relevancy where only some contexts are relevant', async () => {
77
75
  const testCase = testCases[1]!;
78
- const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
76
+ const metric = new ContextRelevancyMetric(llm, { context: testCase.context });
79
77
  const result = await metric.measure(testCase.input, testCase.output);
80
78
  expect(isCloserTo(result.score, testCase.expectedResult.score, 0)).toBe(true);
81
79
  });
82
80
 
83
81
  it('should measure no relevancy where contexts are completely unrelated', async () => {
84
82
  const testCase = testCases[2]!;
85
- const metric = new ContextRelevancyMetric(modelConfig, { context: testCase.context });
83
+ const metric = new ContextRelevancyMetric(llm, { context: testCase.context });
86
84
  const result = await metric.measure(testCase.input, testCase.output);
87
85
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
88
86
  });
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,12 @@ export class ContextRelevancyMetric extends Metric {
15
16
  private scale: number;
16
17
  private context: string[];
17
18
 
18
- constructor(model: ModelConfig, { scale = 1, context }: ContextRelevancyOptions) {
19
+ constructor(llm: MastraLLMBase, { scale = 1, context }: ContextRelevancyOptions) {
19
20
  super();
20
- this.judge = new ContextRelevancyJudge(model);
21
- this.scale = scale;
21
+
22
22
  this.context = context;
23
+ this.judge = new ContextRelevancyJudge(llm);
24
+ this.scale = scale;
23
25
  }
24
26
 
25
27
  async measure(input: string, output: string): Promise<MetricResultWithReason> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
6
6
  import { CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
7
7
 
8
8
  export class ContextRelevancyJudge extends MastraAgentJudge {
9
- constructor(model: ModelConfig) {
10
- super('Context Relevancy', CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, model);
9
+ constructor(llm: MastraLLMBase) {
10
+ super('Context Relevancy', CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, llm);
11
11
  }
12
12
 
13
13
  async evaluate(
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { isCloserTo } from '../utils';
@@ -52,26 +52,24 @@ const testCases: TestCaseWithContext[] = [
52
52
 
53
53
  const SECONDS = 10000;
54
54
 
55
- const modelConfig: ModelConfig = {
56
- provider: 'OPEN_AI',
55
+ const llm = new OpenAI({
57
56
  name: 'gpt-4o',
58
- toolChoice: 'auto',
59
57
  apiKey: process.env.OPENAI_API_KEY,
60
- };
58
+ });
61
59
 
62
60
  describe(
63
61
  'ContextualRecallMetric',
64
62
  () => {
65
63
  it('should succeed when context is relevant', async () => {
66
64
  const testCase = testCases[0]!;
67
- const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
65
+ const metric = new ContextualRecallMetric(llm, { context: testCase.context });
68
66
  const result = await metric.measure(testCase.input, testCase.output);
69
67
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 2);
70
68
  });
71
69
 
72
70
  it('should be mixed', async () => {
73
71
  const testCase = testCases[1]!;
74
- const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
72
+ const metric = new ContextualRecallMetric(llm, { context: testCase.context });
75
73
  const result = await metric.measure(testCase.input, testCase.output);
76
74
 
77
75
  expect(isCloserTo(result.score, testCase.expectedResult.score, 1)).toBe(true);
@@ -80,7 +78,7 @@ describe(
80
78
 
81
79
  it('should be none', async () => {
82
80
  const testCase = testCases[2]!;
83
- const metric = new ContextualRecallMetric(modelConfig, { context: testCase.context });
81
+ const metric = new ContextualRecallMetric(llm, { context: testCase.context });
84
82
  const result = await metric.measure(testCase.input, testCase.output);
85
83
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
86
84
  });
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,12 @@ export class ContextualRecallMetric extends Metric {
15
16
  private scale: number;
16
17
  private context: string[];
17
18
 
18
- constructor(model: ModelConfig, { scale = 1, context }: ContextualRecallMetricOptions) {
19
+ constructor(llm: MastraLLMBase, { scale = 1, context }: ContextualRecallMetricOptions) {
19
20
  super();
20
- this.judge = new ContextualRecallJudge(model);
21
- this.scale = scale;
21
+
22
22
  this.context = context;
23
+ this.judge = new ContextualRecallJudge(llm);
24
+ this.scale = scale;
23
25
  }
24
26
 
25
27
  async measure(input: string, output: string): Promise<MetricResultWithReason> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -6,8 +6,8 @@ import { MastraAgentJudge } from '../../judge';
6
6
  import { CONTEXT_RECALL_AGENT_INSTRUCTIONS, generateEvaluatePrompt, generateReasonPrompt } from './prompts';
7
7
 
8
8
  export class ContextualRecallJudge extends MastraAgentJudge {
9
- constructor(model: ModelConfig) {
10
- super('Contextual Recall', CONTEXT_RECALL_AGENT_INSTRUCTIONS, model);
9
+ constructor(llm: MastraLLMBase) {
10
+ super('Contextual Recall', CONTEXT_RECALL_AGENT_INSTRUCTIONS, llm);
11
11
  }
12
12
 
13
13
  async evaluate(
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCaseWithContext } from '../utils';
@@ -147,19 +147,17 @@ const testCases: TestCaseWithContext[] = [
147
147
 
148
148
  const SECONDS = 10000;
149
149
 
150
- const modelConfig: ModelConfig = {
151
- provider: 'OPEN_AI',
150
+ const llm = new OpenAI({
152
151
  name: 'gpt-4o',
153
- toolChoice: 'auto',
154
152
  apiKey: process.env.OPENAI_API_KEY,
155
- };
153
+ });
156
154
 
157
155
  describe(
158
156
  'FaithfulnessMetric',
159
157
  () => {
160
158
  it('should handle perfect faithfulness', async () => {
161
159
  const testCase = testCases[0]!;
162
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
160
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
163
161
  const result = await metric.measure(testCase.input, testCase.output);
164
162
 
165
163
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -167,7 +165,7 @@ describe(
167
165
 
168
166
  it('should handle mixed faithfulness with contradictions', async () => {
169
167
  const testCase = testCases[1]!;
170
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
168
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
171
169
  const result = await metric.measure(testCase.input, testCase.output);
172
170
 
173
171
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -175,7 +173,7 @@ describe(
175
173
 
176
174
  it('should handle claims with speculative language', async () => {
177
175
  const testCase = testCases[2]!;
178
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
176
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
179
177
  const result = await metric.measure(testCase.input, testCase.output);
180
178
 
181
179
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -183,7 +181,7 @@ describe(
183
181
 
184
182
  it('should handle empty output', async () => {
185
183
  const testCase = testCases[3]!;
186
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
184
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
187
185
  const result = await metric.measure(testCase.input, testCase.output);
188
186
 
189
187
  expect(result.score).toBe(testCase.expectedResult.score);
@@ -191,7 +189,7 @@ describe(
191
189
 
192
190
  it('should handle empty context', async () => {
193
191
  const testCase = testCases[4]!;
194
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
192
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
195
193
  const result = await metric.measure(testCase.input, testCase.output);
196
194
 
197
195
  expect(result.score).toBe(testCase.expectedResult.score);
@@ -199,7 +197,7 @@ describe(
199
197
 
200
198
  it('should handle subjective claims', async () => {
201
199
  const testCase = testCases[5]!;
202
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
200
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
203
201
  const result = await metric.measure(testCase.input, testCase.output);
204
202
 
205
203
  expect(result.score).toBe(testCase.expectedResult.score);
@@ -207,7 +205,7 @@ describe(
207
205
 
208
206
  it('should handle claims with speculative language appropriately', async () => {
209
207
  const testCase = testCases[6]!;
210
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
208
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
211
209
  const result = await metric.measure(testCase.input, testCase.output);
212
210
 
213
211
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -215,7 +213,7 @@ describe(
215
213
 
216
214
  it('should handle compound statements correctly', async () => {
217
215
  const testCase = testCases[7]!;
218
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
216
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
219
217
  const result = await metric.measure(testCase.input, testCase.output);
220
218
 
221
219
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -223,7 +221,7 @@ describe(
223
221
 
224
222
  it('should handle precise numerical claims', async () => {
225
223
  const testCase = testCases[8]!;
226
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
224
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
227
225
  const result = await metric.measure(testCase.input, testCase.output);
228
226
 
229
227
  expect(result.score).toBe(testCase.expectedResult.score);
@@ -231,7 +229,7 @@ describe(
231
229
 
232
230
  it('should handle partially supported claims', async () => {
233
231
  const testCase = testCases[9]!;
234
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
232
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
235
233
  const result = await metric.measure(testCase.input, testCase.output);
236
234
 
237
235
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -239,7 +237,7 @@ describe(
239
237
 
240
238
  it('should handle mixed factual and speculative claims', async () => {
241
239
  const testCase = testCases[10]!;
242
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
240
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
243
241
  const result = await metric.measure(testCase.input, testCase.output);
244
242
 
245
243
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -247,7 +245,7 @@ describe(
247
245
 
248
246
  it('should handle implicit information appropriately', async () => {
249
247
  const testCase = testCases[11]!;
250
- const metric = new FaithfulnessMetric(modelConfig, { context: testCase.context });
248
+ const metric = new FaithfulnessMetric(llm, { context: testCase.context });
251
249
  const result = await metric.measure(testCase.input, testCase.output);
252
250
 
253
251
  expect(result.score).toBeCloseTo(testCase.expectedResult.score, 1);
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,12 @@ export class FaithfulnessMetric extends Metric {
15
16
  private scale: number;
16
17
  private context: string[];
17
18
 
18
- constructor(model: ModelConfig, { scale = 1, context }: FaithfulnessMetricOptions) {
19
+ constructor(llm: MastraLLMBase, { scale = 1, context }: FaithfulnessMetricOptions) {
19
20
  super();
20
- this.scale = scale;
21
+
21
22
  this.context = context;
22
- this.judge = new FaithfulnessJudge(model);
23
+ this.judge = new FaithfulnessJudge(llm);
24
+ this.scale = scale;
23
25
  }
24
26
 
25
27
  async measure(input: string, output: string): Promise<MetricResultWithReason> {