@mastra/evals 0.1.0-alpha.31 → 0.1.0-alpha.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/CHANGELOG.md +20 -0
  2. package/dist/metrics/llm/index.d.ts +12 -11
  3. package/dist/metrics/llm/index.js +51 -49
  4. package/package.json +4 -3
  5. package/src/evaluation.test.ts +4 -6
  6. package/src/metrics/judge/index.ts +5 -4
  7. package/src/metrics/llm/answer-relevancy/index.test.ts +4 -7
  8. package/src/metrics/llm/answer-relevancy/index.ts +4 -3
  9. package/src/metrics/llm/answer-relevancy/metricJudge.ts +3 -3
  10. package/src/metrics/llm/bias/index.test.ts +4 -6
  11. package/src/metrics/llm/bias/index.ts +4 -3
  12. package/src/metrics/llm/bias/metricJudge.ts +3 -3
  13. package/src/metrics/llm/context-position/index.test.ts +15 -17
  14. package/src/metrics/llm/context-position/index.ts +6 -4
  15. package/src/metrics/llm/context-position/metricJudge.ts +3 -3
  16. package/src/metrics/llm/context-precision/index.test.ts +13 -15
  17. package/src/metrics/llm/context-precision/index.ts +6 -4
  18. package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
  19. package/src/metrics/llm/context-relevancy/index.test.ts +7 -9
  20. package/src/metrics/llm/context-relevancy/index.ts +6 -4
  21. package/src/metrics/llm/context-relevancy/metricJudge.ts +3 -3
  22. package/src/metrics/llm/contextual-recall/index.test.ts +6 -8
  23. package/src/metrics/llm/contextual-recall/index.ts +6 -4
  24. package/src/metrics/llm/contextual-recall/metricJudge.ts +3 -3
  25. package/src/metrics/llm/faithfulness/index.test.ts +15 -17
  26. package/src/metrics/llm/faithfulness/index.ts +6 -4
  27. package/src/metrics/llm/faithfulness/metricJudge.ts +3 -3
  28. package/src/metrics/llm/hallucination/index.test.ts +15 -19
  29. package/src/metrics/llm/hallucination/index.ts +7 -5
  30. package/src/metrics/llm/hallucination/metricJudge.ts +3 -3
  31. package/src/metrics/llm/prompt-alignment/index.test.ts +9 -11
  32. package/src/metrics/llm/prompt-alignment/index.ts +4 -3
  33. package/src/metrics/llm/prompt-alignment/metricJudge.ts +3 -3
  34. package/src/metrics/llm/summarization/index.test.ts +4 -6
  35. package/src/metrics/llm/summarization/index.ts +4 -3
  36. package/src/metrics/llm/summarization/metricJudge.ts +3 -3
  37. package/src/metrics/llm/toxicity/index.test.ts +4 -6
  38. package/src/metrics/llm/toxicity/index.ts +4 -3
  39. package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
  40. package/src/metrics/llm/types.ts +1 -1
package/CHANGELOG.md CHANGED
@@ -1,5 +1,25 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 0.1.0-alpha.33
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [d7d465a]
8
+ - Updated dependencies [d7d465a]
9
+ - Updated dependencies [2017553]
10
+ - Updated dependencies [a10b7a3]
11
+ - Updated dependencies [16e5b04]
12
+ - @mastra/core@0.2.0-alpha.91
13
+
14
+ ## 0.1.0-alpha.32
15
+
16
+ ### Patch Changes
17
+
18
+ - Updated dependencies [8151f44]
19
+ - Updated dependencies [e897f1c]
20
+ - Updated dependencies [3700be1]
21
+ - @mastra/core@0.2.0-alpha.90
22
+
3
23
  ## 0.1.0-alpha.31
4
24
 
5
25
  ### Patch Changes
@@ -1,4 +1,5 @@
1
- import { MetricResult, Metric, ModelConfig } from '@mastra/core';
1
+ import { MetricResult, Metric } from '@mastra/core/eval';
2
+ import { MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  interface MetricResultWithReason extends MetricResult {
4
5
  info: {
@@ -14,7 +15,7 @@ declare class AnswerRelevancyMetric extends Metric {
14
15
  private judge;
15
16
  private uncertaintyWeight;
16
17
  private scale;
17
- constructor(model: ModelConfig, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
18
+ constructor(llm: MastraLLMBase, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
18
19
  measure(input: string, output: string): Promise<MetricResultWithReason>;
19
20
  private calculateScore;
20
21
  }
@@ -27,7 +28,7 @@ declare class ContextPositionMetric extends Metric {
27
28
  private judge;
28
29
  private scale;
29
30
  private context;
30
- constructor(model: ModelConfig, { scale, context }: ContextPositionMetricOptions);
31
+ constructor(llm: MastraLLMBase, { scale, context }: ContextPositionMetricOptions);
31
32
  measure(input: string, output: string): Promise<MetricResultWithReason>;
32
33
  private calculateScore;
33
34
  }
@@ -40,7 +41,7 @@ declare class ContextPrecisionMetric extends Metric {
40
41
  private judge;
41
42
  private scale;
42
43
  private context;
43
- constructor(model: ModelConfig, { scale, context }: ContextPrecisionMetricOptions);
44
+ constructor(llm: MastraLLMBase, { scale, context }: ContextPrecisionMetricOptions);
44
45
  measure(input: string, output: string): Promise<MetricResultWithReason>;
45
46
  private calculateScore;
46
47
  }
@@ -53,7 +54,7 @@ declare class FaithfulnessMetric extends Metric {
53
54
  private judge;
54
55
  private scale;
55
56
  private context;
56
- constructor(model: ModelConfig, { scale, context }: FaithfulnessMetricOptions);
57
+ constructor(llm: MastraLLMBase, { scale, context }: FaithfulnessMetricOptions);
57
58
  measure(input: string, output: string): Promise<MetricResultWithReason>;
58
59
  private calculateScore;
59
60
  }
@@ -66,7 +67,7 @@ declare class PromptAlignmentMetric extends Metric {
66
67
  private instructions;
67
68
  private judge;
68
69
  private scale;
69
- constructor(model: ModelConfig, { instructions, scale }: PromptAlignmentMetricOptions);
70
+ constructor(llm: MastraLLMBase, { instructions, scale }: PromptAlignmentMetricOptions);
70
71
  measure(input: string, output: string): Promise<MetricResultWithReason>;
71
72
  private calculateScore;
72
73
  }
@@ -77,7 +78,7 @@ interface ToxicityMetricOptions {
77
78
  declare class ToxicityMetric extends Metric {
78
79
  private judge;
79
80
  private scale;
80
- constructor(model: ModelConfig, { scale }?: ToxicityMetricOptions);
81
+ constructor(llm: MastraLLMBase, { scale }?: ToxicityMetricOptions);
81
82
  measure(input: string, output: string): Promise<MetricResultWithReason>;
82
83
  private calculateScore;
83
84
  }
@@ -90,7 +91,7 @@ declare class ContextRelevancyMetric extends Metric {
90
91
  private judge;
91
92
  private scale;
92
93
  private context;
93
- constructor(model: ModelConfig, { scale, context }: ContextRelevancyOptions);
94
+ constructor(llm: MastraLLMBase, { scale, context }: ContextRelevancyOptions);
94
95
  measure(input: string, output: string): Promise<MetricResultWithReason>;
95
96
  private calculateScore;
96
97
  }
@@ -103,7 +104,7 @@ declare class ContextualRecallMetric extends Metric {
103
104
  private judge;
104
105
  private scale;
105
106
  private context;
106
- constructor(model: ModelConfig, { scale, context }: ContextualRecallMetricOptions);
107
+ constructor(llm: MastraLLMBase, { scale, context }: ContextualRecallMetricOptions);
107
108
  measure(input: string, output: string): Promise<MetricResultWithReason>;
108
109
  private calculateScore;
109
110
  }
@@ -114,7 +115,7 @@ interface SummarizationMetricOptions {
114
115
  declare class SummarizationMetric extends Metric {
115
116
  private judge;
116
117
  private scale;
117
- constructor(model: ModelConfig, { scale }?: SummarizationMetricOptions);
118
+ constructor(llm: MastraLLMBase, { scale }?: SummarizationMetricOptions);
118
119
  measure(input: string, output: string): Promise<MetricResultWithReason & {
119
120
  info: {
120
121
  alignmentScore: number;
@@ -130,7 +131,7 @@ interface BiasMetricOptions {
130
131
  declare class BiasMetric extends Metric {
131
132
  private judge;
132
133
  private scale;
133
- constructor(model: ModelConfig, { scale }?: BiasMetricOptions);
134
+ constructor(llm: MastraLLMBase, { scale }?: BiasMetricOptions);
134
135
  measure(input: string, output: string): Promise<MetricResultWithReason>;
135
136
  private calculateScore;
136
137
  }
@@ -1,6 +1,8 @@
1
1
  import '../../chunk-4VNS5WPM.js';
2
- import { Metric, Agent } from '@mastra/core';
2
+ import { Metric } from '@mastra/core/eval';
3
+ import '@mastra/core/llm';
3
4
  import { z } from 'zod';
5
+ import { Agent } from '@mastra/core/agent';
4
6
 
5
7
  // src/metrics/llm/utils.ts
6
8
  var roundToTwoDecimals = (num) => {
@@ -8,11 +10,11 @@ var roundToTwoDecimals = (num) => {
8
10
  };
9
11
  var MastraAgentJudge = class {
10
12
  agent;
11
- constructor(name, instructions, model) {
13
+ constructor(name, instructions, llm) {
12
14
  this.agent = new Agent({
13
- name: `Mastra Eval Judge ${model.provider} ${name}`,
15
+ name: `Mastra Eval Judge ${llm.name} ${name}`,
14
16
  instructions,
15
- model
17
+ llm
16
18
  });
17
19
  }
18
20
  };
@@ -226,8 +228,8 @@ function generateReasonPrompt({
226
228
 
227
229
  // src/metrics/llm/answer-relevancy/metricJudge.ts
228
230
  var AnswerRelevancyJudge = class extends MastraAgentJudge {
229
- constructor(model) {
230
- super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
231
+ constructor(llm) {
232
+ super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
231
233
  }
232
234
  async evaluate(input, actualOutput) {
233
235
  const statementPrompt = generateEvaluationStatementsPrompt({ output: actualOutput });
@@ -265,10 +267,10 @@ var AnswerRelevancyMetric = class extends Metric {
265
267
  judge;
266
268
  uncertaintyWeight;
267
269
  scale;
268
- constructor(model, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
270
+ constructor(llm, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
269
271
  super();
270
272
  this.uncertaintyWeight = uncertaintyWeight;
271
- this.judge = new AnswerRelevancyJudge(model);
273
+ this.judge = new AnswerRelevancyJudge(llm);
272
274
  this.scale = scale;
273
275
  }
274
276
  async measure(input, output) {
@@ -427,8 +429,8 @@ function generateReasonPrompt2({
427
429
 
428
430
  // src/metrics/llm/context-position/metricJudge.ts
429
431
  var ContextPositionJudge = class extends MastraAgentJudge {
430
- constructor(model) {
431
- super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, model);
432
+ constructor(llm) {
433
+ super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
432
434
  }
433
435
  async evaluate(input, actualOutput, retrievalContext) {
434
436
  const prompt = generateEvaluatePrompt2({
@@ -464,11 +466,11 @@ var ContextPositionMetric = class extends Metric {
464
466
  judge;
465
467
  scale;
466
468
  context;
467
- constructor(model, { scale = 1, context }) {
469
+ constructor(llm, { scale = 1, context }) {
468
470
  super();
469
- this.judge = new ContextPositionJudge(model);
470
- this.scale = scale;
471
471
  this.context = context;
472
+ this.judge = new ContextPositionJudge(llm);
473
+ this.scale = scale;
472
474
  }
473
475
  async measure(input, output) {
474
476
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -635,8 +637,8 @@ JSON:
635
637
 
636
638
  // src/metrics/llm/context-precision/metricJudge.ts
637
639
  var ContextPrecisionJudge = class extends MastraAgentJudge {
638
- constructor(model) {
639
- super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, model);
640
+ constructor(llm) {
641
+ super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, llm);
640
642
  }
641
643
  async evaluate(input, actualOutput, retrievalContext) {
642
644
  const prompt = generateEvaluatePrompt3({
@@ -672,11 +674,11 @@ var ContextPrecisionMetric = class extends Metric {
672
674
  judge;
673
675
  scale;
674
676
  context;
675
- constructor(model, { scale = 1, context }) {
677
+ constructor(llm, { scale = 1, context }) {
676
678
  super();
677
- this.judge = new ContextPrecisionJudge(model);
678
- this.scale = scale;
679
679
  this.context = context;
680
+ this.judge = new ContextPrecisionJudge(llm);
681
+ this.scale = scale;
680
682
  }
681
683
  async measure(input, output) {
682
684
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -872,8 +874,8 @@ Example Responses:
872
874
 
873
875
  // src/metrics/llm/faithfulness/metricJudge.ts
874
876
  var FaithfulnessJudge = class extends MastraAgentJudge {
875
- constructor(model) {
876
- super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, model);
877
+ constructor(llm) {
878
+ super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, llm);
877
879
  }
878
880
  async evaluate(output, context) {
879
881
  const claimsPrompt = generateClaimExtractionPrompt({ output });
@@ -915,11 +917,11 @@ var FaithfulnessMetric = class extends Metric {
915
917
  judge;
916
918
  scale;
917
919
  context;
918
- constructor(model, { scale = 1, context }) {
920
+ constructor(llm, { scale = 1, context }) {
919
921
  super();
920
- this.scale = scale;
921
922
  this.context = context;
922
- this.judge = new FaithfulnessJudge(model);
923
+ this.judge = new FaithfulnessJudge(llm);
924
+ this.scale = scale;
923
925
  }
924
926
  async measure(input, output) {
925
927
  const verdicts = await this.judge.evaluate(output, this.context);
@@ -1067,8 +1069,8 @@ function generateReasonPrompt5({
1067
1069
 
1068
1070
  // src/metrics/llm/prompt-alignment/metricJudge.ts
1069
1071
  var PromptAlignmentJudge = class extends MastraAgentJudge {
1070
- constructor(model) {
1071
- super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, model);
1072
+ constructor(llm) {
1073
+ super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, llm);
1072
1074
  }
1073
1075
  async evaluate(input, actualOutput, instructions) {
1074
1076
  const prompt = generateEvaluatePrompt5({ input, output: actualOutput, instructions });
@@ -1096,10 +1098,10 @@ var PromptAlignmentMetric = class extends Metric {
1096
1098
  instructions;
1097
1099
  judge;
1098
1100
  scale;
1099
- constructor(model, { instructions, scale = 1 }) {
1101
+ constructor(llm, { instructions, scale = 1 }) {
1100
1102
  super();
1101
1103
  this.instructions = instructions;
1102
- this.judge = new PromptAlignmentJudge(model);
1104
+ this.judge = new PromptAlignmentJudge(llm);
1103
1105
  this.scale = scale;
1104
1106
  }
1105
1107
  async measure(input, output) {
@@ -1225,8 +1227,8 @@ ${toxics.join("\n")}`;
1225
1227
 
1226
1228
  // src/metrics/llm/toxicity/metricJudge.ts
1227
1229
  var ToxicityJudge = class extends MastraAgentJudge {
1228
- constructor(model) {
1229
- super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, model);
1230
+ constructor(llm) {
1231
+ super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, llm);
1230
1232
  }
1231
1233
  async evaluate(input, actualOutput) {
1232
1234
  const prompt = generateEvaluatePrompt6({ input, output: actualOutput });
@@ -1257,10 +1259,10 @@ var ToxicityJudge = class extends MastraAgentJudge {
1257
1259
  var ToxicityMetric = class extends Metric {
1258
1260
  judge;
1259
1261
  scale;
1260
- constructor(model, { scale = 1 } = {}) {
1262
+ constructor(llm, { scale = 1 } = {}) {
1261
1263
  super();
1264
+ this.judge = new ToxicityJudge(llm);
1262
1265
  this.scale = scale;
1263
- this.judge = new ToxicityJudge(model);
1264
1266
  }
1265
1267
  async measure(input, output) {
1266
1268
  const verdicts = await this.judge.evaluate(input, output);
@@ -1376,8 +1378,8 @@ ${relevantStatements}`;
1376
1378
 
1377
1379
  // src/metrics/llm/context-relevancy/metricJudge.ts
1378
1380
  var ContextRelevancyJudge = class extends MastraAgentJudge {
1379
- constructor(model) {
1380
- super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, model);
1381
+ constructor(llm) {
1382
+ super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, llm);
1381
1383
  }
1382
1384
  async evaluate(input, actualOutput, retrievalContext) {
1383
1385
  const prompt = generateEvaluatePrompt7({
@@ -1413,11 +1415,11 @@ var ContextRelevancyMetric = class extends Metric {
1413
1415
  judge;
1414
1416
  scale;
1415
1417
  context;
1416
- constructor(model, { scale = 1, context }) {
1418
+ constructor(llm, { scale = 1, context }) {
1417
1419
  super();
1418
- this.judge = new ContextRelevancyJudge(model);
1419
- this.scale = scale;
1420
1420
  this.context = context;
1421
+ this.judge = new ContextRelevancyJudge(llm);
1422
+ this.scale = scale;
1421
1423
  }
1422
1424
  async measure(input, output) {
1423
1425
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -1523,8 +1525,8 @@ ${unsupportiveReasons.join("\n")}
1523
1525
 
1524
1526
  // src/metrics/llm/contextual-recall/metricJudge.ts
1525
1527
  var ContextualRecallJudge = class extends MastraAgentJudge {
1526
- constructor(model) {
1527
- super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, model);
1528
+ constructor(llm) {
1529
+ super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, llm);
1528
1530
  }
1529
1531
  async evaluate(input, actualOutput, retrievalContext) {
1530
1532
  const prompt = generateEvaluatePrompt8({
@@ -1560,11 +1562,11 @@ var ContextualRecallMetric = class extends Metric {
1560
1562
  judge;
1561
1563
  scale;
1562
1564
  context;
1563
- constructor(model, { scale = 1, context }) {
1565
+ constructor(llm, { scale = 1, context }) {
1564
1566
  super();
1565
- this.judge = new ContextualRecallJudge(model);
1566
- this.scale = scale;
1567
1567
  this.context = context;
1568
+ this.judge = new ContextualRecallJudge(llm);
1569
+ this.scale = scale;
1568
1570
  }
1569
1571
  async measure(input, output) {
1570
1572
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -1829,8 +1831,8 @@ function generateReasonPrompt8({
1829
1831
 
1830
1832
  // src/metrics/llm/summarization/metricJudge.ts
1831
1833
  var SummarizationJudge = class extends MastraAgentJudge {
1832
- constructor(model) {
1833
- super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, model);
1834
+ constructor(llm) {
1835
+ super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, llm);
1834
1836
  }
1835
1837
  async evaluateAlignment(originalText, summary) {
1836
1838
  const claimsPrompt = generateClaimExtractionPrompt({ output: summary });
@@ -1894,9 +1896,9 @@ var SummarizationJudge = class extends MastraAgentJudge {
1894
1896
  var SummarizationMetric = class extends Metric {
1895
1897
  judge;
1896
1898
  scale;
1897
- constructor(model, { scale = 1 } = {}) {
1899
+ constructor(llm, { scale = 1 } = {}) {
1898
1900
  super();
1899
- this.judge = new SummarizationJudge(model);
1901
+ this.judge = new SummarizationJudge(llm);
1900
1902
  this.scale = scale;
1901
1903
  }
1902
1904
  async measure(input, output) {
@@ -2048,8 +2050,8 @@ ${biases.join("\n")}
2048
2050
 
2049
2051
  // src/metrics/llm/bias/metricJudge.ts
2050
2052
  var BiasJudge = class extends MastraAgentJudge {
2051
- constructor(model) {
2052
- super("Bias", BIAS_AGENT_INSTRUCTIONS, model);
2053
+ constructor(llm) {
2054
+ super("Bias", BIAS_AGENT_INSTRUCTIONS, llm);
2053
2055
  }
2054
2056
  async evaluate(input, actualOutput) {
2055
2057
  const opinionsPrompt = generateOpinionsPrompt({ input, output: actualOutput });
@@ -2086,10 +2088,10 @@ var BiasJudge = class extends MastraAgentJudge {
2086
2088
  var BiasMetric = class extends Metric {
2087
2089
  judge;
2088
2090
  scale;
2089
- constructor(model, { scale = 1 } = {}) {
2091
+ constructor(llm, { scale = 1 } = {}) {
2090
2092
  super();
2093
+ this.judge = new BiasJudge(llm);
2091
2094
  this.scale = scale;
2092
- this.judge = new BiasJudge(model);
2093
2095
  }
2094
2096
  async measure(input, output) {
2095
2097
  const verdicts = await this.judge.evaluate(input, output);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/evals",
3
- "version": "0.1.0-alpha.31",
3
+ "version": "0.1.0-alpha.33",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -38,7 +38,7 @@
38
38
  "sentiment": "^5.0.2",
39
39
  "string-similarity": "^4.0.4",
40
40
  "zod": "^3.24.1",
41
- "@mastra/core": "^0.2.0-alpha.89"
41
+ "@mastra/core": "^0.2.0-alpha.91"
42
42
  },
43
43
  "devDependencies": {
44
44
  "@tsconfig/recommended": "^1.0.7",
@@ -50,7 +50,8 @@
50
50
  "vitest": "^3.0.4"
51
51
  },
52
52
  "scripts": {
53
- "build": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
53
+ "check": "tsc --noEmit",
54
+ "build": "pnpm check && tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
54
55
  "dev": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --watch",
55
56
  "test": "vitest"
56
57
  }
@@ -1,6 +1,6 @@
1
- import { type ModelConfig } from '@mastra/core';
2
1
  import { Agent } from '@mastra/core/agent';
3
2
  import { Metric } from '@mastra/core/eval';
3
+ import { OpenAI } from '@mastra/core/llm/openai';
4
4
  import { describe, expect, it } from 'vitest';
5
5
 
6
6
  import { evaluate } from './evaluation';
@@ -14,18 +14,16 @@ class TestMetric extends Metric {
14
14
  }
15
15
  }
16
16
 
17
- const modelConfig: ModelConfig = {
18
- provider: 'OPEN_AI',
17
+ const llm = new OpenAI({
19
18
  name: 'gpt-4o',
20
- toolChoice: 'auto',
21
- };
19
+ });
22
20
 
23
21
  describe('evaluate', () => {
24
22
  it('should get a text response from the agent', async () => {
25
23
  const electionAgent = new Agent({
26
24
  name: 'US Election agent',
27
25
  instructions: 'You know about the past US elections',
28
- model: modelConfig,
26
+ llm,
29
27
  });
30
28
 
31
29
  const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());
@@ -1,13 +1,14 @@
1
- import { Agent, type ModelConfig } from '@mastra/core';
1
+ import { Agent } from '@mastra/core/agent';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  export abstract class MastraAgentJudge {
4
5
  protected readonly agent: Agent;
5
6
 
6
- constructor(name: string, instructions: string, model: ModelConfig) {
7
+ constructor(name: string, instructions: string, llm: MastraLLMBase) {
7
8
  this.agent = new Agent({
8
- name: `Mastra Eval Judge ${model.provider} ${name}`,
9
+ name: `Mastra Eval Judge ${llm.name} ${name}`,
9
10
  instructions: instructions,
10
- model,
11
+ llm,
11
12
  });
12
13
  }
13
14
  }
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCase } from '../utils';
@@ -92,17 +92,14 @@ const testCases: TestCase[] = [
92
92
 
93
93
  const SECONDS = 10000;
94
94
 
95
- const modelConfig: ModelConfig = {
96
- provider: 'OPEN_AI',
95
+ const llm = new OpenAI({
97
96
  name: 'gpt-4o',
98
- toolChoice: 'auto',
99
- apiKey: process.env.OPENAI_API_KEY,
100
- };
97
+ });
101
98
 
102
99
  describe(
103
100
  'AnswerRelevancyMetric',
104
101
  () => {
105
- const metric = new AnswerRelevancyMetric(modelConfig);
102
+ const metric = new AnswerRelevancyMetric(llm);
106
103
 
107
104
  it('should be able to measure a prompt with perfect relevancy', async () => {
108
105
  const result = await metric.measure(testCases[0].input, testCases[0].output);
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,11 @@ export class AnswerRelevancyMetric extends Metric {
15
16
  private uncertaintyWeight: number;
16
17
  private scale: number;
17
18
 
18
- constructor(model: ModelConfig, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
19
+ constructor(llm: MastraLLMBase, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
19
20
  super();
20
21
 
21
22
  this.uncertaintyWeight = uncertaintyWeight;
22
- this.judge = new AnswerRelevancyJudge(model);
23
+ this.judge = new AnswerRelevancyJudge(llm);
23
24
  this.scale = scale;
24
25
  }
25
26
 
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -11,8 +11,8 @@ import {
11
11
  } from './prompts';
12
12
 
13
13
  export class AnswerRelevancyJudge extends MastraAgentJudge {
14
- constructor(model: ModelConfig) {
15
- super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
14
+ constructor(llm: MastraLLMBase) {
15
+ super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
16
16
  }
17
17
 
18
18
  async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect, vi } from 'vitest';
3
3
 
4
4
  import { TestCase } from '../utils';
@@ -46,15 +46,13 @@ vi.setConfig({
46
46
  testTimeout: 20 * SECONDS,
47
47
  });
48
48
 
49
- const modelConfig: ModelConfig = {
50
- provider: 'OPEN_AI',
49
+ const llm = new OpenAI({
51
50
  name: 'gpt-4o',
52
- toolChoice: 'auto',
53
51
  apiKey: process.env.OPENAI_API_KEY,
54
- };
52
+ });
55
53
 
56
54
  describe('BiasMetric', () => {
57
- const metric = new BiasMetric(modelConfig);
55
+ const metric = new BiasMetric(llm);
58
56
 
59
57
  it('should be able to measure a prompt that is biased', async () => {
60
58
  const result = await metric.measure(testCases[0].input, testCases[0].output);
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -13,11 +14,11 @@ export class BiasMetric extends Metric {
13
14
  private judge: BiasJudge;
14
15
  private scale: number;
15
16
 
16
- constructor(model: ModelConfig, { scale = 1 }: BiasMetricOptions = {}) {
17
+ constructor(llm: MastraLLMBase, { scale = 1 }: BiasMetricOptions = {}) {
17
18
  super();
18
19
 
20
+ this.judge = new BiasJudge(llm);
19
21
  this.scale = scale;
20
- this.judge = new BiasJudge(model);
21
22
  }
22
23
 
23
24
  async measure(input: string, output: string): Promise<MetricResultWithReason> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -11,8 +11,8 @@ import {
11
11
  } from './prompts';
12
12
 
13
13
  export class BiasJudge extends MastraAgentJudge {
14
- constructor(model: ModelConfig) {
15
- super('Bias', BIAS_AGENT_INSTRUCTIONS, model);
14
+ constructor(llm: MastraLLMBase) {
15
+ super('Bias', BIAS_AGENT_INSTRUCTIONS, llm);
16
16
  }
17
17
 
18
18
  async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {