@mastra/evals 0.1.0-alpha.30 → 0.1.0-alpha.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/dist/{dist-56AYDN4X.js → dist-XPBCCWOM.js} +8 -8
  3. package/dist/index.js +1 -1
  4. package/dist/metrics/llm/index.d.ts +12 -11
  5. package/dist/metrics/llm/index.js +51 -49
  6. package/package.json +4 -3
  7. package/src/evaluation.test.ts +4 -6
  8. package/src/metrics/judge/index.ts +5 -4
  9. package/src/metrics/llm/answer-relevancy/index.test.ts +4 -7
  10. package/src/metrics/llm/answer-relevancy/index.ts +4 -3
  11. package/src/metrics/llm/answer-relevancy/metricJudge.ts +3 -3
  12. package/src/metrics/llm/bias/index.test.ts +4 -6
  13. package/src/metrics/llm/bias/index.ts +4 -3
  14. package/src/metrics/llm/bias/metricJudge.ts +3 -3
  15. package/src/metrics/llm/context-position/index.test.ts +15 -17
  16. package/src/metrics/llm/context-position/index.ts +6 -4
  17. package/src/metrics/llm/context-position/metricJudge.ts +3 -3
  18. package/src/metrics/llm/context-precision/index.test.ts +13 -15
  19. package/src/metrics/llm/context-precision/index.ts +6 -4
  20. package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
  21. package/src/metrics/llm/context-relevancy/index.test.ts +7 -9
  22. package/src/metrics/llm/context-relevancy/index.ts +6 -4
  23. package/src/metrics/llm/context-relevancy/metricJudge.ts +3 -3
  24. package/src/metrics/llm/contextual-recall/index.test.ts +6 -8
  25. package/src/metrics/llm/contextual-recall/index.ts +6 -4
  26. package/src/metrics/llm/contextual-recall/metricJudge.ts +3 -3
  27. package/src/metrics/llm/faithfulness/index.test.ts +15 -17
  28. package/src/metrics/llm/faithfulness/index.ts +6 -4
  29. package/src/metrics/llm/faithfulness/metricJudge.ts +3 -3
  30. package/src/metrics/llm/hallucination/index.test.ts +15 -19
  31. package/src/metrics/llm/hallucination/index.ts +7 -5
  32. package/src/metrics/llm/hallucination/metricJudge.ts +3 -3
  33. package/src/metrics/llm/prompt-alignment/index.test.ts +9 -11
  34. package/src/metrics/llm/prompt-alignment/index.ts +4 -3
  35. package/src/metrics/llm/prompt-alignment/metricJudge.ts +3 -3
  36. package/src/metrics/llm/summarization/index.test.ts +4 -6
  37. package/src/metrics/llm/summarization/index.ts +4 -3
  38. package/src/metrics/llm/summarization/metricJudge.ts +3 -3
  39. package/src/metrics/llm/toxicity/index.test.ts +4 -6
  40. package/src/metrics/llm/toxicity/index.ts +4 -3
  41. package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
  42. package/src/metrics/llm/types.ts +1 -1
package/CHANGELOG.md CHANGED
@@ -1,5 +1,32 @@
1
1
  # @mastra/evals
2
2
 
3
+ ## 0.1.0-alpha.33
4
+
5
+ ### Patch Changes
6
+
7
+ - Updated dependencies [d7d465a]
8
+ - Updated dependencies [d7d465a]
9
+ - Updated dependencies [2017553]
10
+ - Updated dependencies [a10b7a3]
11
+ - Updated dependencies [16e5b04]
12
+ - @mastra/core@0.2.0-alpha.91
13
+
14
+ ## 0.1.0-alpha.32
15
+
16
+ ### Patch Changes
17
+
18
+ - Updated dependencies [8151f44]
19
+ - Updated dependencies [e897f1c]
20
+ - Updated dependencies [3700be1]
21
+ - @mastra/core@0.2.0-alpha.90
22
+
23
+ ## 0.1.0-alpha.31
24
+
25
+ ### Patch Changes
26
+
27
+ - Updated dependencies [27275c9]
28
+ - @mastra/core@0.2.0-alpha.89
29
+
3
30
  ## 0.1.0-alpha.30
4
31
 
5
32
  ### Patch Changes
@@ -12068,7 +12068,7 @@ function createTestHook(name, handler) {
12068
12068
  globalThis.performance ? globalThis.performance.now.bind(globalThis.performance) : Date.now;
12069
12069
  globalThis.performance ? globalThis.performance.now.bind(globalThis.performance) : Date.now;
12070
12070
 
12071
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/utils.C8RiOc4B.js
12071
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/utils.C8RiOc4B.js
12072
12072
  var NAME_WORKER_STATE = "__vitest_worker__";
12073
12073
  function getWorkerState() {
12074
12074
  const workerState = globalThis[NAME_WORKER_STATE];
@@ -12128,7 +12128,7 @@ async function waitForImportsToResolve() {
12128
12128
  await waitForImportsToResolve();
12129
12129
  }
12130
12130
 
12131
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
12131
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
12132
12132
  var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
12133
12133
  function getDefaultExportFromCjs3(x2) {
12134
12134
  return x2 && x2.__esModule && Object.prototype.hasOwnProperty.call(x2, "default") ? x2["default"] : x2;
@@ -14197,7 +14197,7 @@ var SnapshotClient = class {
14197
14197
  }
14198
14198
  };
14199
14199
 
14200
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/date.W2xKR2qe.js
14200
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/date.W2xKR2qe.js
14201
14201
  var RealDate = Date;
14202
14202
  var now2 = null;
14203
14203
  var MockDate = class _MockDate extends RealDate {
@@ -14250,7 +14250,7 @@ function resetDate() {
14250
14250
  globalThis.Date = RealDate;
14251
14251
  }
14252
14252
 
14253
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/vi.CjhMlMwf.js
14253
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/vi.CjhMlMwf.js
14254
14254
  var unsupported = [
14255
14255
  // .poll is meant to retry matchers until they succeed, and
14256
14256
  // snapshots will always succeed as long as the poll method doesn't throw an error
@@ -17214,7 +17214,7 @@ function getImporter(name) {
17214
17214
  return stack?.file || "";
17215
17215
  }
17216
17216
 
17217
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/run-once.2ogXb3JV.js
17217
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/run-once.2ogXb3JV.js
17218
17218
  var filesCount = /* @__PURE__ */ new Map();
17219
17219
  var cache = /* @__PURE__ */ new Map();
17220
17220
  function runOnce(fn2, key) {
@@ -17237,7 +17237,7 @@ function isFirstRun() {
17237
17237
  return firstRun;
17238
17238
  }
17239
17239
 
17240
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/benchmark.Cdu9hjj4.js
17240
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/benchmark.Cdu9hjj4.js
17241
17241
  var benchFns = /* @__PURE__ */ new WeakMap();
17242
17242
  var benchOptsMap = /* @__PURE__ */ new WeakMap();
17243
17243
  var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
@@ -17266,7 +17266,7 @@ function formatName2(name) {
17266
17266
  return typeof name === "string" ? name : name instanceof Function ? name.name || "<anonymous>" : String(name);
17267
17267
  }
17268
17268
 
17269
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/index.Bf4FgyZN.js
17269
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/index.Bf4FgyZN.js
17270
17270
  __toESM(require_dist(), 1);
17271
17271
  function getRunningMode() {
17272
17272
  return process.env.VITEST_MODE === "WATCH" ? "watch" : "run";
@@ -17277,7 +17277,7 @@ function isWatchMode() {
17277
17277
  var assertType = function assertType2() {
17278
17278
  };
17279
17279
 
17280
- // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/index.js
17280
+ // ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/index.js
17281
17281
  var import_expect_type2 = __toESM(require_dist(), 1);
17282
17282
  var export_expectTypeOf = import_expect_type2.expectTypeOf;
17283
17283
  /*! Bundled license information:
package/dist/index.js CHANGED
@@ -39,7 +39,7 @@ var getCurrentTestInfo = async () => {
39
39
  };
40
40
  }
41
41
  try {
42
- const vitest = await import('./dist-56AYDN4X.js');
42
+ const vitest = await import('./dist-XPBCCWOM.js');
43
43
  if (typeof vitest !== "undefined" && vitest.expect?.getState) {
44
44
  const state = vitest.expect.getState();
45
45
  return {
@@ -1,4 +1,5 @@
1
- import { MetricResult, Metric, ModelConfig } from '@mastra/core';
1
+ import { MetricResult, Metric } from '@mastra/core/eval';
2
+ import { MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  interface MetricResultWithReason extends MetricResult {
4
5
  info: {
@@ -14,7 +15,7 @@ declare class AnswerRelevancyMetric extends Metric {
14
15
  private judge;
15
16
  private uncertaintyWeight;
16
17
  private scale;
17
- constructor(model: ModelConfig, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
18
+ constructor(llm: MastraLLMBase, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
18
19
  measure(input: string, output: string): Promise<MetricResultWithReason>;
19
20
  private calculateScore;
20
21
  }
@@ -27,7 +28,7 @@ declare class ContextPositionMetric extends Metric {
27
28
  private judge;
28
29
  private scale;
29
30
  private context;
30
- constructor(model: ModelConfig, { scale, context }: ContextPositionMetricOptions);
31
+ constructor(llm: MastraLLMBase, { scale, context }: ContextPositionMetricOptions);
31
32
  measure(input: string, output: string): Promise<MetricResultWithReason>;
32
33
  private calculateScore;
33
34
  }
@@ -40,7 +41,7 @@ declare class ContextPrecisionMetric extends Metric {
40
41
  private judge;
41
42
  private scale;
42
43
  private context;
43
- constructor(model: ModelConfig, { scale, context }: ContextPrecisionMetricOptions);
44
+ constructor(llm: MastraLLMBase, { scale, context }: ContextPrecisionMetricOptions);
44
45
  measure(input: string, output: string): Promise<MetricResultWithReason>;
45
46
  private calculateScore;
46
47
  }
@@ -53,7 +54,7 @@ declare class FaithfulnessMetric extends Metric {
53
54
  private judge;
54
55
  private scale;
55
56
  private context;
56
- constructor(model: ModelConfig, { scale, context }: FaithfulnessMetricOptions);
57
+ constructor(llm: MastraLLMBase, { scale, context }: FaithfulnessMetricOptions);
57
58
  measure(input: string, output: string): Promise<MetricResultWithReason>;
58
59
  private calculateScore;
59
60
  }
@@ -66,7 +67,7 @@ declare class PromptAlignmentMetric extends Metric {
66
67
  private instructions;
67
68
  private judge;
68
69
  private scale;
69
- constructor(model: ModelConfig, { instructions, scale }: PromptAlignmentMetricOptions);
70
+ constructor(llm: MastraLLMBase, { instructions, scale }: PromptAlignmentMetricOptions);
70
71
  measure(input: string, output: string): Promise<MetricResultWithReason>;
71
72
  private calculateScore;
72
73
  }
@@ -77,7 +78,7 @@ interface ToxicityMetricOptions {
77
78
  declare class ToxicityMetric extends Metric {
78
79
  private judge;
79
80
  private scale;
80
- constructor(model: ModelConfig, { scale }?: ToxicityMetricOptions);
81
+ constructor(llm: MastraLLMBase, { scale }?: ToxicityMetricOptions);
81
82
  measure(input: string, output: string): Promise<MetricResultWithReason>;
82
83
  private calculateScore;
83
84
  }
@@ -90,7 +91,7 @@ declare class ContextRelevancyMetric extends Metric {
90
91
  private judge;
91
92
  private scale;
92
93
  private context;
93
- constructor(model: ModelConfig, { scale, context }: ContextRelevancyOptions);
94
+ constructor(llm: MastraLLMBase, { scale, context }: ContextRelevancyOptions);
94
95
  measure(input: string, output: string): Promise<MetricResultWithReason>;
95
96
  private calculateScore;
96
97
  }
@@ -103,7 +104,7 @@ declare class ContextualRecallMetric extends Metric {
103
104
  private judge;
104
105
  private scale;
105
106
  private context;
106
- constructor(model: ModelConfig, { scale, context }: ContextualRecallMetricOptions);
107
+ constructor(llm: MastraLLMBase, { scale, context }: ContextualRecallMetricOptions);
107
108
  measure(input: string, output: string): Promise<MetricResultWithReason>;
108
109
  private calculateScore;
109
110
  }
@@ -114,7 +115,7 @@ interface SummarizationMetricOptions {
114
115
  declare class SummarizationMetric extends Metric {
115
116
  private judge;
116
117
  private scale;
117
- constructor(model: ModelConfig, { scale }?: SummarizationMetricOptions);
118
+ constructor(llm: MastraLLMBase, { scale }?: SummarizationMetricOptions);
118
119
  measure(input: string, output: string): Promise<MetricResultWithReason & {
119
120
  info: {
120
121
  alignmentScore: number;
@@ -130,7 +131,7 @@ interface BiasMetricOptions {
130
131
  declare class BiasMetric extends Metric {
131
132
  private judge;
132
133
  private scale;
133
- constructor(model: ModelConfig, { scale }?: BiasMetricOptions);
134
+ constructor(llm: MastraLLMBase, { scale }?: BiasMetricOptions);
134
135
  measure(input: string, output: string): Promise<MetricResultWithReason>;
135
136
  private calculateScore;
136
137
  }
@@ -1,6 +1,8 @@
1
1
  import '../../chunk-4VNS5WPM.js';
2
- import { Metric, Agent } from '@mastra/core';
2
+ import { Metric } from '@mastra/core/eval';
3
+ import '@mastra/core/llm';
3
4
  import { z } from 'zod';
5
+ import { Agent } from '@mastra/core/agent';
4
6
 
5
7
  // src/metrics/llm/utils.ts
6
8
  var roundToTwoDecimals = (num) => {
@@ -8,11 +10,11 @@ var roundToTwoDecimals = (num) => {
8
10
  };
9
11
  var MastraAgentJudge = class {
10
12
  agent;
11
- constructor(name, instructions, model) {
13
+ constructor(name, instructions, llm) {
12
14
  this.agent = new Agent({
13
- name: `Mastra Eval Judge ${model.provider} ${name}`,
15
+ name: `Mastra Eval Judge ${llm.name} ${name}`,
14
16
  instructions,
15
- model
17
+ llm
16
18
  });
17
19
  }
18
20
  };
@@ -226,8 +228,8 @@ function generateReasonPrompt({
226
228
 
227
229
  // src/metrics/llm/answer-relevancy/metricJudge.ts
228
230
  var AnswerRelevancyJudge = class extends MastraAgentJudge {
229
- constructor(model) {
230
- super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
231
+ constructor(llm) {
232
+ super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
231
233
  }
232
234
  async evaluate(input, actualOutput) {
233
235
  const statementPrompt = generateEvaluationStatementsPrompt({ output: actualOutput });
@@ -265,10 +267,10 @@ var AnswerRelevancyMetric = class extends Metric {
265
267
  judge;
266
268
  uncertaintyWeight;
267
269
  scale;
268
- constructor(model, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
270
+ constructor(llm, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
269
271
  super();
270
272
  this.uncertaintyWeight = uncertaintyWeight;
271
- this.judge = new AnswerRelevancyJudge(model);
273
+ this.judge = new AnswerRelevancyJudge(llm);
272
274
  this.scale = scale;
273
275
  }
274
276
  async measure(input, output) {
@@ -427,8 +429,8 @@ function generateReasonPrompt2({
427
429
 
428
430
  // src/metrics/llm/context-position/metricJudge.ts
429
431
  var ContextPositionJudge = class extends MastraAgentJudge {
430
- constructor(model) {
431
- super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, model);
432
+ constructor(llm) {
433
+ super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
432
434
  }
433
435
  async evaluate(input, actualOutput, retrievalContext) {
434
436
  const prompt = generateEvaluatePrompt2({
@@ -464,11 +466,11 @@ var ContextPositionMetric = class extends Metric {
464
466
  judge;
465
467
  scale;
466
468
  context;
467
- constructor(model, { scale = 1, context }) {
469
+ constructor(llm, { scale = 1, context }) {
468
470
  super();
469
- this.judge = new ContextPositionJudge(model);
470
- this.scale = scale;
471
471
  this.context = context;
472
+ this.judge = new ContextPositionJudge(llm);
473
+ this.scale = scale;
472
474
  }
473
475
  async measure(input, output) {
474
476
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -635,8 +637,8 @@ JSON:
635
637
 
636
638
  // src/metrics/llm/context-precision/metricJudge.ts
637
639
  var ContextPrecisionJudge = class extends MastraAgentJudge {
638
- constructor(model) {
639
- super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, model);
640
+ constructor(llm) {
641
+ super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, llm);
640
642
  }
641
643
  async evaluate(input, actualOutput, retrievalContext) {
642
644
  const prompt = generateEvaluatePrompt3({
@@ -672,11 +674,11 @@ var ContextPrecisionMetric = class extends Metric {
672
674
  judge;
673
675
  scale;
674
676
  context;
675
- constructor(model, { scale = 1, context }) {
677
+ constructor(llm, { scale = 1, context }) {
676
678
  super();
677
- this.judge = new ContextPrecisionJudge(model);
678
- this.scale = scale;
679
679
  this.context = context;
680
+ this.judge = new ContextPrecisionJudge(llm);
681
+ this.scale = scale;
680
682
  }
681
683
  async measure(input, output) {
682
684
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -872,8 +874,8 @@ Example Responses:
872
874
 
873
875
  // src/metrics/llm/faithfulness/metricJudge.ts
874
876
  var FaithfulnessJudge = class extends MastraAgentJudge {
875
- constructor(model) {
876
- super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, model);
877
+ constructor(llm) {
878
+ super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, llm);
877
879
  }
878
880
  async evaluate(output, context) {
879
881
  const claimsPrompt = generateClaimExtractionPrompt({ output });
@@ -915,11 +917,11 @@ var FaithfulnessMetric = class extends Metric {
915
917
  judge;
916
918
  scale;
917
919
  context;
918
- constructor(model, { scale = 1, context }) {
920
+ constructor(llm, { scale = 1, context }) {
919
921
  super();
920
- this.scale = scale;
921
922
  this.context = context;
922
- this.judge = new FaithfulnessJudge(model);
923
+ this.judge = new FaithfulnessJudge(llm);
924
+ this.scale = scale;
923
925
  }
924
926
  async measure(input, output) {
925
927
  const verdicts = await this.judge.evaluate(output, this.context);
@@ -1067,8 +1069,8 @@ function generateReasonPrompt5({
1067
1069
 
1068
1070
  // src/metrics/llm/prompt-alignment/metricJudge.ts
1069
1071
  var PromptAlignmentJudge = class extends MastraAgentJudge {
1070
- constructor(model) {
1071
- super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, model);
1072
+ constructor(llm) {
1073
+ super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, llm);
1072
1074
  }
1073
1075
  async evaluate(input, actualOutput, instructions) {
1074
1076
  const prompt = generateEvaluatePrompt5({ input, output: actualOutput, instructions });
@@ -1096,10 +1098,10 @@ var PromptAlignmentMetric = class extends Metric {
1096
1098
  instructions;
1097
1099
  judge;
1098
1100
  scale;
1099
- constructor(model, { instructions, scale = 1 }) {
1101
+ constructor(llm, { instructions, scale = 1 }) {
1100
1102
  super();
1101
1103
  this.instructions = instructions;
1102
- this.judge = new PromptAlignmentJudge(model);
1104
+ this.judge = new PromptAlignmentJudge(llm);
1103
1105
  this.scale = scale;
1104
1106
  }
1105
1107
  async measure(input, output) {
@@ -1225,8 +1227,8 @@ ${toxics.join("\n")}`;
1225
1227
 
1226
1228
  // src/metrics/llm/toxicity/metricJudge.ts
1227
1229
  var ToxicityJudge = class extends MastraAgentJudge {
1228
- constructor(model) {
1229
- super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, model);
1230
+ constructor(llm) {
1231
+ super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, llm);
1230
1232
  }
1231
1233
  async evaluate(input, actualOutput) {
1232
1234
  const prompt = generateEvaluatePrompt6({ input, output: actualOutput });
@@ -1257,10 +1259,10 @@ var ToxicityJudge = class extends MastraAgentJudge {
1257
1259
  var ToxicityMetric = class extends Metric {
1258
1260
  judge;
1259
1261
  scale;
1260
- constructor(model, { scale = 1 } = {}) {
1262
+ constructor(llm, { scale = 1 } = {}) {
1261
1263
  super();
1264
+ this.judge = new ToxicityJudge(llm);
1262
1265
  this.scale = scale;
1263
- this.judge = new ToxicityJudge(model);
1264
1266
  }
1265
1267
  async measure(input, output) {
1266
1268
  const verdicts = await this.judge.evaluate(input, output);
@@ -1376,8 +1378,8 @@ ${relevantStatements}`;
1376
1378
 
1377
1379
  // src/metrics/llm/context-relevancy/metricJudge.ts
1378
1380
  var ContextRelevancyJudge = class extends MastraAgentJudge {
1379
- constructor(model) {
1380
- super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, model);
1381
+ constructor(llm) {
1382
+ super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, llm);
1381
1383
  }
1382
1384
  async evaluate(input, actualOutput, retrievalContext) {
1383
1385
  const prompt = generateEvaluatePrompt7({
@@ -1413,11 +1415,11 @@ var ContextRelevancyMetric = class extends Metric {
1413
1415
  judge;
1414
1416
  scale;
1415
1417
  context;
1416
- constructor(model, { scale = 1, context }) {
1418
+ constructor(llm, { scale = 1, context }) {
1417
1419
  super();
1418
- this.judge = new ContextRelevancyJudge(model);
1419
- this.scale = scale;
1420
1420
  this.context = context;
1421
+ this.judge = new ContextRelevancyJudge(llm);
1422
+ this.scale = scale;
1421
1423
  }
1422
1424
  async measure(input, output) {
1423
1425
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -1523,8 +1525,8 @@ ${unsupportiveReasons.join("\n")}
1523
1525
 
1524
1526
  // src/metrics/llm/contextual-recall/metricJudge.ts
1525
1527
  var ContextualRecallJudge = class extends MastraAgentJudge {
1526
- constructor(model) {
1527
- super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, model);
1528
+ constructor(llm) {
1529
+ super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, llm);
1528
1530
  }
1529
1531
  async evaluate(input, actualOutput, retrievalContext) {
1530
1532
  const prompt = generateEvaluatePrompt8({
@@ -1560,11 +1562,11 @@ var ContextualRecallMetric = class extends Metric {
1560
1562
  judge;
1561
1563
  scale;
1562
1564
  context;
1563
- constructor(model, { scale = 1, context }) {
1565
+ constructor(llm, { scale = 1, context }) {
1564
1566
  super();
1565
- this.judge = new ContextualRecallJudge(model);
1566
- this.scale = scale;
1567
1567
  this.context = context;
1568
+ this.judge = new ContextualRecallJudge(llm);
1569
+ this.scale = scale;
1568
1570
  }
1569
1571
  async measure(input, output) {
1570
1572
  const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -1829,8 +1831,8 @@ function generateReasonPrompt8({
1829
1831
 
1830
1832
  // src/metrics/llm/summarization/metricJudge.ts
1831
1833
  var SummarizationJudge = class extends MastraAgentJudge {
1832
- constructor(model) {
1833
- super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, model);
1834
+ constructor(llm) {
1835
+ super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, llm);
1834
1836
  }
1835
1837
  async evaluateAlignment(originalText, summary) {
1836
1838
  const claimsPrompt = generateClaimExtractionPrompt({ output: summary });
@@ -1894,9 +1896,9 @@ var SummarizationJudge = class extends MastraAgentJudge {
1894
1896
  var SummarizationMetric = class extends Metric {
1895
1897
  judge;
1896
1898
  scale;
1897
- constructor(model, { scale = 1 } = {}) {
1899
+ constructor(llm, { scale = 1 } = {}) {
1898
1900
  super();
1899
- this.judge = new SummarizationJudge(model);
1901
+ this.judge = new SummarizationJudge(llm);
1900
1902
  this.scale = scale;
1901
1903
  }
1902
1904
  async measure(input, output) {
@@ -2048,8 +2050,8 @@ ${biases.join("\n")}
2048
2050
 
2049
2051
  // src/metrics/llm/bias/metricJudge.ts
2050
2052
  var BiasJudge = class extends MastraAgentJudge {
2051
- constructor(model) {
2052
- super("Bias", BIAS_AGENT_INSTRUCTIONS, model);
2053
+ constructor(llm) {
2054
+ super("Bias", BIAS_AGENT_INSTRUCTIONS, llm);
2053
2055
  }
2054
2056
  async evaluate(input, actualOutput) {
2055
2057
  const opinionsPrompt = generateOpinionsPrompt({ input, output: actualOutput });
@@ -2086,10 +2088,10 @@ var BiasJudge = class extends MastraAgentJudge {
2086
2088
  var BiasMetric = class extends Metric {
2087
2089
  judge;
2088
2090
  scale;
2089
- constructor(model, { scale = 1 } = {}) {
2091
+ constructor(llm, { scale = 1 } = {}) {
2090
2092
  super();
2093
+ this.judge = new BiasJudge(llm);
2091
2094
  this.scale = scale;
2092
- this.judge = new BiasJudge(model);
2093
2095
  }
2094
2096
  async measure(input, output) {
2095
2097
  const verdicts = await this.judge.evaluate(input, output);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@mastra/evals",
3
- "version": "0.1.0-alpha.30",
3
+ "version": "0.1.0-alpha.33",
4
4
  "description": "",
5
5
  "type": "module",
6
6
  "main": "dist/index.js",
@@ -38,7 +38,7 @@
38
38
  "sentiment": "^5.0.2",
39
39
  "string-similarity": "^4.0.4",
40
40
  "zod": "^3.24.1",
41
- "@mastra/core": "^0.2.0-alpha.88"
41
+ "@mastra/core": "^0.2.0-alpha.91"
42
42
  },
43
43
  "devDependencies": {
44
44
  "@tsconfig/recommended": "^1.0.7",
@@ -50,7 +50,8 @@
50
50
  "vitest": "^3.0.4"
51
51
  },
52
52
  "scripts": {
53
- "build": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
53
+ "check": "tsc --noEmit",
54
+ "build": "pnpm check && tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
54
55
  "dev": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --watch",
55
56
  "test": "vitest"
56
57
  }
@@ -1,6 +1,6 @@
1
- import { type ModelConfig } from '@mastra/core';
2
1
  import { Agent } from '@mastra/core/agent';
3
2
  import { Metric } from '@mastra/core/eval';
3
+ import { OpenAI } from '@mastra/core/llm/openai';
4
4
  import { describe, expect, it } from 'vitest';
5
5
 
6
6
  import { evaluate } from './evaluation';
@@ -14,18 +14,16 @@ class TestMetric extends Metric {
14
14
  }
15
15
  }
16
16
 
17
- const modelConfig: ModelConfig = {
18
- provider: 'OPEN_AI',
17
+ const llm = new OpenAI({
19
18
  name: 'gpt-4o',
20
- toolChoice: 'auto',
21
- };
19
+ });
22
20
 
23
21
  describe('evaluate', () => {
24
22
  it('should get a text response from the agent', async () => {
25
23
  const electionAgent = new Agent({
26
24
  name: 'US Election agent',
27
25
  instructions: 'You know about the past US elections',
28
- model: modelConfig,
26
+ llm,
29
27
  });
30
28
 
31
29
  const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());
@@ -1,13 +1,14 @@
1
- import { Agent, type ModelConfig } from '@mastra/core';
1
+ import { Agent } from '@mastra/core/agent';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  export abstract class MastraAgentJudge {
4
5
  protected readonly agent: Agent;
5
6
 
6
- constructor(name: string, instructions: string, model: ModelConfig) {
7
+ constructor(name: string, instructions: string, llm: MastraLLMBase) {
7
8
  this.agent = new Agent({
8
- name: `Mastra Eval Judge ${model.provider} ${name}`,
9
+ name: `Mastra Eval Judge ${llm.name} ${name}`,
9
10
  instructions: instructions,
10
- model,
11
+ llm,
11
12
  });
12
13
  }
13
14
  }
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect } from 'vitest';
3
3
 
4
4
  import { TestCase } from '../utils';
@@ -92,17 +92,14 @@ const testCases: TestCase[] = [
92
92
 
93
93
  const SECONDS = 10000;
94
94
 
95
- const modelConfig: ModelConfig = {
96
- provider: 'OPEN_AI',
95
+ const llm = new OpenAI({
97
96
  name: 'gpt-4o',
98
- toolChoice: 'auto',
99
- apiKey: process.env.OPENAI_API_KEY,
100
- };
97
+ });
101
98
 
102
99
  describe(
103
100
  'AnswerRelevancyMetric',
104
101
  () => {
105
- const metric = new AnswerRelevancyMetric(modelConfig);
102
+ const metric = new AnswerRelevancyMetric(llm);
106
103
 
107
104
  it('should be able to measure a prompt with perfect relevancy', async () => {
108
105
  const result = await metric.measure(testCases[0].input, testCases[0].output);
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,11 @@ export class AnswerRelevancyMetric extends Metric {
15
16
  private uncertaintyWeight: number;
16
17
  private scale: number;
17
18
 
18
- constructor(model: ModelConfig, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
19
+ constructor(llm: MastraLLMBase, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
19
20
  super();
20
21
 
21
22
  this.uncertaintyWeight = uncertaintyWeight;
22
- this.judge = new AnswerRelevancyJudge(model);
23
+ this.judge = new AnswerRelevancyJudge(llm);
23
24
  this.scale = scale;
24
25
  }
25
26
 
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
2
  import { z } from 'zod';
3
3
 
4
4
  import { MastraAgentJudge } from '../../judge';
@@ -11,8 +11,8 @@ import {
11
11
  } from './prompts';
12
12
 
13
13
  export class AnswerRelevancyJudge extends MastraAgentJudge {
14
- constructor(model: ModelConfig) {
15
- super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
14
+ constructor(llm: MastraLLMBase) {
15
+ super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
16
16
  }
17
17
 
18
18
  async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {
@@ -1,4 +1,4 @@
1
- import { type ModelConfig } from '@mastra/core';
1
+ import { OpenAI } from '@mastra/core/llm/openai';
2
2
  import { describe, it, expect, vi } from 'vitest';
3
3
 
4
4
  import { TestCase } from '../utils';
@@ -46,15 +46,13 @@ vi.setConfig({
46
46
  testTimeout: 20 * SECONDS,
47
47
  });
48
48
 
49
- const modelConfig: ModelConfig = {
50
- provider: 'OPEN_AI',
49
+ const llm = new OpenAI({
51
50
  name: 'gpt-4o',
52
- toolChoice: 'auto',
53
51
  apiKey: process.env.OPENAI_API_KEY,
54
- };
52
+ });
55
53
 
56
54
  describe('BiasMetric', () => {
57
- const metric = new BiasMetric(modelConfig);
55
+ const metric = new BiasMetric(llm);
58
56
 
59
57
  it('should be able to measure a prompt that is biased', async () => {
60
58
  const result = await metric.measure(testCases[0].input, testCases[0].output);
@@ -1,4 +1,5 @@
1
- import { Metric, type ModelConfig } from '@mastra/core';
1
+ import { Metric } from '@mastra/core/eval';
2
+ import { type MastraLLMBase } from '@mastra/core/llm';
2
3
 
3
4
  import { type MetricResultWithReason } from '../types';
4
5
  import { roundToTwoDecimals } from '../utils';
@@ -13,11 +14,11 @@ export class BiasMetric extends Metric {
13
14
  private judge: BiasJudge;
14
15
  private scale: number;
15
16
 
16
- constructor(model: ModelConfig, { scale = 1 }: BiasMetricOptions = {}) {
17
+ constructor(llm: MastraLLMBase, { scale = 1 }: BiasMetricOptions = {}) {
17
18
  super();
18
19
 
20
+ this.judge = new BiasJudge(llm);
19
21
  this.scale = scale;
20
- this.judge = new BiasJudge(model);
21
22
  }
22
23
 
23
24
  async measure(input: string, output: string): Promise<MetricResultWithReason> {