npm - @mastra/evals - Versions diffs - 0.1.0-alpha.30 → 0.1.0-alpha.33 - Mend

@mastra/evals 0.1.0-alpha.30 → 0.1.0-alpha.33

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

package/CHANGELOG.md +27 -0
package/dist/{dist-56AYDN4X.js → dist-XPBCCWOM.js} +8 -8
package/dist/index.js +1 -1
package/dist/metrics/llm/index.d.ts +12 -11
package/dist/metrics/llm/index.js +51 -49
package/package.json +4 -3
package/src/evaluation.test.ts +4 -6
package/src/metrics/judge/index.ts +5 -4
package/src/metrics/llm/answer-relevancy/index.test.ts +4 -7
package/src/metrics/llm/answer-relevancy/index.ts +4 -3
package/src/metrics/llm/answer-relevancy/metricJudge.ts +3 -3
package/src/metrics/llm/bias/index.test.ts +4 -6
package/src/metrics/llm/bias/index.ts +4 -3
package/src/metrics/llm/bias/metricJudge.ts +3 -3
package/src/metrics/llm/context-position/index.test.ts +15 -17
package/src/metrics/llm/context-position/index.ts +6 -4
package/src/metrics/llm/context-position/metricJudge.ts +3 -3
package/src/metrics/llm/context-precision/index.test.ts +13 -15
package/src/metrics/llm/context-precision/index.ts +6 -4
package/src/metrics/llm/context-precision/metricJudge.ts +3 -3
package/src/metrics/llm/context-relevancy/index.test.ts +7 -9
package/src/metrics/llm/context-relevancy/index.ts +6 -4
package/src/metrics/llm/context-relevancy/metricJudge.ts +3 -3
package/src/metrics/llm/contextual-recall/index.test.ts +6 -8
package/src/metrics/llm/contextual-recall/index.ts +6 -4
package/src/metrics/llm/contextual-recall/metricJudge.ts +3 -3
package/src/metrics/llm/faithfulness/index.test.ts +15 -17
package/src/metrics/llm/faithfulness/index.ts +6 -4
package/src/metrics/llm/faithfulness/metricJudge.ts +3 -3
package/src/metrics/llm/hallucination/index.test.ts +15 -19
package/src/metrics/llm/hallucination/index.ts +7 -5
package/src/metrics/llm/hallucination/metricJudge.ts +3 -3
package/src/metrics/llm/prompt-alignment/index.test.ts +9 -11
package/src/metrics/llm/prompt-alignment/index.ts +4 -3
package/src/metrics/llm/prompt-alignment/metricJudge.ts +3 -3
package/src/metrics/llm/summarization/index.test.ts +4 -6
package/src/metrics/llm/summarization/index.ts +4 -3
package/src/metrics/llm/summarization/metricJudge.ts +3 -3
package/src/metrics/llm/toxicity/index.test.ts +4 -6
package/src/metrics/llm/toxicity/index.ts +4 -3
package/src/metrics/llm/toxicity/metricJudge.ts +3 -3
package/src/metrics/llm/types.ts +1 -1

package/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,32 @@
 # @mastra/evals
+## 0.1.0-alpha.33
+### Patch Changes
+- Updated dependencies [d7d465a]
+- Updated dependencies [d7d465a]
+- Updated dependencies [2017553]
+- Updated dependencies [a10b7a3]
+- Updated dependencies [16e5b04]
+  - @mastra/core@0.2.0-alpha.91
+## 0.1.0-alpha.32
+### Patch Changes
+- Updated dependencies [8151f44]
+- Updated dependencies [e897f1c]
+- Updated dependencies [3700be1]
+  - @mastra/core@0.2.0-alpha.90
+## 0.1.0-alpha.31
+### Patch Changes
+- Updated dependencies [27275c9]
+  - @mastra/core@0.2.0-alpha.89
 ## 0.1.0-alpha.30
 ### Patch Changes

package/dist/{dist-56AYDN4X.js → dist-XPBCCWOM.js} RENAMED Viewed

@@ -12068,7 +12068,7 @@ function createTestHook(name, handler) {
 globalThis.performance ? globalThis.performance.now.bind(globalThis.performance) : Date.now;
 globalThis.performance ? globalThis.performance.now.bind(globalThis.performance) : Date.now;
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/utils.C8RiOc4B.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/utils.C8RiOc4B.js
 var NAME_WORKER_STATE = "__vitest_worker__";
 function getWorkerState() {
   const workerState = globalThis[NAME_WORKER_STATE];
@@ -12128,7 +12128,7 @@ async function waitForImportsToResolve() {
   await waitForImportsToResolve();
 }
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/_commonjsHelpers.BFTU3MAI.js
 var commonjsGlobal = typeof globalThis !== "undefined" ? globalThis : typeof window !== "undefined" ? window : typeof global !== "undefined" ? global : typeof self !== "undefined" ? self : {};
 function getDefaultExportFromCjs3(x2) {
   return x2 && x2.__esModule && Object.prototype.hasOwnProperty.call(x2, "default") ? x2["default"] : x2;
@@ -14197,7 +14197,7 @@ var SnapshotClient = class {
   }
 };
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/date.W2xKR2qe.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/date.W2xKR2qe.js
 var RealDate = Date;
 var now2 = null;
 var MockDate = class _MockDate extends RealDate {
@@ -14250,7 +14250,7 @@ function resetDate() {
   globalThis.Date = RealDate;
 }
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/vi.CjhMlMwf.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/vi.CjhMlMwf.js
 var unsupported = [
   // .poll is meant to retry matchers until they succeed, and
   // snapshots will always succeed as long as the poll method doesn't throw an error
@@ -17214,7 +17214,7 @@ function getImporter(name) {
   return stack?.file || "";
 }
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/run-once.2ogXb3JV.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/run-once.2ogXb3JV.js
 var filesCount = /* @__PURE__ */ new Map();
 var cache = /* @__PURE__ */ new Map();
 function runOnce(fn2, key) {
@@ -17237,7 +17237,7 @@ function isFirstRun() {
   return firstRun;
 }
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/benchmark.Cdu9hjj4.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/benchmark.Cdu9hjj4.js
 var benchFns = /* @__PURE__ */ new WeakMap();
 var benchOptsMap = /* @__PURE__ */ new WeakMap();
 var bench = createBenchmark(function(name, fn2 = noop, options = {}) {
@@ -17266,7 +17266,7 @@ function formatName2(name) {
   return typeof name === "string" ? name : name instanceof Function ? name.name || "<anonymous>" : String(name);
 }
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/chunks/index.Bf4FgyZN.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/chunks/index.Bf4FgyZN.js
 __toESM(require_dist(), 1);
 function getRunningMode() {
   return process.env.VITEST_MODE === "WATCH" ? "watch" : "run";
@@ -17277,7 +17277,7 @@ function isWatchMode() {
 var assertType = function assertType2() {
 };
-// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_tsdoks2dgqz77qqt66opozk4oi/node_modules/vitest/dist/index.js
+// ../../node_modules/.pnpm/vitest@3.0.4_@edge-runtime+vm@3.2.0_@types+debug@4.1.12_@types+node@22.13.1_jiti@2.4.2_jsdom@_hiqcd72qe6fxfy2i7gs4jbo6ry/node_modules/vitest/dist/index.js
 var import_expect_type2 = __toESM(require_dist(), 1);
 var export_expectTypeOf = import_expect_type2.expectTypeOf;
 /*! Bundled license information:

package/dist/index.js CHANGED Viewed

@@ -39,7 +39,7 @@ var getCurrentTestInfo = async () => {
     };
   }
   try {
-    const vitest = await import('./dist-56AYDN4X.js');
+    const vitest = await import('./dist-XPBCCWOM.js');
     if (typeof vitest !== "undefined" && vitest.expect?.getState) {
       const state = vitest.expect.getState();
       return {

package/dist/metrics/llm/index.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
-import { MetricResult, Metric, ModelConfig } from '@mastra/core';
+import { MetricResult, Metric } from '@mastra/core/eval';
+import { MastraLLMBase } from '@mastra/core/llm';
 interface MetricResultWithReason extends MetricResult {
     info: {
@@ -14,7 +15,7 @@ declare class AnswerRelevancyMetric extends Metric {
     private judge;
     private uncertaintyWeight;
     private scale;
-    constructor(model: ModelConfig, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
+    constructor(llm: MastraLLMBase, { uncertaintyWeight, scale }?: AnswerRelevancyMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -27,7 +28,7 @@ declare class ContextPositionMetric extends Metric {
     private judge;
     private scale;
     private context;
-    constructor(model: ModelConfig, { scale, context }: ContextPositionMetricOptions);
+    constructor(llm: MastraLLMBase, { scale, context }: ContextPositionMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -40,7 +41,7 @@ declare class ContextPrecisionMetric extends Metric {
     private judge;
     private scale;
     private context;
-    constructor(model: ModelConfig, { scale, context }: ContextPrecisionMetricOptions);
+    constructor(llm: MastraLLMBase, { scale, context }: ContextPrecisionMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -53,7 +54,7 @@ declare class FaithfulnessMetric extends Metric {
     private judge;
     private scale;
     private context;
-    constructor(model: ModelConfig, { scale, context }: FaithfulnessMetricOptions);
+    constructor(llm: MastraLLMBase, { scale, context }: FaithfulnessMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -66,7 +67,7 @@ declare class PromptAlignmentMetric extends Metric {
     private instructions;
     private judge;
     private scale;
-    constructor(model: ModelConfig, { instructions, scale }: PromptAlignmentMetricOptions);
+    constructor(llm: MastraLLMBase, { instructions, scale }: PromptAlignmentMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -77,7 +78,7 @@ interface ToxicityMetricOptions {
 declare class ToxicityMetric extends Metric {
     private judge;
     private scale;
-    constructor(model: ModelConfig, { scale }?: ToxicityMetricOptions);
+    constructor(llm: MastraLLMBase, { scale }?: ToxicityMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -90,7 +91,7 @@ declare class ContextRelevancyMetric extends Metric {
     private judge;
     private scale;
     private context;
-    constructor(model: ModelConfig, { scale, context }: ContextRelevancyOptions);
+    constructor(llm: MastraLLMBase, { scale, context }: ContextRelevancyOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -103,7 +104,7 @@ declare class ContextualRecallMetric extends Metric {
     private judge;
     private scale;
     private context;
-    constructor(model: ModelConfig, { scale, context }: ContextualRecallMetricOptions);
+    constructor(llm: MastraLLMBase, { scale, context }: ContextualRecallMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }
@@ -114,7 +115,7 @@ interface SummarizationMetricOptions {
 declare class SummarizationMetric extends Metric {
     private judge;
     private scale;
-    constructor(model: ModelConfig, { scale }?: SummarizationMetricOptions);
+    constructor(llm: MastraLLMBase, { scale }?: SummarizationMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason & {
         info: {
             alignmentScore: number;
@@ -130,7 +131,7 @@ interface BiasMetricOptions {
 declare class BiasMetric extends Metric {
     private judge;
     private scale;
-    constructor(model: ModelConfig, { scale }?: BiasMetricOptions);
+    constructor(llm: MastraLLMBase, { scale }?: BiasMetricOptions);
     measure(input: string, output: string): Promise<MetricResultWithReason>;
     private calculateScore;
 }

package/dist/metrics/llm/index.js CHANGED Viewed

@@ -1,6 +1,8 @@
 import '../../chunk-4VNS5WPM.js';
-import { Metric, Agent } from '@mastra/core';
+import { Metric } from '@mastra/core/eval';
+import '@mastra/core/llm';
 import { z } from 'zod';
+import { Agent } from '@mastra/core/agent';
 // src/metrics/llm/utils.ts
 var roundToTwoDecimals = (num) => {
@@ -8,11 +10,11 @@ var roundToTwoDecimals = (num) => {
 };
 var MastraAgentJudge = class {
   agent;
-  constructor(name, instructions, model) {
+  constructor(name, instructions, llm) {
     this.agent = new Agent({
-      name: `Mastra Eval Judge ${model.provider} ${name}`,
+      name: `Mastra Eval Judge ${llm.name} ${name}`,
       instructions,
-      model
+      llm
     });
   }
 };
@@ -226,8 +228,8 @@ function generateReasonPrompt({
 // src/metrics/llm/answer-relevancy/metricJudge.ts
 var AnswerRelevancyJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Answer Relevancy", ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput) {
     const statementPrompt = generateEvaluationStatementsPrompt({ output: actualOutput });
@@ -265,10 +267,10 @@ var AnswerRelevancyMetric = class extends Metric {
   judge;
   uncertaintyWeight;
   scale;
-  constructor(model, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
+  constructor(llm, { uncertaintyWeight = 0.3, scale = 1 } = {}) {
     super();
     this.uncertaintyWeight = uncertaintyWeight;
-    this.judge = new AnswerRelevancyJudge(model);
+    this.judge = new AnswerRelevancyJudge(llm);
     this.scale = scale;
   }
   async measure(input, output) {
@@ -427,8 +429,8 @@ function generateReasonPrompt2({
 // src/metrics/llm/context-position/metricJudge.ts
 var ContextPositionJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Context Position", CONTEXT_POSITION_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput, retrievalContext) {
     const prompt = generateEvaluatePrompt2({
@@ -464,11 +466,11 @@ var ContextPositionMetric = class extends Metric {
   judge;
   scale;
   context;
-  constructor(model, { scale = 1, context }) {
+  constructor(llm, { scale = 1, context }) {
     super();
-    this.judge = new ContextPositionJudge(model);
-    this.scale = scale;
     this.context = context;
+    this.judge = new ContextPositionJudge(llm);
+    this.scale = scale;
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -635,8 +637,8 @@ JSON:
 // src/metrics/llm/context-precision/metricJudge.ts
 var ContextPrecisionJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Context Precision", CONTEXT_PRECISION_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput, retrievalContext) {
     const prompt = generateEvaluatePrompt3({
@@ -672,11 +674,11 @@ var ContextPrecisionMetric = class extends Metric {
   judge;
   scale;
   context;
-  constructor(model, { scale = 1, context }) {
+  constructor(llm, { scale = 1, context }) {
     super();
-    this.judge = new ContextPrecisionJudge(model);
-    this.scale = scale;
     this.context = context;
+    this.judge = new ContextPrecisionJudge(llm);
+    this.scale = scale;
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -872,8 +874,8 @@ Example Responses:
 // src/metrics/llm/faithfulness/metricJudge.ts
 var FaithfulnessJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Faithfulness", FAITHFULNESS_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(output, context) {
     const claimsPrompt = generateClaimExtractionPrompt({ output });
@@ -915,11 +917,11 @@ var FaithfulnessMetric = class extends Metric {
   judge;
   scale;
   context;
-  constructor(model, { scale = 1, context }) {
+  constructor(llm, { scale = 1, context }) {
     super();
-    this.scale = scale;
     this.context = context;
-    this.judge = new FaithfulnessJudge(model);
+    this.judge = new FaithfulnessJudge(llm);
+    this.scale = scale;
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(output, this.context);
@@ -1067,8 +1069,8 @@ function generateReasonPrompt5({
 // src/metrics/llm/prompt-alignment/metricJudge.ts
 var PromptAlignmentJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Prompt Alignment", PROMPT_ALIGNMENT_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput, instructions) {
     const prompt = generateEvaluatePrompt5({ input, output: actualOutput, instructions });
@@ -1096,10 +1098,10 @@ var PromptAlignmentMetric = class extends Metric {
   instructions;
   judge;
   scale;
-  constructor(model, { instructions, scale = 1 }) {
+  constructor(llm, { instructions, scale = 1 }) {
     super();
     this.instructions = instructions;
-    this.judge = new PromptAlignmentJudge(model);
+    this.judge = new PromptAlignmentJudge(llm);
     this.scale = scale;
   }
   async measure(input, output) {
@@ -1225,8 +1227,8 @@ ${toxics.join("\n")}`;
 // src/metrics/llm/toxicity/metricJudge.ts
 var ToxicityJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Toxicity", TOXICITY_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput) {
     const prompt = generateEvaluatePrompt6({ input, output: actualOutput });
@@ -1257,10 +1259,10 @@ var ToxicityJudge = class extends MastraAgentJudge {
 var ToxicityMetric = class extends Metric {
   judge;
   scale;
-  constructor(model, { scale = 1 } = {}) {
+  constructor(llm, { scale = 1 } = {}) {
     super();
+    this.judge = new ToxicityJudge(llm);
     this.scale = scale;
-    this.judge = new ToxicityJudge(model);
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(input, output);
@@ -1376,8 +1378,8 @@ ${relevantStatements}`;
 // src/metrics/llm/context-relevancy/metricJudge.ts
 var ContextRelevancyJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Context Relevancy", CONTEXT_RELEVANCY_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput, retrievalContext) {
     const prompt = generateEvaluatePrompt7({
@@ -1413,11 +1415,11 @@ var ContextRelevancyMetric = class extends Metric {
   judge;
   scale;
   context;
-  constructor(model, { scale = 1, context }) {
+  constructor(llm, { scale = 1, context }) {
     super();
-    this.judge = new ContextRelevancyJudge(model);
-    this.scale = scale;
     this.context = context;
+    this.judge = new ContextRelevancyJudge(llm);
+    this.scale = scale;
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -1523,8 +1525,8 @@ ${unsupportiveReasons.join("\n")}
 // src/metrics/llm/contextual-recall/metricJudge.ts
 var ContextualRecallJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Contextual Recall", CONTEXT_RECALL_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput, retrievalContext) {
     const prompt = generateEvaluatePrompt8({
@@ -1560,11 +1562,11 @@ var ContextualRecallMetric = class extends Metric {
   judge;
   scale;
   context;
-  constructor(model, { scale = 1, context }) {
+  constructor(llm, { scale = 1, context }) {
     super();
-    this.judge = new ContextualRecallJudge(model);
-    this.scale = scale;
     this.context = context;
+    this.judge = new ContextualRecallJudge(llm);
+    this.scale = scale;
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(input, output, this.context);
@@ -1829,8 +1831,8 @@ function generateReasonPrompt8({
 // src/metrics/llm/summarization/metricJudge.ts
 var SummarizationJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Summarization", SUMMARIZATION_AGENT_INSTRUCTIONS, llm);
   }
   async evaluateAlignment(originalText, summary) {
     const claimsPrompt = generateClaimExtractionPrompt({ output: summary });
@@ -1894,9 +1896,9 @@ var SummarizationJudge = class extends MastraAgentJudge {
 var SummarizationMetric = class extends Metric {
   judge;
   scale;
-  constructor(model, { scale = 1 } = {}) {
+  constructor(llm, { scale = 1 } = {}) {
     super();
-    this.judge = new SummarizationJudge(model);
+    this.judge = new SummarizationJudge(llm);
     this.scale = scale;
   }
   async measure(input, output) {
@@ -2048,8 +2050,8 @@ ${biases.join("\n")}
 // src/metrics/llm/bias/metricJudge.ts
 var BiasJudge = class extends MastraAgentJudge {
-  constructor(model) {
-    super("Bias", BIAS_AGENT_INSTRUCTIONS, model);
+  constructor(llm) {
+    super("Bias", BIAS_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input, actualOutput) {
     const opinionsPrompt = generateOpinionsPrompt({ input, output: actualOutput });
@@ -2086,10 +2088,10 @@ var BiasJudge = class extends MastraAgentJudge {
 var BiasMetric = class extends Metric {
   judge;
   scale;
-  constructor(model, { scale = 1 } = {}) {
+  constructor(llm, { scale = 1 } = {}) {
     super();
+    this.judge = new BiasJudge(llm);
     this.scale = scale;
-    this.judge = new BiasJudge(model);
   }
   async measure(input, output) {
     const verdicts = await this.judge.evaluate(input, output);

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@mastra/evals",
-  "version": "0.1.0-alpha.30",
+  "version": "0.1.0-alpha.33",
   "description": "",
   "type": "module",
   "main": "dist/index.js",
@@ -38,7 +38,7 @@
     "sentiment": "^5.0.2",
     "string-similarity": "^4.0.4",
     "zod": "^3.24.1",
-    "@mastra/core": "^0.2.0-alpha.88"
+    "@mastra/core": "^0.2.0-alpha.91"
   },
   "devDependencies": {
     "@tsconfig/recommended": "^1.0.7",
@@ -50,7 +50,8 @@
     "vitest": "^3.0.4"
   },
   "scripts": {
-    "build": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
+    "check": "tsc --noEmit",
+    "build": "pnpm check && tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --clean --treeshake",
     "dev": "tsup src/index.ts src/metrics/llm/index.ts src/metrics/nlp/index.ts --format esm --dts --watch",
     "test": "vitest"
   }

package/src/evaluation.test.ts CHANGED Viewed

@@ -1,6 +1,6 @@
-import { type ModelConfig } from '@mastra/core';
 import { Agent } from '@mastra/core/agent';
 import { Metric } from '@mastra/core/eval';
+import { OpenAI } from '@mastra/core/llm/openai';
 import { describe, expect, it } from 'vitest';
 import { evaluate } from './evaluation';
@@ -14,18 +14,16 @@ class TestMetric extends Metric {
   }
 }
-const modelConfig: ModelConfig = {
-  provider: 'OPEN_AI',
+const llm = new OpenAI({
   name: 'gpt-4o',
-  toolChoice: 'auto',
-};
+});
 describe('evaluate', () => {
   it('should get a text response from the agent', async () => {
     const electionAgent = new Agent({
       name: 'US Election agent',
       instructions: 'You know about the past US elections',
-      model: modelConfig,
+      llm,
     });
     const result = await evaluate(electionAgent, 'Who won the 2016 US presidential election?', new TestMetric());

package/src/metrics/judge/index.ts CHANGED Viewed

@@ -1,13 +1,14 @@
-import { Agent, type ModelConfig } from '@mastra/core';
+import { Agent } from '@mastra/core/agent';
+import { type MastraLLMBase } from '@mastra/core/llm';
 export abstract class MastraAgentJudge {
   protected readonly agent: Agent;
-  constructor(name: string, instructions: string, model: ModelConfig) {
+  constructor(name: string, instructions: string, llm: MastraLLMBase) {
     this.agent = new Agent({
-      name: `Mastra Eval Judge ${model.provider} ${name}`,
+      name: `Mastra Eval Judge ${llm.name} ${name}`,
       instructions: instructions,
-      model,
+      llm,
     });
   }
 }

package/src/metrics/llm/answer-relevancy/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { type ModelConfig } from '@mastra/core';
+import { OpenAI } from '@mastra/core/llm/openai';
 import { describe, it, expect } from 'vitest';
 import { TestCase } from '../utils';
@@ -92,17 +92,14 @@ const testCases: TestCase[] = [
 const SECONDS = 10000;
-const modelConfig: ModelConfig = {
-  provider: 'OPEN_AI',
+const llm = new OpenAI({
   name: 'gpt-4o',
-  toolChoice: 'auto',
-  apiKey: process.env.OPENAI_API_KEY,
-};
+});
 describe(
   'AnswerRelevancyMetric',
   () => {
-    const metric = new AnswerRelevancyMetric(modelConfig);
+    const metric = new AnswerRelevancyMetric(llm);
     it('should be able to measure a prompt with perfect relevancy', async () => {
       const result = await metric.measure(testCases[0].input, testCases[0].output);

package/src/metrics/llm/answer-relevancy/index.ts CHANGED Viewed

@@ -1,4 +1,5 @@
-import { Metric, type ModelConfig } from '@mastra/core';
+import { Metric } from '@mastra/core/eval';
+import { type MastraLLMBase } from '@mastra/core/llm';
 import { type MetricResultWithReason } from '../types';
 import { roundToTwoDecimals } from '../utils';
@@ -15,11 +16,11 @@ export class AnswerRelevancyMetric extends Metric {
   private uncertaintyWeight: number;
   private scale: number;
-  constructor(model: ModelConfig, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
+  constructor(llm: MastraLLMBase, { uncertaintyWeight = 0.3, scale = 1 }: AnswerRelevancyMetricOptions = {}) {
     super();
     this.uncertaintyWeight = uncertaintyWeight;
-    this.judge = new AnswerRelevancyJudge(model);
+    this.judge = new AnswerRelevancyJudge(llm);
     this.scale = scale;
   }

package/src/metrics/llm/answer-relevancy/metricJudge.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { type ModelConfig } from '@mastra/core';
+import { type MastraLLMBase } from '@mastra/core/llm';
 import { z } from 'zod';
 import { MastraAgentJudge } from '../../judge';
@@ -11,8 +11,8 @@ import {
 } from './prompts';
 export class AnswerRelevancyJudge extends MastraAgentJudge {
-  constructor(model: ModelConfig) {
-    super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, model);
+  constructor(llm: MastraLLMBase) {
+    super('Answer Relevancy', ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, llm);
   }
   async evaluate(input: string, actualOutput: string): Promise<{ verdict: string; reason: string }[]> {

package/src/metrics/llm/bias/index.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { type ModelConfig } from '@mastra/core';
+import { OpenAI } from '@mastra/core/llm/openai';
 import { describe, it, expect, vi } from 'vitest';
 import { TestCase } from '../utils';
@@ -46,15 +46,13 @@ vi.setConfig({
   testTimeout: 20 * SECONDS,
 });
-const modelConfig: ModelConfig = {
-  provider: 'OPEN_AI',
+const llm = new OpenAI({
   name: 'gpt-4o',
-  toolChoice: 'auto',
   apiKey: process.env.OPENAI_API_KEY,
-};
+});
 describe('BiasMetric', () => {
-  const metric = new BiasMetric(modelConfig);
+  const metric = new BiasMetric(llm);
   it('should be able to measure a prompt that is biased', async () => {
     const result = await metric.measure(testCases[0].input, testCases[0].output);

package/src/metrics/llm/bias/index.ts CHANGED Viewed

@@ -1,4 +1,5 @@
-import { Metric, type ModelConfig } from '@mastra/core';
+import { Metric } from '@mastra/core/eval';
+import { type MastraLLMBase } from '@mastra/core/llm';
 import { type MetricResultWithReason } from '../types';
 import { roundToTwoDecimals } from '../utils';
@@ -13,11 +14,11 @@ export class BiasMetric extends Metric {
   private judge: BiasJudge;
   private scale: number;
-  constructor(model: ModelConfig, { scale = 1 }: BiasMetricOptions = {}) {
+  constructor(llm: MastraLLMBase, { scale = 1 }: BiasMetricOptions = {}) {
     super();
+    this.judge = new BiasJudge(llm);
     this.scale = scale;
-    this.judge = new BiasJudge(model);
   }
   async measure(input: string, output: string): Promise<MetricResultWithReason> {