npm - @mastra/evals - Versions diffs - 1.2.0-alpha.0 → 1.2.0 - Mend

@mastra/evals 1.2.0-alpha.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +97 -0
package/dist/{chunk-XRUR5PBK.cjs → chunk-AY4K3J4R.cjs} +44 -95
package/dist/chunk-AY4K3J4R.cjs.map +1 -0
package/dist/{chunk-EVBNIL5M.js → chunk-X4MKZ735.js} +44 -95
package/dist/chunk-X4MKZ735.js.map +1 -0
package/dist/docs/SKILL.md +1 -1
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/reference-evals-scorer-utils.md +9 -5
package/dist/docs/references/reference-evals-trajectory-accuracy.md +29 -15
package/dist/scorers/code/trajectory/index.d.ts +18 -1
package/dist/scorers/code/trajectory/index.d.ts.map +1 -1
package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
package/dist/scorers/prebuilt/index.cjs +110 -121
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +39 -50
package/dist/scorers/prebuilt/index.js.map +1 -1
package/dist/scorers/utils.cjs +23 -23
package/dist/scorers/utils.d.ts +1 -4
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +1 -1
package/package.json +7 -7
package/dist/chunk-EVBNIL5M.js.map +0 -1
package/dist/chunk-XRUR5PBK.cjs.map +0 -1

package/dist/scorers/prebuilt/index.js CHANGED Viewed

@@ -1,4 +1,4 @@
-import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-EVBNIL5M.js';
+import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-X4MKZ735.js';
 import { createScorer } from '@mastra/core/evals';
 import { z } from 'zod';
 import nlp from 'compromise';
@@ -2735,7 +2735,8 @@ function formatExpectedSteps(steps, indent = 0) {
   const prefix = "  ".repeat(indent);
   return steps.map((step, i) => {
     const typeStr = step.stepType ? `[${step.stepType}] ` : "";
-    const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
+    const { name: _, stepType: _t, children: _c, ...fields } = step;
+    const dataStr = Object.keys(fields).length > 0 ? ` (${JSON.stringify(fields)})` : "";
     let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
     if (step.children?.steps && step.children.steps.length > 0) {
       line += `
@@ -2764,22 +2765,15 @@ function createTrajectoryAccuracyScorerLLM({
       if (Array.isArray(staticExpectedTrajectory)) {
         expectedSteps = staticExpectedTrajectory;
       } else {
-        expectedSteps = staticExpectedTrajectory.steps.map((s) => {
-          const result = { name: s.name, stepType: s.stepType };
-          const data = {};
-          if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
-            data.input = s.toolArgs;
-          if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
-            data.output = s.toolResult;
-          if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
-          if (Object.keys(data).length > 0) result.data = data;
-          if (s.children && s.children.length > 0) {
-            result.children = {
-              steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
-            };
+        const toExpectedStep = (s) => {
+          const { durationMs: _, metadata: _m, children, ...rest } = s;
+          const result = rest;
+          if (children && children.length > 0) {
+            result.children = { steps: children.map(toExpectedStep) };
           }
           return result;
-        });
+        };
+        expectedSteps = staticExpectedTrajectory.steps.map(toExpectedStep);
       }
     } else if (run.expectedTrajectory) {
       const expectation = run.expectedTrajectory;
@@ -3226,18 +3220,11 @@ function createToolCallAccuracyScorerCode(options) {
   });
 }
 function trajectoryStepToExpectedStep(step) {
-  const result = { name: step.name, stepType: step.stepType };
-  const data = {};
-  if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
-    if (step.toolArgs !== void 0) data.input = step.toolArgs;
-    if (step.toolResult !== void 0) data.output = step.toolResult;
-  } else if (step.stepType === "workflow_step") {
-    if (step.output !== void 0) data.output = step.output;
-  }
-  if (Object.keys(data).length > 0) result.data = data;
-  if (step.children && step.children.length > 0) {
+  const { durationMs: _, metadata: _m, children, ...rest } = step;
+  const result = rest;
+  if (children && children.length > 0) {
     result.children = {
-      steps: step.children.map(trajectoryStepToExpectedStep)
+      steps: children.map(trajectoryStepToExpectedStep)
     };
   }
   return result;
@@ -3248,15 +3235,14 @@ function expectationToExpectedSteps(expectation) {
 }
 function createTrajectoryAccuracyScorerCode(options = {}) {
   const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
-  const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
-  const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
+  const { ordering = "relaxed", allowRepeatedSteps = true } = comparisonOptions;
   const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
   const getDescription = () => {
     if (staticExpectedSteps) {
       const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
-      return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
+      return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${ordering} ordering)`;
     }
-    return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
+    return `Evaluates trajectory accuracy against expected trajectory from dataset items (${ordering} ordering)`;
   };
   return createScorer({
     id: "code-trajectory-accuracy-scorer",
@@ -3281,15 +3267,13 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
       };
     }
     const itemExpectation = run.expectedTrajectory;
-    const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
-    const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
+    const effectiveOrdering = itemExpectation?.ordering ?? ordering;
     const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
     const comparison = compareTrajectories(
       actualTrajectory,
       { steps: resolvedExpectedSteps },
       {
         ordering: effectiveOrdering,
-        compareStepData: effectiveCompareData,
         allowRepeatedSteps: effectiveAllowRepeated
       }
     );
@@ -3308,7 +3292,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
     return preprocessResult.comparison.score;
   });
 }
-function evaluateNestedExpectations(expectedSteps, actualSteps) {
+function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }) {
   const results = [];
   const matchedIndices = /* @__PURE__ */ new Set();
   for (const expectedStep of expectedSteps) {
@@ -3348,7 +3332,6 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
         { steps: childConfig.steps },
         {
           ordering: childConfig.ordering ?? "relaxed",
-          compareStepData: childConfig.compareStepData ?? false,
           allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
         }
       );
@@ -3368,22 +3351,23 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
     const toolFailures = analyzeToolFailures(childTrajectory, {
       maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
     });
-    const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
+    const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
     const scores = [];
-    if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
-    if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
-    if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
+    if (accuracy) scores.push({ weight: weights.accuracy, value: accuracy.score });
+    if (efficiency) scores.push({ weight: weights.efficiency, value: efficiency.score });
+    if (toolFailures && toolFailures.patterns.length > 0)
+      scores.push({ weight: weights.toolFailures, value: toolFailures.score });
     if (blacklist) {
       if (blacklist.score === 0) {
         results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
         continue;
       }
-      scores.push({ weight: 0.1, value: blacklist.score });
+      scores.push({ weight: weights.blacklist, value: blacklist.score });
     }
     let levelScore = 1;
     if (scores.length > 0) {
       const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-      levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
+      levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
     }
     let finalScore = levelScore;
     if (nested.length > 0) {
@@ -3408,7 +3392,13 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
   return results;
 }
 function createTrajectoryScorerCode(options = {}) {
-  const { defaults = {} } = options;
+  const { defaults = {}, weights: userWeights = {} } = options;
+  const w = {
+    accuracy: Math.max(0, userWeights.accuracy ?? 0.4),
+    efficiency: Math.max(0, userWeights.efficiency ?? 0.3),
+    toolFailures: Math.max(0, userWeights.toolFailures ?? 0.2),
+    blacklist: Math.max(0, userWeights.blacklist ?? 0.1)
+  };
   return createScorer({
     id: "code-trajectory-scorer",
     name: "Trajectory Scorer",
@@ -3428,7 +3418,6 @@ function createTrajectoryScorerCode(options = {}) {
         { steps: config.steps },
         {
           ordering: config.ordering ?? "relaxed",
-          compareStepData: config.compareStepData ?? false,
           allowRepeatedSteps: config.allowRepeatedSteps ?? true
         }
       );
@@ -3448,7 +3437,7 @@ function createTrajectoryScorerCode(options = {}) {
     const toolFailures = analyzeToolFailures(actualTrajectory, {
       maxRetriesPerTool: config.maxRetriesPerTool ?? 2
     });
-    const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
+    const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
     return {
       accuracy,
       efficiency,
@@ -3464,16 +3453,16 @@ function createTrajectoryScorerCode(options = {}) {
     }
     const scores = [];
     if (accuracy) {
-      scores.push({ weight: 0.4, value: accuracy.score });
+      scores.push({ weight: w.accuracy, value: accuracy.score });
     }
     if (efficiency) {
-      scores.push({ weight: 0.3, value: efficiency.score });
+      scores.push({ weight: w.efficiency, value: efficiency.score });
     }
     if (toolFailures && toolFailures.patterns.length > 0) {
-      scores.push({ weight: 0.2, value: toolFailures.score });
+      scores.push({ weight: w.toolFailures, value: toolFailures.score });
     }
     if (blacklist) {
-      scores.push({ weight: 0.1, value: blacklist.score });
+      scores.push({ weight: w.blacklist, value: blacklist.score });
     }
     if (scores.length === 0 && !nested) {
       return 1;
@@ -3481,7 +3470,7 @@ function createTrajectoryScorerCode(options = {}) {
     let levelScore = 1;
     if (scores.length > 0) {
       const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-      levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
+      levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
     }
     if (nested && nested.length > 0) {
       const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);