npm - @mastra/evals - Versions diffs - 1.2.0-alpha.0 → 1.2.0 - Mend

@mastra/evals 1.2.0-alpha.0 → 1.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/CHANGELOG.md +97 -0
package/dist/{chunk-XRUR5PBK.cjs → chunk-AY4K3J4R.cjs} +44 -95
package/dist/chunk-AY4K3J4R.cjs.map +1 -0
package/dist/{chunk-EVBNIL5M.js → chunk-X4MKZ735.js} +44 -95
package/dist/chunk-X4MKZ735.js.map +1 -0
package/dist/docs/SKILL.md +1 -1
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/reference-evals-scorer-utils.md +9 -5
package/dist/docs/references/reference-evals-trajectory-accuracy.md +29 -15
package/dist/scorers/code/trajectory/index.d.ts +18 -1
package/dist/scorers/code/trajectory/index.d.ts.map +1 -1
package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
package/dist/scorers/prebuilt/index.cjs +110 -121
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +39 -50
package/dist/scorers/prebuilt/index.js.map +1 -1
package/dist/scorers/utils.cjs +23 -23
package/dist/scorers/utils.d.ts +1 -4
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +1 -1
package/package.json +7 -7
package/dist/chunk-EVBNIL5M.js.map +0 -1
package/dist/chunk-XRUR5PBK.cjs.map +0 -1

package/dist/scorers/prebuilt/index.cjs CHANGED Viewed

@@ -1,6 +1,6 @@
 'use strict';
-var chunkXRUR5PBK_cjs = require('../../chunk-XRUR5PBK.cjs');
+var chunkAY4K3J4R_cjs = require('../../chunk-AY4K3J4R.cjs');
 var evals = require('@mastra/core/evals');
 var zod = require('zod');
 var nlp = require('compromise');
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
     description: "Extract relevant statements from the LLM output",
     outputSchema: extractOutputSchema,
     createPrompt: ({ run }) => {
-      const assistantMessage = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const assistantMessage = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       return createExtractPrompt(assistantMessage);
     }
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run, results }) => {
-      const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
       return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
     }
   }).generateScore(({ results }) => {
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
       }
     }
     const score = relevancyCount / numberOfResults;
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * options.scale);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * options.scale);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       return createReasonPrompt({
-        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         score,
         results: results.analyzeStepResult.results,
         scale: options.scale
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
           groundTruth: ""
         });
       }
-      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
       return createExtractPrompt2({
         output,
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
     );
     score -= extraInfoPenalty;
     score = Math.max(0, Math.min(1, score));
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * mergedOptions.scale);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * mergedOptions.scale);
   }).generateReason({
     description: "Generate explanation of similarity score",
     createPrompt: ({ run, results, score }) => {
       if (!run.groundTruth) {
         return "No ground truth was provided for comparison. Score is 0 by default.";
       }
-      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
       return createReasonPrompt2({
         output,
@@ -717,7 +717,7 @@ function createFaithfulnessScorer({
       claims: zod.z.array(zod.z.string())
     }),
     createPrompt: ({ run }) => {
-      const prompt = createFaithfulnessExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createFaithfulnessExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -741,14 +741,14 @@ function createFaithfulnessScorer({
       return 0;
     }
     const score = supportedClaims / totalClaims * (options?.scale || 1);
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       const assistantMessage = run.output.find(({ role }) => role === "assistant");
       const prompt = createFaithfulnessReasonPrompt({
-        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
         score,
         scale: options?.scale || 1,
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
     outputSchema: zod.z.object({
       opinions: zod.z.array(zod.z.string())
     }),
-    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
+    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run, results }) => {
       const prompt = createBiasAnalyzePrompt({
-        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         opinions: results.preprocessStepResult?.opinions || []
       });
       return prompt;
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
     }
     const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
     const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ score, results }) => {
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
       claims: zod.z.array(zod.z.string())
     }),
     createPrompt: ({ run }) => {
-      const prompt = createHallucinationExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createHallucinationExtractPrompt({ output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
       return 0;
     }
     const score = contradictedStatements / totalStatements * (options?.scale || 1);
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: async ({ run, results, score }) => {
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
         context = options?.context ?? [];
       }
       const prompt = createHallucinationReasonPrompt({
-        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context,
         score,
         scale: options?.scale || 1,
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
     outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run }) => {
       const prompt = createToxicityAnalyzePrompt({
-        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
+        input: chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
       });
       return prompt;
     }
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
       }
     }
     const score = toxicityCount / numberOfVerdicts;
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ results, score }) => {
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Input and output messages cannot be null or empty");
     }
-    const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
+    const { tools: actualTools, toolCallInfos } = chunkAY4K3J4R_cjs.extractToolCalls(run.output);
     return {
       actualTools,
       hasToolCalls: actualTools.length > 0,
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     description: "Analyze the appropriateness of tool selections",
     outputSchema: analyzeOutputSchema2,
     createPrompt: ({ run, results }) => {
-      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const toolsCalled = results.preprocessStepResult?.actualTools || [];
       return createAnalyzePrompt2({
         userInput,
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     }
     const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
     const totalToolCalls = evaluations.length;
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
   }).generateReason({
     description: "Generate human-readable explanation of tool selection evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const evaluations = results.analyzeStepResult?.evaluations || [];
       const missingTools = results.analyzeStepResult?.missingTools || [];
       return createReasonPrompt3({
@@ -1659,8 +1659,8 @@ function createContextRelevanceScorerLLM({
     description: "Analyze the relevance and utility of provided context",
     outputSchema: analyzeOutputSchema3,
     createPrompt: ({ run }) => {
-      const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
         return createAnalyzePrompt3({
@@ -1708,11 +1708,11 @@ function createContextRelevanceScorerLLM({
     const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
     const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
     const scaledScore = finalScore * (options.scale || 1);
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(scaledScore);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(scaledScore);
   }).generateReason({
     description: "Generate human-readable explanation of context relevance evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
         return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
@@ -1883,8 +1883,8 @@ function createContextPrecisionScorer({
     description: "Evaluate the relevance of each context piece for generating the expected output",
     outputSchema: contextRelevanceOutputSchema,
     createPrompt: ({ run }) => {
-      const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
         throw new Error("No context available for evaluation");
@@ -1917,12 +1917,12 @@ function createContextPrecisionScorer({
     }
     const map = sumPrecision / relevantCount;
     const score = map * (options.scale || 1);
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the context precision results",
     createPrompt: ({ run, results, score }) => {
-      const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const input = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const output = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       return createContextPrecisionReasonPrompt({
         input,
@@ -2177,8 +2177,8 @@ function createNoiseSensitivityScorerLLM({
     description: "Analyze the impact of noise on agent response quality",
     outputSchema: analyzeOutputSchema4,
     createPrompt: ({ run }) => {
-      const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const noisyResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const originalQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const noisyResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       if (!originalQuery || !noisyResponse) {
         throw new Error("Both original query and noisy response are required for evaluation");
       }
@@ -2221,11 +2221,11 @@ function createNoiseSensitivityScorerLLM({
     const majorIssues = analysisResult.majorIssues || [];
     const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
     finalScore = Math.max(0, finalScore - issuesPenalty);
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(finalScore);
   }).generateReason({
     description: "Generate human-readable explanation of noise sensitivity evaluation",
     createPrompt: ({ run, results, score }) => {
-      const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const originalQuery = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const analysisResult = results.analyzeStepResult;
       if (!analysisResult) {
         throw new Error("Analysis step failed to produce results for reason generation");
@@ -2550,9 +2550,9 @@ function createPromptAlignmentScorerLLM({
     description: "Analyze prompt-response alignment across multiple dimensions",
     outputSchema: analyzeOutputSchema5,
     createPrompt: ({ run }) => {
-      const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
-      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userPrompt = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const systemPrompt = chunkAY4K3J4R_cjs.getCombinedSystemPrompt(run.input) ?? "";
+      const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       if (evaluationMode === "user" && !userPrompt) {
         throw new Error("User prompt is required for user prompt alignment scoring");
       }
@@ -2588,12 +2588,12 @@ function createPromptAlignmentScorerLLM({
       weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
     }
     const finalScore = weightedScore * scale;
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(finalScore);
   }).generateReason({
     description: "Generate human-readable explanation of prompt alignment evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
+      const userPrompt = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const systemPrompt = chunkAY4K3J4R_cjs.getCombinedSystemPrompt(run.input) ?? "";
       const analysis = results.analyzeStepResult;
       if (!analysis) {
         return `Unable to analyze prompt alignment. Score: ${score}`;
@@ -2744,7 +2744,8 @@ function formatExpectedSteps(steps, indent = 0) {
   const prefix = "  ".repeat(indent);
   return steps.map((step, i) => {
     const typeStr = step.stepType ? `[${step.stepType}] ` : "";
-    const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
+    const { name: _, stepType: _t, children: _c, ...fields } = step;
+    const dataStr = Object.keys(fields).length > 0 ? ` (${JSON.stringify(fields)})` : "";
     let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
     if (step.children?.steps && step.children.steps.length > 0) {
       line += `
@@ -2773,22 +2774,15 @@ function createTrajectoryAccuracyScorerLLM({
       if (Array.isArray(staticExpectedTrajectory)) {
         expectedSteps = staticExpectedTrajectory;
       } else {
-        expectedSteps = staticExpectedTrajectory.steps.map((s) => {
-          const result = { name: s.name, stepType: s.stepType };
-          const data = {};
-          if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
-            data.input = s.toolArgs;
-          if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
-            data.output = s.toolResult;
-          if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
-          if (Object.keys(data).length > 0) result.data = data;
-          if (s.children && s.children.length > 0) {
-            result.children = {
-              steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
-            };
+        const toExpectedStep = (s) => {
+          const { durationMs: _, metadata: _m, children, ...rest } = s;
+          const result = rest;
+          if (children && children.length > 0) {
+            result.children = { steps: children.map(toExpectedStep) };
           }
           return result;
-        });
+        };
+        expectedSteps = staticExpectedTrajectory.steps.map(toExpectedStep);
       }
     } else if (run.expectedTrajectory) {
       const expectation = run.expectedTrajectory;
@@ -2804,8 +2798,8 @@ function createTrajectoryAccuracyScorerLLM({
     description: "Analyze the quality and appropriateness of the agent trajectory",
     outputSchema: analyzeOutputSchema6,
     createPrompt: ({ run, results }) => {
-      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
+      const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkAY4K3J4R_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
       return createAnalyzePrompt6({
         userInput,
         agentResponse,
@@ -2830,11 +2824,11 @@ function createTrajectoryAccuracyScorerLLM({
     const necessityScore = necessarySteps / totalSteps;
     const orderScore = orderedSteps / totalSteps;
     const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
-    return chunkXRUR5PBK_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
+    return chunkAY4K3J4R_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
   }).generateReason({
     description: "Generate human-readable explanation of trajectory evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userInput = chunkAY4K3J4R_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
       const missingSteps = results.analyzeStepResult?.missingSteps || [];
       const extraSteps = results.analyzeStepResult?.extraSteps || [];
@@ -2897,18 +2891,18 @@ function createCompletenessScorer() {
     type: "agent"
   }).preprocess(async ({ run }) => {
     const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
-      const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
+      const content = chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i);
       return content === null || content === void 0;
     });
     const isOutputInvalid = !run.output || run.output.some((i) => {
-      const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
+      const content = chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i);
       return content === null || content === void 0;
     });
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Inputs cannot be null or undefined");
     }
-    const input = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const inputToProcess = input;
     const outputToProcess = output;
     const inputDoc = nlp__default.default(inputToProcess.trim());
@@ -3013,8 +3007,8 @@ function createTextualDifferenceScorer() {
     description: "Calculate textual difference between input and output using sequence matching algorithms.",
     type: "agent"
   }).preprocess(async ({ run }) => {
-    const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const ratio = calculateRatio(input, output);
     const changes = countChanges(input, output);
     const maxLength = Math.max(input.length, output.length);
@@ -3037,8 +3031,8 @@ function createKeywordCoverageScorer() {
     description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
     type: "agent"
   }).preprocess(async ({ run }) => {
-    const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     if (!input && !output) {
       return {
         result: {
@@ -3091,8 +3085,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
     description: "Calculates content similarity between input and output messages using string comparison algorithms.",
     type: "agent"
   }).preprocess(async ({ run }) => {
-    let processedInput = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    let processedOutput = run.output.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    let processedInput = run.input?.inputMessages.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    let processedOutput = run.output.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     if (ignoreCase) {
       processedInput = processedInput.toLowerCase();
       processedOutput = processedOutput.toLowerCase();
@@ -3122,7 +3116,7 @@ function createToneScorer(config = {}) {
     type: "agent"
   }).preprocess(async ({ run }) => {
     const sentiment = new Sentiment__default.default();
-    const agentMessage = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const agentMessage = run.output?.map((i) => chunkAY4K3J4R_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const responseSentiment = sentiment.analyze(agentMessage);
     if (referenceTone) {
       const referenceSentiment = sentiment.analyze(referenceTone);
@@ -3209,7 +3203,7 @@ function createToolCallAccuracyScorerCode(options) {
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Input and output messages cannot be null or empty");
     }
-    const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
+    const { tools: actualTools, toolCallInfos } = chunkAY4K3J4R_cjs.extractToolCalls(run.output);
     const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
     return {
       expectedTool,
@@ -3235,18 +3229,11 @@ function createToolCallAccuracyScorerCode(options) {
   });
 }
 function trajectoryStepToExpectedStep(step) {
-  const result = { name: step.name, stepType: step.stepType };
-  const data = {};
-  if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
-    if (step.toolArgs !== void 0) data.input = step.toolArgs;
-    if (step.toolResult !== void 0) data.output = step.toolResult;
-  } else if (step.stepType === "workflow_step") {
-    if (step.output !== void 0) data.output = step.output;
-  }
-  if (Object.keys(data).length > 0) result.data = data;
-  if (step.children && step.children.length > 0) {
+  const { durationMs: _, metadata: _m, children, ...rest } = step;
+  const result = rest;
+  if (children && children.length > 0) {
     result.children = {
-      steps: step.children.map(trajectoryStepToExpectedStep)
+      steps: children.map(trajectoryStepToExpectedStep)
     };
   }
   return result;
@@ -3257,15 +3244,14 @@ function expectationToExpectedSteps(expectation) {
 }
 function createTrajectoryAccuracyScorerCode(options = {}) {
   const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
-  const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
-  const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
+  const { ordering = "relaxed", allowRepeatedSteps = true } = comparisonOptions;
   const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
   const getDescription = () => {
     if (staticExpectedSteps) {
       const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
-      return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
+      return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${ordering} ordering)`;
     }
-    return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
+    return `Evaluates trajectory accuracy against expected trajectory from dataset items (${ordering} ordering)`;
   };
   return evals.createScorer({
     id: "code-trajectory-accuracy-scorer",
@@ -3290,15 +3276,13 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
       };
     }
     const itemExpectation = run.expectedTrajectory;
-    const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
-    const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
+    const effectiveOrdering = itemExpectation?.ordering ?? ordering;
     const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
-    const comparison = chunkXRUR5PBK_cjs.compareTrajectories(
+    const comparison = chunkAY4K3J4R_cjs.compareTrajectories(
       actualTrajectory,
       { steps: resolvedExpectedSteps },
       {
         ordering: effectiveOrdering,
-        compareStepData: effectiveCompareData,
         allowRepeatedSteps: effectiveAllowRepeated
       }
     );
@@ -3317,7 +3301,7 @@ function createTrajectoryAccuracyScorerCode(options = {}) {
     return preprocessResult.comparison.score;
   });
 }
-function evaluateNestedExpectations(expectedSteps, actualSteps) {
+function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }) {
   const results = [];
   const matchedIndices = /* @__PURE__ */ new Set();
   for (const expectedStep of expectedSteps) {
@@ -3352,47 +3336,47 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
     const childConfig = expectedStep.children;
     let accuracy;
     if (childConfig.steps && childConfig.steps.length > 0) {
-      accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
+      accuracy = chunkAY4K3J4R_cjs.compareTrajectories(
         childTrajectory,
         { steps: childConfig.steps },
         {
           ordering: childConfig.ordering ?? "relaxed",
-          compareStepData: childConfig.compareStepData ?? false,
           allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
         }
       );
     }
     const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
-    const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(childTrajectory, {
+    const efficiency = hasEfficiencyConfig ? chunkAY4K3J4R_cjs.checkTrajectoryEfficiency(childTrajectory, {
       maxSteps: childConfig.maxSteps,
       maxTotalTokens: childConfig.maxTotalTokens,
       maxTotalDurationMs: childConfig.maxTotalDurationMs,
       noRedundantCalls: childConfig.noRedundantCalls ?? true
     }) : void 0;
     const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
-    const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(childTrajectory, {
+    const blacklist = hasBlacklistConfig ? chunkAY4K3J4R_cjs.checkTrajectoryBlacklist(childTrajectory, {
       blacklistedTools: childConfig.blacklistedTools,
       blacklistedSequences: childConfig.blacklistedSequences
     }) : void 0;
-    const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(childTrajectory, {
+    const toolFailures = chunkAY4K3J4R_cjs.analyzeToolFailures(childTrajectory, {
       maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
     });
-    const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
+    const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
     const scores = [];
-    if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
-    if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
-    if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
+    if (accuracy) scores.push({ weight: weights.accuracy, value: accuracy.score });
+    if (efficiency) scores.push({ weight: weights.efficiency, value: efficiency.score });
+    if (toolFailures && toolFailures.patterns.length > 0)
+      scores.push({ weight: weights.toolFailures, value: toolFailures.score });
     if (blacklist) {
       if (blacklist.score === 0) {
         results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
         continue;
       }
-      scores.push({ weight: 0.1, value: blacklist.score });
+      scores.push({ weight: weights.blacklist, value: blacklist.score });
     }
     let levelScore = 1;
     if (scores.length > 0) {
       const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-      levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
+      levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
     }
     let finalScore = levelScore;
     if (nested.length > 0) {
@@ -3417,7 +3401,13 @@ function evaluateNestedExpectations(expectedSteps, actualSteps) {
   return results;
 }
 function createTrajectoryScorerCode(options = {}) {
-  const { defaults = {} } = options;
+  const { defaults = {}, weights: userWeights = {} } = options;
+  const w = {
+    accuracy: Math.max(0, userWeights.accuracy ?? 0.4),
+    efficiency: Math.max(0, userWeights.efficiency ?? 0.3),
+    toolFailures: Math.max(0, userWeights.toolFailures ?? 0.2),
+    blacklist: Math.max(0, userWeights.blacklist ?? 0.1)
+  };
   return evals.createScorer({
     id: "code-trajectory-scorer",
     name: "Trajectory Scorer",
@@ -3432,32 +3422,31 @@ function createTrajectoryScorerCode(options = {}) {
     }
     let accuracy;
     if (config.steps && config.steps.length > 0) {
-      accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
+      accuracy = chunkAY4K3J4R_cjs.compareTrajectories(
         actualTrajectory,
         { steps: config.steps },
         {
           ordering: config.ordering ?? "relaxed",
-          compareStepData: config.compareStepData ?? false,
           allowRepeatedSteps: config.allowRepeatedSteps ?? true
         }
       );
     }
     const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
-    const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(actualTrajectory, {
+    const efficiency = hasEfficiencyConfig ? chunkAY4K3J4R_cjs.checkTrajectoryEfficiency(actualTrajectory, {
       maxSteps: config.maxSteps,
       maxTotalTokens: config.maxTotalTokens,
       maxTotalDurationMs: config.maxTotalDurationMs,
       noRedundantCalls: config.noRedundantCalls ?? true
     }) : void 0;
     const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
-    const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(actualTrajectory, {
+    const blacklist = hasBlacklistConfig ? chunkAY4K3J4R_cjs.checkTrajectoryBlacklist(actualTrajectory, {
       blacklistedTools: config.blacklistedTools,
       blacklistedSequences: config.blacklistedSequences
     }) : void 0;
-    const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(actualTrajectory, {
+    const toolFailures = chunkAY4K3J4R_cjs.analyzeToolFailures(actualTrajectory, {
       maxRetriesPerTool: config.maxRetriesPerTool ?? 2
     });
-    const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
+    const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
     return {
       accuracy,
       efficiency,
@@ -3473,16 +3462,16 @@ function createTrajectoryScorerCode(options = {}) {
     }
     const scores = [];
     if (accuracy) {
-      scores.push({ weight: 0.4, value: accuracy.score });
+      scores.push({ weight: w.accuracy, value: accuracy.score });
     }
     if (efficiency) {
-      scores.push({ weight: 0.3, value: efficiency.score });
+      scores.push({ weight: w.efficiency, value: efficiency.score });
     }
     if (toolFailures && toolFailures.patterns.length > 0) {
-      scores.push({ weight: 0.2, value: toolFailures.score });
+      scores.push({ weight: w.toolFailures, value: toolFailures.score });
     }
     if (blacklist) {
-      scores.push({ weight: 0.1, value: blacklist.score });
+      scores.push({ weight: w.blacklist, value: blacklist.score });
     }
     if (scores.length === 0 && !nested) {
       return 1;
@@ -3490,7 +3479,7 @@ function createTrajectoryScorerCode(options = {}) {
     let levelScore = 1;
     if (scores.length > 0) {
       const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
-      levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
+      levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
     }
     if (nested && nested.length > 0) {
       const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);