npm - @mastra/evals - Versions diffs - 1.1.2-alpha.0 → 1.2.0-alpha.0 - Mend

@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (60) hide show

package/CHANGELOG.md +59 -2
package/LICENSE.md +15 -0
package/dist/chunk-EVBNIL5M.js +606 -0
package/dist/chunk-EVBNIL5M.js.map +1 -0
package/dist/chunk-XRUR5PBK.cjs +632 -0
package/dist/chunk-XRUR5PBK.cjs.map +1 -0
package/dist/docs/SKILL.md +20 -19
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
package/dist/docs/references/docs-evals-overview.md +11 -16
package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
package/dist/docs/references/reference-evals-bias.md +24 -24
package/dist/docs/references/reference-evals-completeness.md +19 -20
package/dist/docs/references/reference-evals-content-similarity.md +20 -20
package/dist/docs/references/reference-evals-context-precision.md +36 -36
package/dist/docs/references/reference-evals-context-relevance.md +136 -141
package/dist/docs/references/reference-evals-faithfulness.md +24 -24
package/dist/docs/references/reference-evals-hallucination.md +52 -69
package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
package/dist/docs/references/reference-evals-textual-difference.md +18 -18
package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
package/dist/docs/references/reference-evals-toxicity.md +21 -21
package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
package/dist/scorers/code/index.d.ts +1 -0
package/dist/scorers/code/index.d.ts.map +1 -1
package/dist/scorers/code/trajectory/index.d.ts +147 -0
package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
package/dist/scorers/llm/context-precision/index.d.ts +2 -2
package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
package/dist/scorers/llm/hallucination/index.d.ts +2 -2
package/dist/scorers/llm/index.d.ts +1 -0
package/dist/scorers/llm/index.d.ts.map +1 -1
package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
package/dist/scorers/llm/toxicity/index.d.ts +1 -1
package/dist/scorers/llm/trajectory/index.d.ts +58 -0
package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
package/dist/scorers/prebuilt/index.cjs +638 -59
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +578 -2
package/dist/scorers/prebuilt/index.js.map +1 -1
package/dist/scorers/utils.cjs +41 -17
package/dist/scorers/utils.d.ts +171 -1
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +1 -1
package/package.json +14 -11
package/dist/chunk-OEOE7ZHN.js +0 -195
package/dist/chunk-OEOE7ZHN.js.map +0 -1
package/dist/chunk-W3U7MMDX.cjs +0 -212
package/dist/chunk-W3U7MMDX.cjs.map +0 -1

package/dist/scorers/prebuilt/index.cjs CHANGED Viewed

@@ -1,6 +1,6 @@
 'use strict';
-var chunkW3U7MMDX_cjs = require('../../chunk-W3U7MMDX.cjs');
+var chunkXRUR5PBK_cjs = require('../../chunk-XRUR5PBK.cjs');
 var evals = require('@mastra/core/evals');
 var zod = require('zod');
 var nlp = require('compromise');
@@ -239,14 +239,14 @@ function createAnswerRelevancyScorer({
     description: "Extract relevant statements from the LLM output",
     outputSchema: extractOutputSchema,
     createPrompt: ({ run }) => {
-      const assistantMessage = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const assistantMessage = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       return createExtractPrompt(assistantMessage);
     }
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run, results }) => {
-      const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
       return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
     }
   }).generateScore(({ results }) => {
@@ -263,13 +263,13 @@ function createAnswerRelevancyScorer({
       }
     }
     const score = relevancyCount / numberOfResults;
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * options.scale);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * options.scale);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       return createReasonPrompt({
-        input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         score,
         results: results.analyzeStepResult.results,
         scale: options.scale
@@ -466,7 +466,7 @@ function createAnswerSimilarityScorer({
           groundTruth: ""
         });
       }
-      const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
       return createExtractPrompt2({
         output,
@@ -524,14 +524,14 @@ function createAnswerSimilarityScorer({
     );
     score -= extraInfoPenalty;
     score = Math.max(0, Math.min(1, score));
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * mergedOptions.scale);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * mergedOptions.scale);
   }).generateReason({
     description: "Generate explanation of similarity score",
     createPrompt: ({ run, results, score }) => {
       if (!run.groundTruth) {
         return "No ground truth was provided for comparison. Score is 0 by default.";
       }
-      const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
       return createReasonPrompt2({
         output,
@@ -717,7 +717,7 @@ function createFaithfulnessScorer({
       claims: zod.z.array(zod.z.string())
     }),
     createPrompt: ({ run }) => {
-      const prompt = createFaithfulnessExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createFaithfulnessExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -741,14 +741,14 @@ function createFaithfulnessScorer({
       return 0;
     }
     const score = supportedClaims / totalClaims * (options?.scale || 1);
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ run, results, score }) => {
       const assistantMessage = run.output.find(({ role }) => role === "assistant");
       const prompt = createFaithfulnessReasonPrompt({
-        input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
         score,
         scale: options?.scale || 1,
@@ -881,13 +881,13 @@ function createBiasScorer({ model, options }) {
     outputSchema: zod.z.object({
       opinions: zod.z.array(zod.z.string())
     }),
-    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
+    createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
   }).analyze({
     description: "Score the relevance of the statements to the input",
     outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run, results }) => {
       const prompt = createBiasAnalyzePrompt({
-        output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         opinions: results.preprocessStepResult?.opinions || []
       });
       return prompt;
@@ -898,7 +898,7 @@ function createBiasScorer({ model, options }) {
     }
     const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
     const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * (options?.scale || 1));
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ score, results }) => {
@@ -1117,7 +1117,7 @@ function createHallucinationScorer({
       claims: zod.z.array(zod.z.string())
     }),
     createPrompt: ({ run }) => {
-      const prompt = createHallucinationExtractPrompt({ output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
+      const prompt = createHallucinationExtractPrompt({ output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
@@ -1145,7 +1145,7 @@ function createHallucinationScorer({
       return 0;
     }
     const score = contradictedStatements / totalStatements * (options?.scale || 1);
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the results",
     createPrompt: async ({ run, results, score }) => {
@@ -1156,8 +1156,8 @@ function createHallucinationScorer({
         context = options?.context ?? [];
       }
       const prompt = createHallucinationReasonPrompt({
-        input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
+        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
         context,
         score,
         scale: options?.scale || 1,
@@ -1271,8 +1271,8 @@ function createToxicityScorer({
     outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
     createPrompt: ({ run }) => {
       const prompt = createToxicityAnalyzePrompt({
-        input: chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "",
-        output: chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
+        input: chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "",
+        output: chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
       });
       return prompt;
     }
@@ -1288,7 +1288,7 @@ function createToxicityScorer({
       }
     }
     const score = toxicityCount / numberOfVerdicts;
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score * (options?.scale || 1));
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score * (options?.scale || 1));
   }).generateReason({
     description: "Reason about the results",
     createPrompt: ({ results, score }) => {
@@ -1422,7 +1422,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Input and output messages cannot be null or empty");
     }
-    const { tools: actualTools, toolCallInfos } = chunkW3U7MMDX_cjs.extractToolCalls(run.output);
+    const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
     return {
       actualTools,
       hasToolCalls: actualTools.length > 0,
@@ -1432,8 +1432,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     description: "Analyze the appropriateness of tool selections",
     outputSchema: analyzeOutputSchema2,
     createPrompt: ({ run, results }) => {
-      const userInput = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const toolsCalled = results.preprocessStepResult?.actualTools || [];
       return createAnalyzePrompt2({
         userInput,
@@ -1450,11 +1450,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     }
     const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
     const totalToolCalls = evaluations.length;
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
   }).generateReason({
     description: "Generate human-readable explanation of tool selection evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userInput = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const evaluations = results.analyzeStepResult?.evaluations || [];
       const missingTools = results.analyzeStepResult?.missingTools || [];
       return createReasonPrompt3({
@@ -1659,8 +1659,8 @@ function createContextRelevanceScorerLLM({
     description: "Analyze the relevance and utility of provided context",
     outputSchema: analyzeOutputSchema3,
     createPrompt: ({ run }) => {
-      const userQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
         return createAnalyzePrompt3({
@@ -1708,11 +1708,11 @@ function createContextRelevanceScorerLLM({
     const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
     const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
     const scaledScore = finalScore * (options.scale || 1);
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(scaledScore);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(scaledScore);
   }).generateReason({
     description: "Generate human-readable explanation of context relevance evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const userQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
         return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
@@ -1883,8 +1883,8 @@ function createContextPrecisionScorer({
     description: "Evaluate the relevance of each context piece for generating the expected output",
     outputSchema: contextRelevanceOutputSchema,
     createPrompt: ({ run }) => {
-      const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
         throw new Error("No context available for evaluation");
@@ -1917,12 +1917,12 @@ function createContextPrecisionScorer({
     }
     const map = sumPrecision / relevantCount;
     const score = map * (options.scale || 1);
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(score);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(score);
   }).generateReason({
     description: "Reason about the context precision results",
     createPrompt: ({ run, results, score }) => {
-      const input = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const output = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const input = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const output = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       return createContextPrecisionReasonPrompt({
         input,
@@ -2177,8 +2177,8 @@ function createNoiseSensitivityScorerLLM({
     description: "Analyze the impact of noise on agent response quality",
     outputSchema: analyzeOutputSchema4,
     createPrompt: ({ run }) => {
-      const originalQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const noisyResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const noisyResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       if (!originalQuery || !noisyResponse) {
         throw new Error("Both original query and noisy response are required for evaluation");
       }
@@ -2221,11 +2221,11 @@ function createNoiseSensitivityScorerLLM({
     const majorIssues = analysisResult.majorIssues || [];
     const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
     finalScore = Math.max(0, finalScore - issuesPenalty);
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(finalScore);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
   }).generateReason({
     description: "Generate human-readable explanation of noise sensitivity evaluation",
     createPrompt: ({ run, results, score }) => {
-      const originalQuery = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const originalQuery = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
       const analysisResult = results.analyzeStepResult;
       if (!analysisResult) {
         throw new Error("Analysis step failed to produce results for reason generation");
@@ -2550,9 +2550,9 @@ function createPromptAlignmentScorerLLM({
     description: "Analyze prompt-response alignment across multiple dimensions",
     outputSchema: analyzeOutputSchema5,
     createPrompt: ({ run }) => {
-      const userPrompt = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const systemPrompt = chunkW3U7MMDX_cjs.getCombinedSystemPrompt(run.input) ?? "";
-      const agentResponse = chunkW3U7MMDX_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
+      const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
+      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
       if (evaluationMode === "user" && !userPrompt) {
         throw new Error("User prompt is required for user prompt alignment scoring");
       }
@@ -2588,12 +2588,12 @@ function createPromptAlignmentScorerLLM({
       weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
     }
     const finalScore = weightedScore * scale;
-    return chunkW3U7MMDX_cjs.roundToTwoDecimals(finalScore);
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(finalScore);
   }).generateReason({
     description: "Generate human-readable explanation of prompt alignment evaluation",
     createPrompt: ({ run, results, score }) => {
-      const userPrompt = chunkW3U7MMDX_cjs.getUserMessageFromRunInput(run.input) ?? "";
-      const systemPrompt = chunkW3U7MMDX_cjs.getCombinedSystemPrompt(run.input) ?? "";
+      const userPrompt = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const systemPrompt = chunkXRUR5PBK_cjs.getCombinedSystemPrompt(run.input) ?? "";
       const analysis = results.analyzeStepResult;
       if (!analysis) {
         return `Unable to analyze prompt alignment. Score: ${score}`;
@@ -2609,6 +2609,245 @@ function createPromptAlignmentScorerLLM({
     }
   });
 }
+// src/scorers/llm/trajectory/prompts.ts
+var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
+You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
+CORE RESPONSIBILITIES:
+- Analyze the full sequence of actions the agent took
+- Evaluate whether each step was necessary and well-ordered
+- Identify unnecessary, redundant, or missing steps
+- Assess the overall quality of the agent's action path
+EVALUATION PHILOSOPHY:
+- Consider both the individual steps AND the overall flow
+- A good trajectory is efficient, logical, and complete
+- Redundant steps reduce quality even if the final result is correct
+- Missing critical steps are a significant issue
+- Order matters: logical dependencies should be respected
+OUTPUT REQUIREMENTS:
+- Provide clear reasoning for your trajectory assessment
+- Use provided JSON schema exactly as specified
+- Be consistent in your evaluation standards
+`;
+var createAnalyzePrompt6 = ({
+  userInput,
+  agentResponse,
+  actualTrajectory,
+  expectedTrajectory
+}) => {
+  let prompt = `
+You are evaluating whether an AI agent took an appropriate sequence of actions to fulfill a user request.
+USER REQUEST: "${userInput}"
+AGENT FINAL RESPONSE: "${agentResponse}"
+ACTUAL TRAJECTORY (sequence of actions the agent took):
+${actualTrajectory}
+`;
+  if (expectedTrajectory) {
+    prompt += `
+EXPECTED TRAJECTORY (the ideal sequence):
+${expectedTrajectory}
+EVALUATION CRITERIA:
+1. STEP PRESENCE: Did the agent perform all expected steps?
+2. STEP ORDER: Were the steps in a logical order? (Expected order is a guideline, not absolute)
+3. EXTRA STEPS: Did the agent take unnecessary steps not in the expected trajectory?
+4. MISSING STEPS: Are any expected steps missing from the actual trajectory?
+5. STEP QUALITY: For each step that matches, was it executed appropriately?
+For each actual step, evaluate:
+- Does it correspond to an expected step?
+- Was it necessary for the task?
+- Was it in the right position in the sequence?
+`;
+  } else {
+    prompt += `
+EVALUATION CRITERIA (no expected trajectory provided - evaluate based on the task):
+1. COMPLETENESS: Did the agent take all necessary steps to fulfill the request?
+2. EFFICIENCY: Were there any redundant or unnecessary steps?
+3. ORDERING: Were the steps in a logical order given their dependencies?
+4. APPROPRIATENESS: Was each step appropriate for the task?
+`;
+  }
+  prompt += `
+Evaluate each step and the overall trajectory quality.
+`;
+  return prompt;
+};
+var createReasonPrompt7 = ({
+  userInput,
+  score,
+  stepEvaluations,
+  missingSteps,
+  extraSteps
+}) => {
+  return `
+Explain this trajectory evaluation in ONE SENTENCE.
+User Request: "${userInput}"
+Score: ${score}/1
+Steps Evaluated: ${JSON.stringify(stepEvaluations)}
+Missing Steps: ${JSON.stringify(missingSteps)}
+Extra/Unnecessary Steps: ${JSON.stringify(extraSteps)}
+Provide a single, concise sentence explaining why this score was given.
+`;
+};
+// src/scorers/llm/trajectory/index.ts
+var analyzeOutputSchema6 = zod.z.object({
+  stepEvaluations: zod.z.array(
+    zod.z.object({
+      stepName: zod.z.string().describe("Name of the step (tool name or action)"),
+      wasNecessary: zod.z.boolean().describe("Whether this step was necessary for the task"),
+      wasInOrder: zod.z.boolean().describe("Whether this step was in a logical position in the sequence"),
+      reasoning: zod.z.string().describe("Brief explanation of the evaluation")
+    })
+  ),
+  missingSteps: zod.z.array(zod.z.string()).optional().describe("Steps that should have been taken but were not"),
+  extraSteps: zod.z.array(zod.z.string()).optional().describe("Steps that were unnecessary or redundant"),
+  overallAssessment: zod.z.string().describe("Brief overall assessment of the trajectory quality")
+});
+function formatStepDetails(step) {
+  switch (step.stepType) {
+    case "tool_call":
+    case "mcp_tool_call": {
+      const parts = [];
+      if (step.toolArgs !== void 0) parts.push(`args: ${JSON.stringify(step.toolArgs)}`);
+      if (step.toolResult !== void 0) parts.push(`result: ${JSON.stringify(step.toolResult)}`);
+      return parts.length > 0 ? ` (${parts.join(", ")})` : "";
+    }
+    case "model_generation":
+      return step.modelId ? ` (model: ${step.modelId})` : "";
+    case "workflow_step":
+      return step.output !== void 0 ? ` (output: ${JSON.stringify(step.output)})` : "";
+    default:
+      return "";
+  }
+}
+function formatTrajectory(trajectory, indent = 0) {
+  const prefix = "  ".repeat(indent);
+  return trajectory.steps.map((step, i) => {
+    let line = `${prefix}${i + 1}. [${step.stepType}] ${step.name}${formatStepDetails(step)}`;
+    if (step.children && step.children.length > 0) {
+      line += `
+${formatTrajectory({ steps: step.children }, indent + 1)}`;
+    }
+    return line;
+  }).join("\n");
+}
+function formatExpectedSteps(steps, indent = 0) {
+  const prefix = "  ".repeat(indent);
+  return steps.map((step, i) => {
+    const typeStr = step.stepType ? `[${step.stepType}] ` : "";
+    const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
+    let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
+    if (step.children?.steps && step.children.steps.length > 0) {
+      line += `
+${formatExpectedSteps(step.children.steps, indent + 1)}`;
+    }
+    return line;
+  }).join("\n");
+}
+function createTrajectoryAccuracyScorerLLM({
+  model,
+  expectedTrajectory: staticExpectedTrajectory
+}) {
+  return evals.createScorer({
+    id: "llm-trajectory-accuracy-scorer",
+    name: "Trajectory Accuracy (LLM)",
+    description: staticExpectedTrajectory ? "Evaluates the trajectory against an expected trajectory using LLM analysis" : "Evaluates the quality and appropriateness of the trajectory using LLM analysis",
+    judge: {
+      model,
+      instructions: TRAJECTORY_EVALUATION_INSTRUCTIONS
+    },
+    type: "trajectory"
+  }).preprocess(async ({ run }) => {
+    const actualTrajectory = run.output;
+    let expectedSteps;
+    if (staticExpectedTrajectory) {
+      if (Array.isArray(staticExpectedTrajectory)) {
+        expectedSteps = staticExpectedTrajectory;
+      } else {
+        expectedSteps = staticExpectedTrajectory.steps.map((s) => {
+          const result = { name: s.name, stepType: s.stepType };
+          const data = {};
+          if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
+            data.input = s.toolArgs;
+          if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
+            data.output = s.toolResult;
+          if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
+          if (Object.keys(data).length > 0) result.data = data;
+          if (s.children && s.children.length > 0) {
+            result.children = {
+              steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
+            };
+          }
+          return result;
+        });
+      }
+    } else if (run.expectedTrajectory) {
+      const expectation = run.expectedTrajectory;
+      expectedSteps = expectation.steps && expectation.steps.length > 0 ? expectation.steps : void 0;
+    }
+    return {
+      actualTrajectory,
+      actualTrajectoryFormatted: formatTrajectory(actualTrajectory),
+      expectedTrajectoryFormatted: expectedSteps ? formatExpectedSteps(expectedSteps) : void 0,
+      hasSteps: actualTrajectory.steps.length > 0
+    };
+  }).analyze({
+    description: "Analyze the quality and appropriateness of the agent trajectory",
+    outputSchema: analyzeOutputSchema6,
+    createPrompt: ({ run, results }) => {
+      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const agentResponse = chunkXRUR5PBK_cjs.getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
+      return createAnalyzePrompt6({
+        userInput,
+        agentResponse,
+        actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
+        expectedTrajectory: results.preprocessStepResult?.expectedTrajectoryFormatted
+      });
+    }
+  }).generateScore(({ results }) => {
+    const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
+    if (stepEvaluations.length === 0) {
+      const missingSteps2 = results.analyzeStepResult?.missingSteps || [];
+      const extraSteps = results.analyzeStepResult?.extraSteps || [];
+      if (missingSteps2.length > 0) return 0;
+      if (extraSteps.length > 0) return 0.5;
+      return 1;
+    }
+    const necessarySteps = stepEvaluations.filter((e) => e.wasNecessary).length;
+    const orderedSteps = stepEvaluations.filter((e) => e.wasInOrder).length;
+    const totalSteps = stepEvaluations.length;
+    const missingSteps = results.analyzeStepResult?.missingSteps || [];
+    const missingPenalty = missingSteps.length > 0 ? missingSteps.length / (totalSteps + missingSteps.length) : 0;
+    const necessityScore = necessarySteps / totalSteps;
+    const orderScore = orderedSteps / totalSteps;
+    const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
+    return chunkXRUR5PBK_cjs.roundToTwoDecimals(Math.max(0, Math.min(1, score)));
+  }).generateReason({
+    description: "Generate human-readable explanation of trajectory evaluation",
+    createPrompt: ({ run, results, score }) => {
+      const userInput = chunkXRUR5PBK_cjs.getUserMessageFromRunInput(run.input) ?? "";
+      const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
+      const missingSteps = results.analyzeStepResult?.missingSteps || [];
+      const extraSteps = results.analyzeStepResult?.extraSteps || [];
+      return createReasonPrompt7({
+        userInput,
+        score,
+        stepEvaluations,
+        missingSteps,
+        extraSteps
+      });
+    }
+  });
+}
 function normalizeString(str) {
   return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
 }
@@ -2658,18 +2897,18 @@ function createCompletenessScorer() {
     type: "agent"
   }).preprocess(async ({ run }) => {
     const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
-      const content = chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i);
+      const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
       return content === null || content === void 0;
     });
     const isOutputInvalid = !run.output || run.output.some((i) => {
-      const content = chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i);
+      const content = chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i);
       return content === null || content === void 0;
     });
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Inputs cannot be null or undefined");
     }
-    const input = run.input?.inputMessages.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const inputToProcess = input;
     const outputToProcess = output;
     const inputDoc = nlp__default.default(inputToProcess.trim());
@@ -2774,8 +3013,8 @@ function createTextualDifferenceScorer() {
     description: "Calculate textual difference between input and output using sequence matching algorithms.",
     type: "agent"
   }).preprocess(async ({ run }) => {
-    const input = run.input?.inputMessages?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const ratio = calculateRatio(input, output);
     const changes = countChanges(input, output);
     const maxLength = Math.max(input.length, output.length);
@@ -2798,8 +3037,8 @@ function createKeywordCoverageScorer() {
     description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
     type: "agent"
   }).preprocess(async ({ run }) => {
-    const input = run.input?.inputMessages?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    const output = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const input = run.input?.inputMessages?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const output = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     if (!input && !output) {
       return {
         result: {
@@ -2852,8 +3091,8 @@ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { igno
     description: "Calculates content similarity between input and output messages using string comparison algorithms.",
     type: "agent"
   }).preprocess(async ({ run }) => {
-    let processedInput = run.input?.inputMessages.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
-    let processedOutput = run.output.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    let processedInput = run.input?.inputMessages.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    let processedOutput = run.output.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     if (ignoreCase) {
       processedInput = processedInput.toLowerCase();
       processedOutput = processedOutput.toLowerCase();
@@ -2883,7 +3122,7 @@ function createToneScorer(config = {}) {
     type: "agent"
   }).preprocess(async ({ run }) => {
     const sentiment = new Sentiment__default.default();
-    const agentMessage = run.output?.map((i) => chunkW3U7MMDX_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
+    const agentMessage = run.output?.map((i) => chunkXRUR5PBK_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
     const responseSentiment = sentiment.analyze(agentMessage);
     if (referenceTone) {
       const referenceSentiment = sentiment.analyze(referenceTone);
@@ -2970,7 +3209,7 @@ function createToolCallAccuracyScorerCode(options) {
     if (isInputInvalid || isOutputInvalid) {
       throw new Error("Input and output messages cannot be null or empty");
     }
-    const { tools: actualTools, toolCallInfos } = chunkW3U7MMDX_cjs.extractToolCalls(run.output);
+    const { tools: actualTools, toolCallInfos } = chunkXRUR5PBK_cjs.extractToolCalls(run.output);
     const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
     return {
       expectedTool,
@@ -2995,6 +3234,343 @@ function createToolCallAccuracyScorerCode(options) {
     });
   });
 }
+function trajectoryStepToExpectedStep(step) {
+  const result = { name: step.name, stepType: step.stepType };
+  const data = {};
+  if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
+    if (step.toolArgs !== void 0) data.input = step.toolArgs;
+    if (step.toolResult !== void 0) data.output = step.toolResult;
+  } else if (step.stepType === "workflow_step") {
+    if (step.output !== void 0) data.output = step.output;
+  }
+  if (Object.keys(data).length > 0) result.data = data;
+  if (step.children && step.children.length > 0) {
+    result.children = {
+      steps: step.children.map(trajectoryStepToExpectedStep)
+    };
+  }
+  return result;
+}
+function expectationToExpectedSteps(expectation) {
+  if (!expectation.steps || expectation.steps.length === 0) return void 0;
+  return expectation.steps;
+}
+function createTrajectoryAccuracyScorerCode(options = {}) {
+  const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
+  const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
+  const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
+  const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
+  const getDescription = () => {
+    if (staticExpectedSteps) {
+      const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
+      return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
+    }
+    return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
+  };
+  return evals.createScorer({
+    id: "code-trajectory-accuracy-scorer",
+    name: "Trajectory Accuracy Scorer",
+    description: getDescription(),
+    type: "trajectory"
+  }).preprocess(async ({ run }) => {
+    const actualTrajectory = run.output;
+    let resolvedExpectedSteps = staticExpectedSteps;
+    if (!resolvedExpectedSteps && run.expectedTrajectory) {
+      const expectation = run.expectedTrajectory;
+      resolvedExpectedSteps = expectationToExpectedSteps(expectation);
+    }
+    if (!resolvedExpectedSteps || resolvedExpectedSteps.length === 0) {
+      return {
+        actualTrajectory,
+        expectedTrajectory: void 0,
+        comparison: void 0,
+        actualStepNames: actualTrajectory.steps.map((s) => s.name),
+        expectedStepNames: [],
+        error: "No expected trajectory provided (pass via options or dataset item expectedTrajectory)"
+      };
+    }
+    const itemExpectation = run.expectedTrajectory;
+    const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
+    const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
+    const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
+    const comparison = chunkXRUR5PBK_cjs.compareTrajectories(
+      actualTrajectory,
+      { steps: resolvedExpectedSteps },
+      {
+        ordering: effectiveOrdering,
+        compareStepData: effectiveCompareData,
+        allowRepeatedSteps: effectiveAllowRepeated
+      }
+    );
+    return {
+      actualTrajectory,
+      expectedTrajectory: { steps: resolvedExpectedSteps },
+      comparison,
+      actualStepNames: actualTrajectory.steps.map((s) => s.name),
+      expectedStepNames: resolvedExpectedSteps.map((s) => s.name)
+    };
+  }).generateScore(({ results }) => {
+    const preprocessResult = results.preprocessStepResult;
+    if (!preprocessResult || !preprocessResult.comparison) {
+      return 0;
+    }
+    return preprocessResult.comparison.score;
+  });
+}
+function evaluateNestedExpectations(expectedSteps, actualSteps) {
+  const results = [];
+  const matchedIndices = /* @__PURE__ */ new Set();
+  for (const expectedStep of expectedSteps) {
+    if (!expectedStep.children) continue;
+    const matchIndex = actualSteps.findIndex(
+      (s, i) => !matchedIndices.has(i) && s.name === expectedStep.name && (!expectedStep.stepType || s.stepType === expectedStep.stepType)
+    );
+    const actualStep = matchIndex >= 0 ? actualSteps[matchIndex] : void 0;
+    if (matchIndex >= 0) matchedIndices.add(matchIndex);
+    if (!actualStep?.children || actualStep.children.length === 0) {
+      const expectedStepCount = expectedStep.children.steps?.length ?? 0;
+      results.push({
+        stepName: expectedStep.name,
+        score: 0,
+        accuracy: expectedStepCount > 0 ? {
+          score: 0,
+          matchedSteps: 0,
+          totalExpectedSteps: expectedStepCount,
+          totalActualSteps: 0,
+          missingSteps: expectedStep.children.steps.map((s) => s.name),
+          extraSteps: [],
+          outOfOrderSteps: [],
+          repeatedSteps: []
+        } : void 0
+      });
+      continue;
+    }
+    const childTrajectory = {
+      steps: actualStep.children,
+      totalDurationMs: actualStep.durationMs
+    };
+    const childConfig = expectedStep.children;
+    let accuracy;
+    if (childConfig.steps && childConfig.steps.length > 0) {
+      accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
+        childTrajectory,
+        { steps: childConfig.steps },
+        {
+          ordering: childConfig.ordering ?? "relaxed",
+          compareStepData: childConfig.compareStepData ?? false,
+          allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
+        }
+      );
+    }
+    const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
+    const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(childTrajectory, {
+      maxSteps: childConfig.maxSteps,
+      maxTotalTokens: childConfig.maxTotalTokens,
+      maxTotalDurationMs: childConfig.maxTotalDurationMs,
+      noRedundantCalls: childConfig.noRedundantCalls ?? true
+    }) : void 0;
+    const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
+    const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(childTrajectory, {
+      blacklistedTools: childConfig.blacklistedTools,
+      blacklistedSequences: childConfig.blacklistedSequences
+    }) : void 0;
+    const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(childTrajectory, {
+      maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
+    });
+    const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
+    const scores = [];
+    if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
+    if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
+    if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
+    if (blacklist) {
+      if (blacklist.score === 0) {
+        results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
+        continue;
+      }
+      scores.push({ weight: 0.1, value: blacklist.score });
+    }
+    let levelScore = 1;
+    if (scores.length > 0) {
+      const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
+      levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
+    }
+    let finalScore = levelScore;
+    if (nested.length > 0) {
+      const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
+      if (hasNestedBlacklistViolation) {
+        results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
+        continue;
+      }
+      const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
+      finalScore = 0.7 * levelScore + 0.3 * nestedAvg;
+    }
+    results.push({
+      stepName: expectedStep.name,
+      score: Math.round(finalScore * 100) / 100,
+      accuracy,
+      efficiency,
+      blacklist,
+      toolFailures,
+      nested: nested.length > 0 ? nested : void 0
+    });
+  }
+  return results;
+}
+function createTrajectoryScorerCode(options = {}) {
+  const { defaults = {} } = options;
+  return evals.createScorer({
+    id: "code-trajectory-scorer",
+    name: "Trajectory Scorer",
+    description: "Multi-dimensional trajectory evaluation: accuracy, efficiency, blacklist, and tool failures",
+    type: "trajectory"
+  }).preprocess(async ({ run }) => {
+    const actualTrajectory = run.output;
+    const itemExpectation = run.expectedTrajectory ?? {};
+    const config = { ...defaults, ...itemExpectation };
+    if (itemExpectation.steps !== void 0) {
+      config.steps = itemExpectation.steps;
+    }
+    let accuracy;
+    if (config.steps && config.steps.length > 0) {
+      accuracy = chunkXRUR5PBK_cjs.compareTrajectories(
+        actualTrajectory,
+        { steps: config.steps },
+        {
+          ordering: config.ordering ?? "relaxed",
+          compareStepData: config.compareStepData ?? false,
+          allowRepeatedSteps: config.allowRepeatedSteps ?? true
+        }
+      );
+    }
+    const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
+    const efficiency = hasEfficiencyConfig ? chunkXRUR5PBK_cjs.checkTrajectoryEfficiency(actualTrajectory, {
+      maxSteps: config.maxSteps,
+      maxTotalTokens: config.maxTotalTokens,
+      maxTotalDurationMs: config.maxTotalDurationMs,
+      noRedundantCalls: config.noRedundantCalls ?? true
+    }) : void 0;
+    const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
+    const blacklist = hasBlacklistConfig ? chunkXRUR5PBK_cjs.checkTrajectoryBlacklist(actualTrajectory, {
+      blacklistedTools: config.blacklistedTools,
+      blacklistedSequences: config.blacklistedSequences
+    }) : void 0;
+    const toolFailures = chunkXRUR5PBK_cjs.analyzeToolFailures(actualTrajectory, {
+      maxRetriesPerTool: config.maxRetriesPerTool ?? 2
+    });
+    const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
+    return {
+      accuracy,
+      efficiency,
+      blacklist,
+      toolFailures,
+      nested: nested && nested.length > 0 ? nested : void 0,
+      config
+    };
+  }).generateScore(({ results }) => {
+    const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
+    if (blacklist && blacklist.score === 0) {
+      return 0;
+    }
+    const scores = [];
+    if (accuracy) {
+      scores.push({ weight: 0.4, value: accuracy.score });
+    }
+    if (efficiency) {
+      scores.push({ weight: 0.3, value: efficiency.score });
+    }
+    if (toolFailures && toolFailures.patterns.length > 0) {
+      scores.push({ weight: 0.2, value: toolFailures.score });
+    }
+    if (blacklist) {
+      scores.push({ weight: 0.1, value: blacklist.score });
+    }
+    if (scores.length === 0 && !nested) {
+      return 1;
+    }
+    let levelScore = 1;
+    if (scores.length > 0) {
+      const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
+      levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
+    }
+    if (nested && nested.length > 0) {
+      const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
+      if (hasNestedBlacklistViolation) {
+        return 0;
+      }
+      const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
+      levelScore = 0.7 * levelScore + 0.3 * nestedAvg;
+    }
+    return Math.round(levelScore * 100) / 100;
+  }).generateReason(({ results, score }) => {
+    const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
+    const parts = [];
+    parts.push(`Score: ${score}`);
+    if (blacklist && blacklist.score === 0) {
+      const violations = [];
+      if (blacklist.violatedTools.length > 0) {
+        violations.push(`forbidden tools used: ${blacklist.violatedTools.join(", ")}`);
+      }
+      if (blacklist.violatedSequences.length > 0) {
+        violations.push(`forbidden sequences: ${blacklist.violatedSequences.map((s) => s.join(" \u2192 ")).join("; ")}`);
+      }
+      parts.push(`Blacklist violation: ${violations.join(". ")}.`);
+      return parts.join("\n");
+    }
+    if (nested && nested.some((r) => r.blacklist && r.blacklist.score === 0)) {
+      const violating = nested.filter((r) => r.blacklist && r.blacklist.score === 0).map((r) => r.stepName);
+      parts.push(`Nested blacklist violation in: ${violating.join(", ")}.`);
+      return parts.join("\n");
+    }
+    if (accuracy) {
+      const details = [`${accuracy.matchedSteps}/${accuracy.totalExpectedSteps} expected steps matched`];
+      if (accuracy.missingSteps.length > 0) {
+        details.push(`missing: ${accuracy.missingSteps.join(", ")}`);
+      }
+      if (accuracy.extraSteps.length > 0) {
+        details.push(`extra: ${accuracy.extraSteps.join(", ")}`);
+      }
+      if (accuracy.outOfOrderSteps.length > 0) {
+        details.push(`out of order: ${accuracy.outOfOrderSteps.join(", ")}`);
+      }
+      parts.push(`Accuracy (${accuracy.score}): ${details.join(". ")}.`);
+    }
+    if (efficiency) {
+      const details = [];
+      if (efficiency.overStepBudget) {
+        details.push(`over step budget (${efficiency.totalSteps} steps)`);
+      }
+      if (efficiency.overTokenBudget) {
+        details.push(`over token budget (${efficiency.totalTokens} tokens)`);
+      }
+      if (efficiency.overDurationBudget) {
+        details.push(`over duration budget (${efficiency.totalDurationMs}ms)`);
+      }
+      if (efficiency.redundantCalls.length > 0) {
+        details.push(`redundant calls: ${efficiency.redundantCalls.map((c) => c.name).join(", ")}`);
+      }
+      if (details.length > 0) {
+        parts.push(`Efficiency (${efficiency.score}): ${details.join(". ")}.`);
+      } else {
+        parts.push(`Efficiency (${efficiency.score}): all budgets met, no redundant calls.`);
+      }
+    }
+    if (toolFailures && toolFailures.patterns.length > 0) {
+      const details = [];
+      if (toolFailures.totalRetries > 0) {
+        details.push(`${toolFailures.totalRetries} total retries`);
+      }
+      if (toolFailures.excessiveRetryTools.length > 0) {
+        details.push(`excessive retries: ${toolFailures.excessiveRetryTools.join(", ")}`);
+      }
+      parts.push(`Tool failures (${toolFailures.score}): ${details.join(". ")}.`);
+    }
+    if (nested && nested.length > 0) {
+      const nestedSummary = nested.map((r) => `${r.stepName}: ${r.score}`).join(", ");
+      parts.push(`Nested scores: ${nestedSummary}.`);
+    }
+    return parts.join("\n");
+  });
+}
 exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
 exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
@@ -3017,5 +3593,8 @@ exports.createToneScorer = createToneScorer;
 exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;
 exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
 exports.createToxicityScorer = createToxicityScorer;
+exports.createTrajectoryAccuracyScorerCode = createTrajectoryAccuracyScorerCode;
+exports.createTrajectoryAccuracyScorerLLM = createTrajectoryAccuracyScorerLLM;
+exports.createTrajectoryScorerCode = createTrajectoryScorerCode;
 //# sourceMappingURL=index.cjs.map
 //# sourceMappingURL=index.cjs.map