@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,4 +1,4 @@
1
- import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage } from '../../chunk-OEOE7ZHN.js';
1
+ import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-EVBNIL5M.js';
2
2
  import { createScorer } from '@mastra/core/evals';
3
3
  import { z } from 'zod';
4
4
  import nlp from 'compromise';
@@ -2600,6 +2600,245 @@ function createPromptAlignmentScorerLLM({
2600
2600
  }
2601
2601
  });
2602
2602
  }
2603
+
2604
+ // src/scorers/llm/trajectory/prompts.ts
2605
+ var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
2606
+ You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
2607
+
2608
+ CORE RESPONSIBILITIES:
2609
+ - Analyze the full sequence of actions the agent took
2610
+ - Evaluate whether each step was necessary and well-ordered
2611
+ - Identify unnecessary, redundant, or missing steps
2612
+ - Assess the overall quality of the agent's action path
2613
+
2614
+ EVALUATION PHILOSOPHY:
2615
+ - Consider both the individual steps AND the overall flow
2616
+ - A good trajectory is efficient, logical, and complete
2617
+ - Redundant steps reduce quality even if the final result is correct
2618
+ - Missing critical steps are a significant issue
2619
+ - Order matters: logical dependencies should be respected
2620
+
2621
+ OUTPUT REQUIREMENTS:
2622
+ - Provide clear reasoning for your trajectory assessment
2623
+ - Use provided JSON schema exactly as specified
2624
+ - Be consistent in your evaluation standards
2625
+ `;
2626
+ var createAnalyzePrompt6 = ({
2627
+ userInput,
2628
+ agentResponse,
2629
+ actualTrajectory,
2630
+ expectedTrajectory
2631
+ }) => {
2632
+ let prompt = `
2633
+ You are evaluating whether an AI agent took an appropriate sequence of actions to fulfill a user request.
2634
+
2635
+ USER REQUEST: "${userInput}"
2636
+ AGENT FINAL RESPONSE: "${agentResponse}"
2637
+
2638
+ ACTUAL TRAJECTORY (sequence of actions the agent took):
2639
+ ${actualTrajectory}
2640
+ `;
2641
+ if (expectedTrajectory) {
2642
+ prompt += `
2643
+ EXPECTED TRAJECTORY (the ideal sequence):
2644
+ ${expectedTrajectory}
2645
+
2646
+ EVALUATION CRITERIA:
2647
+ 1. STEP PRESENCE: Did the agent perform all expected steps?
2648
+ 2. STEP ORDER: Were the steps in a logical order? (Expected order is a guideline, not absolute)
2649
+ 3. EXTRA STEPS: Did the agent take unnecessary steps not in the expected trajectory?
2650
+ 4. MISSING STEPS: Are any expected steps missing from the actual trajectory?
2651
+ 5. STEP QUALITY: For each step that matches, was it executed appropriately?
2652
+
2653
+ For each actual step, evaluate:
2654
+ - Does it correspond to an expected step?
2655
+ - Was it necessary for the task?
2656
+ - Was it in the right position in the sequence?
2657
+ `;
2658
+ } else {
2659
+ prompt += `
2660
+ EVALUATION CRITERIA (no expected trajectory provided - evaluate based on the task):
2661
+ 1. COMPLETENESS: Did the agent take all necessary steps to fulfill the request?
2662
+ 2. EFFICIENCY: Were there any redundant or unnecessary steps?
2663
+ 3. ORDERING: Were the steps in a logical order given their dependencies?
2664
+ 4. APPROPRIATENESS: Was each step appropriate for the task?
2665
+ `;
2666
+ }
2667
+ prompt += `
2668
+ Evaluate each step and the overall trajectory quality.
2669
+ `;
2670
+ return prompt;
2671
+ };
2672
+ var createReasonPrompt7 = ({
2673
+ userInput,
2674
+ score,
2675
+ stepEvaluations,
2676
+ missingSteps,
2677
+ extraSteps
2678
+ }) => {
2679
+ return `
2680
+ Explain this trajectory evaluation in ONE SENTENCE.
2681
+
2682
+ User Request: "${userInput}"
2683
+ Score: ${score}/1
2684
+ Steps Evaluated: ${JSON.stringify(stepEvaluations)}
2685
+ Missing Steps: ${JSON.stringify(missingSteps)}
2686
+ Extra/Unnecessary Steps: ${JSON.stringify(extraSteps)}
2687
+
2688
+ Provide a single, concise sentence explaining why this score was given.
2689
+ `;
2690
+ };
2691
+
2692
+ // src/scorers/llm/trajectory/index.ts
2693
+ var analyzeOutputSchema6 = z.object({
2694
+ stepEvaluations: z.array(
2695
+ z.object({
2696
+ stepName: z.string().describe("Name of the step (tool name or action)"),
2697
+ wasNecessary: z.boolean().describe("Whether this step was necessary for the task"),
2698
+ wasInOrder: z.boolean().describe("Whether this step was in a logical position in the sequence"),
2699
+ reasoning: z.string().describe("Brief explanation of the evaluation")
2700
+ })
2701
+ ),
2702
+ missingSteps: z.array(z.string()).optional().describe("Steps that should have been taken but were not"),
2703
+ extraSteps: z.array(z.string()).optional().describe("Steps that were unnecessary or redundant"),
2704
+ overallAssessment: z.string().describe("Brief overall assessment of the trajectory quality")
2705
+ });
2706
+ function formatStepDetails(step) {
2707
+ switch (step.stepType) {
2708
+ case "tool_call":
2709
+ case "mcp_tool_call": {
2710
+ const parts = [];
2711
+ if (step.toolArgs !== void 0) parts.push(`args: ${JSON.stringify(step.toolArgs)}`);
2712
+ if (step.toolResult !== void 0) parts.push(`result: ${JSON.stringify(step.toolResult)}`);
2713
+ return parts.length > 0 ? ` (${parts.join(", ")})` : "";
2714
+ }
2715
+ case "model_generation":
2716
+ return step.modelId ? ` (model: ${step.modelId})` : "";
2717
+ case "workflow_step":
2718
+ return step.output !== void 0 ? ` (output: ${JSON.stringify(step.output)})` : "";
2719
+ default:
2720
+ return "";
2721
+ }
2722
+ }
2723
+ function formatTrajectory(trajectory, indent = 0) {
2724
+ const prefix = " ".repeat(indent);
2725
+ return trajectory.steps.map((step, i) => {
2726
+ let line = `${prefix}${i + 1}. [${step.stepType}] ${step.name}${formatStepDetails(step)}`;
2727
+ if (step.children && step.children.length > 0) {
2728
+ line += `
2729
+ ${formatTrajectory({ steps: step.children }, indent + 1)}`;
2730
+ }
2731
+ return line;
2732
+ }).join("\n");
2733
+ }
2734
+ function formatExpectedSteps(steps, indent = 0) {
2735
+ const prefix = " ".repeat(indent);
2736
+ return steps.map((step, i) => {
2737
+ const typeStr = step.stepType ? `[${step.stepType}] ` : "";
2738
+ const dataStr = step.data ? ` (data: ${JSON.stringify(step.data)})` : "";
2739
+ let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
2740
+ if (step.children?.steps && step.children.steps.length > 0) {
2741
+ line += `
2742
+ ${formatExpectedSteps(step.children.steps, indent + 1)}`;
2743
+ }
2744
+ return line;
2745
+ }).join("\n");
2746
+ }
2747
+ function createTrajectoryAccuracyScorerLLM({
2748
+ model,
2749
+ expectedTrajectory: staticExpectedTrajectory
2750
+ }) {
2751
+ return createScorer({
2752
+ id: "llm-trajectory-accuracy-scorer",
2753
+ name: "Trajectory Accuracy (LLM)",
2754
+ description: staticExpectedTrajectory ? "Evaluates the trajectory against an expected trajectory using LLM analysis" : "Evaluates the quality and appropriateness of the trajectory using LLM analysis",
2755
+ judge: {
2756
+ model,
2757
+ instructions: TRAJECTORY_EVALUATION_INSTRUCTIONS
2758
+ },
2759
+ type: "trajectory"
2760
+ }).preprocess(async ({ run }) => {
2761
+ const actualTrajectory = run.output;
2762
+ let expectedSteps;
2763
+ if (staticExpectedTrajectory) {
2764
+ if (Array.isArray(staticExpectedTrajectory)) {
2765
+ expectedSteps = staticExpectedTrajectory;
2766
+ } else {
2767
+ expectedSteps = staticExpectedTrajectory.steps.map((s) => {
2768
+ const result = { name: s.name, stepType: s.stepType };
2769
+ const data = {};
2770
+ if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolArgs !== void 0)
2771
+ data.input = s.toolArgs;
2772
+ if ((s.stepType === "tool_call" || s.stepType === "mcp_tool_call") && s.toolResult !== void 0)
2773
+ data.output = s.toolResult;
2774
+ if (s.stepType === "workflow_step" && s.output !== void 0) data.output = s.output;
2775
+ if (Object.keys(data).length > 0) result.data = data;
2776
+ if (s.children && s.children.length > 0) {
2777
+ result.children = {
2778
+ steps: s.children.map((c) => ({ name: c.name, stepType: c.stepType }))
2779
+ };
2780
+ }
2781
+ return result;
2782
+ });
2783
+ }
2784
+ } else if (run.expectedTrajectory) {
2785
+ const expectation = run.expectedTrajectory;
2786
+ expectedSteps = expectation.steps && expectation.steps.length > 0 ? expectation.steps : void 0;
2787
+ }
2788
+ return {
2789
+ actualTrajectory,
2790
+ actualTrajectoryFormatted: formatTrajectory(actualTrajectory),
2791
+ expectedTrajectoryFormatted: expectedSteps ? formatExpectedSteps(expectedSteps) : void 0,
2792
+ hasSteps: actualTrajectory.steps.length > 0
2793
+ };
2794
+ }).analyze({
2795
+ description: "Analyze the quality and appropriateness of the agent trajectory",
2796
+ outputSchema: analyzeOutputSchema6,
2797
+ createPrompt: ({ run, results }) => {
2798
+ const userInput = getUserMessageFromRunInput(run.input) ?? "";
2799
+ const agentResponse = getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
2800
+ return createAnalyzePrompt6({
2801
+ userInput,
2802
+ agentResponse,
2803
+ actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
2804
+ expectedTrajectory: results.preprocessStepResult?.expectedTrajectoryFormatted
2805
+ });
2806
+ }
2807
+ }).generateScore(({ results }) => {
2808
+ const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
2809
+ if (stepEvaluations.length === 0) {
2810
+ const missingSteps2 = results.analyzeStepResult?.missingSteps || [];
2811
+ const extraSteps = results.analyzeStepResult?.extraSteps || [];
2812
+ if (missingSteps2.length > 0) return 0;
2813
+ if (extraSteps.length > 0) return 0.5;
2814
+ return 1;
2815
+ }
2816
+ const necessarySteps = stepEvaluations.filter((e) => e.wasNecessary).length;
2817
+ const orderedSteps = stepEvaluations.filter((e) => e.wasInOrder).length;
2818
+ const totalSteps = stepEvaluations.length;
2819
+ const missingSteps = results.analyzeStepResult?.missingSteps || [];
2820
+ const missingPenalty = missingSteps.length > 0 ? missingSteps.length / (totalSteps + missingSteps.length) : 0;
2821
+ const necessityScore = necessarySteps / totalSteps;
2822
+ const orderScore = orderedSteps / totalSteps;
2823
+ const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
2824
+ return roundToTwoDecimals(Math.max(0, Math.min(1, score)));
2825
+ }).generateReason({
2826
+ description: "Generate human-readable explanation of trajectory evaluation",
2827
+ createPrompt: ({ run, results, score }) => {
2828
+ const userInput = getUserMessageFromRunInput(run.input) ?? "";
2829
+ const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
2830
+ const missingSteps = results.analyzeStepResult?.missingSteps || [];
2831
+ const extraSteps = results.analyzeStepResult?.extraSteps || [];
2832
+ return createReasonPrompt7({
2833
+ userInput,
2834
+ score,
2835
+ stepEvaluations,
2836
+ missingSteps,
2837
+ extraSteps
2838
+ });
2839
+ }
2840
+ });
2841
+ }
2603
2842
  function normalizeString(str) {
2604
2843
  return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
2605
2844
  }
@@ -2986,7 +3225,344 @@ function createToolCallAccuracyScorerCode(options) {
2986
3225
  });
2987
3226
  });
2988
3227
  }
3228
+ function trajectoryStepToExpectedStep(step) {
3229
+ const result = { name: step.name, stepType: step.stepType };
3230
+ const data = {};
3231
+ if (step.stepType === "tool_call" || step.stepType === "mcp_tool_call") {
3232
+ if (step.toolArgs !== void 0) data.input = step.toolArgs;
3233
+ if (step.toolResult !== void 0) data.output = step.toolResult;
3234
+ } else if (step.stepType === "workflow_step") {
3235
+ if (step.output !== void 0) data.output = step.output;
3236
+ }
3237
+ if (Object.keys(data).length > 0) result.data = data;
3238
+ if (step.children && step.children.length > 0) {
3239
+ result.children = {
3240
+ steps: step.children.map(trajectoryStepToExpectedStep)
3241
+ };
3242
+ }
3243
+ return result;
3244
+ }
3245
+ function expectationToExpectedSteps(expectation) {
3246
+ if (!expectation.steps || expectation.steps.length === 0) return void 0;
3247
+ return expectation.steps;
3248
+ }
3249
+ function createTrajectoryAccuracyScorerCode(options = {}) {
3250
+ const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
3251
+ const { ordering, strictOrder, compareStepData = false, allowRepeatedSteps = true } = comparisonOptions;
3252
+ const resolvedOrdering = ordering ?? (strictOrder ? "strict" : "relaxed");
3253
+ const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
3254
+ const getDescription = () => {
3255
+ if (staticExpectedSteps) {
3256
+ const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
3257
+ return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${resolvedOrdering} ordering)`;
3258
+ }
3259
+ return `Evaluates trajectory accuracy against expected trajectory from dataset items (${resolvedOrdering} ordering)`;
3260
+ };
3261
+ return createScorer({
3262
+ id: "code-trajectory-accuracy-scorer",
3263
+ name: "Trajectory Accuracy Scorer",
3264
+ description: getDescription(),
3265
+ type: "trajectory"
3266
+ }).preprocess(async ({ run }) => {
3267
+ const actualTrajectory = run.output;
3268
+ let resolvedExpectedSteps = staticExpectedSteps;
3269
+ if (!resolvedExpectedSteps && run.expectedTrajectory) {
3270
+ const expectation = run.expectedTrajectory;
3271
+ resolvedExpectedSteps = expectationToExpectedSteps(expectation);
3272
+ }
3273
+ if (!resolvedExpectedSteps || resolvedExpectedSteps.length === 0) {
3274
+ return {
3275
+ actualTrajectory,
3276
+ expectedTrajectory: void 0,
3277
+ comparison: void 0,
3278
+ actualStepNames: actualTrajectory.steps.map((s) => s.name),
3279
+ expectedStepNames: [],
3280
+ error: "No expected trajectory provided (pass via options or dataset item expectedTrajectory)"
3281
+ };
3282
+ }
3283
+ const itemExpectation = run.expectedTrajectory;
3284
+ const effectiveOrdering = itemExpectation?.ordering ?? resolvedOrdering;
3285
+ const effectiveCompareData = itemExpectation?.compareStepData ?? compareStepData;
3286
+ const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
3287
+ const comparison = compareTrajectories(
3288
+ actualTrajectory,
3289
+ { steps: resolvedExpectedSteps },
3290
+ {
3291
+ ordering: effectiveOrdering,
3292
+ compareStepData: effectiveCompareData,
3293
+ allowRepeatedSteps: effectiveAllowRepeated
3294
+ }
3295
+ );
3296
+ return {
3297
+ actualTrajectory,
3298
+ expectedTrajectory: { steps: resolvedExpectedSteps },
3299
+ comparison,
3300
+ actualStepNames: actualTrajectory.steps.map((s) => s.name),
3301
+ expectedStepNames: resolvedExpectedSteps.map((s) => s.name)
3302
+ };
3303
+ }).generateScore(({ results }) => {
3304
+ const preprocessResult = results.preprocessStepResult;
3305
+ if (!preprocessResult || !preprocessResult.comparison) {
3306
+ return 0;
3307
+ }
3308
+ return preprocessResult.comparison.score;
3309
+ });
3310
+ }
3311
+ function evaluateNestedExpectations(expectedSteps, actualSteps) {
3312
+ const results = [];
3313
+ const matchedIndices = /* @__PURE__ */ new Set();
3314
+ for (const expectedStep of expectedSteps) {
3315
+ if (!expectedStep.children) continue;
3316
+ const matchIndex = actualSteps.findIndex(
3317
+ (s, i) => !matchedIndices.has(i) && s.name === expectedStep.name && (!expectedStep.stepType || s.stepType === expectedStep.stepType)
3318
+ );
3319
+ const actualStep = matchIndex >= 0 ? actualSteps[matchIndex] : void 0;
3320
+ if (matchIndex >= 0) matchedIndices.add(matchIndex);
3321
+ if (!actualStep?.children || actualStep.children.length === 0) {
3322
+ const expectedStepCount = expectedStep.children.steps?.length ?? 0;
3323
+ results.push({
3324
+ stepName: expectedStep.name,
3325
+ score: 0,
3326
+ accuracy: expectedStepCount > 0 ? {
3327
+ score: 0,
3328
+ matchedSteps: 0,
3329
+ totalExpectedSteps: expectedStepCount,
3330
+ totalActualSteps: 0,
3331
+ missingSteps: expectedStep.children.steps.map((s) => s.name),
3332
+ extraSteps: [],
3333
+ outOfOrderSteps: [],
3334
+ repeatedSteps: []
3335
+ } : void 0
3336
+ });
3337
+ continue;
3338
+ }
3339
+ const childTrajectory = {
3340
+ steps: actualStep.children,
3341
+ totalDurationMs: actualStep.durationMs
3342
+ };
3343
+ const childConfig = expectedStep.children;
3344
+ let accuracy;
3345
+ if (childConfig.steps && childConfig.steps.length > 0) {
3346
+ accuracy = compareTrajectories(
3347
+ childTrajectory,
3348
+ { steps: childConfig.steps },
3349
+ {
3350
+ ordering: childConfig.ordering ?? "relaxed",
3351
+ compareStepData: childConfig.compareStepData ?? false,
3352
+ allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
3353
+ }
3354
+ );
3355
+ }
3356
+ const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
3357
+ const efficiency = hasEfficiencyConfig ? checkTrajectoryEfficiency(childTrajectory, {
3358
+ maxSteps: childConfig.maxSteps,
3359
+ maxTotalTokens: childConfig.maxTotalTokens,
3360
+ maxTotalDurationMs: childConfig.maxTotalDurationMs,
3361
+ noRedundantCalls: childConfig.noRedundantCalls ?? true
3362
+ }) : void 0;
3363
+ const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
3364
+ const blacklist = hasBlacklistConfig ? checkTrajectoryBlacklist(childTrajectory, {
3365
+ blacklistedTools: childConfig.blacklistedTools,
3366
+ blacklistedSequences: childConfig.blacklistedSequences
3367
+ }) : void 0;
3368
+ const toolFailures = analyzeToolFailures(childTrajectory, {
3369
+ maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
3370
+ });
3371
+ const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children) : [];
3372
+ const scores = [];
3373
+ if (accuracy) scores.push({ weight: 0.4, value: accuracy.score });
3374
+ if (efficiency) scores.push({ weight: 0.3, value: efficiency.score });
3375
+ if (toolFailures && toolFailures.patterns.length > 0) scores.push({ weight: 0.2, value: toolFailures.score });
3376
+ if (blacklist) {
3377
+ if (blacklist.score === 0) {
3378
+ results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
3379
+ continue;
3380
+ }
3381
+ scores.push({ weight: 0.1, value: blacklist.score });
3382
+ }
3383
+ let levelScore = 1;
3384
+ if (scores.length > 0) {
3385
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
3386
+ levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
3387
+ }
3388
+ let finalScore = levelScore;
3389
+ if (nested.length > 0) {
3390
+ const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
3391
+ if (hasNestedBlacklistViolation) {
3392
+ results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
3393
+ continue;
3394
+ }
3395
+ const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
3396
+ finalScore = 0.7 * levelScore + 0.3 * nestedAvg;
3397
+ }
3398
+ results.push({
3399
+ stepName: expectedStep.name,
3400
+ score: Math.round(finalScore * 100) / 100,
3401
+ accuracy,
3402
+ efficiency,
3403
+ blacklist,
3404
+ toolFailures,
3405
+ nested: nested.length > 0 ? nested : void 0
3406
+ });
3407
+ }
3408
+ return results;
3409
+ }
3410
+ function createTrajectoryScorerCode(options = {}) {
3411
+ const { defaults = {} } = options;
3412
+ return createScorer({
3413
+ id: "code-trajectory-scorer",
3414
+ name: "Trajectory Scorer",
3415
+ description: "Multi-dimensional trajectory evaluation: accuracy, efficiency, blacklist, and tool failures",
3416
+ type: "trajectory"
3417
+ }).preprocess(async ({ run }) => {
3418
+ const actualTrajectory = run.output;
3419
+ const itemExpectation = run.expectedTrajectory ?? {};
3420
+ const config = { ...defaults, ...itemExpectation };
3421
+ if (itemExpectation.steps !== void 0) {
3422
+ config.steps = itemExpectation.steps;
3423
+ }
3424
+ let accuracy;
3425
+ if (config.steps && config.steps.length > 0) {
3426
+ accuracy = compareTrajectories(
3427
+ actualTrajectory,
3428
+ { steps: config.steps },
3429
+ {
3430
+ ordering: config.ordering ?? "relaxed",
3431
+ compareStepData: config.compareStepData ?? false,
3432
+ allowRepeatedSteps: config.allowRepeatedSteps ?? true
3433
+ }
3434
+ );
3435
+ }
3436
+ const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
3437
+ const efficiency = hasEfficiencyConfig ? checkTrajectoryEfficiency(actualTrajectory, {
3438
+ maxSteps: config.maxSteps,
3439
+ maxTotalTokens: config.maxTotalTokens,
3440
+ maxTotalDurationMs: config.maxTotalDurationMs,
3441
+ noRedundantCalls: config.noRedundantCalls ?? true
3442
+ }) : void 0;
3443
+ const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
3444
+ const blacklist = hasBlacklistConfig ? checkTrajectoryBlacklist(actualTrajectory, {
3445
+ blacklistedTools: config.blacklistedTools,
3446
+ blacklistedSequences: config.blacklistedSequences
3447
+ }) : void 0;
3448
+ const toolFailures = analyzeToolFailures(actualTrajectory, {
3449
+ maxRetriesPerTool: config.maxRetriesPerTool ?? 2
3450
+ });
3451
+ const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps) : void 0;
3452
+ return {
3453
+ accuracy,
3454
+ efficiency,
3455
+ blacklist,
3456
+ toolFailures,
3457
+ nested: nested && nested.length > 0 ? nested : void 0,
3458
+ config
3459
+ };
3460
+ }).generateScore(({ results }) => {
3461
+ const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
3462
+ if (blacklist && blacklist.score === 0) {
3463
+ return 0;
3464
+ }
3465
+ const scores = [];
3466
+ if (accuracy) {
3467
+ scores.push({ weight: 0.4, value: accuracy.score });
3468
+ }
3469
+ if (efficiency) {
3470
+ scores.push({ weight: 0.3, value: efficiency.score });
3471
+ }
3472
+ if (toolFailures && toolFailures.patterns.length > 0) {
3473
+ scores.push({ weight: 0.2, value: toolFailures.score });
3474
+ }
3475
+ if (blacklist) {
3476
+ scores.push({ weight: 0.1, value: blacklist.score });
3477
+ }
3478
+ if (scores.length === 0 && !nested) {
3479
+ return 1;
3480
+ }
3481
+ let levelScore = 1;
3482
+ if (scores.length > 0) {
3483
+ const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
3484
+ levelScore = scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0);
3485
+ }
3486
+ if (nested && nested.length > 0) {
3487
+ const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
3488
+ if (hasNestedBlacklistViolation) {
3489
+ return 0;
3490
+ }
3491
+ const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
3492
+ levelScore = 0.7 * levelScore + 0.3 * nestedAvg;
3493
+ }
3494
+ return Math.round(levelScore * 100) / 100;
3495
+ }).generateReason(({ results, score }) => {
3496
+ const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
3497
+ const parts = [];
3498
+ parts.push(`Score: ${score}`);
3499
+ if (blacklist && blacklist.score === 0) {
3500
+ const violations = [];
3501
+ if (blacklist.violatedTools.length > 0) {
3502
+ violations.push(`forbidden tools used: ${blacklist.violatedTools.join(", ")}`);
3503
+ }
3504
+ if (blacklist.violatedSequences.length > 0) {
3505
+ violations.push(`forbidden sequences: ${blacklist.violatedSequences.map((s) => s.join(" \u2192 ")).join("; ")}`);
3506
+ }
3507
+ parts.push(`Blacklist violation: ${violations.join(". ")}.`);
3508
+ return parts.join("\n");
3509
+ }
3510
+ if (nested && nested.some((r) => r.blacklist && r.blacklist.score === 0)) {
3511
+ const violating = nested.filter((r) => r.blacklist && r.blacklist.score === 0).map((r) => r.stepName);
3512
+ parts.push(`Nested blacklist violation in: ${violating.join(", ")}.`);
3513
+ return parts.join("\n");
3514
+ }
3515
+ if (accuracy) {
3516
+ const details = [`${accuracy.matchedSteps}/${accuracy.totalExpectedSteps} expected steps matched`];
3517
+ if (accuracy.missingSteps.length > 0) {
3518
+ details.push(`missing: ${accuracy.missingSteps.join(", ")}`);
3519
+ }
3520
+ if (accuracy.extraSteps.length > 0) {
3521
+ details.push(`extra: ${accuracy.extraSteps.join(", ")}`);
3522
+ }
3523
+ if (accuracy.outOfOrderSteps.length > 0) {
3524
+ details.push(`out of order: ${accuracy.outOfOrderSteps.join(", ")}`);
3525
+ }
3526
+ parts.push(`Accuracy (${accuracy.score}): ${details.join(". ")}.`);
3527
+ }
3528
+ if (efficiency) {
3529
+ const details = [];
3530
+ if (efficiency.overStepBudget) {
3531
+ details.push(`over step budget (${efficiency.totalSteps} steps)`);
3532
+ }
3533
+ if (efficiency.overTokenBudget) {
3534
+ details.push(`over token budget (${efficiency.totalTokens} tokens)`);
3535
+ }
3536
+ if (efficiency.overDurationBudget) {
3537
+ details.push(`over duration budget (${efficiency.totalDurationMs}ms)`);
3538
+ }
3539
+ if (efficiency.redundantCalls.length > 0) {
3540
+ details.push(`redundant calls: ${efficiency.redundantCalls.map((c) => c.name).join(", ")}`);
3541
+ }
3542
+ if (details.length > 0) {
3543
+ parts.push(`Efficiency (${efficiency.score}): ${details.join(". ")}.`);
3544
+ } else {
3545
+ parts.push(`Efficiency (${efficiency.score}): all budgets met, no redundant calls.`);
3546
+ }
3547
+ }
3548
+ if (toolFailures && toolFailures.patterns.length > 0) {
3549
+ const details = [];
3550
+ if (toolFailures.totalRetries > 0) {
3551
+ details.push(`${toolFailures.totalRetries} total retries`);
3552
+ }
3553
+ if (toolFailures.excessiveRetryTools.length > 0) {
3554
+ details.push(`excessive retries: ${toolFailures.excessiveRetryTools.join(", ")}`);
3555
+ }
3556
+ parts.push(`Tool failures (${toolFailures.score}): ${details.join(". ")}.`);
3557
+ }
3558
+ if (nested && nested.length > 0) {
3559
+ const nestedSummary = nested.map((r) => `${r.stepName}: ${r.score}`).join(", ");
3560
+ parts.push(`Nested scores: ${nestedSummary}.`);
3561
+ }
3562
+ return parts.join("\n");
3563
+ });
3564
+ }
2989
3565
 
2990
- export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer };
3566
+ export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer, createTrajectoryAccuracyScorerCode, createTrajectoryAccuracyScorerLLM, createTrajectoryScorerCode };
2991
3567
  //# sourceMappingURL=index.js.map
2992
3568
  //# sourceMappingURL=index.js.map