@mastra/evals 1.1.2 → 1.2.0-alpha.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +78 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-AY4K3J4R.cjs +581 -0
- package/dist/chunk-AY4K3J4R.cjs.map +1 -0
- package/dist/chunk-X4MKZ735.js +555 -0
- package/dist/chunk-X4MKZ735.js.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +289 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +627 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +164 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +627 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +567 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +168 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage } from '../../chunk-
|
|
1
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures } from '../../chunk-X4MKZ735.js';
|
|
2
2
|
import { createScorer } from '@mastra/core/evals';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
import nlp from 'compromise';
|
|
@@ -2600,6 +2600,239 @@ function createPromptAlignmentScorerLLM({
|
|
|
2600
2600
|
}
|
|
2601
2601
|
});
|
|
2602
2602
|
}
|
|
2603
|
+
|
|
2604
|
+
// src/scorers/llm/trajectory/prompts.ts
|
|
2605
|
+
var TRAJECTORY_EVALUATION_INSTRUCTIONS = `
|
|
2606
|
+
You are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.
|
|
2607
|
+
|
|
2608
|
+
CORE RESPONSIBILITIES:
|
|
2609
|
+
- Analyze the full sequence of actions the agent took
|
|
2610
|
+
- Evaluate whether each step was necessary and well-ordered
|
|
2611
|
+
- Identify unnecessary, redundant, or missing steps
|
|
2612
|
+
- Assess the overall quality of the agent's action path
|
|
2613
|
+
|
|
2614
|
+
EVALUATION PHILOSOPHY:
|
|
2615
|
+
- Consider both the individual steps AND the overall flow
|
|
2616
|
+
- A good trajectory is efficient, logical, and complete
|
|
2617
|
+
- Redundant steps reduce quality even if the final result is correct
|
|
2618
|
+
- Missing critical steps are a significant issue
|
|
2619
|
+
- Order matters: logical dependencies should be respected
|
|
2620
|
+
|
|
2621
|
+
OUTPUT REQUIREMENTS:
|
|
2622
|
+
- Provide clear reasoning for your trajectory assessment
|
|
2623
|
+
- Use provided JSON schema exactly as specified
|
|
2624
|
+
- Be consistent in your evaluation standards
|
|
2625
|
+
`;
|
|
2626
|
+
var createAnalyzePrompt6 = ({
|
|
2627
|
+
userInput,
|
|
2628
|
+
agentResponse,
|
|
2629
|
+
actualTrajectory,
|
|
2630
|
+
expectedTrajectory
|
|
2631
|
+
}) => {
|
|
2632
|
+
let prompt = `
|
|
2633
|
+
You are evaluating whether an AI agent took an appropriate sequence of actions to fulfill a user request.
|
|
2634
|
+
|
|
2635
|
+
USER REQUEST: "${userInput}"
|
|
2636
|
+
AGENT FINAL RESPONSE: "${agentResponse}"
|
|
2637
|
+
|
|
2638
|
+
ACTUAL TRAJECTORY (sequence of actions the agent took):
|
|
2639
|
+
${actualTrajectory}
|
|
2640
|
+
`;
|
|
2641
|
+
if (expectedTrajectory) {
|
|
2642
|
+
prompt += `
|
|
2643
|
+
EXPECTED TRAJECTORY (the ideal sequence):
|
|
2644
|
+
${expectedTrajectory}
|
|
2645
|
+
|
|
2646
|
+
EVALUATION CRITERIA:
|
|
2647
|
+
1. STEP PRESENCE: Did the agent perform all expected steps?
|
|
2648
|
+
2. STEP ORDER: Were the steps in a logical order? (Expected order is a guideline, not absolute)
|
|
2649
|
+
3. EXTRA STEPS: Did the agent take unnecessary steps not in the expected trajectory?
|
|
2650
|
+
4. MISSING STEPS: Are any expected steps missing from the actual trajectory?
|
|
2651
|
+
5. STEP QUALITY: For each step that matches, was it executed appropriately?
|
|
2652
|
+
|
|
2653
|
+
For each actual step, evaluate:
|
|
2654
|
+
- Does it correspond to an expected step?
|
|
2655
|
+
- Was it necessary for the task?
|
|
2656
|
+
- Was it in the right position in the sequence?
|
|
2657
|
+
`;
|
|
2658
|
+
} else {
|
|
2659
|
+
prompt += `
|
|
2660
|
+
EVALUATION CRITERIA (no expected trajectory provided - evaluate based on the task):
|
|
2661
|
+
1. COMPLETENESS: Did the agent take all necessary steps to fulfill the request?
|
|
2662
|
+
2. EFFICIENCY: Were there any redundant or unnecessary steps?
|
|
2663
|
+
3. ORDERING: Were the steps in a logical order given their dependencies?
|
|
2664
|
+
4. APPROPRIATENESS: Was each step appropriate for the task?
|
|
2665
|
+
`;
|
|
2666
|
+
}
|
|
2667
|
+
prompt += `
|
|
2668
|
+
Evaluate each step and the overall trajectory quality.
|
|
2669
|
+
`;
|
|
2670
|
+
return prompt;
|
|
2671
|
+
};
|
|
2672
|
+
var createReasonPrompt7 = ({
|
|
2673
|
+
userInput,
|
|
2674
|
+
score,
|
|
2675
|
+
stepEvaluations,
|
|
2676
|
+
missingSteps,
|
|
2677
|
+
extraSteps
|
|
2678
|
+
}) => {
|
|
2679
|
+
return `
|
|
2680
|
+
Explain this trajectory evaluation in ONE SENTENCE.
|
|
2681
|
+
|
|
2682
|
+
User Request: "${userInput}"
|
|
2683
|
+
Score: ${score}/1
|
|
2684
|
+
Steps Evaluated: ${JSON.stringify(stepEvaluations)}
|
|
2685
|
+
Missing Steps: ${JSON.stringify(missingSteps)}
|
|
2686
|
+
Extra/Unnecessary Steps: ${JSON.stringify(extraSteps)}
|
|
2687
|
+
|
|
2688
|
+
Provide a single, concise sentence explaining why this score was given.
|
|
2689
|
+
`;
|
|
2690
|
+
};
|
|
2691
|
+
|
|
2692
|
+
// src/scorers/llm/trajectory/index.ts
|
|
2693
|
+
var analyzeOutputSchema6 = z.object({
|
|
2694
|
+
stepEvaluations: z.array(
|
|
2695
|
+
z.object({
|
|
2696
|
+
stepName: z.string().describe("Name of the step (tool name or action)"),
|
|
2697
|
+
wasNecessary: z.boolean().describe("Whether this step was necessary for the task"),
|
|
2698
|
+
wasInOrder: z.boolean().describe("Whether this step was in a logical position in the sequence"),
|
|
2699
|
+
reasoning: z.string().describe("Brief explanation of the evaluation")
|
|
2700
|
+
})
|
|
2701
|
+
),
|
|
2702
|
+
missingSteps: z.array(z.string()).optional().describe("Steps that should have been taken but were not"),
|
|
2703
|
+
extraSteps: z.array(z.string()).optional().describe("Steps that were unnecessary or redundant"),
|
|
2704
|
+
overallAssessment: z.string().describe("Brief overall assessment of the trajectory quality")
|
|
2705
|
+
});
|
|
2706
|
+
function formatStepDetails(step) {
|
|
2707
|
+
switch (step.stepType) {
|
|
2708
|
+
case "tool_call":
|
|
2709
|
+
case "mcp_tool_call": {
|
|
2710
|
+
const parts = [];
|
|
2711
|
+
if (step.toolArgs !== void 0) parts.push(`args: ${JSON.stringify(step.toolArgs)}`);
|
|
2712
|
+
if (step.toolResult !== void 0) parts.push(`result: ${JSON.stringify(step.toolResult)}`);
|
|
2713
|
+
return parts.length > 0 ? ` (${parts.join(", ")})` : "";
|
|
2714
|
+
}
|
|
2715
|
+
case "model_generation":
|
|
2716
|
+
return step.modelId ? ` (model: ${step.modelId})` : "";
|
|
2717
|
+
case "workflow_step":
|
|
2718
|
+
return step.output !== void 0 ? ` (output: ${JSON.stringify(step.output)})` : "";
|
|
2719
|
+
default:
|
|
2720
|
+
return "";
|
|
2721
|
+
}
|
|
2722
|
+
}
|
|
2723
|
+
function formatTrajectory(trajectory, indent = 0) {
|
|
2724
|
+
const prefix = " ".repeat(indent);
|
|
2725
|
+
return trajectory.steps.map((step, i) => {
|
|
2726
|
+
let line = `${prefix}${i + 1}. [${step.stepType}] ${step.name}${formatStepDetails(step)}`;
|
|
2727
|
+
if (step.children && step.children.length > 0) {
|
|
2728
|
+
line += `
|
|
2729
|
+
${formatTrajectory({ steps: step.children }, indent + 1)}`;
|
|
2730
|
+
}
|
|
2731
|
+
return line;
|
|
2732
|
+
}).join("\n");
|
|
2733
|
+
}
|
|
2734
|
+
function formatExpectedSteps(steps, indent = 0) {
|
|
2735
|
+
const prefix = " ".repeat(indent);
|
|
2736
|
+
return steps.map((step, i) => {
|
|
2737
|
+
const typeStr = step.stepType ? `[${step.stepType}] ` : "";
|
|
2738
|
+
const { name: _, stepType: _t, children: _c, ...fields } = step;
|
|
2739
|
+
const dataStr = Object.keys(fields).length > 0 ? ` (${JSON.stringify(fields)})` : "";
|
|
2740
|
+
let line = `${prefix}${i + 1}. ${typeStr}${step.name}${dataStr}`;
|
|
2741
|
+
if (step.children?.steps && step.children.steps.length > 0) {
|
|
2742
|
+
line += `
|
|
2743
|
+
${formatExpectedSteps(step.children.steps, indent + 1)}`;
|
|
2744
|
+
}
|
|
2745
|
+
return line;
|
|
2746
|
+
}).join("\n");
|
|
2747
|
+
}
|
|
2748
|
+
function createTrajectoryAccuracyScorerLLM({
|
|
2749
|
+
model,
|
|
2750
|
+
expectedTrajectory: staticExpectedTrajectory
|
|
2751
|
+
}) {
|
|
2752
|
+
return createScorer({
|
|
2753
|
+
id: "llm-trajectory-accuracy-scorer",
|
|
2754
|
+
name: "Trajectory Accuracy (LLM)",
|
|
2755
|
+
description: staticExpectedTrajectory ? "Evaluates the trajectory against an expected trajectory using LLM analysis" : "Evaluates the quality and appropriateness of the trajectory using LLM analysis",
|
|
2756
|
+
judge: {
|
|
2757
|
+
model,
|
|
2758
|
+
instructions: TRAJECTORY_EVALUATION_INSTRUCTIONS
|
|
2759
|
+
},
|
|
2760
|
+
type: "trajectory"
|
|
2761
|
+
}).preprocess(async ({ run }) => {
|
|
2762
|
+
const actualTrajectory = run.output;
|
|
2763
|
+
let expectedSteps;
|
|
2764
|
+
if (staticExpectedTrajectory) {
|
|
2765
|
+
if (Array.isArray(staticExpectedTrajectory)) {
|
|
2766
|
+
expectedSteps = staticExpectedTrajectory;
|
|
2767
|
+
} else {
|
|
2768
|
+
const toExpectedStep = (s) => {
|
|
2769
|
+
const { durationMs: _, metadata: _m, children, ...rest } = s;
|
|
2770
|
+
const result = rest;
|
|
2771
|
+
if (children && children.length > 0) {
|
|
2772
|
+
result.children = { steps: children.map(toExpectedStep) };
|
|
2773
|
+
}
|
|
2774
|
+
return result;
|
|
2775
|
+
};
|
|
2776
|
+
expectedSteps = staticExpectedTrajectory.steps.map(toExpectedStep);
|
|
2777
|
+
}
|
|
2778
|
+
} else if (run.expectedTrajectory) {
|
|
2779
|
+
const expectation = run.expectedTrajectory;
|
|
2780
|
+
expectedSteps = expectation.steps && expectation.steps.length > 0 ? expectation.steps : void 0;
|
|
2781
|
+
}
|
|
2782
|
+
return {
|
|
2783
|
+
actualTrajectory,
|
|
2784
|
+
actualTrajectoryFormatted: formatTrajectory(actualTrajectory),
|
|
2785
|
+
expectedTrajectoryFormatted: expectedSteps ? formatExpectedSteps(expectedSteps) : void 0,
|
|
2786
|
+
hasSteps: actualTrajectory.steps.length > 0
|
|
2787
|
+
};
|
|
2788
|
+
}).analyze({
|
|
2789
|
+
description: "Analyze the quality and appropriateness of the agent trajectory",
|
|
2790
|
+
outputSchema: analyzeOutputSchema6,
|
|
2791
|
+
createPrompt: ({ run, results }) => {
|
|
2792
|
+
const userInput = getUserMessageFromRunInput(run.input) ?? "";
|
|
2793
|
+
const agentResponse = getAssistantMessageFromRunOutput(run.output.rawOutput) ?? "";
|
|
2794
|
+
return createAnalyzePrompt6({
|
|
2795
|
+
userInput,
|
|
2796
|
+
agentResponse,
|
|
2797
|
+
actualTrajectory: results.preprocessStepResult?.actualTrajectoryFormatted ?? "No steps taken",
|
|
2798
|
+
expectedTrajectory: results.preprocessStepResult?.expectedTrajectoryFormatted
|
|
2799
|
+
});
|
|
2800
|
+
}
|
|
2801
|
+
}).generateScore(({ results }) => {
|
|
2802
|
+
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
2803
|
+
if (stepEvaluations.length === 0) {
|
|
2804
|
+
const missingSteps2 = results.analyzeStepResult?.missingSteps || [];
|
|
2805
|
+
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
2806
|
+
if (missingSteps2.length > 0) return 0;
|
|
2807
|
+
if (extraSteps.length > 0) return 0.5;
|
|
2808
|
+
return 1;
|
|
2809
|
+
}
|
|
2810
|
+
const necessarySteps = stepEvaluations.filter((e) => e.wasNecessary).length;
|
|
2811
|
+
const orderedSteps = stepEvaluations.filter((e) => e.wasInOrder).length;
|
|
2812
|
+
const totalSteps = stepEvaluations.length;
|
|
2813
|
+
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
2814
|
+
const missingPenalty = missingSteps.length > 0 ? missingSteps.length / (totalSteps + missingSteps.length) : 0;
|
|
2815
|
+
const necessityScore = necessarySteps / totalSteps;
|
|
2816
|
+
const orderScore = orderedSteps / totalSteps;
|
|
2817
|
+
const score = necessityScore * 0.6 + orderScore * 0.3 - missingPenalty * 0.1;
|
|
2818
|
+
return roundToTwoDecimals(Math.max(0, Math.min(1, score)));
|
|
2819
|
+
}).generateReason({
|
|
2820
|
+
description: "Generate human-readable explanation of trajectory evaluation",
|
|
2821
|
+
createPrompt: ({ run, results, score }) => {
|
|
2822
|
+
const userInput = getUserMessageFromRunInput(run.input) ?? "";
|
|
2823
|
+
const stepEvaluations = results.analyzeStepResult?.stepEvaluations || [];
|
|
2824
|
+
const missingSteps = results.analyzeStepResult?.missingSteps || [];
|
|
2825
|
+
const extraSteps = results.analyzeStepResult?.extraSteps || [];
|
|
2826
|
+
return createReasonPrompt7({
|
|
2827
|
+
userInput,
|
|
2828
|
+
score,
|
|
2829
|
+
stepEvaluations,
|
|
2830
|
+
missingSteps,
|
|
2831
|
+
extraSteps
|
|
2832
|
+
});
|
|
2833
|
+
}
|
|
2834
|
+
});
|
|
2835
|
+
}
|
|
2603
2836
|
function normalizeString(str) {
|
|
2604
2837
|
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
2605
2838
|
}
|
|
@@ -2986,7 +3219,339 @@ function createToolCallAccuracyScorerCode(options) {
|
|
|
2986
3219
|
});
|
|
2987
3220
|
});
|
|
2988
3221
|
}
|
|
3222
|
+
function trajectoryStepToExpectedStep(step) {
|
|
3223
|
+
const { durationMs: _, metadata: _m, children, ...rest } = step;
|
|
3224
|
+
const result = rest;
|
|
3225
|
+
if (children && children.length > 0) {
|
|
3226
|
+
result.children = {
|
|
3227
|
+
steps: children.map(trajectoryStepToExpectedStep)
|
|
3228
|
+
};
|
|
3229
|
+
}
|
|
3230
|
+
return result;
|
|
3231
|
+
}
|
|
3232
|
+
function expectationToExpectedSteps(expectation) {
|
|
3233
|
+
if (!expectation.steps || expectation.steps.length === 0) return void 0;
|
|
3234
|
+
return expectation.steps;
|
|
3235
|
+
}
|
|
3236
|
+
function createTrajectoryAccuracyScorerCode(options = {}) {
|
|
3237
|
+
const { expectedTrajectory: staticExpectedTrajectory, comparisonOptions = {} } = options;
|
|
3238
|
+
const { ordering = "relaxed", allowRepeatedSteps = true } = comparisonOptions;
|
|
3239
|
+
const staticExpectedSteps = staticExpectedTrajectory ? Array.isArray(staticExpectedTrajectory) && staticExpectedTrajectory.length > 0 && !("steps" in staticExpectedTrajectory[0] || false) ? staticExpectedTrajectory : "steps" in staticExpectedTrajectory ? staticExpectedTrajectory.steps.map(trajectoryStepToExpectedStep) : void 0 : void 0;
|
|
3240
|
+
const getDescription = () => {
|
|
3241
|
+
if (staticExpectedSteps) {
|
|
3242
|
+
const expectedStepNames = staticExpectedSteps.map((s) => s.name).join(" \u2192 ");
|
|
3243
|
+
return `Evaluates whether the trajectory matches the expected path: [${expectedStepNames}] (${ordering} ordering)`;
|
|
3244
|
+
}
|
|
3245
|
+
return `Evaluates trajectory accuracy against expected trajectory from dataset items (${ordering} ordering)`;
|
|
3246
|
+
};
|
|
3247
|
+
return createScorer({
|
|
3248
|
+
id: "code-trajectory-accuracy-scorer",
|
|
3249
|
+
name: "Trajectory Accuracy Scorer",
|
|
3250
|
+
description: getDescription(),
|
|
3251
|
+
type: "trajectory"
|
|
3252
|
+
}).preprocess(async ({ run }) => {
|
|
3253
|
+
const actualTrajectory = run.output;
|
|
3254
|
+
let resolvedExpectedSteps = staticExpectedSteps;
|
|
3255
|
+
if (!resolvedExpectedSteps && run.expectedTrajectory) {
|
|
3256
|
+
const expectation = run.expectedTrajectory;
|
|
3257
|
+
resolvedExpectedSteps = expectationToExpectedSteps(expectation);
|
|
3258
|
+
}
|
|
3259
|
+
if (!resolvedExpectedSteps || resolvedExpectedSteps.length === 0) {
|
|
3260
|
+
return {
|
|
3261
|
+
actualTrajectory,
|
|
3262
|
+
expectedTrajectory: void 0,
|
|
3263
|
+
comparison: void 0,
|
|
3264
|
+
actualStepNames: actualTrajectory.steps.map((s) => s.name),
|
|
3265
|
+
expectedStepNames: [],
|
|
3266
|
+
error: "No expected trajectory provided (pass via options or dataset item expectedTrajectory)"
|
|
3267
|
+
};
|
|
3268
|
+
}
|
|
3269
|
+
const itemExpectation = run.expectedTrajectory;
|
|
3270
|
+
const effectiveOrdering = itemExpectation?.ordering ?? ordering;
|
|
3271
|
+
const effectiveAllowRepeated = itemExpectation?.allowRepeatedSteps ?? allowRepeatedSteps;
|
|
3272
|
+
const comparison = compareTrajectories(
|
|
3273
|
+
actualTrajectory,
|
|
3274
|
+
{ steps: resolvedExpectedSteps },
|
|
3275
|
+
{
|
|
3276
|
+
ordering: effectiveOrdering,
|
|
3277
|
+
allowRepeatedSteps: effectiveAllowRepeated
|
|
3278
|
+
}
|
|
3279
|
+
);
|
|
3280
|
+
return {
|
|
3281
|
+
actualTrajectory,
|
|
3282
|
+
expectedTrajectory: { steps: resolvedExpectedSteps },
|
|
3283
|
+
comparison,
|
|
3284
|
+
actualStepNames: actualTrajectory.steps.map((s) => s.name),
|
|
3285
|
+
expectedStepNames: resolvedExpectedSteps.map((s) => s.name)
|
|
3286
|
+
};
|
|
3287
|
+
}).generateScore(({ results }) => {
|
|
3288
|
+
const preprocessResult = results.preprocessStepResult;
|
|
3289
|
+
if (!preprocessResult || !preprocessResult.comparison) {
|
|
3290
|
+
return 0;
|
|
3291
|
+
}
|
|
3292
|
+
return preprocessResult.comparison.score;
|
|
3293
|
+
});
|
|
3294
|
+
}
|
|
3295
|
+
function evaluateNestedExpectations(expectedSteps, actualSteps, weights = { accuracy: 0.4, efficiency: 0.3, toolFailures: 0.2, blacklist: 0.1 }) {
|
|
3296
|
+
const results = [];
|
|
3297
|
+
const matchedIndices = /* @__PURE__ */ new Set();
|
|
3298
|
+
for (const expectedStep of expectedSteps) {
|
|
3299
|
+
if (!expectedStep.children) continue;
|
|
3300
|
+
const matchIndex = actualSteps.findIndex(
|
|
3301
|
+
(s, i) => !matchedIndices.has(i) && s.name === expectedStep.name && (!expectedStep.stepType || s.stepType === expectedStep.stepType)
|
|
3302
|
+
);
|
|
3303
|
+
const actualStep = matchIndex >= 0 ? actualSteps[matchIndex] : void 0;
|
|
3304
|
+
if (matchIndex >= 0) matchedIndices.add(matchIndex);
|
|
3305
|
+
if (!actualStep?.children || actualStep.children.length === 0) {
|
|
3306
|
+
const expectedStepCount = expectedStep.children.steps?.length ?? 0;
|
|
3307
|
+
results.push({
|
|
3308
|
+
stepName: expectedStep.name,
|
|
3309
|
+
score: 0,
|
|
3310
|
+
accuracy: expectedStepCount > 0 ? {
|
|
3311
|
+
score: 0,
|
|
3312
|
+
matchedSteps: 0,
|
|
3313
|
+
totalExpectedSteps: expectedStepCount,
|
|
3314
|
+
totalActualSteps: 0,
|
|
3315
|
+
missingSteps: expectedStep.children.steps.map((s) => s.name),
|
|
3316
|
+
extraSteps: [],
|
|
3317
|
+
outOfOrderSteps: [],
|
|
3318
|
+
repeatedSteps: []
|
|
3319
|
+
} : void 0
|
|
3320
|
+
});
|
|
3321
|
+
continue;
|
|
3322
|
+
}
|
|
3323
|
+
const childTrajectory = {
|
|
3324
|
+
steps: actualStep.children,
|
|
3325
|
+
totalDurationMs: actualStep.durationMs
|
|
3326
|
+
};
|
|
3327
|
+
const childConfig = expectedStep.children;
|
|
3328
|
+
let accuracy;
|
|
3329
|
+
if (childConfig.steps && childConfig.steps.length > 0) {
|
|
3330
|
+
accuracy = compareTrajectories(
|
|
3331
|
+
childTrajectory,
|
|
3332
|
+
{ steps: childConfig.steps },
|
|
3333
|
+
{
|
|
3334
|
+
ordering: childConfig.ordering ?? "relaxed",
|
|
3335
|
+
allowRepeatedSteps: childConfig.allowRepeatedSteps ?? true
|
|
3336
|
+
}
|
|
3337
|
+
);
|
|
3338
|
+
}
|
|
3339
|
+
const hasEfficiencyConfig = childConfig.maxSteps !== void 0 || childConfig.maxTotalTokens !== void 0 || childConfig.maxTotalDurationMs !== void 0 || childConfig.noRedundantCalls !== void 0;
|
|
3340
|
+
const efficiency = hasEfficiencyConfig ? checkTrajectoryEfficiency(childTrajectory, {
|
|
3341
|
+
maxSteps: childConfig.maxSteps,
|
|
3342
|
+
maxTotalTokens: childConfig.maxTotalTokens,
|
|
3343
|
+
maxTotalDurationMs: childConfig.maxTotalDurationMs,
|
|
3344
|
+
noRedundantCalls: childConfig.noRedundantCalls ?? true
|
|
3345
|
+
}) : void 0;
|
|
3346
|
+
const hasBlacklistConfig = childConfig.blacklistedTools && childConfig.blacklistedTools.length > 0 || childConfig.blacklistedSequences && childConfig.blacklistedSequences.length > 0;
|
|
3347
|
+
const blacklist = hasBlacklistConfig ? checkTrajectoryBlacklist(childTrajectory, {
|
|
3348
|
+
blacklistedTools: childConfig.blacklistedTools,
|
|
3349
|
+
blacklistedSequences: childConfig.blacklistedSequences
|
|
3350
|
+
}) : void 0;
|
|
3351
|
+
const toolFailures = analyzeToolFailures(childTrajectory, {
|
|
3352
|
+
maxRetriesPerTool: childConfig.maxRetriesPerTool ?? 2
|
|
3353
|
+
});
|
|
3354
|
+
const nested = childConfig.steps ? evaluateNestedExpectations(childConfig.steps, actualStep.children, weights) : [];
|
|
3355
|
+
const scores = [];
|
|
3356
|
+
if (accuracy) scores.push({ weight: weights.accuracy, value: accuracy.score });
|
|
3357
|
+
if (efficiency) scores.push({ weight: weights.efficiency, value: efficiency.score });
|
|
3358
|
+
if (toolFailures && toolFailures.patterns.length > 0)
|
|
3359
|
+
scores.push({ weight: weights.toolFailures, value: toolFailures.score });
|
|
3360
|
+
if (blacklist) {
|
|
3361
|
+
if (blacklist.score === 0) {
|
|
3362
|
+
results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
|
|
3363
|
+
continue;
|
|
3364
|
+
}
|
|
3365
|
+
scores.push({ weight: weights.blacklist, value: blacklist.score });
|
|
3366
|
+
}
|
|
3367
|
+
let levelScore = 1;
|
|
3368
|
+
if (scores.length > 0) {
|
|
3369
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3370
|
+
levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
|
|
3371
|
+
}
|
|
3372
|
+
let finalScore = levelScore;
|
|
3373
|
+
if (nested.length > 0) {
|
|
3374
|
+
const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
|
|
3375
|
+
if (hasNestedBlacklistViolation) {
|
|
3376
|
+
results.push({ stepName: expectedStep.name, score: 0, accuracy, efficiency, blacklist, toolFailures, nested });
|
|
3377
|
+
continue;
|
|
3378
|
+
}
|
|
3379
|
+
const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
|
|
3380
|
+
finalScore = 0.7 * levelScore + 0.3 * nestedAvg;
|
|
3381
|
+
}
|
|
3382
|
+
results.push({
|
|
3383
|
+
stepName: expectedStep.name,
|
|
3384
|
+
score: Math.round(finalScore * 100) / 100,
|
|
3385
|
+
accuracy,
|
|
3386
|
+
efficiency,
|
|
3387
|
+
blacklist,
|
|
3388
|
+
toolFailures,
|
|
3389
|
+
nested: nested.length > 0 ? nested : void 0
|
|
3390
|
+
});
|
|
3391
|
+
}
|
|
3392
|
+
return results;
|
|
3393
|
+
}
|
|
3394
|
+
function createTrajectoryScorerCode(options = {}) {
|
|
3395
|
+
const { defaults = {}, weights: userWeights = {} } = options;
|
|
3396
|
+
const w = {
|
|
3397
|
+
accuracy: Math.max(0, userWeights.accuracy ?? 0.4),
|
|
3398
|
+
efficiency: Math.max(0, userWeights.efficiency ?? 0.3),
|
|
3399
|
+
toolFailures: Math.max(0, userWeights.toolFailures ?? 0.2),
|
|
3400
|
+
blacklist: Math.max(0, userWeights.blacklist ?? 0.1)
|
|
3401
|
+
};
|
|
3402
|
+
return createScorer({
|
|
3403
|
+
id: "code-trajectory-scorer",
|
|
3404
|
+
name: "Trajectory Scorer",
|
|
3405
|
+
description: "Multi-dimensional trajectory evaluation: accuracy, efficiency, blacklist, and tool failures",
|
|
3406
|
+
type: "trajectory"
|
|
3407
|
+
}).preprocess(async ({ run }) => {
|
|
3408
|
+
const actualTrajectory = run.output;
|
|
3409
|
+
const itemExpectation = run.expectedTrajectory ?? {};
|
|
3410
|
+
const config = { ...defaults, ...itemExpectation };
|
|
3411
|
+
if (itemExpectation.steps !== void 0) {
|
|
3412
|
+
config.steps = itemExpectation.steps;
|
|
3413
|
+
}
|
|
3414
|
+
let accuracy;
|
|
3415
|
+
if (config.steps && config.steps.length > 0) {
|
|
3416
|
+
accuracy = compareTrajectories(
|
|
3417
|
+
actualTrajectory,
|
|
3418
|
+
{ steps: config.steps },
|
|
3419
|
+
{
|
|
3420
|
+
ordering: config.ordering ?? "relaxed",
|
|
3421
|
+
allowRepeatedSteps: config.allowRepeatedSteps ?? true
|
|
3422
|
+
}
|
|
3423
|
+
);
|
|
3424
|
+
}
|
|
3425
|
+
const hasEfficiencyConfig = config.maxSteps !== void 0 || config.maxTotalTokens !== void 0 || config.maxTotalDurationMs !== void 0 || config.noRedundantCalls !== void 0;
|
|
3426
|
+
const efficiency = hasEfficiencyConfig ? checkTrajectoryEfficiency(actualTrajectory, {
|
|
3427
|
+
maxSteps: config.maxSteps,
|
|
3428
|
+
maxTotalTokens: config.maxTotalTokens,
|
|
3429
|
+
maxTotalDurationMs: config.maxTotalDurationMs,
|
|
3430
|
+
noRedundantCalls: config.noRedundantCalls ?? true
|
|
3431
|
+
}) : void 0;
|
|
3432
|
+
const hasBlacklistConfig = config.blacklistedTools && config.blacklistedTools.length > 0 || config.blacklistedSequences && config.blacklistedSequences.length > 0;
|
|
3433
|
+
const blacklist = hasBlacklistConfig ? checkTrajectoryBlacklist(actualTrajectory, {
|
|
3434
|
+
blacklistedTools: config.blacklistedTools,
|
|
3435
|
+
blacklistedSequences: config.blacklistedSequences
|
|
3436
|
+
}) : void 0;
|
|
3437
|
+
const toolFailures = analyzeToolFailures(actualTrajectory, {
|
|
3438
|
+
maxRetriesPerTool: config.maxRetriesPerTool ?? 2
|
|
3439
|
+
});
|
|
3440
|
+
const nested = config.steps && config.steps.length > 0 ? evaluateNestedExpectations(config.steps, actualTrajectory.steps, w) : void 0;
|
|
3441
|
+
return {
|
|
3442
|
+
accuracy,
|
|
3443
|
+
efficiency,
|
|
3444
|
+
blacklist,
|
|
3445
|
+
toolFailures,
|
|
3446
|
+
nested: nested && nested.length > 0 ? nested : void 0,
|
|
3447
|
+
config
|
|
3448
|
+
};
|
|
3449
|
+
}).generateScore(({ results }) => {
|
|
3450
|
+
const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
|
|
3451
|
+
if (blacklist && blacklist.score === 0) {
|
|
3452
|
+
return 0;
|
|
3453
|
+
}
|
|
3454
|
+
const scores = [];
|
|
3455
|
+
if (accuracy) {
|
|
3456
|
+
scores.push({ weight: w.accuracy, value: accuracy.score });
|
|
3457
|
+
}
|
|
3458
|
+
if (efficiency) {
|
|
3459
|
+
scores.push({ weight: w.efficiency, value: efficiency.score });
|
|
3460
|
+
}
|
|
3461
|
+
if (toolFailures && toolFailures.patterns.length > 0) {
|
|
3462
|
+
scores.push({ weight: w.toolFailures, value: toolFailures.score });
|
|
3463
|
+
}
|
|
3464
|
+
if (blacklist) {
|
|
3465
|
+
scores.push({ weight: w.blacklist, value: blacklist.score });
|
|
3466
|
+
}
|
|
3467
|
+
if (scores.length === 0 && !nested) {
|
|
3468
|
+
return 1;
|
|
3469
|
+
}
|
|
3470
|
+
let levelScore = 1;
|
|
3471
|
+
if (scores.length > 0) {
|
|
3472
|
+
const totalWeight = scores.reduce((sum, s) => sum + s.weight, 0);
|
|
3473
|
+
levelScore = totalWeight > 0 ? scores.reduce((sum, s) => sum + s.weight / totalWeight * s.value, 0) : 1;
|
|
3474
|
+
}
|
|
3475
|
+
if (nested && nested.length > 0) {
|
|
3476
|
+
const hasNestedBlacklistViolation = nested.some((r) => r.blacklist && r.blacklist.score === 0);
|
|
3477
|
+
if (hasNestedBlacklistViolation) {
|
|
3478
|
+
return 0;
|
|
3479
|
+
}
|
|
3480
|
+
const nestedAvg = nested.reduce((sum, r) => sum + r.score, 0) / nested.length;
|
|
3481
|
+
levelScore = 0.7 * levelScore + 0.3 * nestedAvg;
|
|
3482
|
+
}
|
|
3483
|
+
return Math.round(levelScore * 100) / 100;
|
|
3484
|
+
}).generateReason(({ results, score }) => {
|
|
3485
|
+
const { accuracy, efficiency, blacklist, toolFailures, nested } = results.preprocessStepResult ?? {};
|
|
3486
|
+
const parts = [];
|
|
3487
|
+
parts.push(`Score: ${score}`);
|
|
3488
|
+
if (blacklist && blacklist.score === 0) {
|
|
3489
|
+
const violations = [];
|
|
3490
|
+
if (blacklist.violatedTools.length > 0) {
|
|
3491
|
+
violations.push(`forbidden tools used: ${blacklist.violatedTools.join(", ")}`);
|
|
3492
|
+
}
|
|
3493
|
+
if (blacklist.violatedSequences.length > 0) {
|
|
3494
|
+
violations.push(`forbidden sequences: ${blacklist.violatedSequences.map((s) => s.join(" \u2192 ")).join("; ")}`);
|
|
3495
|
+
}
|
|
3496
|
+
parts.push(`Blacklist violation: ${violations.join(". ")}.`);
|
|
3497
|
+
return parts.join("\n");
|
|
3498
|
+
}
|
|
3499
|
+
if (nested && nested.some((r) => r.blacklist && r.blacklist.score === 0)) {
|
|
3500
|
+
const violating = nested.filter((r) => r.blacklist && r.blacklist.score === 0).map((r) => r.stepName);
|
|
3501
|
+
parts.push(`Nested blacklist violation in: ${violating.join(", ")}.`);
|
|
3502
|
+
return parts.join("\n");
|
|
3503
|
+
}
|
|
3504
|
+
if (accuracy) {
|
|
3505
|
+
const details = [`${accuracy.matchedSteps}/${accuracy.totalExpectedSteps} expected steps matched`];
|
|
3506
|
+
if (accuracy.missingSteps.length > 0) {
|
|
3507
|
+
details.push(`missing: ${accuracy.missingSteps.join(", ")}`);
|
|
3508
|
+
}
|
|
3509
|
+
if (accuracy.extraSteps.length > 0) {
|
|
3510
|
+
details.push(`extra: ${accuracy.extraSteps.join(", ")}`);
|
|
3511
|
+
}
|
|
3512
|
+
if (accuracy.outOfOrderSteps.length > 0) {
|
|
3513
|
+
details.push(`out of order: ${accuracy.outOfOrderSteps.join(", ")}`);
|
|
3514
|
+
}
|
|
3515
|
+
parts.push(`Accuracy (${accuracy.score}): ${details.join(". ")}.`);
|
|
3516
|
+
}
|
|
3517
|
+
if (efficiency) {
|
|
3518
|
+
const details = [];
|
|
3519
|
+
if (efficiency.overStepBudget) {
|
|
3520
|
+
details.push(`over step budget (${efficiency.totalSteps} steps)`);
|
|
3521
|
+
}
|
|
3522
|
+
if (efficiency.overTokenBudget) {
|
|
3523
|
+
details.push(`over token budget (${efficiency.totalTokens} tokens)`);
|
|
3524
|
+
}
|
|
3525
|
+
if (efficiency.overDurationBudget) {
|
|
3526
|
+
details.push(`over duration budget (${efficiency.totalDurationMs}ms)`);
|
|
3527
|
+
}
|
|
3528
|
+
if (efficiency.redundantCalls.length > 0) {
|
|
3529
|
+
details.push(`redundant calls: ${efficiency.redundantCalls.map((c) => c.name).join(", ")}`);
|
|
3530
|
+
}
|
|
3531
|
+
if (details.length > 0) {
|
|
3532
|
+
parts.push(`Efficiency (${efficiency.score}): ${details.join(". ")}.`);
|
|
3533
|
+
} else {
|
|
3534
|
+
parts.push(`Efficiency (${efficiency.score}): all budgets met, no redundant calls.`);
|
|
3535
|
+
}
|
|
3536
|
+
}
|
|
3537
|
+
if (toolFailures && toolFailures.patterns.length > 0) {
|
|
3538
|
+
const details = [];
|
|
3539
|
+
if (toolFailures.totalRetries > 0) {
|
|
3540
|
+
details.push(`${toolFailures.totalRetries} total retries`);
|
|
3541
|
+
}
|
|
3542
|
+
if (toolFailures.excessiveRetryTools.length > 0) {
|
|
3543
|
+
details.push(`excessive retries: ${toolFailures.excessiveRetryTools.join(", ")}`);
|
|
3544
|
+
}
|
|
3545
|
+
parts.push(`Tool failures (${toolFailures.score}): ${details.join(". ")}.`);
|
|
3546
|
+
}
|
|
3547
|
+
if (nested && nested.length > 0) {
|
|
3548
|
+
const nestedSummary = nested.map((r) => `${r.stepName}: ${r.score}`).join(", ");
|
|
3549
|
+
parts.push(`Nested scores: ${nestedSummary}.`);
|
|
3550
|
+
}
|
|
3551
|
+
return parts.join("\n");
|
|
3552
|
+
});
|
|
3553
|
+
}
|
|
2989
3554
|
|
|
2990
|
-
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer };
|
|
3555
|
+
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer, createTrajectoryAccuracyScorerCode, createTrajectoryAccuracyScorerLLM, createTrajectoryScorerCode };
|
|
2991
3556
|
//# sourceMappingURL=index.js.map
|
|
2992
3557
|
//# sourceMappingURL=index.js.map
|