@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -4,4 +4,5 @@ export * from './keyword-coverage/index.js';
4
4
  export * from './content-similarity/index.js';
5
5
  export * from './tone/index.js';
6
6
  export * from './tool-call-accuracy/index.js';
7
+ export * from './trajectory/index.js';
7
8
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/code/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,sBAAsB,CAAC;AACrC,cAAc,oBAAoB,CAAC;AACnC,cAAc,sBAAsB,CAAC;AACrC,cAAc,QAAQ,CAAC;AACvB,cAAc,sBAAsB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/code/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,sBAAsB,CAAC;AACrC,cAAc,oBAAoB,CAAC;AACnC,cAAc,sBAAsB,CAAC;AACrC,cAAc,QAAQ,CAAC;AACvB,cAAc,sBAAsB,CAAC;AACrC,cAAc,cAAc,CAAC"}
@@ -0,0 +1,147 @@
1
+ import type { ExpectedStep, Trajectory, TrajectoryComparisonOptions, TrajectoryExpectation } from '@mastra/core/evals';
2
+ import type { TrajectoryComparisonResult, TrajectoryEfficiencyResult, TrajectoryBlacklistResult, ToolFailureAnalysisResult } from '../../utils.js';
3
+ interface TrajectoryAccuracyScorerCodeOptions {
4
+ /**
5
+ * The expected trajectory to compare against.
6
+ * Accepts a Trajectory (full trajectory steps) or ExpectedStep[] (lightweight matchers).
7
+ * If not provided, the scorer will use `run.expectedTrajectory` from the dataset item.
8
+ */
9
+ expectedTrajectory?: Trajectory | ExpectedStep[];
10
+ /** Comparison behavior options */
11
+ comparisonOptions?: TrajectoryComparisonOptions;
12
+ }
13
+ /**
14
+ * Creates a code-based trajectory accuracy scorer that compares the actual sequence
15
+ * of tool calls an agent made against an expected trajectory.
16
+ *
17
+ * This scorer extracts the agent's tool call trajectory from its output messages
18
+ * and compares it against a predefined expected trajectory. It supports strict,
19
+ * relaxed, and unordered comparison modes.
20
+ *
21
+ * @param options - Configuration for the trajectory scorer
22
+ * @returns A scorer that evaluates trajectory accuracy
23
+ *
24
+ * @example
25
+ * ```ts
26
+ * import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers';
27
+ *
28
+ * const scorer = createTrajectoryAccuracyScorerCode({
29
+ * expectedTrajectory: {
30
+ * steps: [
31
+ * { stepType: 'tool_call', name: 'search' },
32
+ * { stepType: 'tool_call', name: 'summarize' },
33
+ * ],
34
+ * },
35
+ * comparisonOptions: {
36
+ * ordering: 'relaxed',
37
+ * allowRepeatedSteps: true,
38
+ * },
39
+ * });
40
+ *
41
+ * const result = await scorer.run(agentRun);
42
+ * // result.score: 0.0 - 1.0
43
+ * // result.preprocessStepResult.comparison: detailed comparison results
44
+ * ```
45
+ */
46
+ export declare function createTrajectoryAccuracyScorerCode(options?: TrajectoryAccuracyScorerCodeOptions): import("@mastra/core/evals").MastraScorer<"code-trajectory-accuracy-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, Trajectory, Record<"preprocessStepResult", {
47
+ actualTrajectory: Trajectory;
48
+ expectedTrajectory: undefined;
49
+ comparison: undefined;
50
+ actualStepNames: string[];
51
+ expectedStepNames: never[];
52
+ error: string;
53
+ } | {
54
+ actualTrajectory: Trajectory;
55
+ expectedTrajectory: {
56
+ steps: ExpectedStep[];
57
+ };
58
+ comparison: TrajectoryComparisonResult;
59
+ actualStepNames: string[];
60
+ expectedStepNames: string[];
61
+ error?: undefined;
62
+ }> & Record<"generateScoreStepResult", number>>;
63
+ /**
64
+ * Result from evaluating a nested step's children against its TrajectoryExpectation.
65
+ */
66
+ export type NestedEvaluationResult = {
67
+ /** Name of the expected step that contained the nested config */
68
+ stepName: string;
69
+ /** Score for this nested evaluation (0.0 - 1.0) */
70
+ score: number;
71
+ /** Accuracy result for the children */
72
+ accuracy?: TrajectoryComparisonResult;
73
+ /** Efficiency result for the children */
74
+ efficiency?: TrajectoryEfficiencyResult;
75
+ /** Blacklist result for the children */
76
+ blacklist?: TrajectoryBlacklistResult;
77
+ /** Tool failure result for the children */
78
+ toolFailures?: ToolFailureAnalysisResult;
79
+ /** Further nested results from deeper levels */
80
+ nested?: NestedEvaluationResult[];
81
+ };
82
+ /**
83
+ * Multi-dimensional result from the unified trajectory scorer.
84
+ */
85
+ export type TrajectoryScoreResult = {
86
+ /** Overall score (0.0 - 1.0). Weighted combination of dimensions (0.0 if blacklist violation). */
87
+ score: number;
88
+ /** Accuracy sub-score (step matching). Only present if expected steps were provided. */
89
+ accuracy?: TrajectoryComparisonResult;
90
+ /** Efficiency sub-score (budgets + redundancy). */
91
+ efficiency?: TrajectoryEfficiencyResult;
92
+ /** Blacklist sub-score (forbidden tools/sequences). */
93
+ blacklist?: TrajectoryBlacklistResult;
94
+ /** Tool failure analysis. */
95
+ toolFailures?: ToolFailureAnalysisResult;
96
+ /** Results from evaluating nested step expectations. */
97
+ nested?: NestedEvaluationResult[];
98
+ };
99
+ interface TrajectoryScorerCodeOptions {
100
+ /**
101
+ * Default expectation config for all runs.
102
+ * Per-item `run.expectedTrajectory` values override these defaults.
103
+ */
104
+ defaults?: TrajectoryExpectation;
105
+ }
106
+ /**
107
+ * Creates a unified trajectory scorer that evaluates multiple dimensions:
108
+ * accuracy (step matching), efficiency (budgets, redundancy), blacklist (forbidden tools/sequences),
109
+ * and tool failure patterns.
110
+ *
111
+ * Configuration can be set at two levels:
112
+ * - **Constructor defaults** (`defaults`) — agent-level defaults for all dataset items
113
+ * - **Per-item overrides** (`run.expectedTrajectory`) — prompt-specific overrides from dataset items
114
+ *
115
+ * Per-item values override constructor defaults for all fields.
116
+ *
117
+ * @param options - Default trajectory expectations
118
+ * @returns A scorer with multi-dimensional trajectory evaluation
119
+ *
120
+ * @example
121
+ * ```ts
122
+ * import { createTrajectoryScorerCode } from '@mastra/evals/scorers';
123
+ *
124
+ * const scorer = createTrajectoryScorerCode({
125
+ * defaults: {
126
+ * steps: [
127
+ * { stepType: 'tool_call', name: 'search' },
128
+ * { stepType: 'tool_call', name: 'summarize' },
129
+ * ],
130
+ * ordering: 'relaxed',
131
+ * maxSteps: 5,
132
+ * noRedundantCalls: true,
133
+ * blacklistedTools: ['deleteAll'],
134
+ * },
135
+ * });
136
+ * ```
137
+ */
138
+ export declare function createTrajectoryScorerCode(options?: TrajectoryScorerCodeOptions): import("@mastra/core/evals").MastraScorer<"code-trajectory-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, Trajectory, Record<"preprocessStepResult", {
139
+ accuracy: TrajectoryComparisonResult | undefined;
140
+ efficiency: TrajectoryEfficiencyResult | undefined;
141
+ blacklist: TrajectoryBlacklistResult | undefined;
142
+ toolFailures: ToolFailureAnalysisResult;
143
+ nested: NestedEvaluationResult[] | undefined;
144
+ config: TrajectoryExpectation;
145
+ }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
146
+ export {};
147
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/code/trajectory/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,YAAY,EACZ,UAAU,EACV,2BAA2B,EAC3B,qBAAqB,EAEtB,MAAM,oBAAoB,CAAC;AAQ5B,OAAO,KAAK,EACV,0BAA0B,EAC1B,0BAA0B,EAC1B,yBAAyB,EACzB,yBAAyB,EAC1B,MAAM,aAAa,CAAC;AAErB,UAAU,mCAAmC;IAC3C;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,UAAU,GAAG,YAAY,EAAE,CAAC;IACjD,kCAAkC;IAClC,iBAAiB,CAAC,EAAE,2BAA2B,CAAC;CACjD;AAiCD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAgB,kCAAkC,CAAC,OAAO,GAAE,mCAAwC;;;;;;;;;;;;;;;;gDAuFnG;AAID;;GAEG;AACH,MAAM,MAAM,sBAAsB,GAAG;IACnC,iEAAiE;IACjE,QAAQ,EAAE,MAAM,CAAC;IACjB,mDAAmD;IACnD,KAAK,EAAE,MAAM,CAAC;IACd,uCAAuC;IACvC,QAAQ,CAAC,EAAE,0BAA0B,CAAC;IACtC,yCAAyC;IACzC,UAAU,CAAC,EAAE,0BAA0B,CAAC;IACxC,wCAAwC;IACxC,SAAS,CAAC,EAAE,yBAAyB,CAAC;IACtC,2CAA2C;IAC3C,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,gDAAgD;IAChD,MAAM,CAAC,EAAE,sBAAsB,EAAE,CAAC;CACnC,CAAC;AAwJF;;GAEG;AACH,MAAM,MAAM,qBAAqB,GAAG;IAClC,kGAAkG;IAClG,KAAK,EAAE,MAAM,CAAC;IACd,wFAAwF;IACxF,QAAQ,CAAC,EAAE,0BAA0B,CAAC;IACtC,mDAAmD;IACnD,UAAU,CAAC,EAAE,0BAA0B,CAAC;IACxC,uDAAuD;IACvD,SAAS,CAAC,EAAE,yBAAyB,CAAC;IACtC,6BAA6B;IAC7B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,wDAAwD;IACxD,MAAM,CAAC,EAAE,sBAAsB,EAAE,CAAC;CACnC,CAAC;AAEF,UAAU,2BAA2B;IACnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,qBAAqB,CAAC;CAClC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,wBAAgB,0BAA0B,CAAC,OAAO,GAAE,2BAAgC;;;;;;;6FAqNnF"}
@@ -20,13 +20,13 @@ export declare function createAnswerSimilarityScorer({ model, options, }: {
20
20
  matches: {
21
21
  groundTruthUnit: string;
22
22
  outputUnit: string | null;
23
- matchType: "exact" | "semantic" | "partial" | "missing";
23
+ matchType: "exact" | "partial" | "semantic" | "missing";
24
24
  explanation: string;
25
25
  }[];
26
26
  extraInOutput: string[];
27
27
  contradictions: {
28
- groundTruthUnit: string;
29
28
  outputUnit: string;
29
+ groundTruthUnit: string;
30
30
  explanation: string;
31
31
  }[];
32
32
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
@@ -10,9 +10,9 @@ export declare function createContextPrecisionScorer({ model, options, }: {
10
10
  options: ContextPrecisionMetricOptions;
11
11
  }): import("@mastra/core/evals").MastraScorer<"context-precision-scorer", ScorerRunInputForAgent, ScorerRunOutputForAgent, Record<"analyzeStepResult", {
12
12
  verdicts: {
13
- reason: string;
14
- verdict: string;
15
13
  context_index: number;
14
+ verdict: string;
15
+ reason: string;
16
16
  }[];
17
17
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
18
18
  //# sourceMappingURL=index.d.ts.map
@@ -15,11 +15,11 @@ export declare function createContextRelevanceScorerLLM({ model, options, }: {
15
15
  options: ContextRelevanceOptions;
16
16
  }): import("@mastra/core/evals").MastraScorer<"context-relevance-scorer", ScorerRunInputForAgent, ScorerRunOutputForAgent, Record<"analyzeStepResult", {
17
17
  evaluations: {
18
- reasoning: string;
19
18
  context_index: number;
20
19
  contextPiece: string;
21
20
  relevanceLevel: "high" | "medium" | "low" | "none";
22
21
  wasUsed: boolean;
22
+ reasoning: string;
23
23
  }[];
24
24
  overallAssessment: string;
25
25
  missingContext?: string[] | undefined;
@@ -10,8 +10,8 @@ export declare function createFaithfulnessScorer({ model, options, }: {
10
10
  claims: string[];
11
11
  }> & Record<"analyzeStepResult", {
12
12
  verdicts: {
13
- reason: string;
14
13
  verdict: string;
14
+ reason: string;
15
15
  }[];
16
16
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
17
17
  //# sourceMappingURL=index.d.ts.map
@@ -27,9 +27,9 @@ export declare function createHallucinationScorer({ model, options, }: {
27
27
  claims: string[];
28
28
  }> & Record<"analyzeStepResult", {
29
29
  verdicts: {
30
- reason: string;
31
- verdict: string;
32
30
  statement: string;
31
+ verdict: string;
32
+ reason: string;
33
33
  }[];
34
34
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
35
35
  //# sourceMappingURL=index.d.ts.map
@@ -9,4 +9,5 @@ export * from './context-relevance/index.js';
9
9
  export * from './context-precision/index.js';
10
10
  export * from './noise-sensitivity/index.js';
11
11
  export * from './prompt-alignment/index.js';
12
+ export * from './trajectory/index.js';
12
13
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,cAAc,CAAC"}
@@ -22,13 +22,13 @@ export declare function createNoiseSensitivityScorerLLM({ model, options, }: {
22
22
  model: MastraModelConfig;
23
23
  options: NoiseSensitivityOptions;
24
24
  }): import("@mastra/core/evals").MastraScorer<"noise-sensitivity-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, import("@mastra/core/evals").ScorerRunOutputForAgent, Record<"analyzeStepResult", {
25
- overallAssessment: string;
26
25
  dimensions: {
27
26
  dimension: string;
28
27
  impactLevel: "none" | "minimal" | "moderate" | "significant" | "severe";
29
28
  specificChanges: string;
30
29
  noiseInfluence: string;
31
30
  }[];
31
+ overallAssessment: string;
32
32
  robustnessScore: number;
33
33
  majorIssues?: string[] | undefined;
34
34
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
@@ -8,31 +8,31 @@ export declare function createPromptAlignmentScorerLLM({ model, options, }: {
8
8
  model: MastraModelConfig;
9
9
  options?: PromptAlignmentOptions;
10
10
  }): import("@mastra/core/evals").MastraScorer<string, ScorerRunInputForAgent, ScorerRunOutputForAgent, Record<"analyzeStepResult", {
11
- overallAssessment: string;
12
11
  intentAlignment: {
13
- reasoning: string;
14
12
  score: number;
15
13
  primaryIntent: string;
16
14
  isAddressed: boolean;
15
+ reasoning: string;
17
16
  };
18
17
  requirementsFulfillment: {
19
18
  requirements: {
20
- reasoning: string;
21
19
  requirement: string;
22
20
  isFulfilled: boolean;
21
+ reasoning: string;
23
22
  }[];
24
23
  overallScore: number;
25
24
  };
26
25
  completeness: {
27
- reasoning: string;
28
26
  score: number;
29
27
  missingElements: string[];
28
+ reasoning: string;
30
29
  };
31
30
  responseAppropriateness: {
32
- reasoning: string;
33
31
  score: number;
34
32
  formatAlignment: boolean;
35
33
  toneAlignment: boolean;
34
+ reasoning: string;
36
35
  };
36
+ overallAssessment: string;
37
37
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
38
38
  //# sourceMappingURL=index.d.ts.map
@@ -10,9 +10,9 @@ export declare function createToolCallAccuracyScorerLLM({ model, availableTools
10
10
  toolCallInfos: import("../..").ToolCallInfo[];
11
11
  }> & Record<"analyzeStepResult", {
12
12
  evaluations: {
13
- reasoning: string;
14
13
  toolCalled: string;
15
14
  wasAppropriate: boolean;
15
+ reasoning: string;
16
16
  }[];
17
17
  missingTools?: string[] | undefined;
18
18
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
@@ -7,8 +7,8 @@ export declare function createToxicityScorer({ model, options, }: {
7
7
  options?: ToxicityMetricOptions;
8
8
  }): import("@mastra/core/evals").MastraScorer<"toxicity-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, import("@mastra/core/evals").ScorerRunOutputForAgent, Record<"analyzeStepResult", {
9
9
  verdicts: {
10
- reason: string;
11
10
  verdict: string;
11
+ reason: string;
12
12
  }[];
13
13
  }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
14
14
  //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1,58 @@
1
+ import type { ExpectedStep, Trajectory } from '@mastra/core/evals';
2
+ import type { MastraModelConfig } from '@mastra/core/llm';
3
+ export interface TrajectoryAccuracyLLMOptions {
4
+ /** The LLM model to use as judge */
5
+ model: MastraModelConfig;
6
+ /** Optional expected trajectory to compare against */
7
+ expectedTrajectory?: Trajectory | ExpectedStep[];
8
+ }
9
+ /**
10
+ * Creates an LLM-based trajectory accuracy scorer that evaluates the quality
11
+ * of an agent's action sequence using an LLM judge.
12
+ *
13
+ * This scorer extracts the agent's tool call trajectory and asks an LLM to evaluate
14
+ * whether the trajectory was appropriate, efficient, and complete. When an expected
15
+ * trajectory is provided, it compares against it. Otherwise, it evaluates the trajectory
16
+ * based on the task requirements.
17
+ *
18
+ * @param options - Configuration for the trajectory scorer
19
+ * @returns A scorer that evaluates trajectory quality
20
+ *
21
+ * @example
22
+ * ```ts
23
+ * import { createTrajectoryAccuracyScorerLLM } from '@mastra/evals/scorers';
24
+ *
25
+ * // Without expected trajectory (evaluates quality based on task)
26
+ * const scorer = createTrajectoryAccuracyScorerLLM({
27
+ * model: { provider: 'openai', name: 'gpt-4o' },
28
+ * });
29
+ *
30
+ * // With expected trajectory
31
+ * const scorerWithExpected = createTrajectoryAccuracyScorerLLM({
32
+ * model: { provider: 'openai', name: 'gpt-4o' },
33
+ * expectedTrajectory: {
34
+ * steps: [
35
+ * { stepType: 'tool_call', name: 'search' },
36
+ * { stepType: 'tool_call', name: 'summarize' },
37
+ * ],
38
+ * },
39
+ * });
40
+ * ```
41
+ */
42
+ export declare function createTrajectoryAccuracyScorerLLM({ model, expectedTrajectory: staticExpectedTrajectory, }: TrajectoryAccuracyLLMOptions): import("@mastra/core/evals").MastraScorer<"llm-trajectory-accuracy-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, Trajectory, Record<"preprocessStepResult", {
43
+ actualTrajectory: Trajectory;
44
+ actualTrajectoryFormatted: string;
45
+ expectedTrajectoryFormatted: string | undefined;
46
+ hasSteps: boolean;
47
+ }> & Record<"analyzeStepResult", {
48
+ stepEvaluations: {
49
+ stepName: string;
50
+ wasNecessary: boolean;
51
+ wasInOrder: boolean;
52
+ reasoning: string;
53
+ }[];
54
+ overallAssessment: string;
55
+ missingSteps?: string[] | undefined;
56
+ extraSteps?: string[] | undefined;
57
+ }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
58
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/trajectory/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAyC,MAAM,oBAAoB,CAAC;AAE1G,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAK1D,MAAM,WAAW,4BAA4B;IAC3C,oCAAoC;IACpC,KAAK,EAAE,iBAAiB,CAAC;IACzB,sDAAsD;IACtD,kBAAkB,CAAC,EAAE,UAAU,GAAG,YAAY,EAAE,CAAC;CAClD;AA8DD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAgB,iCAAiC,CAAC,EAChD,KAAK,EACL,kBAAkB,EAAE,wBAAwB,GAC7C,EAAE,4BAA4B;;;;;;;;;;;;;;;6FA+G9B"}
@@ -0,0 +1,20 @@
1
+ export declare const TRAJECTORY_EVALUATION_INSTRUCTIONS = "\nYou are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.\n\nCORE RESPONSIBILITIES:\n- Analyze the full sequence of actions the agent took\n- Evaluate whether each step was necessary and well-ordered\n- Identify unnecessary, redundant, or missing steps\n- Assess the overall quality of the agent's action path\n\nEVALUATION PHILOSOPHY:\n- Consider both the individual steps AND the overall flow\n- A good trajectory is efficient, logical, and complete\n- Redundant steps reduce quality even if the final result is correct\n- Missing critical steps are a significant issue\n- Order matters: logical dependencies should be respected\n\nOUTPUT REQUIREMENTS:\n- Provide clear reasoning for your trajectory assessment\n- Use provided JSON schema exactly as specified\n- Be consistent in your evaluation standards\n";
2
+ export declare const createAnalyzePrompt: ({ userInput, agentResponse, actualTrajectory, expectedTrajectory, }: {
3
+ userInput: string;
4
+ agentResponse: string;
5
+ actualTrajectory: string;
6
+ expectedTrajectory?: string;
7
+ }) => string;
8
+ export declare const createReasonPrompt: ({ userInput, score, stepEvaluations, missingSteps, extraSteps, }: {
9
+ userInput: string;
10
+ score: number;
11
+ stepEvaluations: Array<{
12
+ stepName: string;
13
+ wasNecessary: boolean;
14
+ wasInOrder: boolean;
15
+ reasoning: string;
16
+ }>;
17
+ missingSteps: string[];
18
+ extraSteps: string[];
19
+ }) => string;
20
+ //# sourceMappingURL=prompts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/trajectory/prompts.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,kCAAkC,88BAoB9C,CAAC;AAEF,eAAO,MAAM,mBAAmB,GAAI,qEAKjC;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,gBAAgB,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B,KAAG,MA2CH,CAAC;AAEF,eAAO,MAAM,kBAAkB,GAAI,kEAMhC;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,OAAO,CAAC;QAAC,UAAU,EAAE,OAAO,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC5G,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,EAAE,MAAM,EAAE,CAAC;CACtB,KAAG,MAYH,CAAC"}