@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +59 -2
- package/LICENSE.md +15 -0
- package/dist/chunk-EVBNIL5M.js +606 -0
- package/dist/chunk-EVBNIL5M.js.map +1 -0
- package/dist/chunk-XRUR5PBK.cjs +632 -0
- package/dist/chunk-XRUR5PBK.cjs.map +1 -0
- package/dist/docs/SKILL.md +20 -19
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
- package/dist/docs/references/docs-evals-overview.md +11 -16
- package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
- package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
- package/dist/docs/references/reference-evals-bias.md +24 -24
- package/dist/docs/references/reference-evals-completeness.md +19 -20
- package/dist/docs/references/reference-evals-content-similarity.md +20 -20
- package/dist/docs/references/reference-evals-context-precision.md +36 -36
- package/dist/docs/references/reference-evals-context-relevance.md +136 -141
- package/dist/docs/references/reference-evals-faithfulness.md +24 -24
- package/dist/docs/references/reference-evals-hallucination.md +52 -69
- package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
- package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
- package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
- package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
- package/dist/docs/references/reference-evals-textual-difference.md +18 -18
- package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
- package/dist/docs/references/reference-evals-toxicity.md +21 -21
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
- package/dist/scorers/code/index.d.ts +1 -0
- package/dist/scorers/code/index.d.ts.map +1 -1
- package/dist/scorers/code/trajectory/index.d.ts +147 -0
- package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/context-precision/index.d.ts +2 -2
- package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +1 -1
- package/dist/scorers/llm/trajectory/index.d.ts +58 -0
- package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
- package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +638 -59
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +578 -2
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +41 -17
- package/dist/scorers/utils.d.ts +171 -1
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +14 -11
- package/dist/chunk-OEOE7ZHN.js +0 -195
- package/dist/chunk-OEOE7ZHN.js.map +0 -1
- package/dist/chunk-W3U7MMDX.cjs +0 -212
- package/dist/chunk-W3U7MMDX.cjs.map +0 -1
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/code/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,sBAAsB,CAAC;AACrC,cAAc,oBAAoB,CAAC;AACnC,cAAc,sBAAsB,CAAC;AACrC,cAAc,QAAQ,CAAC;AACvB,cAAc,sBAAsB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/code/index.ts"],"names":[],"mappings":"AAAA,cAAc,gBAAgB,CAAC;AAC/B,cAAc,sBAAsB,CAAC;AACrC,cAAc,oBAAoB,CAAC;AACnC,cAAc,sBAAsB,CAAC;AACrC,cAAc,QAAQ,CAAC;AACvB,cAAc,sBAAsB,CAAC;AACrC,cAAc,cAAc,CAAC"}
|
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
import type { ExpectedStep, Trajectory, TrajectoryComparisonOptions, TrajectoryExpectation } from '@mastra/core/evals';
|
|
2
|
+
import type { TrajectoryComparisonResult, TrajectoryEfficiencyResult, TrajectoryBlacklistResult, ToolFailureAnalysisResult } from '../../utils.js';
|
|
3
|
+
interface TrajectoryAccuracyScorerCodeOptions {
|
|
4
|
+
/**
|
|
5
|
+
* The expected trajectory to compare against.
|
|
6
|
+
* Accepts a Trajectory (full trajectory steps) or ExpectedStep[] (lightweight matchers).
|
|
7
|
+
* If not provided, the scorer will use `run.expectedTrajectory` from the dataset item.
|
|
8
|
+
*/
|
|
9
|
+
expectedTrajectory?: Trajectory | ExpectedStep[];
|
|
10
|
+
/** Comparison behavior options */
|
|
11
|
+
comparisonOptions?: TrajectoryComparisonOptions;
|
|
12
|
+
}
|
|
13
|
+
/**
|
|
14
|
+
* Creates a code-based trajectory accuracy scorer that compares the actual sequence
|
|
15
|
+
* of tool calls an agent made against an expected trajectory.
|
|
16
|
+
*
|
|
17
|
+
* This scorer extracts the agent's tool call trajectory from its output messages
|
|
18
|
+
* and compares it against a predefined expected trajectory. It supports strict,
|
|
19
|
+
* relaxed, and unordered comparison modes.
|
|
20
|
+
*
|
|
21
|
+
* @param options - Configuration for the trajectory scorer
|
|
22
|
+
* @returns A scorer that evaluates trajectory accuracy
|
|
23
|
+
*
|
|
24
|
+
* @example
|
|
25
|
+
* ```ts
|
|
26
|
+
* import { createTrajectoryAccuracyScorerCode } from '@mastra/evals/scorers';
|
|
27
|
+
*
|
|
28
|
+
* const scorer = createTrajectoryAccuracyScorerCode({
|
|
29
|
+
* expectedTrajectory: {
|
|
30
|
+
* steps: [
|
|
31
|
+
* { stepType: 'tool_call', name: 'search' },
|
|
32
|
+
* { stepType: 'tool_call', name: 'summarize' },
|
|
33
|
+
* ],
|
|
34
|
+
* },
|
|
35
|
+
* comparisonOptions: {
|
|
36
|
+
* ordering: 'relaxed',
|
|
37
|
+
* allowRepeatedSteps: true,
|
|
38
|
+
* },
|
|
39
|
+
* });
|
|
40
|
+
*
|
|
41
|
+
* const result = await scorer.run(agentRun);
|
|
42
|
+
* // result.score: 0.0 - 1.0
|
|
43
|
+
* // result.preprocessStepResult.comparison: detailed comparison results
|
|
44
|
+
* ```
|
|
45
|
+
*/
|
|
46
|
+
export declare function createTrajectoryAccuracyScorerCode(options?: TrajectoryAccuracyScorerCodeOptions): import("@mastra/core/evals").MastraScorer<"code-trajectory-accuracy-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, Trajectory, Record<"preprocessStepResult", {
|
|
47
|
+
actualTrajectory: Trajectory;
|
|
48
|
+
expectedTrajectory: undefined;
|
|
49
|
+
comparison: undefined;
|
|
50
|
+
actualStepNames: string[];
|
|
51
|
+
expectedStepNames: never[];
|
|
52
|
+
error: string;
|
|
53
|
+
} | {
|
|
54
|
+
actualTrajectory: Trajectory;
|
|
55
|
+
expectedTrajectory: {
|
|
56
|
+
steps: ExpectedStep[];
|
|
57
|
+
};
|
|
58
|
+
comparison: TrajectoryComparisonResult;
|
|
59
|
+
actualStepNames: string[];
|
|
60
|
+
expectedStepNames: string[];
|
|
61
|
+
error?: undefined;
|
|
62
|
+
}> & Record<"generateScoreStepResult", number>>;
|
|
63
|
+
/**
|
|
64
|
+
* Result from evaluating a nested step's children against its TrajectoryExpectation.
|
|
65
|
+
*/
|
|
66
|
+
export type NestedEvaluationResult = {
|
|
67
|
+
/** Name of the expected step that contained the nested config */
|
|
68
|
+
stepName: string;
|
|
69
|
+
/** Score for this nested evaluation (0.0 - 1.0) */
|
|
70
|
+
score: number;
|
|
71
|
+
/** Accuracy result for the children */
|
|
72
|
+
accuracy?: TrajectoryComparisonResult;
|
|
73
|
+
/** Efficiency result for the children */
|
|
74
|
+
efficiency?: TrajectoryEfficiencyResult;
|
|
75
|
+
/** Blacklist result for the children */
|
|
76
|
+
blacklist?: TrajectoryBlacklistResult;
|
|
77
|
+
/** Tool failure result for the children */
|
|
78
|
+
toolFailures?: ToolFailureAnalysisResult;
|
|
79
|
+
/** Further nested results from deeper levels */
|
|
80
|
+
nested?: NestedEvaluationResult[];
|
|
81
|
+
};
|
|
82
|
+
/**
|
|
83
|
+
* Multi-dimensional result from the unified trajectory scorer.
|
|
84
|
+
*/
|
|
85
|
+
export type TrajectoryScoreResult = {
|
|
86
|
+
/** Overall score (0.0 - 1.0). Weighted combination of dimensions (0.0 if blacklist violation). */
|
|
87
|
+
score: number;
|
|
88
|
+
/** Accuracy sub-score (step matching). Only present if expected steps were provided. */
|
|
89
|
+
accuracy?: TrajectoryComparisonResult;
|
|
90
|
+
/** Efficiency sub-score (budgets + redundancy). */
|
|
91
|
+
efficiency?: TrajectoryEfficiencyResult;
|
|
92
|
+
/** Blacklist sub-score (forbidden tools/sequences). */
|
|
93
|
+
blacklist?: TrajectoryBlacklistResult;
|
|
94
|
+
/** Tool failure analysis. */
|
|
95
|
+
toolFailures?: ToolFailureAnalysisResult;
|
|
96
|
+
/** Results from evaluating nested step expectations. */
|
|
97
|
+
nested?: NestedEvaluationResult[];
|
|
98
|
+
};
|
|
99
|
+
interface TrajectoryScorerCodeOptions {
|
|
100
|
+
/**
|
|
101
|
+
* Default expectation config for all runs.
|
|
102
|
+
* Per-item `run.expectedTrajectory` values override these defaults.
|
|
103
|
+
*/
|
|
104
|
+
defaults?: TrajectoryExpectation;
|
|
105
|
+
}
|
|
106
|
+
/**
|
|
107
|
+
* Creates a unified trajectory scorer that evaluates multiple dimensions:
|
|
108
|
+
* accuracy (step matching), efficiency (budgets, redundancy), blacklist (forbidden tools/sequences),
|
|
109
|
+
* and tool failure patterns.
|
|
110
|
+
*
|
|
111
|
+
* Configuration can be set at two levels:
|
|
112
|
+
* - **Constructor defaults** (`defaults`) — agent-level defaults for all dataset items
|
|
113
|
+
* - **Per-item overrides** (`run.expectedTrajectory`) — prompt-specific overrides from dataset items
|
|
114
|
+
*
|
|
115
|
+
* Per-item values override constructor defaults for all fields.
|
|
116
|
+
*
|
|
117
|
+
* @param options - Default trajectory expectations
|
|
118
|
+
* @returns A scorer with multi-dimensional trajectory evaluation
|
|
119
|
+
*
|
|
120
|
+
* @example
|
|
121
|
+
* ```ts
|
|
122
|
+
* import { createTrajectoryScorerCode } from '@mastra/evals/scorers';
|
|
123
|
+
*
|
|
124
|
+
* const scorer = createTrajectoryScorerCode({
|
|
125
|
+
* defaults: {
|
|
126
|
+
* steps: [
|
|
127
|
+
* { stepType: 'tool_call', name: 'search' },
|
|
128
|
+
* { stepType: 'tool_call', name: 'summarize' },
|
|
129
|
+
* ],
|
|
130
|
+
* ordering: 'relaxed',
|
|
131
|
+
* maxSteps: 5,
|
|
132
|
+
* noRedundantCalls: true,
|
|
133
|
+
* blacklistedTools: ['deleteAll'],
|
|
134
|
+
* },
|
|
135
|
+
* });
|
|
136
|
+
* ```
|
|
137
|
+
*/
|
|
138
|
+
export declare function createTrajectoryScorerCode(options?: TrajectoryScorerCodeOptions): import("@mastra/core/evals").MastraScorer<"code-trajectory-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, Trajectory, Record<"preprocessStepResult", {
|
|
139
|
+
accuracy: TrajectoryComparisonResult | undefined;
|
|
140
|
+
efficiency: TrajectoryEfficiencyResult | undefined;
|
|
141
|
+
blacklist: TrajectoryBlacklistResult | undefined;
|
|
142
|
+
toolFailures: ToolFailureAnalysisResult;
|
|
143
|
+
nested: NestedEvaluationResult[] | undefined;
|
|
144
|
+
config: TrajectoryExpectation;
|
|
145
|
+
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
146
|
+
export {};
|
|
147
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/code/trajectory/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,YAAY,EACZ,UAAU,EACV,2BAA2B,EAC3B,qBAAqB,EAEtB,MAAM,oBAAoB,CAAC;AAQ5B,OAAO,KAAK,EACV,0BAA0B,EAC1B,0BAA0B,EAC1B,yBAAyB,EACzB,yBAAyB,EAC1B,MAAM,aAAa,CAAC;AAErB,UAAU,mCAAmC;IAC3C;;;;OAIG;IACH,kBAAkB,CAAC,EAAE,UAAU,GAAG,YAAY,EAAE,CAAC;IACjD,kCAAkC;IAClC,iBAAiB,CAAC,EAAE,2BAA2B,CAAC;CACjD;AAiCD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAgB,kCAAkC,CAAC,OAAO,GAAE,mCAAwC;;;;;;;;;;;;;;;;gDAuFnG;AAID;;GAEG;AACH,MAAM,MAAM,sBAAsB,GAAG;IACnC,iEAAiE;IACjE,QAAQ,EAAE,MAAM,CAAC;IACjB,mDAAmD;IACnD,KAAK,EAAE,MAAM,CAAC;IACd,uCAAuC;IACvC,QAAQ,CAAC,EAAE,0BAA0B,CAAC;IACtC,yCAAyC;IACzC,UAAU,CAAC,EAAE,0BAA0B,CAAC;IACxC,wCAAwC;IACxC,SAAS,CAAC,EAAE,yBAAyB,CAAC;IACtC,2CAA2C;IAC3C,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,gDAAgD;IAChD,MAAM,CAAC,EAAE,sBAAsB,EAAE,CAAC;CACnC,CAAC;AAwJF;;GAEG;AACH,MAAM,MAAM,qBAAqB,GAAG;IAClC,kGAAkG;IAClG,KAAK,EAAE,MAAM,CAAC;IACd,wFAAwF;IACxF,QAAQ,CAAC,EAAE,0BAA0B,CAAC;IACtC,mDAAmD;IACnD,UAAU,CAAC,EAAE,0BAA0B,CAAC;IACxC,uDAAuD;IACvD,SAAS,CAAC,EAAE,yBAAyB,CAAC;IACtC,6BAA6B;IAC7B,YAAY,CAAC,EAAE,yBAAyB,CAAC;IACzC,wDAAwD;IACxD,MAAM,CAAC,EAAE,sBAAsB,EAAE,CAAC;CACnC,CAAC;AAEF,UAAU,2BAA2B;IACnC;;;OAGG;IACH,QAAQ,CAAC,EAAE,qBAAqB,CAAC;CAClC;AAED;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA+BG;AACH,wBAAgB,0BAA0B,CAAC,OAAO,GAAE,2BAAgC;;;;;;;6FAqNnF"}
|
|
@@ -20,13 +20,13 @@ export declare function createAnswerSimilarityScorer({ model, options, }: {
|
|
|
20
20
|
matches: {
|
|
21
21
|
groundTruthUnit: string;
|
|
22
22
|
outputUnit: string | null;
|
|
23
|
-
matchType: "exact" | "
|
|
23
|
+
matchType: "exact" | "partial" | "semantic" | "missing";
|
|
24
24
|
explanation: string;
|
|
25
25
|
}[];
|
|
26
26
|
extraInOutput: string[];
|
|
27
27
|
contradictions: {
|
|
28
|
-
groundTruthUnit: string;
|
|
29
28
|
outputUnit: string;
|
|
29
|
+
groundTruthUnit: string;
|
|
30
30
|
explanation: string;
|
|
31
31
|
}[];
|
|
32
32
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
@@ -10,9 +10,9 @@ export declare function createContextPrecisionScorer({ model, options, }: {
|
|
|
10
10
|
options: ContextPrecisionMetricOptions;
|
|
11
11
|
}): import("@mastra/core/evals").MastraScorer<"context-precision-scorer", ScorerRunInputForAgent, ScorerRunOutputForAgent, Record<"analyzeStepResult", {
|
|
12
12
|
verdicts: {
|
|
13
|
-
reason: string;
|
|
14
|
-
verdict: string;
|
|
15
13
|
context_index: number;
|
|
14
|
+
verdict: string;
|
|
15
|
+
reason: string;
|
|
16
16
|
}[];
|
|
17
17
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
18
18
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -15,11 +15,11 @@ export declare function createContextRelevanceScorerLLM({ model, options, }: {
|
|
|
15
15
|
options: ContextRelevanceOptions;
|
|
16
16
|
}): import("@mastra/core/evals").MastraScorer<"context-relevance-scorer", ScorerRunInputForAgent, ScorerRunOutputForAgent, Record<"analyzeStepResult", {
|
|
17
17
|
evaluations: {
|
|
18
|
-
reasoning: string;
|
|
19
18
|
context_index: number;
|
|
20
19
|
contextPiece: string;
|
|
21
20
|
relevanceLevel: "high" | "medium" | "low" | "none";
|
|
22
21
|
wasUsed: boolean;
|
|
22
|
+
reasoning: string;
|
|
23
23
|
}[];
|
|
24
24
|
overallAssessment: string;
|
|
25
25
|
missingContext?: string[] | undefined;
|
|
@@ -10,8 +10,8 @@ export declare function createFaithfulnessScorer({ model, options, }: {
|
|
|
10
10
|
claims: string[];
|
|
11
11
|
}> & Record<"analyzeStepResult", {
|
|
12
12
|
verdicts: {
|
|
13
|
-
reason: string;
|
|
14
13
|
verdict: string;
|
|
14
|
+
reason: string;
|
|
15
15
|
}[];
|
|
16
16
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
17
17
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -27,9 +27,9 @@ export declare function createHallucinationScorer({ model, options, }: {
|
|
|
27
27
|
claims: string[];
|
|
28
28
|
}> & Record<"analyzeStepResult", {
|
|
29
29
|
verdicts: {
|
|
30
|
-
reason: string;
|
|
31
|
-
verdict: string;
|
|
32
30
|
statement: string;
|
|
31
|
+
verdict: string;
|
|
32
|
+
reason: string;
|
|
33
33
|
}[];
|
|
34
34
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
35
35
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -9,4 +9,5 @@ export * from './context-relevance/index.js';
|
|
|
9
9
|
export * from './context-precision/index.js';
|
|
10
10
|
export * from './noise-sensitivity/index.js';
|
|
11
11
|
export * from './prompt-alignment/index.js';
|
|
12
|
+
export * from './trajectory/index.js';
|
|
12
13
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,cAAc,CAAC"}
|
|
@@ -22,13 +22,13 @@ export declare function createNoiseSensitivityScorerLLM({ model, options, }: {
|
|
|
22
22
|
model: MastraModelConfig;
|
|
23
23
|
options: NoiseSensitivityOptions;
|
|
24
24
|
}): import("@mastra/core/evals").MastraScorer<"noise-sensitivity-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, import("@mastra/core/evals").ScorerRunOutputForAgent, Record<"analyzeStepResult", {
|
|
25
|
-
overallAssessment: string;
|
|
26
25
|
dimensions: {
|
|
27
26
|
dimension: string;
|
|
28
27
|
impactLevel: "none" | "minimal" | "moderate" | "significant" | "severe";
|
|
29
28
|
specificChanges: string;
|
|
30
29
|
noiseInfluence: string;
|
|
31
30
|
}[];
|
|
31
|
+
overallAssessment: string;
|
|
32
32
|
robustnessScore: number;
|
|
33
33
|
majorIssues?: string[] | undefined;
|
|
34
34
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
@@ -8,31 +8,31 @@ export declare function createPromptAlignmentScorerLLM({ model, options, }: {
|
|
|
8
8
|
model: MastraModelConfig;
|
|
9
9
|
options?: PromptAlignmentOptions;
|
|
10
10
|
}): import("@mastra/core/evals").MastraScorer<string, ScorerRunInputForAgent, ScorerRunOutputForAgent, Record<"analyzeStepResult", {
|
|
11
|
-
overallAssessment: string;
|
|
12
11
|
intentAlignment: {
|
|
13
|
-
reasoning: string;
|
|
14
12
|
score: number;
|
|
15
13
|
primaryIntent: string;
|
|
16
14
|
isAddressed: boolean;
|
|
15
|
+
reasoning: string;
|
|
17
16
|
};
|
|
18
17
|
requirementsFulfillment: {
|
|
19
18
|
requirements: {
|
|
20
|
-
reasoning: string;
|
|
21
19
|
requirement: string;
|
|
22
20
|
isFulfilled: boolean;
|
|
21
|
+
reasoning: string;
|
|
23
22
|
}[];
|
|
24
23
|
overallScore: number;
|
|
25
24
|
};
|
|
26
25
|
completeness: {
|
|
27
|
-
reasoning: string;
|
|
28
26
|
score: number;
|
|
29
27
|
missingElements: string[];
|
|
28
|
+
reasoning: string;
|
|
30
29
|
};
|
|
31
30
|
responseAppropriateness: {
|
|
32
|
-
reasoning: string;
|
|
33
31
|
score: number;
|
|
34
32
|
formatAlignment: boolean;
|
|
35
33
|
toneAlignment: boolean;
|
|
34
|
+
reasoning: string;
|
|
36
35
|
};
|
|
36
|
+
overallAssessment: string;
|
|
37
37
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
38
38
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -10,9 +10,9 @@ export declare function createToolCallAccuracyScorerLLM({ model, availableTools
|
|
|
10
10
|
toolCallInfos: import("../..").ToolCallInfo[];
|
|
11
11
|
}> & Record<"analyzeStepResult", {
|
|
12
12
|
evaluations: {
|
|
13
|
-
reasoning: string;
|
|
14
13
|
toolCalled: string;
|
|
15
14
|
wasAppropriate: boolean;
|
|
15
|
+
reasoning: string;
|
|
16
16
|
}[];
|
|
17
17
|
missingTools?: string[] | undefined;
|
|
18
18
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
@@ -7,8 +7,8 @@ export declare function createToxicityScorer({ model, options, }: {
|
|
|
7
7
|
options?: ToxicityMetricOptions;
|
|
8
8
|
}): import("@mastra/core/evals").MastraScorer<"toxicity-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, import("@mastra/core/evals").ScorerRunOutputForAgent, Record<"analyzeStepResult", {
|
|
9
9
|
verdicts: {
|
|
10
|
-
reason: string;
|
|
11
10
|
verdict: string;
|
|
11
|
+
reason: string;
|
|
12
12
|
}[];
|
|
13
13
|
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
14
14
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
import type { ExpectedStep, Trajectory } from '@mastra/core/evals';
|
|
2
|
+
import type { MastraModelConfig } from '@mastra/core/llm';
|
|
3
|
+
export interface TrajectoryAccuracyLLMOptions {
|
|
4
|
+
/** The LLM model to use as judge */
|
|
5
|
+
model: MastraModelConfig;
|
|
6
|
+
/** Optional expected trajectory to compare against */
|
|
7
|
+
expectedTrajectory?: Trajectory | ExpectedStep[];
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Creates an LLM-based trajectory accuracy scorer that evaluates the quality
|
|
11
|
+
* of an agent's action sequence using an LLM judge.
|
|
12
|
+
*
|
|
13
|
+
* This scorer extracts the agent's tool call trajectory and asks an LLM to evaluate
|
|
14
|
+
* whether the trajectory was appropriate, efficient, and complete. When an expected
|
|
15
|
+
* trajectory is provided, it compares against it. Otherwise, it evaluates the trajectory
|
|
16
|
+
* based on the task requirements.
|
|
17
|
+
*
|
|
18
|
+
* @param options - Configuration for the trajectory scorer
|
|
19
|
+
* @returns A scorer that evaluates trajectory quality
|
|
20
|
+
*
|
|
21
|
+
* @example
|
|
22
|
+
* ```ts
|
|
23
|
+
* import { createTrajectoryAccuracyScorerLLM } from '@mastra/evals/scorers';
|
|
24
|
+
*
|
|
25
|
+
* // Without expected trajectory (evaluates quality based on task)
|
|
26
|
+
* const scorer = createTrajectoryAccuracyScorerLLM({
|
|
27
|
+
* model: { provider: 'openai', name: 'gpt-4o' },
|
|
28
|
+
* });
|
|
29
|
+
*
|
|
30
|
+
* // With expected trajectory
|
|
31
|
+
* const scorerWithExpected = createTrajectoryAccuracyScorerLLM({
|
|
32
|
+
* model: { provider: 'openai', name: 'gpt-4o' },
|
|
33
|
+
* expectedTrajectory: {
|
|
34
|
+
* steps: [
|
|
35
|
+
* { stepType: 'tool_call', name: 'search' },
|
|
36
|
+
* { stepType: 'tool_call', name: 'summarize' },
|
|
37
|
+
* ],
|
|
38
|
+
* },
|
|
39
|
+
* });
|
|
40
|
+
* ```
|
|
41
|
+
*/
|
|
42
|
+
export declare function createTrajectoryAccuracyScorerLLM({ model, expectedTrajectory: staticExpectedTrajectory, }: TrajectoryAccuracyLLMOptions): import("@mastra/core/evals").MastraScorer<"llm-trajectory-accuracy-scorer", import("@mastra/core/evals").ScorerRunInputForAgent, Trajectory, Record<"preprocessStepResult", {
|
|
43
|
+
actualTrajectory: Trajectory;
|
|
44
|
+
actualTrajectoryFormatted: string;
|
|
45
|
+
expectedTrajectoryFormatted: string | undefined;
|
|
46
|
+
hasSteps: boolean;
|
|
47
|
+
}> & Record<"analyzeStepResult", {
|
|
48
|
+
stepEvaluations: {
|
|
49
|
+
stepName: string;
|
|
50
|
+
wasNecessary: boolean;
|
|
51
|
+
wasInOrder: boolean;
|
|
52
|
+
reasoning: string;
|
|
53
|
+
}[];
|
|
54
|
+
overallAssessment: string;
|
|
55
|
+
missingSteps?: string[] | undefined;
|
|
56
|
+
extraSteps?: string[] | undefined;
|
|
57
|
+
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
58
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/trajectory/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,YAAY,EAAE,UAAU,EAAyC,MAAM,oBAAoB,CAAC;AAE1G,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAK1D,MAAM,WAAW,4BAA4B;IAC3C,oCAAoC;IACpC,KAAK,EAAE,iBAAiB,CAAC;IACzB,sDAAsD;IACtD,kBAAkB,CAAC,EAAE,UAAU,GAAG,YAAY,EAAE,CAAC;CAClD;AA8DD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAgCG;AACH,wBAAgB,iCAAiC,CAAC,EAChD,KAAK,EACL,kBAAkB,EAAE,wBAAwB,GAC7C,EAAE,4BAA4B;;;;;;;;;;;;;;;6FA+G9B"}
|
|
@@ -0,0 +1,20 @@
|
|
|
1
|
+
export declare const TRAJECTORY_EVALUATION_INSTRUCTIONS = "\nYou are an expert evaluator specializing in AI agent trajectory analysis. Your role is to assess whether an agent took an appropriate sequence of actions (tool calls, reasoning steps) to accomplish a user's request.\n\nCORE RESPONSIBILITIES:\n- Analyze the full sequence of actions the agent took\n- Evaluate whether each step was necessary and well-ordered\n- Identify unnecessary, redundant, or missing steps\n- Assess the overall quality of the agent's action path\n\nEVALUATION PHILOSOPHY:\n- Consider both the individual steps AND the overall flow\n- A good trajectory is efficient, logical, and complete\n- Redundant steps reduce quality even if the final result is correct\n- Missing critical steps are a significant issue\n- Order matters: logical dependencies should be respected\n\nOUTPUT REQUIREMENTS:\n- Provide clear reasoning for your trajectory assessment\n- Use provided JSON schema exactly as specified\n- Be consistent in your evaluation standards\n";
|
|
2
|
+
export declare const createAnalyzePrompt: ({ userInput, agentResponse, actualTrajectory, expectedTrajectory, }: {
|
|
3
|
+
userInput: string;
|
|
4
|
+
agentResponse: string;
|
|
5
|
+
actualTrajectory: string;
|
|
6
|
+
expectedTrajectory?: string;
|
|
7
|
+
}) => string;
|
|
8
|
+
export declare const createReasonPrompt: ({ userInput, score, stepEvaluations, missingSteps, extraSteps, }: {
|
|
9
|
+
userInput: string;
|
|
10
|
+
score: number;
|
|
11
|
+
stepEvaluations: Array<{
|
|
12
|
+
stepName: string;
|
|
13
|
+
wasNecessary: boolean;
|
|
14
|
+
wasInOrder: boolean;
|
|
15
|
+
reasoning: string;
|
|
16
|
+
}>;
|
|
17
|
+
missingSteps: string[];
|
|
18
|
+
extraSteps: string[];
|
|
19
|
+
}) => string;
|
|
20
|
+
//# sourceMappingURL=prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/trajectory/prompts.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,kCAAkC,88BAoB9C,CAAC;AAEF,eAAO,MAAM,mBAAmB,GAAI,qEAKjC;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,aAAa,EAAE,MAAM,CAAC;IACtB,gBAAgB,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B,KAAG,MA2CH,CAAC;AAEF,eAAO,MAAM,kBAAkB,GAAI,kEAMhC;IACD,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,eAAe,EAAE,KAAK,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,YAAY,EAAE,OAAO,CAAC;QAAC,UAAU,EAAE,OAAO,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,CAAC,CAAC;IAC5G,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,UAAU,EAAE,MAAM,EAAE,CAAC;CACtB,KAAG,MAYH,CAAC"}
|