@mastra/evals 1.1.2-alpha.0 → 1.2.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/CHANGELOG.md +59 -2
  2. package/LICENSE.md +15 -0
  3. package/dist/chunk-EVBNIL5M.js +606 -0
  4. package/dist/chunk-EVBNIL5M.js.map +1 -0
  5. package/dist/chunk-XRUR5PBK.cjs +632 -0
  6. package/dist/chunk-XRUR5PBK.cjs.map +1 -0
  7. package/dist/docs/SKILL.md +20 -19
  8. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  9. package/dist/docs/references/docs-evals-built-in-scorers.md +2 -1
  10. package/dist/docs/references/docs-evals-overview.md +11 -16
  11. package/dist/docs/references/reference-evals-answer-relevancy.md +25 -25
  12. package/dist/docs/references/reference-evals-answer-similarity.md +33 -35
  13. package/dist/docs/references/reference-evals-bias.md +24 -24
  14. package/dist/docs/references/reference-evals-completeness.md +19 -20
  15. package/dist/docs/references/reference-evals-content-similarity.md +20 -20
  16. package/dist/docs/references/reference-evals-context-precision.md +36 -36
  17. package/dist/docs/references/reference-evals-context-relevance.md +136 -141
  18. package/dist/docs/references/reference-evals-faithfulness.md +24 -24
  19. package/dist/docs/references/reference-evals-hallucination.md +52 -69
  20. package/dist/docs/references/reference-evals-keyword-coverage.md +18 -18
  21. package/dist/docs/references/reference-evals-noise-sensitivity.md +167 -177
  22. package/dist/docs/references/reference-evals-prompt-alignment.md +111 -116
  23. package/dist/docs/references/reference-evals-scorer-utils.md +285 -105
  24. package/dist/docs/references/reference-evals-textual-difference.md +18 -18
  25. package/dist/docs/references/reference-evals-tone-consistency.md +19 -19
  26. package/dist/docs/references/reference-evals-tool-call-accuracy.md +165 -165
  27. package/dist/docs/references/reference-evals-toxicity.md +21 -21
  28. package/dist/docs/references/reference-evals-trajectory-accuracy.md +613 -0
  29. package/dist/scorers/code/index.d.ts +1 -0
  30. package/dist/scorers/code/index.d.ts.map +1 -1
  31. package/dist/scorers/code/trajectory/index.d.ts +147 -0
  32. package/dist/scorers/code/trajectory/index.d.ts.map +1 -0
  33. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  34. package/dist/scorers/llm/context-precision/index.d.ts +2 -2
  35. package/dist/scorers/llm/context-relevance/index.d.ts +1 -1
  36. package/dist/scorers/llm/faithfulness/index.d.ts +1 -1
  37. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  38. package/dist/scorers/llm/index.d.ts +1 -0
  39. package/dist/scorers/llm/index.d.ts.map +1 -1
  40. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  41. package/dist/scorers/llm/prompt-alignment/index.d.ts +5 -5
  42. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +1 -1
  43. package/dist/scorers/llm/toxicity/index.d.ts +1 -1
  44. package/dist/scorers/llm/trajectory/index.d.ts +58 -0
  45. package/dist/scorers/llm/trajectory/index.d.ts.map +1 -0
  46. package/dist/scorers/llm/trajectory/prompts.d.ts +20 -0
  47. package/dist/scorers/llm/trajectory/prompts.d.ts.map +1 -0
  48. package/dist/scorers/prebuilt/index.cjs +638 -59
  49. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  50. package/dist/scorers/prebuilt/index.js +578 -2
  51. package/dist/scorers/prebuilt/index.js.map +1 -1
  52. package/dist/scorers/utils.cjs +41 -17
  53. package/dist/scorers/utils.d.ts +171 -1
  54. package/dist/scorers/utils.d.ts.map +1 -1
  55. package/dist/scorers/utils.js +1 -1
  56. package/package.json +14 -11
  57. package/dist/chunk-OEOE7ZHN.js +0 -195
  58. package/dist/chunk-OEOE7ZHN.js.map +0 -1
  59. package/dist/chunk-W3U7MMDX.cjs +0 -212
  60. package/dist/chunk-W3U7MMDX.cjs.map +0 -1
@@ -1,4 +1,4 @@
1
- # Hallucination Scorer
1
+ # Hallucination scorer
2
2
 
3
3
  The `createHallucinationScorer()` function evaluates whether an LLM generates factually correct information by comparing its output against the provided context. This scorer measures hallucination by identifying direct contradictions between the context and the output.
4
4
 
@@ -6,47 +6,37 @@ The `createHallucinationScorer()` function evaluates whether an LLM generates fa
6
6
 
7
7
  The `createHallucinationScorer()` function accepts a single options object with the following properties:
8
8
 
9
- **model:** (`LanguageModel`): Configuration for the model used to evaluate hallucination.
9
+ **model** (`LanguageModel`): Configuration for the model used to evaluate hallucination.
10
10
 
11
- **options.scale:** (`number`): Maximum score value. (Default: `1`)
11
+ **options** (`Options`): Configuration options.
12
12
 
13
- **options.context:** (`string[]`): Static context strings to use as ground truth for hallucination detection.
13
+ **options.scale** (`number`): Maximum score value.
14
14
 
15
- **options.getContext:** (`(params: GetContextParams) => string[] | Promise<string[]>`): A hook to dynamically resolve context at runtime. Takes priority over static context. Useful for live scoring where context (like tool results) is only available when the scorer runs.
15
+ **options.context** (`string[]`): Static context strings to use as ground truth for hallucination detection.
16
16
 
17
- This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
18
-
19
- ### GetContextParams
20
-
21
- The `getContext` hook receives the following parameters:
22
-
23
- **run:** (`GetContextRun`): The scorer run containing input, output, runId, requestContext, and tracingContext.
17
+ **options.getContext** (`(params: GetContextParams) => string[] | Promise<string[]>`): A hook to dynamically resolve context at runtime. Takes priority over static context. Useful for live scoring where context (like tool results) is only available when the scorer runs.
24
18
 
25
- **results:** (`Record<string, any>`): Accumulated results from previous steps (e.g., preprocessStepResult with extracted claims).
26
-
27
- **score:** (`number`): The computed score. Only present when called from the generateReason step.
28
-
29
- **step:** (`'analyze' | 'generateReason'`): Which step is calling the hook. Useful for caching context between calls.
19
+ This function returns an instance of the MastraScorer class. The `.run()` method accepts the same input as other scorers (see the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer)), but the return value includes LLM-specific fields as documented below.
30
20
 
31
- ## .run() Returns
21
+ ## `.run()` returns
32
22
 
33
- **runId:** (`string`): The id of the run (optional).
23
+ **runId** (`string`): The id of the run (optional).
34
24
 
35
- **preprocessStepResult:** (`object`): Object with extracted claims: { claims: string\[] }
25
+ **preprocessStepResult** (`object`): Object with extracted claims: { claims: string\[] }
36
26
 
37
- **preprocessPrompt:** (`string`): The prompt sent to the LLM for the preprocess step (optional).
27
+ **preprocessPrompt** (`string`): The prompt sent to the LLM for the preprocess step (optional).
38
28
 
39
- **analyzeStepResult:** (`object`): Object with verdicts: { verdicts: Array<{ statement: string, verdict: 'yes' | 'no', reason: string }> }
29
+ **analyzeStepResult** (`object`): Object with verdicts: { verdicts: Array<{ statement: string, verdict: 'yes' | 'no', reason: string }> }
40
30
 
41
- **analyzePrompt:** (`string`): The prompt sent to the LLM for the analyze step (optional).
31
+ **analyzePrompt** (`string`): The prompt sent to the LLM for the analyze step (optional).
42
32
 
43
- **score:** (`number`): Hallucination score (0 to scale, default 0-1).
33
+ **score** (`number`): Hallucination score (0 to scale, default 0-1).
44
34
 
45
- **reason:** (`string`): Detailed explanation of the score and identified contradictions.
35
+ **reason** (`string`): Detailed explanation of the score and identified contradictions.
46
36
 
47
- **generateReasonPrompt:** (`string`): The prompt sent to the LLM for the generateReason step (optional).
37
+ **generateReasonPrompt** (`string`): The prompt sent to the LLM for the generateReason step (optional).
48
38
 
49
- ## Scoring Details
39
+ ## Scoring details
50
40
 
51
41
  The scorer evaluates hallucination through contradiction detection and unsupported claim analysis.
52
42
 
@@ -111,40 +101,38 @@ A hallucination score between 0 and 1:
111
101
  Use static context when you have known ground truth to compare against:
112
102
 
113
103
  ```typescript
114
- import { createHallucinationScorer } from "@mastra/evals/scorers/prebuilt";
104
+ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
115
105
 
116
106
  const scorer = createHallucinationScorer({
117
- model: "openai/gpt-4o",
107
+ model: 'openai/gpt-5.4',
118
108
  options: {
119
109
  context: [
120
- "The first iPhone was announced on January 9, 2007.",
121
- "It was released on June 29, 2007.",
122
- "Steve Jobs introduced it at Macworld.",
110
+ 'The first iPhone was announced on January 9, 2007.',
111
+ 'It was released on June 29, 2007.',
112
+ 'Steve Jobs introduced it at Macworld.',
123
113
  ],
124
114
  },
125
- });
115
+ })
126
116
  ```
127
117
 
128
- ### Dynamic Context with getContext
118
+ ### Dynamic Context with `getContext`
129
119
 
130
120
  Use `getContext` for live scoring scenarios where context comes from tool results:
131
121
 
132
122
  ```typescript
133
- import { createHallucinationScorer } from "@mastra/evals/scorers/prebuilt";
134
- import { extractToolResults } from "@mastra/evals/scorers";
123
+ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
124
+ import { extractToolResults } from '@mastra/evals/scorers'
135
125
 
136
126
  const scorer = createHallucinationScorer({
137
- model: "openai/gpt-4o",
127
+ model: 'openai/gpt-5.4',
138
128
  options: {
139
129
  getContext: ({ run, step }) => {
140
130
  // Extract tool results as context
141
- const toolResults = extractToolResults(run.output);
142
- return toolResults.map((t) =>
143
- JSON.stringify({ tool: t.toolName, result: t.result })
144
- );
131
+ const toolResults = extractToolResults(run.output)
132
+ return toolResults.map(t => JSON.stringify({ tool: t.toolName, result: t.result }))
145
133
  },
146
134
  },
147
- });
135
+ })
148
136
  ```
149
137
 
150
138
  ### Live Scoring with Agent
@@ -152,62 +140,57 @@ const scorer = createHallucinationScorer({
152
140
  Attach the scorer to an agent for live evaluation:
153
141
 
154
142
  ```typescript
155
- import { Agent } from "@mastra/core/agent";
156
- import { createHallucinationScorer } from "@mastra/evals/scorers/prebuilt";
157
- import { extractToolResults } from "@mastra/evals/scorers";
143
+ import { Agent } from '@mastra/core/agent'
144
+ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
145
+ import { extractToolResults } from '@mastra/evals/scorers'
158
146
 
159
147
  const hallucinationScorer = createHallucinationScorer({
160
- model: "openai/gpt-4o",
148
+ model: 'openai/gpt-5.4',
161
149
  options: {
162
150
  getContext: ({ run }) => {
163
- const toolResults = extractToolResults(run.output);
164
- return toolResults.map((t) =>
165
- JSON.stringify({ tool: t.toolName, result: t.result })
166
- );
151
+ const toolResults = extractToolResults(run.output)
152
+ return toolResults.map(t => JSON.stringify({ tool: t.toolName, result: t.result }))
167
153
  },
168
154
  },
169
- });
155
+ })
170
156
 
171
157
  const agent = new Agent({
172
- name: "my-agent",
173
- model: "openai/gpt-4o",
174
- instructions: "You are a helpful assistant.",
158
+ name: 'my-agent',
159
+ model: 'openai/gpt-5.4',
160
+ instructions: 'You are a helpful assistant.',
175
161
  evals: {
176
162
  scorers: [hallucinationScorer],
177
163
  },
178
- });
164
+ })
179
165
  ```
180
166
 
181
- ### Batch Evaluation with runEvals
167
+ ### Batch Evaluation with `runEvals`
182
168
 
183
169
  ```typescript
184
- import { runEvals } from "@mastra/core/evals";
185
- import { createHallucinationScorer } from "@mastra/evals/scorers/prebuilt";
186
- import { myAgent } from "./agent";
170
+ import { runEvals } from '@mastra/core/evals'
171
+ import { createHallucinationScorer } from '@mastra/evals/scorers/prebuilt'
172
+ import { myAgent } from './agent'
187
173
 
188
174
  const scorer = createHallucinationScorer({
189
- model: "openai/gpt-4o",
175
+ model: 'openai/gpt-5.4',
190
176
  options: {
191
- context: ["Known fact 1", "Known fact 2"],
177
+ context: ['Known fact 1', 'Known fact 2'],
192
178
  },
193
- });
179
+ })
194
180
 
195
181
  const result = await runEvals({
196
- data: [
197
- { input: "Tell me about topic A" },
198
- { input: "Tell me about topic B" },
199
- ],
182
+ data: [{ input: 'Tell me about topic A' }, { input: 'Tell me about topic B' }],
200
183
  scorers: [scorer],
201
184
  target: myAgent,
202
185
  onItemComplete: ({ scorerResults }) => {
203
186
  console.log({
204
187
  score: scorerResults[scorer.id].score,
205
188
  reason: scorerResults[scorer.id].reason,
206
- });
189
+ })
207
190
  },
208
- });
191
+ })
209
192
 
210
- console.log(result.scores);
193
+ console.log(result.scores)
211
194
  ```
212
195
 
213
196
  For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).
@@ -1,22 +1,22 @@
1
- # Keyword Coverage Scorer
1
+ # Keyword coverage scorer
2
2
 
3
3
  The `createKeywordCoverageScorer()` function evaluates how well an LLM's output covers the important keywords from the input. It analyzes keyword presence and matches while ignoring common words and stop words.
4
4
 
5
5
  ## Parameters
6
6
 
7
- The `createKeywordCoverageScorer()` function does not take any options.
7
+ The `createKeywordCoverageScorer()` function doesn't take any options.
8
8
 
9
9
  This function returns an instance of the MastraScorer class. See the [MastraScorer reference](https://mastra.ai/reference/evals/mastra-scorer) for details on the `.run()` method and its input/output.
10
10
 
11
- ## .run() Returns
11
+ ## `.run()` returns
12
12
 
13
- **runId:** (`string`): The id of the run (optional).
13
+ **runId** (`string`): The id of the run (optional).
14
14
 
15
- **preprocessStepResult:** (`object`): Object with extracted keywords: { referenceKeywords: Set\<string>, responseKeywords: Set\<string> }
15
+ **preprocessStepResult** (`object`): Object with extracted keywords: { referenceKeywords: Set\<string>, responseKeywords: Set\<string> }
16
16
 
17
- **analyzeStepResult:** (`object`): Object with keyword coverage: { totalKeywords: number, matchedKeywords: number }
17
+ **analyzeStepResult** (`object`): Object with keyword coverage: { totalKeywords: number, matchedKeywords: number }
18
18
 
19
- **score:** (`number`): Coverage score (0-1) representing the proportion of matched keywords.
19
+ **score** (`number`): Coverage score (0-1) representing the proportion of matched keywords.
20
20
 
21
21
  `.run()` returns a result in the following shape:
22
22
 
@@ -35,7 +35,7 @@ This function returns an instance of the MastraScorer class. See the [MastraScor
35
35
  }
36
36
  ```
37
37
 
38
- ## Scoring Details
38
+ ## Scoring details
39
39
 
40
40
  The scorer evaluates keyword coverage by matching keywords with the following features:
41
41
 
@@ -85,23 +85,23 @@ The scorer handles several special cases:
85
85
  Evaluate keyword coverage between input queries and agent responses:
86
86
 
87
87
  ```typescript
88
- import { runEvals } from "@mastra/core/evals";
89
- import { createKeywordCoverageScorer } from "@mastra/evals/scorers/prebuilt";
90
- import { myAgent } from "./agent";
88
+ import { runEvals } from '@mastra/core/evals'
89
+ import { createKeywordCoverageScorer } from '@mastra/evals/scorers/prebuilt'
90
+ import { myAgent } from './agent'
91
91
 
92
- const scorer = createKeywordCoverageScorer();
92
+ const scorer = createKeywordCoverageScorer()
93
93
 
94
94
  const result = await runEvals({
95
95
  data: [
96
96
  {
97
- input: "JavaScript frameworks like React and Vue",
97
+ input: 'JavaScript frameworks like React and Vue',
98
98
  },
99
99
  {
100
- input: "TypeScript offers interfaces, generics, and type inference",
100
+ input: 'TypeScript offers interfaces, generics, and type inference',
101
101
  },
102
102
  {
103
103
  input:
104
- "Machine learning models require data preprocessing, feature engineering, and hyperparameter tuning",
104
+ 'Machine learning models require data preprocessing, feature engineering, and hyperparameter tuning',
105
105
  },
106
106
  ],
107
107
  scorers: [scorer],
@@ -109,11 +109,11 @@ const result = await runEvals({
109
109
  onItemComplete: ({ scorerResults }) => {
110
110
  console.log({
111
111
  score: scorerResults[scorer.id].score,
112
- });
112
+ })
113
113
  },
114
- });
114
+ })
115
115
 
116
- console.log(result.scores);
116
+ console.log(result.scores)
117
117
  ```
118
118
 
119
119
  For more details on `runEvals`, see the [runEvals reference](https://mastra.ai/reference/evals/run-evals).