@mastra/evals 1.2.4-alpha.0 → 1.3.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +41 -0
- package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
- package/dist/chunk-BE5F2OUQ.js.map +1 -0
- package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
- package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
- package/dist/docs/SKILL.md +2 -1
- package/dist/docs/assets/SOURCE_MAP.json +1 -1
- package/dist/docs/references/docs-evals-overview.md +2 -2
- package/dist/docs/references/reference-evals-answer-relevancy.md +1 -1
- package/dist/docs/references/reference-evals-answer-similarity.md +1 -1
- package/dist/docs/references/reference-evals-bias.md +1 -1
- package/dist/docs/references/reference-evals-context-precision.md +3 -3
- package/dist/docs/references/reference-evals-context-relevance.md +11 -11
- package/dist/docs/references/reference-evals-faithfulness.md +1 -1
- package/dist/docs/references/reference-evals-hallucination.md +5 -5
- package/dist/docs/references/reference-evals-noise-sensitivity.md +11 -11
- package/dist/docs/references/reference-evals-prompt-alignment.md +15 -15
- package/dist/docs/references/reference-evals-rubric.md +113 -0
- package/dist/docs/references/reference-evals-tool-call-accuracy.md +3 -3
- package/dist/docs/references/reference-evals-toxicity.md +1 -1
- package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
- package/dist/scorers/llm/index.d.ts +1 -0
- package/dist/scorers/llm/index.d.ts.map +1 -1
- package/dist/scorers/llm/rubric/index.d.ts +71 -0
- package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
- package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
- package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
- package/dist/scorers/prebuilt/index.cjs +276 -78
- package/dist/scorers/prebuilt/index.cjs.map +1 -1
- package/dist/scorers/prebuilt/index.js +203 -6
- package/dist/scorers/prebuilt/index.js.map +1 -1
- package/dist/scorers/utils.cjs +25 -25
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +9 -8
- package/dist/chunk-BULMCHKJ.cjs.map +0 -1
|
@@ -61,7 +61,7 @@ describe('Agent Noise Resistance Tests', () => {
|
|
|
61
61
|
|
|
62
62
|
// Step 4: Evaluate using noise sensitivity scorer
|
|
63
63
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
64
|
-
model: 'openai/gpt-5.
|
|
64
|
+
model: 'openai/gpt-5.5',
|
|
65
65
|
options: {
|
|
66
66
|
baselineResponse,
|
|
67
67
|
noisyQuery,
|
|
@@ -256,7 +256,7 @@ describe('Agent Noise Resistance CI Tests', () => {
|
|
|
256
256
|
|
|
257
257
|
// Evaluate using noise sensitivity scorer
|
|
258
258
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
259
|
-
model: 'openai/gpt-5.
|
|
259
|
+
model: 'openai/gpt-5.5',
|
|
260
260
|
options: {
|
|
261
261
|
baselineResponse: testCase.baselineResponse,
|
|
262
262
|
noisyQuery: testCase.noisyQuery,
|
|
@@ -291,7 +291,7 @@ This example shows an agent that completely resists misinformation in a test sce
|
|
|
291
291
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
292
292
|
|
|
293
293
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
294
|
-
model: 'openai/gpt-5.
|
|
294
|
+
model: 'openai/gpt-5.5',
|
|
295
295
|
options: {
|
|
296
296
|
baselineResponse:
|
|
297
297
|
'Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing.',
|
|
@@ -337,7 +337,7 @@ This example shows an agent partially distracted by irrelevant requests:
|
|
|
337
337
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
338
338
|
|
|
339
339
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
340
|
-
model: 'openai/gpt-5.
|
|
340
|
+
model: 'openai/gpt-5.5',
|
|
341
341
|
options: {
|
|
342
342
|
baselineResponse:
|
|
343
343
|
'To bake a cake: Mix flour, sugar, eggs, and butter. Bake at 350°F for 30 minutes.',
|
|
@@ -382,7 +382,7 @@ This example shows an agent that incorporates misinformation:
|
|
|
382
382
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
383
383
|
|
|
384
384
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
385
|
-
model: 'openai/gpt-5.
|
|
385
|
+
model: 'openai/gpt-5.5',
|
|
386
386
|
options: {
|
|
387
387
|
baselineResponse: 'Climate change is caused by greenhouse gas emissions from human activities.',
|
|
388
388
|
noisyQuery:
|
|
@@ -428,7 +428,7 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
|
428
428
|
|
|
429
429
|
// Lenient scoring - more forgiving of minor issues
|
|
430
430
|
const lenientScorer = createNoiseSensitivityScorerLLM({
|
|
431
|
-
model: 'openai/gpt-5.
|
|
431
|
+
model: 'openai/gpt-5.5',
|
|
432
432
|
options: {
|
|
433
433
|
baselineResponse: 'Python is a high-level programming language.',
|
|
434
434
|
noisyQuery: 'What is Python? Also, snakes are dangerous!',
|
|
@@ -448,7 +448,7 @@ const lenientScorer = createNoiseSensitivityScorerLLM({
|
|
|
448
448
|
|
|
449
449
|
// Strict scoring - harsh on any deviation
|
|
450
450
|
const strictScorer = createNoiseSensitivityScorerLLM({
|
|
451
|
-
model: 'openai/gpt-5.
|
|
451
|
+
model: 'openai/gpt-5.5',
|
|
452
452
|
options: {
|
|
453
453
|
baselineResponse: 'Python is a high-level programming language.',
|
|
454
454
|
noisyQuery: 'What is Python? Also, snakes are dangerous!',
|
|
@@ -499,7 +499,7 @@ async function evaluateNoiseResistance(testCases) {
|
|
|
499
499
|
|
|
500
500
|
for (const testCase of testCases) {
|
|
501
501
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
502
|
-
model: 'openai/gpt-5.
|
|
502
|
+
model: 'openai/gpt-5.5',
|
|
503
503
|
options: {
|
|
504
504
|
baselineResponse: testCase.baseline,
|
|
505
505
|
noisyQuery: testCase.noisyQuery,
|
|
@@ -546,9 +546,9 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
|
546
546
|
|
|
547
547
|
async function compareModelRobustness() {
|
|
548
548
|
const models = [
|
|
549
|
-
{ name: 'GPT-5.4', model: 'openai/gpt-5.
|
|
549
|
+
{ name: 'GPT-5.4', model: 'openai/gpt-5.5' },
|
|
550
550
|
{ name: 'GPT-5.4-mini', model: 'openai/gpt-5-mini' },
|
|
551
|
-
{ name: 'Claude', model: 'anthropic/claude-opus-4-
|
|
551
|
+
{ name: 'Claude', model: 'anthropic/claude-opus-4-7' },
|
|
552
552
|
]
|
|
553
553
|
|
|
554
554
|
const testScenario = {
|
|
@@ -598,7 +598,7 @@ Include noise sensitivity tests in your security test suite to validate prompt i
|
|
|
598
598
|
import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
|
|
599
599
|
|
|
600
600
|
const scorer = createNoiseSensitivityScorerLLM({
|
|
601
|
-
model: 'openai/gpt-5.
|
|
601
|
+
model: 'openai/gpt-5.5',
|
|
602
602
|
options: {
|
|
603
603
|
baselineResponse: 'I can help you with programming questions.',
|
|
604
604
|
noisyQuery:
|
|
@@ -60,7 +60,7 @@ You can customize the Prompt Alignment Scorer by adjusting the scale parameter a
|
|
|
60
60
|
|
|
61
61
|
```typescript
|
|
62
62
|
const scorer = createPromptAlignmentScorerLLM({
|
|
63
|
-
model: 'openai/gpt-5.
|
|
63
|
+
model: 'openai/gpt-5.5',
|
|
64
64
|
options: {
|
|
65
65
|
scale: 10, // Score from 0-10 instead of 0-1
|
|
66
66
|
evaluationMode: 'both', // 'user', 'system', or 'both' (default)
|
|
@@ -221,24 +221,24 @@ Measure how well your AI agents follow user instructions:
|
|
|
221
221
|
const agent = new Agent({
|
|
222
222
|
name: 'CodingAssistant',
|
|
223
223
|
instructions: 'You are a helpful coding assistant. Always provide working code examples.',
|
|
224
|
-
model: 'openai/gpt-5.
|
|
224
|
+
model: 'openai/gpt-5.5',
|
|
225
225
|
})
|
|
226
226
|
|
|
227
227
|
// Evaluate comprehensive alignment (default)
|
|
228
228
|
const scorer = createPromptAlignmentScorerLLM({
|
|
229
|
-
model: 'openai/gpt-5.
|
|
229
|
+
model: 'openai/gpt-5.5',
|
|
230
230
|
options: { evaluationMode: 'both' }, // Evaluates both user intent and system guidelines
|
|
231
231
|
})
|
|
232
232
|
|
|
233
233
|
// Evaluate just user satisfaction
|
|
234
234
|
const userScorer = createPromptAlignmentScorerLLM({
|
|
235
|
-
model: 'openai/gpt-5.
|
|
235
|
+
model: 'openai/gpt-5.5',
|
|
236
236
|
options: { evaluationMode: 'user' }, // Focus only on user request fulfillment
|
|
237
237
|
})
|
|
238
238
|
|
|
239
239
|
// Evaluate system compliance
|
|
240
240
|
const systemScorer = createPromptAlignmentScorerLLM({
|
|
241
|
-
model: 'openai/gpt-5.
|
|
241
|
+
model: 'openai/gpt-5.5',
|
|
242
242
|
options: { evaluationMode: 'system' }, // Check adherence to system instructions
|
|
243
243
|
})
|
|
244
244
|
|
|
@@ -290,7 +290,7 @@ for (const agent of agents) {
|
|
|
290
290
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals'
|
|
291
291
|
|
|
292
292
|
const scorer = createPromptAlignmentScorerLLM({
|
|
293
|
-
model: 'openai/gpt-5.
|
|
293
|
+
model: 'openai/gpt-5.5',
|
|
294
294
|
})
|
|
295
295
|
|
|
296
296
|
// Evaluate a code generation task
|
|
@@ -319,7 +319,7 @@ const result = await scorer.run({
|
|
|
319
319
|
```typescript
|
|
320
320
|
// Configure scale and evaluation mode
|
|
321
321
|
const scorer = createPromptAlignmentScorerLLM({
|
|
322
|
-
model: 'openai/gpt-5.
|
|
322
|
+
model: 'openai/gpt-5.5',
|
|
323
323
|
options: {
|
|
324
324
|
scale: 10, // Score from 0-10 instead of 0-1
|
|
325
325
|
evaluationMode: 'both', // 'user', 'system', or 'both' (default)
|
|
@@ -328,13 +328,13 @@ const scorer = createPromptAlignmentScorerLLM({
|
|
|
328
328
|
|
|
329
329
|
// User-only evaluation - focus on user satisfaction
|
|
330
330
|
const userScorer = createPromptAlignmentScorerLLM({
|
|
331
|
-
model: 'openai/gpt-5.
|
|
331
|
+
model: 'openai/gpt-5.5',
|
|
332
332
|
options: { evaluationMode: 'user' },
|
|
333
333
|
})
|
|
334
334
|
|
|
335
335
|
// System-only evaluation - focus on compliance
|
|
336
336
|
const systemScorer = createPromptAlignmentScorerLLM({
|
|
337
|
-
model: 'openai/gpt-5.
|
|
337
|
+
model: 'openai/gpt-5.5',
|
|
338
338
|
options: { evaluationMode: 'system' },
|
|
339
339
|
})
|
|
340
340
|
|
|
@@ -369,7 +369,7 @@ In this example, the response fully addresses the user's prompt with all require
|
|
|
369
369
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
370
370
|
|
|
371
371
|
const scorer = createPromptAlignmentScorerLLM({
|
|
372
|
-
model: 'openai/gpt-5.
|
|
372
|
+
model: 'openai/gpt-5.5',
|
|
373
373
|
})
|
|
374
374
|
|
|
375
375
|
const inputMessages = [
|
|
@@ -417,7 +417,7 @@ In this example, the response addresses the core intent but misses some requirem
|
|
|
417
417
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
418
418
|
|
|
419
419
|
const scorer = createPromptAlignmentScorerLLM({
|
|
420
|
-
model: 'openai/gpt-5.
|
|
420
|
+
model: 'openai/gpt-5.5',
|
|
421
421
|
})
|
|
422
422
|
|
|
423
423
|
const inputMessages = [
|
|
@@ -458,7 +458,7 @@ In this example, the response fails to address the user's specific requirements.
|
|
|
458
458
|
import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
|
|
459
459
|
|
|
460
460
|
const scorer = createPromptAlignmentScorerLLM({
|
|
461
|
-
model: 'openai/gpt-5.
|
|
461
|
+
model: 'openai/gpt-5.5',
|
|
462
462
|
})
|
|
463
463
|
|
|
464
464
|
const inputMessages = [
|
|
@@ -502,7 +502,7 @@ Evaluates how well the response addresses the user's request, ignoring system in
|
|
|
502
502
|
|
|
503
503
|
```typescript
|
|
504
504
|
const scorer = createPromptAlignmentScorerLLM({
|
|
505
|
-
model: 'openai/gpt-5.
|
|
505
|
+
model: 'openai/gpt-5.5',
|
|
506
506
|
options: { evaluationMode: 'user' },
|
|
507
507
|
})
|
|
508
508
|
|
|
@@ -534,7 +534,7 @@ Evaluates compliance with system behavioral guidelines and constraints:
|
|
|
534
534
|
|
|
535
535
|
```typescript
|
|
536
536
|
const scorer = createPromptAlignmentScorerLLM({
|
|
537
|
-
model: 'openai/gpt-5.
|
|
537
|
+
model: 'openai/gpt-5.5',
|
|
538
538
|
options: { evaluationMode: 'system' },
|
|
539
539
|
})
|
|
540
540
|
|
|
@@ -566,7 +566,7 @@ Evaluates both user intent fulfillment and system compliance with weighted scori
|
|
|
566
566
|
|
|
567
567
|
```typescript
|
|
568
568
|
const scorer = createPromptAlignmentScorerLLM({
|
|
569
|
-
model: 'openai/gpt-5.
|
|
569
|
+
model: 'openai/gpt-5.5',
|
|
570
570
|
options: { evaluationMode: 'both' }, // This is the default
|
|
571
571
|
})
|
|
572
572
|
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
# Rubric scorer
|
|
2
|
+
|
|
3
|
+
**Added in:** `@mastra/evals@1.3.0`
|
|
4
|
+
|
|
5
|
+
The `createRubricScorer()` function creates an LLM-as-judge scorer that grades an agent's output against a rubric (a checklist of criteria). It returns a **binary** score: `1` only when every required criterion is satisfied, otherwise `0`. The `reason` lists each criterion's verdict so the agent knows exactly what to fix.
|
|
6
|
+
|
|
7
|
+
This scorer is designed to drop into [`isTaskComplete`](https://mastra.ai/reference/streaming/agents/stream). Because `isTaskComplete` treats `score === 1` as "task complete" and injects the `reason` back into the conversation as feedback, the agent keeps iterating until the rubric is satisfied (or `maxSteps` is reached).
|
|
8
|
+
|
|
9
|
+
## Parameters
|
|
10
|
+
|
|
11
|
+
**model** (`MastraModelConfig`): The language model used to grade the output against the rubric. A smaller, cheaper model is usually sufficient for grading.
|
|
12
|
+
|
|
13
|
+
**criteria** (`RubricCriterion[] | string`): The rubric to grade against. A string is treated as a newline-delimited checklist (each line becomes a required criterion). If omitted, the rubric is read at run time from a \`rubric\` value on request/additional context; if none resolves, the scorer is a no-op and returns 1.
|
|
14
|
+
|
|
15
|
+
**options** (`RubricScorerOptions`): Configuration options for the scorer
|
|
16
|
+
|
|
17
|
+
## `.run()` returns
|
|
18
|
+
|
|
19
|
+
**score** (`number`): 1 when every required criterion is satisfied, otherwise 0 (multiplied by scale).
|
|
20
|
+
|
|
21
|
+
**reason** (`string`): A per-criterion explanation listing which criteria are met or unmet and why. This is the text that isTaskComplete injects back into the conversation as feedback.
|
|
22
|
+
|
|
23
|
+
## Usage with isTaskComplete
|
|
24
|
+
|
|
25
|
+
Define the rubric once, attach the scorer to `isTaskComplete`, and the agent self-corrects until the rubric is satisfied:
|
|
26
|
+
|
|
27
|
+
```typescript
|
|
28
|
+
import { Agent } from '@mastra/core/agent'
|
|
29
|
+
import { createRubricScorer } from '@mastra/evals/scorers/prebuilt'
|
|
30
|
+
|
|
31
|
+
const supervisor = new Agent({
|
|
32
|
+
id: 'supervisor',
|
|
33
|
+
instructions: `You coordinate research and writing using specialized agents. Delegate to research-agent for facts, then writing-agent for content.`,
|
|
34
|
+
model: 'openai/gpt-5.5',
|
|
35
|
+
agents: { researchAgent, writingAgent },
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
const rubricScorer = createRubricScorer({
|
|
39
|
+
model: 'openai/gpt-5-mini',
|
|
40
|
+
criteria: [
|
|
41
|
+
{ description: 'The response includes an analysis section' },
|
|
42
|
+
{ description: 'The response includes concrete recommendations' },
|
|
43
|
+
],
|
|
44
|
+
})
|
|
45
|
+
|
|
46
|
+
const stream = await supervisor.stream('Research AI in education', {
|
|
47
|
+
maxSteps: 10,
|
|
48
|
+
isTaskComplete: {
|
|
49
|
+
scorers: [rubricScorer],
|
|
50
|
+
strategy: 'all',
|
|
51
|
+
},
|
|
52
|
+
})
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
## String rubric
|
|
56
|
+
|
|
57
|
+
A newline-delimited string is parsed into criteria, with common list markers (`-`, `*`, `1.`) stripped. Every line becomes a required criterion:
|
|
58
|
+
|
|
59
|
+
```typescript
|
|
60
|
+
const rubricScorer = createRubricScorer({
|
|
61
|
+
model: 'openai/gpt-5-mini',
|
|
62
|
+
criteria: `- All tests pass in the test suite
|
|
63
|
+
- The function is named find_duplicates and accepts a single list argument`,
|
|
64
|
+
})
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Optional criteria
|
|
68
|
+
|
|
69
|
+
Mark a criterion as optional to have it graded and reported without gating completion:
|
|
70
|
+
|
|
71
|
+
```typescript
|
|
72
|
+
const rubricScorer = createRubricScorer({
|
|
73
|
+
model: 'openai/gpt-5-mini',
|
|
74
|
+
criteria: [
|
|
75
|
+
{ description: 'Includes an analysis section', required: true },
|
|
76
|
+
{ description: 'Includes citations', required: false },
|
|
77
|
+
],
|
|
78
|
+
})
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
## Dynamic rubric per run
|
|
82
|
+
|
|
83
|
+
When no `criteria` is passed to the factory, the scorer resolves a `rubric` value from the run's request context, additional context, or input. This lets a single scorer instance grade different rubrics per run without rebuilding it:
|
|
84
|
+
|
|
85
|
+
```typescript
|
|
86
|
+
const rubricScorer = createRubricScorer({
|
|
87
|
+
model: 'openai/gpt-5-mini',
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
await supervisor.stream('Write find_duplicates', {
|
|
91
|
+
isTaskComplete: { scorers: [rubricScorer] },
|
|
92
|
+
requestContext: {
|
|
93
|
+
rubric: '- All tests pass\n- The function is named find_duplicates',
|
|
94
|
+
},
|
|
95
|
+
})
|
|
96
|
+
```
|
|
97
|
+
|
|
98
|
+
If no rubric resolves, the scorer returns `1` and doesn't gate the loop.
|
|
99
|
+
|
|
100
|
+
## Scoring details
|
|
101
|
+
|
|
102
|
+
The scorer runs in two phases:
|
|
103
|
+
|
|
104
|
+
1. **Grade**: The judge model evaluates each criterion independently and returns a per-criterion verdict (`satisfied` / not) with reasoning.
|
|
105
|
+
2. **Score**: The result is `1` only when every required criterion is `satisfied`, otherwise `0`. If no criteria are marked required, all criteria are treated as required.
|
|
106
|
+
|
|
107
|
+
The `reason` summarizes the overall result and lists each criterion with its verdict, so a failing grade gives the agent targeted, actionable feedback rather than a generic "try again".
|
|
108
|
+
|
|
109
|
+
## Related
|
|
110
|
+
|
|
111
|
+
- [isTaskComplete on stream()](https://mastra.ai/reference/streaming/agents/stream)
|
|
112
|
+
- [Supervisor agents](https://mastra.ai/docs/agents/supervisor-agents)
|
|
113
|
+
- [createScorer](https://mastra.ai/reference/evals/create-scorer)
|
|
@@ -309,7 +309,7 @@ The LLM-based scorer provides:
|
|
|
309
309
|
```typescript
|
|
310
310
|
// Basic configuration
|
|
311
311
|
const basicLLMScorer = createLLMScorer({
|
|
312
|
-
model: 'openai/gpt-5.
|
|
312
|
+
model: 'openai/gpt-5.5',
|
|
313
313
|
availableTools: [
|
|
314
314
|
{ name: 'tool1', description: 'Description 1' },
|
|
315
315
|
{ name: 'tool2', description: 'Description 2' }
|
|
@@ -349,7 +349,7 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
|
|
|
349
349
|
|
|
350
350
|
```typescript
|
|
351
351
|
const llmScorer = createToolCallAccuracyScorerLLM({
|
|
352
|
-
model: 'openai/gpt-5.
|
|
352
|
+
model: 'openai/gpt-5.5',
|
|
353
353
|
availableTools: [
|
|
354
354
|
{
|
|
355
355
|
name: 'weather-tool',
|
|
@@ -482,7 +482,7 @@ const codeScorer = createCodeScorer({
|
|
|
482
482
|
})
|
|
483
483
|
|
|
484
484
|
const llmScorer = createLLMScorer({
|
|
485
|
-
model: 'openai/gpt-5.
|
|
485
|
+
model: 'openai/gpt-5.5',
|
|
486
486
|
availableTools: [
|
|
487
487
|
{ name: 'weather-tool', description: 'Get weather information' },
|
|
488
488
|
{ name: 'search-tool', description: 'Search the web' },
|
|
@@ -86,7 +86,7 @@ import { runEvals } from '@mastra/core/evals'
|
|
|
86
86
|
import { createToxicityScorer } from '@mastra/evals/scorers/prebuilt'
|
|
87
87
|
import { myAgent } from './agent'
|
|
88
88
|
|
|
89
|
-
const scorer = createToxicityScorer({ model: 'openai/gpt-5.
|
|
89
|
+
const scorer = createToxicityScorer({ model: 'openai/gpt-5.5' })
|
|
90
90
|
|
|
91
91
|
const result = await runEvals({
|
|
92
92
|
data: [
|
|
@@ -31,7 +31,7 @@ workflow_run
|
|
|
31
31
|
|
|
32
32
|
### Fallback extraction
|
|
33
33
|
|
|
34
|
-
When storage
|
|
34
|
+
When storage isn't available, the pipeline falls back to:
|
|
35
35
|
|
|
36
36
|
- **Agents:** `extractTrajectory()` — Extracts `ToolCallStep` entries from `toolInvocations` in the agent's message output. Produces a flat list of tool calls.
|
|
37
37
|
- **Workflows:** `extractWorkflowTrajectory()` — Extracts `WorkflowStepStep` entries from `stepResults`. Produces a flat list of workflow steps.
|
|
@@ -176,7 +176,7 @@ In this example, the parent workflow requires strict ordering of its steps, but
|
|
|
176
176
|
### Use the LLM-based scorer when:
|
|
177
177
|
|
|
178
178
|
- You need **semantic understanding** of whether steps were appropriate
|
|
179
|
-
- The optimal trajectory
|
|
179
|
+
- The optimal trajectory **isn't predetermined** (evaluate based on task requirements)
|
|
180
180
|
- You want to detect **unnecessary, redundant, or missing** steps
|
|
181
181
|
- You need **explanations** for scoring decisions
|
|
182
182
|
- You are evaluating **production agent behavior**
|
|
@@ -360,7 +360,7 @@ console.log(result.scores.trajectory['trajectory-accuracy'])
|
|
|
360
360
|
|
|
361
361
|
### Comparing step data
|
|
362
362
|
|
|
363
|
-
Validates
|
|
363
|
+
Validates the step names and step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
|
|
364
364
|
|
|
365
365
|
```typescript
|
|
366
366
|
const scorer = createTrajectoryAccuracyScorerCode({
|
|
@@ -9,5 +9,6 @@ export * from './context-relevance/index.js';
|
|
|
9
9
|
export * from './context-precision/index.js';
|
|
10
10
|
export * from './noise-sensitivity/index.js';
|
|
11
11
|
export * from './prompt-alignment/index.js';
|
|
12
|
+
export * from './rubric/index.js';
|
|
12
13
|
export * from './trajectory/index.js';
|
|
13
14
|
//# sourceMappingURL=index.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,cAAc,CAAC"}
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,UAAU,CAAC;AACzB,cAAc,cAAc,CAAC"}
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
import type { MastraModelConfig } from '@mastra/core/llm';
|
|
2
|
+
import type { ScorerRunInputForLLMJudge, ScorerRunOutputForLLMJudge } from '../../utils.js';
|
|
3
|
+
/**
|
|
4
|
+
* A single rubric criterion the agent's output is graded against.
|
|
5
|
+
*/
|
|
6
|
+
export interface RubricCriterion {
|
|
7
|
+
/** Optional stable identifier for the criterion. */
|
|
8
|
+
id?: string;
|
|
9
|
+
/** What the output must satisfy, e.g. "All tests pass" or "Includes a recommendations section". */
|
|
10
|
+
description: string;
|
|
11
|
+
/**
|
|
12
|
+
* Whether this criterion must be satisfied for the task to be considered complete.
|
|
13
|
+
* Defaults to `true`. Optional criteria are graded and reported but do not gate completion.
|
|
14
|
+
*/
|
|
15
|
+
required?: boolean;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Rubric input accepted by the scorer factory and by the dynamic `rubric` context value.
|
|
19
|
+
* A string is treated as a newline-delimited checklist; leading list markers ("-", "*", "1.")
|
|
20
|
+
* are stripped. Every parsed line becomes a required criterion.
|
|
21
|
+
*/
|
|
22
|
+
export type RubricInput = RubricCriterion[] | string;
|
|
23
|
+
export interface RubricScorerOptions {
|
|
24
|
+
/** Scale applied to the final score. Defaults to 1. Only relevant for standalone evals — `isTaskComplete` gates on `=== 1`. */
|
|
25
|
+
scale?: number;
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Creates an LLM-as-judge scorer that grades an agent's output against a rubric and returns a
|
|
29
|
+
* **binary** score: `1` only when every required criterion is satisfied, otherwise `0`. The
|
|
30
|
+
* `generateReason` output lists each unmet criterion with the judge's reasoning.
|
|
31
|
+
*
|
|
32
|
+
* It is designed to drop into `isTaskComplete`, which treats `score === 1` as "task complete" and
|
|
33
|
+
* injects the reason back into the conversation as feedback, so the agent iterates until the rubric
|
|
34
|
+
* is satisfied (or `maxSteps` is reached):
|
|
35
|
+
*
|
|
36
|
+
* @example
|
|
37
|
+
* ```typescript
|
|
38
|
+
* import { createRubricScorer } from '@mastra/evals/scorers/prebuilt';
|
|
39
|
+
*
|
|
40
|
+
* const rubricScorer = createRubricScorer({
|
|
41
|
+
* model: '__GATEWAY_OPENAI_MODEL_MINI__',
|
|
42
|
+
* criteria: [
|
|
43
|
+
* { description: 'The response includes an analysis section' },
|
|
44
|
+
* { description: 'The response includes concrete recommendations' },
|
|
45
|
+
* ],
|
|
46
|
+
* });
|
|
47
|
+
*
|
|
48
|
+
* await supervisor.stream('Research AI in education', {
|
|
49
|
+
* maxSteps: 10,
|
|
50
|
+
* isTaskComplete: { scorers: [rubricScorer], strategy: 'all' },
|
|
51
|
+
* });
|
|
52
|
+
* ```
|
|
53
|
+
*
|
|
54
|
+
* The rubric can also be supplied dynamically per run via request/additional context under the
|
|
55
|
+
* `rubric` key (string checklist or `RubricCriterion[]`). If no rubric resolves, the scorer is a
|
|
56
|
+
* no-op and returns `1` (so it does not gate the loop), mirroring "if the rubric is absent, do nothing".
|
|
57
|
+
*/
|
|
58
|
+
export declare function createRubricScorer({ model, criteria, options, }: {
|
|
59
|
+
model: MastraModelConfig;
|
|
60
|
+
criteria?: RubricInput;
|
|
61
|
+
options?: RubricScorerOptions;
|
|
62
|
+
}): import("@mastra/core/evals").MastraScorer<string, ScorerRunInputForLLMJudge, ScorerRunOutputForLLMJudge, Record<"analyzeStepResult", {
|
|
63
|
+
criteria: {
|
|
64
|
+
criterion: string;
|
|
65
|
+
satisfied: boolean;
|
|
66
|
+
required: boolean;
|
|
67
|
+
reasoning: string;
|
|
68
|
+
}[];
|
|
69
|
+
overallAssessment: string;
|
|
70
|
+
}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
|
|
71
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/rubric/index.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG1D,OAAO,KAAK,EAAE,yBAAyB,EAAE,0BAA0B,EAAE,MAAM,aAAa,CAAC;AAIzF;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oDAAoD;IACpD,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,mGAAmG;IACnG,WAAW,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;;;GAIG;AACH,MAAM,MAAM,WAAW,GAAG,eAAe,EAAE,GAAG,MAAM,CAAC;AAErD,MAAM,WAAW,mBAAmB;IAClC,+HAA+H;IAC/H,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AA0GD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,wBAAgB,kBAAkB,CAAC,EACjC,KAAK,EACL,QAAQ,EACR,OAAO,GACR,EAAE;IACD,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,OAAO,CAAC,EAAE,mBAAmB,CAAC;CAC/B;;;;;;;;6FA2DA"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
export declare const RUBRIC_INSTRUCTIONS = "You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.\n\nA rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.\n\nGrading guidelines:\n- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.\n- A criterion is \"satisfied\" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.\n- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.\n- Be concise but specific in your reasoning: say what is present or missing.\n- Do not reward effort, intent, or partial progress. Only the actual output counts.";
|
|
2
|
+
export interface RubricAnalysisCriterion {
|
|
3
|
+
/** The criterion text, exactly as provided in the rubric. */
|
|
4
|
+
criterion: string;
|
|
5
|
+
/** Whether the output satisfies this criterion. */
|
|
6
|
+
satisfied: boolean;
|
|
7
|
+
/** Whether this criterion is required for the task to be considered complete. */
|
|
8
|
+
required: boolean;
|
|
9
|
+
/** Short explanation of why the criterion is or is not satisfied. */
|
|
10
|
+
reasoning: string;
|
|
11
|
+
}
|
|
12
|
+
export interface RubricAnalysisResult {
|
|
13
|
+
criteria: RubricAnalysisCriterion[];
|
|
14
|
+
overallAssessment: string;
|
|
15
|
+
}
|
|
16
|
+
/**
|
|
17
|
+
* A single rubric criterion as provided to the prompt builder.
|
|
18
|
+
*/
|
|
19
|
+
export interface RubricCriterionInput {
|
|
20
|
+
criterion: string;
|
|
21
|
+
required: boolean;
|
|
22
|
+
}
|
|
23
|
+
export declare function createAnalyzePrompt({ originalTask, output, criteria, }: {
|
|
24
|
+
originalTask: string;
|
|
25
|
+
output: string;
|
|
26
|
+
criteria: RubricCriterionInput[];
|
|
27
|
+
}): string;
|
|
28
|
+
/**
|
|
29
|
+
* Format a human-readable, per-criterion explanation of the rubric result. This text is what
|
|
30
|
+
* `isTaskComplete` injects back into the conversation as feedback, so it must clearly tell the
|
|
31
|
+
* agent which criteria are unmet and why.
|
|
32
|
+
*/
|
|
33
|
+
export declare function formatRubricReason({ score, analysis }: {
|
|
34
|
+
score: number;
|
|
35
|
+
analysis: RubricAnalysisResult;
|
|
36
|
+
}): string;
|
|
37
|
+
//# sourceMappingURL=prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/rubric/prompts.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,mBAAmB,mxBASoD,CAAC;AAErF,MAAM,WAAW,uBAAuB;IACtC,6DAA6D;IAC7D,SAAS,EAAE,MAAM,CAAC;IAClB,mDAAmD;IACnD,SAAS,EAAE,OAAO,CAAC;IACnB,iFAAiF;IACjF,QAAQ,EAAE,OAAO,CAAC;IAClB,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,EAAE,uBAAuB,EAAE,CAAC;IACpC,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,wBAAgB,mBAAmB,CAAC,EAClC,YAAY,EACZ,MAAM,EACN,QAAQ,GACT,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,oBAAoB,EAAE,CAAC;CAClC,GAAG,MAAM,CA8BT;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,oBAAoB,CAAA;CAAE,GAAG,MAAM,CAoBjH"}
|