@mastra/evals 1.2.4-alpha.0 → 1.3.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. package/CHANGELOG.md +41 -0
  2. package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
  3. package/dist/chunk-BE5F2OUQ.js.map +1 -0
  4. package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
  5. package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
  6. package/dist/docs/SKILL.md +2 -1
  7. package/dist/docs/assets/SOURCE_MAP.json +1 -1
  8. package/dist/docs/references/docs-evals-overview.md +2 -2
  9. package/dist/docs/references/reference-evals-answer-relevancy.md +1 -1
  10. package/dist/docs/references/reference-evals-answer-similarity.md +1 -1
  11. package/dist/docs/references/reference-evals-bias.md +1 -1
  12. package/dist/docs/references/reference-evals-context-precision.md +3 -3
  13. package/dist/docs/references/reference-evals-context-relevance.md +11 -11
  14. package/dist/docs/references/reference-evals-faithfulness.md +1 -1
  15. package/dist/docs/references/reference-evals-hallucination.md +5 -5
  16. package/dist/docs/references/reference-evals-noise-sensitivity.md +11 -11
  17. package/dist/docs/references/reference-evals-prompt-alignment.md +15 -15
  18. package/dist/docs/references/reference-evals-rubric.md +113 -0
  19. package/dist/docs/references/reference-evals-tool-call-accuracy.md +3 -3
  20. package/dist/docs/references/reference-evals-toxicity.md +1 -1
  21. package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
  22. package/dist/scorers/llm/index.d.ts +1 -0
  23. package/dist/scorers/llm/index.d.ts.map +1 -1
  24. package/dist/scorers/llm/rubric/index.d.ts +71 -0
  25. package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
  26. package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
  27. package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
  28. package/dist/scorers/prebuilt/index.cjs +276 -78
  29. package/dist/scorers/prebuilt/index.cjs.map +1 -1
  30. package/dist/scorers/prebuilt/index.js +203 -6
  31. package/dist/scorers/prebuilt/index.js.map +1 -1
  32. package/dist/scorers/utils.cjs +25 -25
  33. package/dist/scorers/utils.d.ts.map +1 -1
  34. package/dist/scorers/utils.js +1 -1
  35. package/package.json +9 -8
  36. package/dist/chunk-BULMCHKJ.cjs.map +0 -1
@@ -61,7 +61,7 @@ describe('Agent Noise Resistance Tests', () => {
61
61
 
62
62
  // Step 4: Evaluate using noise sensitivity scorer
63
63
  const scorer = createNoiseSensitivityScorerLLM({
64
- model: 'openai/gpt-5.4',
64
+ model: 'openai/gpt-5.5',
65
65
  options: {
66
66
  baselineResponse,
67
67
  noisyQuery,
@@ -256,7 +256,7 @@ describe('Agent Noise Resistance CI Tests', () => {
256
256
 
257
257
  // Evaluate using noise sensitivity scorer
258
258
  const scorer = createNoiseSensitivityScorerLLM({
259
- model: 'openai/gpt-5.4',
259
+ model: 'openai/gpt-5.5',
260
260
  options: {
261
261
  baselineResponse: testCase.baselineResponse,
262
262
  noisyQuery: testCase.noisyQuery,
@@ -291,7 +291,7 @@ This example shows an agent that completely resists misinformation in a test sce
291
291
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
292
292
 
293
293
  const scorer = createNoiseSensitivityScorerLLM({
294
- model: 'openai/gpt-5.4',
294
+ model: 'openai/gpt-5.5',
295
295
  options: {
296
296
  baselineResponse:
297
297
  'Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing.',
@@ -337,7 +337,7 @@ This example shows an agent partially distracted by irrelevant requests:
337
337
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals/scorers/prebuilt'
338
338
 
339
339
  const scorer = createNoiseSensitivityScorerLLM({
340
- model: 'openai/gpt-5.4',
340
+ model: 'openai/gpt-5.5',
341
341
  options: {
342
342
  baselineResponse:
343
343
  'To bake a cake: Mix flour, sugar, eggs, and butter. Bake at 350°F for 30 minutes.',
@@ -382,7 +382,7 @@ This example shows an agent that incorporates misinformation:
382
382
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
383
383
 
384
384
  const scorer = createNoiseSensitivityScorerLLM({
385
- model: 'openai/gpt-5.4',
385
+ model: 'openai/gpt-5.5',
386
386
  options: {
387
387
  baselineResponse: 'Climate change is caused by greenhouse gas emissions from human activities.',
388
388
  noisyQuery:
@@ -428,7 +428,7 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
428
428
 
429
429
  // Lenient scoring - more forgiving of minor issues
430
430
  const lenientScorer = createNoiseSensitivityScorerLLM({
431
- model: 'openai/gpt-5.4',
431
+ model: 'openai/gpt-5.5',
432
432
  options: {
433
433
  baselineResponse: 'Python is a high-level programming language.',
434
434
  noisyQuery: 'What is Python? Also, snakes are dangerous!',
@@ -448,7 +448,7 @@ const lenientScorer = createNoiseSensitivityScorerLLM({
448
448
 
449
449
  // Strict scoring - harsh on any deviation
450
450
  const strictScorer = createNoiseSensitivityScorerLLM({
451
- model: 'openai/gpt-5.4',
451
+ model: 'openai/gpt-5.5',
452
452
  options: {
453
453
  baselineResponse: 'Python is a high-level programming language.',
454
454
  noisyQuery: 'What is Python? Also, snakes are dangerous!',
@@ -499,7 +499,7 @@ async function evaluateNoiseResistance(testCases) {
499
499
 
500
500
  for (const testCase of testCases) {
501
501
  const scorer = createNoiseSensitivityScorerLLM({
502
- model: 'openai/gpt-5.4',
502
+ model: 'openai/gpt-5.5',
503
503
  options: {
504
504
  baselineResponse: testCase.baseline,
505
505
  noisyQuery: testCase.noisyQuery,
@@ -546,9 +546,9 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
546
546
 
547
547
  async function compareModelRobustness() {
548
548
  const models = [
549
- { name: 'GPT-5.4', model: 'openai/gpt-5.4' },
549
+ { name: 'GPT-5.4', model: 'openai/gpt-5.5' },
550
550
  { name: 'GPT-5.4-mini', model: 'openai/gpt-5-mini' },
551
- { name: 'Claude', model: 'anthropic/claude-opus-4-6' },
551
+ { name: 'Claude', model: 'anthropic/claude-opus-4-7' },
552
552
  ]
553
553
 
554
554
  const testScenario = {
@@ -598,7 +598,7 @@ Include noise sensitivity tests in your security test suite to validate prompt i
598
598
  import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
599
599
 
600
600
  const scorer = createNoiseSensitivityScorerLLM({
601
- model: 'openai/gpt-5.4',
601
+ model: 'openai/gpt-5.5',
602
602
  options: {
603
603
  baselineResponse: 'I can help you with programming questions.',
604
604
  noisyQuery:
@@ -60,7 +60,7 @@ You can customize the Prompt Alignment Scorer by adjusting the scale parameter a
60
60
 
61
61
  ```typescript
62
62
  const scorer = createPromptAlignmentScorerLLM({
63
- model: 'openai/gpt-5.4',
63
+ model: 'openai/gpt-5.5',
64
64
  options: {
65
65
  scale: 10, // Score from 0-10 instead of 0-1
66
66
  evaluationMode: 'both', // 'user', 'system', or 'both' (default)
@@ -221,24 +221,24 @@ Measure how well your AI agents follow user instructions:
221
221
  const agent = new Agent({
222
222
  name: 'CodingAssistant',
223
223
  instructions: 'You are a helpful coding assistant. Always provide working code examples.',
224
- model: 'openai/gpt-5.4',
224
+ model: 'openai/gpt-5.5',
225
225
  })
226
226
 
227
227
  // Evaluate comprehensive alignment (default)
228
228
  const scorer = createPromptAlignmentScorerLLM({
229
- model: 'openai/gpt-5.4',
229
+ model: 'openai/gpt-5.5',
230
230
  options: { evaluationMode: 'both' }, // Evaluates both user intent and system guidelines
231
231
  })
232
232
 
233
233
  // Evaluate just user satisfaction
234
234
  const userScorer = createPromptAlignmentScorerLLM({
235
- model: 'openai/gpt-5.4',
235
+ model: 'openai/gpt-5.5',
236
236
  options: { evaluationMode: 'user' }, // Focus only on user request fulfillment
237
237
  })
238
238
 
239
239
  // Evaluate system compliance
240
240
  const systemScorer = createPromptAlignmentScorerLLM({
241
- model: 'openai/gpt-5.4',
241
+ model: 'openai/gpt-5.5',
242
242
  options: { evaluationMode: 'system' }, // Check adherence to system instructions
243
243
  })
244
244
 
@@ -290,7 +290,7 @@ for (const agent of agents) {
290
290
  import { createPromptAlignmentScorerLLM } from '@mastra/evals'
291
291
 
292
292
  const scorer = createPromptAlignmentScorerLLM({
293
- model: 'openai/gpt-5.4',
293
+ model: 'openai/gpt-5.5',
294
294
  })
295
295
 
296
296
  // Evaluate a code generation task
@@ -319,7 +319,7 @@ const result = await scorer.run({
319
319
  ```typescript
320
320
  // Configure scale and evaluation mode
321
321
  const scorer = createPromptAlignmentScorerLLM({
322
- model: 'openai/gpt-5.4',
322
+ model: 'openai/gpt-5.5',
323
323
  options: {
324
324
  scale: 10, // Score from 0-10 instead of 0-1
325
325
  evaluationMode: 'both', // 'user', 'system', or 'both' (default)
@@ -328,13 +328,13 @@ const scorer = createPromptAlignmentScorerLLM({
328
328
 
329
329
  // User-only evaluation - focus on user satisfaction
330
330
  const userScorer = createPromptAlignmentScorerLLM({
331
- model: 'openai/gpt-5.4',
331
+ model: 'openai/gpt-5.5',
332
332
  options: { evaluationMode: 'user' },
333
333
  })
334
334
 
335
335
  // System-only evaluation - focus on compliance
336
336
  const systemScorer = createPromptAlignmentScorerLLM({
337
- model: 'openai/gpt-5.4',
337
+ model: 'openai/gpt-5.5',
338
338
  options: { evaluationMode: 'system' },
339
339
  })
340
340
 
@@ -369,7 +369,7 @@ In this example, the response fully addresses the user's prompt with all require
369
369
  import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
370
370
 
371
371
  const scorer = createPromptAlignmentScorerLLM({
372
- model: 'openai/gpt-5.4',
372
+ model: 'openai/gpt-5.5',
373
373
  })
374
374
 
375
375
  const inputMessages = [
@@ -417,7 +417,7 @@ In this example, the response addresses the core intent but misses some requirem
417
417
  import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
418
418
 
419
419
  const scorer = createPromptAlignmentScorerLLM({
420
- model: 'openai/gpt-5.4',
420
+ model: 'openai/gpt-5.5',
421
421
  })
422
422
 
423
423
  const inputMessages = [
@@ -458,7 +458,7 @@ In this example, the response fails to address the user's specific requirements.
458
458
  import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
459
459
 
460
460
  const scorer = createPromptAlignmentScorerLLM({
461
- model: 'openai/gpt-5.4',
461
+ model: 'openai/gpt-5.5',
462
462
  })
463
463
 
464
464
  const inputMessages = [
@@ -502,7 +502,7 @@ Evaluates how well the response addresses the user's request, ignoring system in
502
502
 
503
503
  ```typescript
504
504
  const scorer = createPromptAlignmentScorerLLM({
505
- model: 'openai/gpt-5.4',
505
+ model: 'openai/gpt-5.5',
506
506
  options: { evaluationMode: 'user' },
507
507
  })
508
508
 
@@ -534,7 +534,7 @@ Evaluates compliance with system behavioral guidelines and constraints:
534
534
 
535
535
  ```typescript
536
536
  const scorer = createPromptAlignmentScorerLLM({
537
- model: 'openai/gpt-5.4',
537
+ model: 'openai/gpt-5.5',
538
538
  options: { evaluationMode: 'system' },
539
539
  })
540
540
 
@@ -566,7 +566,7 @@ Evaluates both user intent fulfillment and system compliance with weighted scori
566
566
 
567
567
  ```typescript
568
568
  const scorer = createPromptAlignmentScorerLLM({
569
- model: 'openai/gpt-5.4',
569
+ model: 'openai/gpt-5.5',
570
570
  options: { evaluationMode: 'both' }, // This is the default
571
571
  })
572
572
 
@@ -0,0 +1,113 @@
1
+ # Rubric scorer
2
+
3
+ **Added in:** `@mastra/evals@1.3.0`
4
+
5
+ The `createRubricScorer()` function creates an LLM-as-judge scorer that grades an agent's output against a rubric (a checklist of criteria). It returns a **binary** score: `1` only when every required criterion is satisfied, otherwise `0`. The `reason` lists each criterion's verdict so the agent knows exactly what to fix.
6
+
7
+ This scorer is designed to drop into [`isTaskComplete`](https://mastra.ai/reference/streaming/agents/stream). Because `isTaskComplete` treats `score === 1` as "task complete" and injects the `reason` back into the conversation as feedback, the agent keeps iterating until the rubric is satisfied (or `maxSteps` is reached).
8
+
9
+ ## Parameters
10
+
11
+ **model** (`MastraModelConfig`): The language model used to grade the output against the rubric. A smaller, cheaper model is usually sufficient for grading.
12
+
13
+ **criteria** (`RubricCriterion[] | string`): The rubric to grade against. A string is treated as a newline-delimited checklist (each line becomes a required criterion). If omitted, the rubric is read at run time from a \`rubric\` value on request/additional context; if none resolves, the scorer is a no-op and returns 1.
14
+
15
+ **options** (`RubricScorerOptions`): Configuration options for the scorer
16
+
17
+ ## `.run()` returns
18
+
19
+ **score** (`number`): 1 when every required criterion is satisfied, otherwise 0 (multiplied by scale).
20
+
21
+ **reason** (`string`): A per-criterion explanation listing which criteria are met or unmet and why. This is the text that isTaskComplete injects back into the conversation as feedback.
22
+
23
+ ## Usage with isTaskComplete
24
+
25
+ Define the rubric once, attach the scorer to `isTaskComplete`, and the agent self-corrects until the rubric is satisfied:
26
+
27
+ ```typescript
28
+ import { Agent } from '@mastra/core/agent'
29
+ import { createRubricScorer } from '@mastra/evals/scorers/prebuilt'
30
+
31
+ const supervisor = new Agent({
32
+ id: 'supervisor',
33
+ instructions: `You coordinate research and writing using specialized agents. Delegate to research-agent for facts, then writing-agent for content.`,
34
+ model: 'openai/gpt-5.5',
35
+ agents: { researchAgent, writingAgent },
36
+ })
37
+
38
+ const rubricScorer = createRubricScorer({
39
+ model: 'openai/gpt-5-mini',
40
+ criteria: [
41
+ { description: 'The response includes an analysis section' },
42
+ { description: 'The response includes concrete recommendations' },
43
+ ],
44
+ })
45
+
46
+ const stream = await supervisor.stream('Research AI in education', {
47
+ maxSteps: 10,
48
+ isTaskComplete: {
49
+ scorers: [rubricScorer],
50
+ strategy: 'all',
51
+ },
52
+ })
53
+ ```
54
+
55
+ ## String rubric
56
+
57
+ A newline-delimited string is parsed into criteria, with common list markers (`-`, `*`, `1.`) stripped. Every line becomes a required criterion:
58
+
59
+ ```typescript
60
+ const rubricScorer = createRubricScorer({
61
+ model: 'openai/gpt-5-mini',
62
+ criteria: `- All tests pass in the test suite
63
+ - The function is named find_duplicates and accepts a single list argument`,
64
+ })
65
+ ```
66
+
67
+ ## Optional criteria
68
+
69
+ Mark a criterion as optional to have it graded and reported without gating completion:
70
+
71
+ ```typescript
72
+ const rubricScorer = createRubricScorer({
73
+ model: 'openai/gpt-5-mini',
74
+ criteria: [
75
+ { description: 'Includes an analysis section', required: true },
76
+ { description: 'Includes citations', required: false },
77
+ ],
78
+ })
79
+ ```
80
+
81
+ ## Dynamic rubric per run
82
+
83
+ When no `criteria` is passed to the factory, the scorer resolves a `rubric` value from the run's request context, additional context, or input. This lets a single scorer instance grade different rubrics per run without rebuilding it:
84
+
85
+ ```typescript
86
+ const rubricScorer = createRubricScorer({
87
+ model: 'openai/gpt-5-mini',
88
+ })
89
+
90
+ await supervisor.stream('Write find_duplicates', {
91
+ isTaskComplete: { scorers: [rubricScorer] },
92
+ requestContext: {
93
+ rubric: '- All tests pass\n- The function is named find_duplicates',
94
+ },
95
+ })
96
+ ```
97
+
98
+ If no rubric resolves, the scorer returns `1` and doesn't gate the loop.
99
+
100
+ ## Scoring details
101
+
102
+ The scorer runs in two phases:
103
+
104
+ 1. **Grade**: The judge model evaluates each criterion independently and returns a per-criterion verdict (`satisfied` / not) with reasoning.
105
+ 2. **Score**: The result is `1` only when every required criterion is `satisfied`, otherwise `0`. If no criteria are marked required, all criteria are treated as required.
106
+
107
+ The `reason` summarizes the overall result and lists each criterion with its verdict, so a failing grade gives the agent targeted, actionable feedback rather than a generic "try again".
108
+
109
+ ## Related
110
+
111
+ - [isTaskComplete on stream()](https://mastra.ai/reference/streaming/agents/stream)
112
+ - [Supervisor agents](https://mastra.ai/docs/agents/supervisor-agents)
113
+ - [createScorer](https://mastra.ai/reference/evals/create-scorer)
@@ -309,7 +309,7 @@ The LLM-based scorer provides:
309
309
  ```typescript
310
310
  // Basic configuration
311
311
  const basicLLMScorer = createLLMScorer({
312
- model: 'openai/gpt-5.4',
312
+ model: 'openai/gpt-5.5',
313
313
  availableTools: [
314
314
  { name: 'tool1', description: 'Description 1' },
315
315
  { name: 'tool2', description: 'Description 2' }
@@ -349,7 +349,7 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
349
349
 
350
350
  ```typescript
351
351
  const llmScorer = createToolCallAccuracyScorerLLM({
352
- model: 'openai/gpt-5.4',
352
+ model: 'openai/gpt-5.5',
353
353
  availableTools: [
354
354
  {
355
355
  name: 'weather-tool',
@@ -482,7 +482,7 @@ const codeScorer = createCodeScorer({
482
482
  })
483
483
 
484
484
  const llmScorer = createLLMScorer({
485
- model: 'openai/gpt-5.4',
485
+ model: 'openai/gpt-5.5',
486
486
  availableTools: [
487
487
  { name: 'weather-tool', description: 'Get weather information' },
488
488
  { name: 'search-tool', description: 'Search the web' },
@@ -86,7 +86,7 @@ import { runEvals } from '@mastra/core/evals'
86
86
  import { createToxicityScorer } from '@mastra/evals/scorers/prebuilt'
87
87
  import { myAgent } from './agent'
88
88
 
89
- const scorer = createToxicityScorer({ model: 'openai/gpt-5.4' })
89
+ const scorer = createToxicityScorer({ model: 'openai/gpt-5.5' })
90
90
 
91
91
  const result = await runEvals({
92
92
  data: [
@@ -31,7 +31,7 @@ workflow_run
31
31
 
32
32
  ### Fallback extraction
33
33
 
34
- When storage is not available, the pipeline falls back to:
34
+ When storage isn't available, the pipeline falls back to:
35
35
 
36
36
  - **Agents:** `extractTrajectory()` — Extracts `ToolCallStep` entries from `toolInvocations` in the agent's message output. Produces a flat list of tool calls.
37
37
  - **Workflows:** `extractWorkflowTrajectory()` — Extracts `WorkflowStepStep` entries from `stepResults`. Produces a flat list of workflow steps.
@@ -176,7 +176,7 @@ In this example, the parent workflow requires strict ordering of its steps, but
176
176
  ### Use the LLM-based scorer when:
177
177
 
178
178
  - You need **semantic understanding** of whether steps were appropriate
179
- - The optimal trajectory is **not predetermined** (evaluate based on task requirements)
179
+ - The optimal trajectory **isn't predetermined** (evaluate based on task requirements)
180
180
  - You want to detect **unnecessary, redundant, or missing** steps
181
181
  - You need **explanations** for scoring decisions
182
182
  - You are evaluating **production agent behavior**
@@ -360,7 +360,7 @@ console.log(result.scores.trajectory['trajectory-accuracy'])
360
360
 
361
361
  ### Comparing step data
362
362
 
363
- Validates not just the step names but also step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
363
+ Validates the step names and step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
364
364
 
365
365
  ```typescript
366
366
  const scorer = createTrajectoryAccuracyScorerCode({
@@ -9,5 +9,6 @@ export * from './context-relevance/index.js';
9
9
  export * from './context-precision/index.js';
10
10
  export * from './noise-sensitivity/index.js';
11
11
  export * from './prompt-alignment/index.js';
12
+ export * from './rubric/index.js';
12
13
  export * from './trajectory/index.js';
13
14
  //# sourceMappingURL=index.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,cAAc,CAAC"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,UAAU,CAAC;AACzB,cAAc,cAAc,CAAC"}
@@ -0,0 +1,71 @@
1
+ import type { MastraModelConfig } from '@mastra/core/llm';
2
+ import type { ScorerRunInputForLLMJudge, ScorerRunOutputForLLMJudge } from '../../utils.js';
3
+ /**
4
+ * A single rubric criterion the agent's output is graded against.
5
+ */
6
+ export interface RubricCriterion {
7
+ /** Optional stable identifier for the criterion. */
8
+ id?: string;
9
+ /** What the output must satisfy, e.g. "All tests pass" or "Includes a recommendations section". */
10
+ description: string;
11
+ /**
12
+ * Whether this criterion must be satisfied for the task to be considered complete.
13
+ * Defaults to `true`. Optional criteria are graded and reported but do not gate completion.
14
+ */
15
+ required?: boolean;
16
+ }
17
+ /**
18
+ * Rubric input accepted by the scorer factory and by the dynamic `rubric` context value.
19
+ * A string is treated as a newline-delimited checklist; leading list markers ("-", "*", "1.")
20
+ * are stripped. Every parsed line becomes a required criterion.
21
+ */
22
+ export type RubricInput = RubricCriterion[] | string;
23
+ export interface RubricScorerOptions {
24
+ /** Scale applied to the final score. Defaults to 1. Only relevant for standalone evals — `isTaskComplete` gates on `=== 1`. */
25
+ scale?: number;
26
+ }
27
+ /**
28
+ * Creates an LLM-as-judge scorer that grades an agent's output against a rubric and returns a
29
+ * **binary** score: `1` only when every required criterion is satisfied, otherwise `0`. The
30
+ * `generateReason` output lists each unmet criterion with the judge's reasoning.
31
+ *
32
+ * It is designed to drop into `isTaskComplete`, which treats `score === 1` as "task complete" and
33
+ * injects the reason back into the conversation as feedback, so the agent iterates until the rubric
34
+ * is satisfied (or `maxSteps` is reached):
35
+ *
36
+ * @example
37
+ * ```typescript
38
+ * import { createRubricScorer } from '@mastra/evals/scorers/prebuilt';
39
+ *
40
+ * const rubricScorer = createRubricScorer({
41
+ * model: '__GATEWAY_OPENAI_MODEL_MINI__',
42
+ * criteria: [
43
+ * { description: 'The response includes an analysis section' },
44
+ * { description: 'The response includes concrete recommendations' },
45
+ * ],
46
+ * });
47
+ *
48
+ * await supervisor.stream('Research AI in education', {
49
+ * maxSteps: 10,
50
+ * isTaskComplete: { scorers: [rubricScorer], strategy: 'all' },
51
+ * });
52
+ * ```
53
+ *
54
+ * The rubric can also be supplied dynamically per run via request/additional context under the
55
+ * `rubric` key (string checklist or `RubricCriterion[]`). If no rubric resolves, the scorer is a
56
+ * no-op and returns `1` (so it does not gate the loop), mirroring "if the rubric is absent, do nothing".
57
+ */
58
+ export declare function createRubricScorer({ model, criteria, options, }: {
59
+ model: MastraModelConfig;
60
+ criteria?: RubricInput;
61
+ options?: RubricScorerOptions;
62
+ }): import("@mastra/core/evals").MastraScorer<string, ScorerRunInputForLLMJudge, ScorerRunOutputForLLMJudge, Record<"analyzeStepResult", {
63
+ criteria: {
64
+ criterion: string;
65
+ satisfied: boolean;
66
+ required: boolean;
67
+ reasoning: string;
68
+ }[];
69
+ overallAssessment: string;
70
+ }> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
71
+ //# sourceMappingURL=index.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/rubric/index.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG1D,OAAO,KAAK,EAAE,yBAAyB,EAAE,0BAA0B,EAAE,MAAM,aAAa,CAAC;AAIzF;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oDAAoD;IACpD,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,mGAAmG;IACnG,WAAW,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;;;GAIG;AACH,MAAM,MAAM,WAAW,GAAG,eAAe,EAAE,GAAG,MAAM,CAAC;AAErD,MAAM,WAAW,mBAAmB;IAClC,+HAA+H;IAC/H,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AA0GD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,wBAAgB,kBAAkB,CAAC,EACjC,KAAK,EACL,QAAQ,EACR,OAAO,GACR,EAAE;IACD,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,OAAO,CAAC,EAAE,mBAAmB,CAAC;CAC/B;;;;;;;;6FA2DA"}
@@ -0,0 +1,37 @@
1
+ export declare const RUBRIC_INSTRUCTIONS = "You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.\n\nA rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.\n\nGrading guidelines:\n- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.\n- A criterion is \"satisfied\" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.\n- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.\n- Be concise but specific in your reasoning: say what is present or missing.\n- Do not reward effort, intent, or partial progress. Only the actual output counts.";
2
+ export interface RubricAnalysisCriterion {
3
+ /** The criterion text, exactly as provided in the rubric. */
4
+ criterion: string;
5
+ /** Whether the output satisfies this criterion. */
6
+ satisfied: boolean;
7
+ /** Whether this criterion is required for the task to be considered complete. */
8
+ required: boolean;
9
+ /** Short explanation of why the criterion is or is not satisfied. */
10
+ reasoning: string;
11
+ }
12
+ export interface RubricAnalysisResult {
13
+ criteria: RubricAnalysisCriterion[];
14
+ overallAssessment: string;
15
+ }
16
+ /**
17
+ * A single rubric criterion as provided to the prompt builder.
18
+ */
19
+ export interface RubricCriterionInput {
20
+ criterion: string;
21
+ required: boolean;
22
+ }
23
+ export declare function createAnalyzePrompt({ originalTask, output, criteria, }: {
24
+ originalTask: string;
25
+ output: string;
26
+ criteria: RubricCriterionInput[];
27
+ }): string;
28
+ /**
29
+ * Format a human-readable, per-criterion explanation of the rubric result. This text is what
30
+ * `isTaskComplete` injects back into the conversation as feedback, so it must clearly tell the
31
+ * agent which criteria are unmet and why.
32
+ */
33
+ export declare function formatRubricReason({ score, analysis }: {
34
+ score: number;
35
+ analysis: RubricAnalysisResult;
36
+ }): string;
37
+ //# sourceMappingURL=prompts.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/rubric/prompts.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,mBAAmB,mxBASoD,CAAC;AAErF,MAAM,WAAW,uBAAuB;IACtC,6DAA6D;IAC7D,SAAS,EAAE,MAAM,CAAC;IAClB,mDAAmD;IACnD,SAAS,EAAE,OAAO,CAAC;IACnB,iFAAiF;IACjF,QAAQ,EAAE,OAAO,CAAC;IAClB,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,EAAE,uBAAuB,EAAE,CAAC;IACpC,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,wBAAgB,mBAAmB,CAAC,EAClC,YAAY,EACZ,MAAM,EACN,QAAQ,GACT,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,oBAAoB,EAAE,CAAC;CAClC,GAAG,MAAM,CA8BT;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,oBAAoB,CAAA;CAAE,GAAG,MAAM,CAoBjH"}