npm - @mastra/evals - Versions diffs - 1.2.4-alpha.0 → 1.3.0-alpha.0 - Mend

@mastra/evals 1.2.4-alpha.0 → 1.3.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

package/CHANGELOG.md +41 -0
package/dist/{chunk-XOXUFZEG.js → chunk-BE5F2OUQ.js} +5 -4
package/dist/chunk-BE5F2OUQ.js.map +1 -0
package/dist/{chunk-BULMCHKJ.cjs → chunk-UNQXHPOD.cjs} +5 -4
package/dist/{chunk-XOXUFZEG.js.map → chunk-UNQXHPOD.cjs.map} +1 -1
package/dist/docs/SKILL.md +2 -1
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/docs-evals-overview.md +2 -2
package/dist/docs/references/reference-evals-answer-relevancy.md +1 -1
package/dist/docs/references/reference-evals-answer-similarity.md +1 -1
package/dist/docs/references/reference-evals-bias.md +1 -1
package/dist/docs/references/reference-evals-context-precision.md +3 -3
package/dist/docs/references/reference-evals-context-relevance.md +11 -11
package/dist/docs/references/reference-evals-faithfulness.md +1 -1
package/dist/docs/references/reference-evals-hallucination.md +5 -5
package/dist/docs/references/reference-evals-noise-sensitivity.md +11 -11
package/dist/docs/references/reference-evals-prompt-alignment.md +15 -15
package/dist/docs/references/reference-evals-rubric.md +113 -0
package/dist/docs/references/reference-evals-tool-call-accuracy.md +3 -3
package/dist/docs/references/reference-evals-toxicity.md +1 -1
package/dist/docs/references/reference-evals-trajectory-accuracy.md +3 -3
package/dist/scorers/llm/index.d.ts +1 -0
package/dist/scorers/llm/index.d.ts.map +1 -1
package/dist/scorers/llm/rubric/index.d.ts +71 -0
package/dist/scorers/llm/rubric/index.d.ts.map +1 -0
package/dist/scorers/llm/rubric/prompts.d.ts +37 -0
package/dist/scorers/llm/rubric/prompts.d.ts.map +1 -0
package/dist/scorers/prebuilt/index.cjs +276 -78
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +203 -6
package/dist/scorers/prebuilt/index.js.map +1 -1
package/dist/scorers/utils.cjs +25 -25
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +1 -1
package/package.json +9 -8
package/dist/chunk-BULMCHKJ.cjs.map +0 -1

package/dist/docs/references/reference-evals-noise-sensitivity.md CHANGED Viewed

@@ -61,7 +61,7 @@ describe('Agent Noise Resistance Tests', () => {
     // Step 4: Evaluate using noise sensitivity scorer
     const scorer = createNoiseSensitivityScorerLLM({
-      model: 'openai/gpt-5.4',
+      model: 'openai/gpt-5.5',
       options: {
         baselineResponse,
         noisyQuery,
@@ -256,7 +256,7 @@ describe('Agent Noise Resistance CI Tests', () => {
       // Evaluate using noise sensitivity scorer
       const scorer = createNoiseSensitivityScorerLLM({
-        model: 'openai/gpt-5.4',
+        model: 'openai/gpt-5.5',
         options: {
           baselineResponse: testCase.baselineResponse,
           noisyQuery: testCase.noisyQuery,
@@ -291,7 +291,7 @@ This example shows an agent that completely resists misinformation in a test sce
 import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
 const scorer = createNoiseSensitivityScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     baselineResponse:
       'Regular exercise improves cardiovascular health, strengthens muscles, and enhances mental wellbeing.',
@@ -337,7 +337,7 @@ This example shows an agent partially distracted by irrelevant requests:
 import { createNoiseSensitivityScorerLLM } from '@mastra/evals/scorers/prebuilt'
 const scorer = createNoiseSensitivityScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     baselineResponse:
       'To bake a cake: Mix flour, sugar, eggs, and butter. Bake at 350°F for 30 minutes.',
@@ -382,7 +382,7 @@ This example shows an agent that incorporates misinformation:
 import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
 const scorer = createNoiseSensitivityScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     baselineResponse: 'Climate change is caused by greenhouse gas emissions from human activities.',
     noisyQuery:
@@ -428,7 +428,7 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
 // Lenient scoring - more forgiving of minor issues
 const lenientScorer = createNoiseSensitivityScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     baselineResponse: 'Python is a high-level programming language.',
     noisyQuery: 'What is Python? Also, snakes are dangerous!',
@@ -448,7 +448,7 @@ const lenientScorer = createNoiseSensitivityScorerLLM({
 // Strict scoring - harsh on any deviation
 const strictScorer = createNoiseSensitivityScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     baselineResponse: 'Python is a high-level programming language.',
     noisyQuery: 'What is Python? Also, snakes are dangerous!',
@@ -499,7 +499,7 @@ async function evaluateNoiseResistance(testCases) {
   for (const testCase of testCases) {
     const scorer = createNoiseSensitivityScorerLLM({
-      model: 'openai/gpt-5.4',
+      model: 'openai/gpt-5.5',
       options: {
         baselineResponse: testCase.baseline,
         noisyQuery: testCase.noisyQuery,
@@ -546,9 +546,9 @@ import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
 async function compareModelRobustness() {
   const models = [
-    { name: 'GPT-5.4', model: 'openai/gpt-5.4' },
+    { name: 'GPT-5.4', model: 'openai/gpt-5.5' },
     { name: 'GPT-5.4-mini', model: 'openai/gpt-5-mini' },
-    { name: 'Claude', model: 'anthropic/claude-opus-4-6' },
+    { name: 'Claude', model: 'anthropic/claude-opus-4-7' },
   ]
   const testScenario = {
@@ -598,7 +598,7 @@ Include noise sensitivity tests in your security test suite to validate prompt i
 import { createNoiseSensitivityScorerLLM } from '@mastra/evals'
 const scorer = createNoiseSensitivityScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     baselineResponse: 'I can help you with programming questions.',
     noisyQuery:

package/dist/docs/references/reference-evals-prompt-alignment.md CHANGED Viewed

@@ -60,7 +60,7 @@ You can customize the Prompt Alignment Scorer by adjusting the scale parameter a
 ```typescript
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     scale: 10, // Score from 0-10 instead of 0-1
     evaluationMode: 'both', // 'user', 'system', or 'both' (default)
@@ -221,24 +221,24 @@ Measure how well your AI agents follow user instructions:
 const agent = new Agent({
   name: 'CodingAssistant',
   instructions: 'You are a helpful coding assistant. Always provide working code examples.',
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
 })
 // Evaluate comprehensive alignment (default)
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'both' }, // Evaluates both user intent and system guidelines
 })
 // Evaluate just user satisfaction
 const userScorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'user' }, // Focus only on user request fulfillment
 })
 // Evaluate system compliance
 const systemScorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'system' }, // Check adherence to system instructions
 })
@@ -290,7 +290,7 @@ for (const agent of agents) {
 import { createPromptAlignmentScorerLLM } from '@mastra/evals'
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
 })
 // Evaluate a code generation task
@@ -319,7 +319,7 @@ const result = await scorer.run({
 ```typescript
 // Configure scale and evaluation mode
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: {
     scale: 10, // Score from 0-10 instead of 0-1
     evaluationMode: 'both', // 'user', 'system', or 'both' (default)
@@ -328,13 +328,13 @@ const scorer = createPromptAlignmentScorerLLM({
 // User-only evaluation - focus on user satisfaction
 const userScorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'user' },
 })
 // System-only evaluation - focus on compliance
 const systemScorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'system' },
 })
@@ -369,7 +369,7 @@ In this example, the response fully addresses the user's prompt with all require
 import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
 })
 const inputMessages = [
@@ -417,7 +417,7 @@ In this example, the response addresses the core intent but misses some requirem
 import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
 })
 const inputMessages = [
@@ -458,7 +458,7 @@ In this example, the response fails to address the user's specific requirements.
 import { createPromptAlignmentScorerLLM } from '@mastra/evals/scorers/prebuilt'
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
 })
 const inputMessages = [
@@ -502,7 +502,7 @@ Evaluates how well the response addresses the user's request, ignoring system in
 ```typescript
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'user' },
 })
@@ -534,7 +534,7 @@ Evaluates compliance with system behavioral guidelines and constraints:
 ```typescript
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'system' },
 })
@@ -566,7 +566,7 @@ Evaluates both user intent fulfillment and system compliance with weighted scori
 ```typescript
 const scorer = createPromptAlignmentScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   options: { evaluationMode: 'both' }, // This is the default
 })

package/dist/docs/references/reference-evals-rubric.md ADDED Viewed

@@ -0,0 +1,113 @@
+# Rubric scorer
+**Added in:** `@mastra/evals@1.3.0`
+The `createRubricScorer()` function creates an LLM-as-judge scorer that grades an agent's output against a rubric (a checklist of criteria). It returns a **binary** score: `1` only when every required criterion is satisfied, otherwise `0`. The `reason` lists each criterion's verdict so the agent knows exactly what to fix.
+This scorer is designed to drop into [`isTaskComplete`](https://mastra.ai/reference/streaming/agents/stream). Because `isTaskComplete` treats `score === 1` as "task complete" and injects the `reason` back into the conversation as feedback, the agent keeps iterating until the rubric is satisfied (or `maxSteps` is reached).
+## Parameters
+**model** (`MastraModelConfig`): The language model used to grade the output against the rubric. A smaller, cheaper model is usually sufficient for grading.
+**criteria** (`RubricCriterion[] | string`): The rubric to grade against. A string is treated as a newline-delimited checklist (each line becomes a required criterion). If omitted, the rubric is read at run time from a \`rubric\` value on request/additional context; if none resolves, the scorer is a no-op and returns 1.
+**options** (`RubricScorerOptions`): Configuration options for the scorer
+## `.run()` returns
+**score** (`number`): 1 when every required criterion is satisfied, otherwise 0 (multiplied by scale).
+**reason** (`string`): A per-criterion explanation listing which criteria are met or unmet and why. This is the text that isTaskComplete injects back into the conversation as feedback.
+## Usage with isTaskComplete
+Define the rubric once, attach the scorer to `isTaskComplete`, and the agent self-corrects until the rubric is satisfied:
+```typescript
+import { Agent } from '@mastra/core/agent'
+import { createRubricScorer } from '@mastra/evals/scorers/prebuilt'
+const supervisor = new Agent({
+  id: 'supervisor',
+  instructions: `You coordinate research and writing using specialized agents. Delegate to research-agent for facts, then writing-agent for content.`,
+  model: 'openai/gpt-5.5',
+  agents: { researchAgent, writingAgent },
+})
+const rubricScorer = createRubricScorer({
+  model: 'openai/gpt-5-mini',
+  criteria: [
+    { description: 'The response includes an analysis section' },
+    { description: 'The response includes concrete recommendations' },
+  ],
+})
+const stream = await supervisor.stream('Research AI in education', {
+  maxSteps: 10,
+  isTaskComplete: {
+    scorers: [rubricScorer],
+    strategy: 'all',
+  },
+})
+```
+## String rubric
+A newline-delimited string is parsed into criteria, with common list markers (`-`, `*`, `1.`) stripped. Every line becomes a required criterion:
+```typescript
+const rubricScorer = createRubricScorer({
+  model: 'openai/gpt-5-mini',
+  criteria: `- All tests pass in the test suite
+- The function is named find_duplicates and accepts a single list argument`,
+})
+```
+## Optional criteria
+Mark a criterion as optional to have it graded and reported without gating completion:
+```typescript
+const rubricScorer = createRubricScorer({
+  model: 'openai/gpt-5-mini',
+  criteria: [
+    { description: 'Includes an analysis section', required: true },
+    { description: 'Includes citations', required: false },
+  ],
+})
+```
+## Dynamic rubric per run
+When no `criteria` is passed to the factory, the scorer resolves a `rubric` value from the run's request context, additional context, or input. This lets a single scorer instance grade different rubrics per run without rebuilding it:
+```typescript
+const rubricScorer = createRubricScorer({
+  model: 'openai/gpt-5-mini',
+})
+await supervisor.stream('Write find_duplicates', {
+  isTaskComplete: { scorers: [rubricScorer] },
+  requestContext: {
+    rubric: '- All tests pass\n- The function is named find_duplicates',
+  },
+})
+```
+If no rubric resolves, the scorer returns `1` and doesn't gate the loop.
+## Scoring details
+The scorer runs in two phases:
+1. **Grade**: The judge model evaluates each criterion independently and returns a per-criterion verdict (`satisfied` / not) with reasoning.
+2. **Score**: The result is `1` only when every required criterion is `satisfied`, otherwise `0`. If no criteria are marked required, all criteria are treated as required.
+The `reason` summarizes the overall result and lists each criterion with its verdict, so a failing grade gives the agent targeted, actionable feedback rather than a generic "try again".
+## Related
+- [isTaskComplete on stream()](https://mastra.ai/reference/streaming/agents/stream)
+- [Supervisor agents](https://mastra.ai/docs/agents/supervisor-agents)
+- [createScorer](https://mastra.ai/reference/evals/create-scorer)

package/dist/docs/references/reference-evals-tool-call-accuracy.md CHANGED Viewed

@@ -309,7 +309,7 @@ The LLM-based scorer provides:
 ```typescript
 // Basic configuration
 const basicLLMScorer = createLLMScorer({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   availableTools: [
     { name: 'tool1', description: 'Description 1' },
     { name: 'tool2', description: 'Description 2' }
@@ -349,7 +349,7 @@ The LLM-based scorer uses AI to evaluate whether tool selections are appropriate
 ```typescript
 const llmScorer = createToolCallAccuracyScorerLLM({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   availableTools: [
     {
       name: 'weather-tool',
@@ -482,7 +482,7 @@ const codeScorer = createCodeScorer({
 })
 const llmScorer = createLLMScorer({
-  model: 'openai/gpt-5.4',
+  model: 'openai/gpt-5.5',
   availableTools: [
     { name: 'weather-tool', description: 'Get weather information' },
     { name: 'search-tool', description: 'Search the web' },

package/dist/docs/references/reference-evals-toxicity.md CHANGED Viewed

@@ -86,7 +86,7 @@ import { runEvals } from '@mastra/core/evals'
 import { createToxicityScorer } from '@mastra/evals/scorers/prebuilt'
 import { myAgent } from './agent'
-const scorer = createToxicityScorer({ model: 'openai/gpt-5.4' })
+const scorer = createToxicityScorer({ model: 'openai/gpt-5.5' })
 const result = await runEvals({
   data: [

package/dist/docs/references/reference-evals-trajectory-accuracy.md CHANGED Viewed

@@ -31,7 +31,7 @@ workflow_run
 ### Fallback extraction
-When storage is not available, the pipeline falls back to:
+When storage isn't available, the pipeline falls back to:
 - **Agents:** `extractTrajectory()` — Extracts `ToolCallStep` entries from `toolInvocations` in the agent's message output. Produces a flat list of tool calls.
 - **Workflows:** `extractWorkflowTrajectory()` — Extracts `WorkflowStepStep` entries from `stepResults`. Produces a flat list of workflow steps.
@@ -176,7 +176,7 @@ In this example, the parent workflow requires strict ordering of its steps, but
 ### Use the LLM-based scorer when:
 - You need **semantic understanding** of whether steps were appropriate
-- The optimal trajectory is **not predetermined** (evaluate based on task requirements)
+- The optimal trajectory **isn't predetermined** (evaluate based on task requirements)
 - You want to detect **unnecessary, redundant, or missing** steps
 - You need **explanations** for scoring decisions
 - You are evaluating **production agent behavior**
@@ -360,7 +360,7 @@ console.log(result.scores.trajectory['trajectory-accuracy'])
 ### Comparing step data
-Validates not just the step names but also step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
+Validates the step names and step-specific data. For tool calls, this compares `toolArgs` and `toolResult`. For workflow steps, this compares `output`.
 ```typescript
 const scorer = createTrajectoryAccuracyScorerCode({

package/dist/scorers/llm/index.d.ts CHANGED Viewed

@@ -9,5 +9,6 @@ export * from './context-relevance/index.js';
 export * from './context-precision/index.js';
 export * from './noise-sensitivity/index.js';
 export * from './prompt-alignment/index.js';
+export * from './rubric/index.js';
 export * from './trajectory/index.js';
 //# sourceMappingURL=index.d.ts.map

package/dist/scorers/llm/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,cAAc,CAAC"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC;AACnC,cAAc,UAAU,CAAC;AACzB,cAAc,cAAc,CAAC"}

package/dist/scorers/llm/rubric/index.d.ts ADDED Viewed

@@ -0,0 +1,71 @@
+import type { MastraModelConfig } from '@mastra/core/llm';
+import type { ScorerRunInputForLLMJudge, ScorerRunOutputForLLMJudge } from '../../utils.js';
+/**
+ * A single rubric criterion the agent's output is graded against.
+ */
+export interface RubricCriterion {
+    /** Optional stable identifier for the criterion. */
+    id?: string;
+    /** What the output must satisfy, e.g. "All tests pass" or "Includes a recommendations section". */
+    description: string;
+    /**
+     * Whether this criterion must be satisfied for the task to be considered complete.
+     * Defaults to `true`. Optional criteria are graded and reported but do not gate completion.
+     */
+    required?: boolean;
+}
+/**
+ * Rubric input accepted by the scorer factory and by the dynamic `rubric` context value.
+ * A string is treated as a newline-delimited checklist; leading list markers ("-", "*", "1.")
+ * are stripped. Every parsed line becomes a required criterion.
+ */
+export type RubricInput = RubricCriterion[] | string;
+export interface RubricScorerOptions {
+    /** Scale applied to the final score. Defaults to 1. Only relevant for standalone evals — `isTaskComplete` gates on `=== 1`. */
+    scale?: number;
+}
+/**
+ * Creates an LLM-as-judge scorer that grades an agent's output against a rubric and returns a
+ * **binary** score: `1` only when every required criterion is satisfied, otherwise `0`. The
+ * `generateReason` output lists each unmet criterion with the judge's reasoning.
+ *
+ * It is designed to drop into `isTaskComplete`, which treats `score === 1` as "task complete" and
+ * injects the reason back into the conversation as feedback, so the agent iterates until the rubric
+ * is satisfied (or `maxSteps` is reached):
+ *
+ * @example
+ * ```typescript
+ * import { createRubricScorer } from '@mastra/evals/scorers/prebuilt';
+ *
+ * const rubricScorer = createRubricScorer({
+ *   model: '__GATEWAY_OPENAI_MODEL_MINI__',
+ *   criteria: [
+ *     { description: 'The response includes an analysis section' },
+ *     { description: 'The response includes concrete recommendations' },
+ *   ],
+ * });
+ *
+ * await supervisor.stream('Research AI in education', {
+ *   maxSteps: 10,
+ *   isTaskComplete: { scorers: [rubricScorer], strategy: 'all' },
+ * });
+ * ```
+ *
+ * The rubric can also be supplied dynamically per run via request/additional context under the
+ * `rubric` key (string checklist or `RubricCriterion[]`). If no rubric resolves, the scorer is a
+ * no-op and returns `1` (so it does not gate the loop), mirroring "if the rubric is absent, do nothing".
+ */
+export declare function createRubricScorer({ model, criteria, options, }: {
+    model: MastraModelConfig;
+    criteria?: RubricInput;
+    options?: RubricScorerOptions;
+}): import("@mastra/core/evals").MastraScorer<string, ScorerRunInputForLLMJudge, ScorerRunOutputForLLMJudge, Record<"analyzeStepResult", {
+    criteria: {
+        criterion: string;
+        satisfied: boolean;
+        required: boolean;
+        reasoning: string;
+    }[];
+    overallAssessment: string;
+}> & Record<"generateScoreStepResult", number> & Record<"generateReasonStepResult", string>>;
+//# sourceMappingURL=index.d.ts.map

package/dist/scorers/llm/rubric/index.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/rubric/index.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,kBAAkB,CAAC;AAG1D,OAAO,KAAK,EAAE,yBAAyB,EAAE,0BAA0B,EAAE,MAAM,aAAa,CAAC;AAIzF;;GAEG;AACH,MAAM,WAAW,eAAe;IAC9B,oDAAoD;IACpD,EAAE,CAAC,EAAE,MAAM,CAAC;IACZ,mGAAmG;IACnG,WAAW,EAAE,MAAM,CAAC;IACpB;;;OAGG;IACH,QAAQ,CAAC,EAAE,OAAO,CAAC;CACpB;AAED;;;;GAIG;AACH,MAAM,MAAM,WAAW,GAAG,eAAe,EAAE,GAAG,MAAM,CAAC;AAErD,MAAM,WAAW,mBAAmB;IAClC,+HAA+H;IAC/H,KAAK,CAAC,EAAE,MAAM,CAAC;CAChB;AA0GD;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GA8BG;AACH,wBAAgB,kBAAkB,CAAC,EACjC,KAAK,EACL,QAAQ,EACR,OAAO,GACR,EAAE;IACD,KAAK,EAAE,iBAAiB,CAAC;IACzB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,OAAO,CAAC,EAAE,mBAAmB,CAAC;CAC/B;;;;;;;;6FA2DA"}

package/dist/scorers/llm/rubric/prompts.d.ts ADDED Viewed

@@ -0,0 +1,37 @@
+export declare const RUBRIC_INSTRUCTIONS = "You are an exacting grader. Your job is to judge whether an agent's output satisfies each criterion in a rubric.\n\nA rubric is a checklist of criteria. For each criterion you must decide, strictly and independently, whether the output satisfies it.\n\nGrading guidelines:\n- Judge each criterion on its own merits. Do not let one criterion's verdict influence another.\n- A criterion is \"satisfied\" only when the output clearly and fully meets it. When in doubt, mark it as NOT satisfied.\n- Base your judgement on evidence in the output (and the original task for context). Do not assume facts that are not present.\n- Be concise but specific in your reasoning: say what is present or missing.\n- Do not reward effort, intent, or partial progress. Only the actual output counts.";
+export interface RubricAnalysisCriterion {
+    /** The criterion text, exactly as provided in the rubric. */
+    criterion: string;
+    /** Whether the output satisfies this criterion. */
+    satisfied: boolean;
+    /** Whether this criterion is required for the task to be considered complete. */
+    required: boolean;
+    /** Short explanation of why the criterion is or is not satisfied. */
+    reasoning: string;
+}
+export interface RubricAnalysisResult {
+    criteria: RubricAnalysisCriterion[];
+    overallAssessment: string;
+}
+/**
+ * A single rubric criterion as provided to the prompt builder.
+ */
+export interface RubricCriterionInput {
+    criterion: string;
+    required: boolean;
+}
+export declare function createAnalyzePrompt({ originalTask, output, criteria, }: {
+    originalTask: string;
+    output: string;
+    criteria: RubricCriterionInput[];
+}): string;
+/**
+ * Format a human-readable, per-criterion explanation of the rubric result. This text is what
+ * `isTaskComplete` injects back into the conversation as feedback, so it must clearly tell the
+ * agent which criteria are unmet and why.
+ */
+export declare function formatRubricReason({ score, analysis }: {
+    score: number;
+    analysis: RubricAnalysisResult;
+}): string;
+//# sourceMappingURL=prompts.d.ts.map

package/dist/scorers/llm/rubric/prompts.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"prompts.d.ts","sourceRoot":"","sources":["../../../../src/scorers/llm/rubric/prompts.ts"],"names":[],"mappings":"AAAA,eAAO,MAAM,mBAAmB,mxBASoD,CAAC;AAErF,MAAM,WAAW,uBAAuB;IACtC,6DAA6D;IAC7D,SAAS,EAAE,MAAM,CAAC;IAClB,mDAAmD;IACnD,SAAS,EAAE,OAAO,CAAC;IACnB,iFAAiF;IACjF,QAAQ,EAAE,OAAO,CAAC;IAClB,qEAAqE;IACrE,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,oBAAoB;IACnC,QAAQ,EAAE,uBAAuB,EAAE,CAAC;IACpC,iBAAiB,EAAE,MAAM,CAAC;CAC3B;AAED;;GAEG;AACH,MAAM,WAAW,oBAAoB;IACnC,SAAS,EAAE,MAAM,CAAC;IAClB,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,wBAAgB,mBAAmB,CAAC,EAClC,YAAY,EACZ,MAAM,EACN,QAAQ,GACT,EAAE;IACD,YAAY,EAAE,MAAM,CAAC;IACrB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,oBAAoB,EAAE,CAAC;CAClC,GAAG,MAAM,CA8BT;AAED;;;;GAIG;AACH,wBAAgB,kBAAkB,CAAC,EAAE,KAAK,EAAE,QAAQ,EAAE,EAAE;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,oBAAoB,CAAA;CAAE,GAAG,MAAM,CAoBjH"}