npm - @mastra/evals - Versions diffs - 1.2.2 → 1.2.3 - Mend

@mastra/evals 1.2.2 → 1.2.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/CHANGELOG.md +18 -0
package/dist/docs/SKILL.md +1 -1
package/dist/docs/assets/SOURCE_MAP.json +1 -1
package/dist/docs/references/docs-evals-overview.md +2 -2
package/dist/docs/references/reference-evals-noise-sensitivity.md +1 -1
package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
package/dist/scorers/llm/bias/index.d.ts.map +1 -1
package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
package/dist/scorers/llm/trajectory/index.d.ts.map +1 -1
package/dist/scorers/prebuilt/index.cjs +656 -132
package/dist/scorers/prebuilt/index.cjs.map +1 -1
package/dist/scorers/prebuilt/index.js +656 -132
package/dist/scorers/prebuilt/index.js.map +1 -1
package/package.json +9 -10

package/dist/scorers/prebuilt/index.js CHANGED Viewed

@@ -1,6 +1,5 @@
 import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage, compareTrajectories, checkTrajectoryEfficiency, checkTrajectoryBlacklist, analyzeToolFailures, isScorerRunInputForAgent, isScorerRunOutputForAgent } from '../../chunk-ZRHCSFKL.js';
 import { createScorer } from '@mastra/core/evals';
-import { z } from 'zod';
 import nlp from 'compromise';
 import keyword_extractor from 'keyword-extractor';
 import stringSimilarity from 'string-similarity';
@@ -210,9 +209,21 @@ var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `
     5. Empty inputs or error messages should always be marked as "no"
     6. Responses that discuss the type of information being asked show partial relevance
 `;
-var extractOutputSchema = z.object({
-  statements: z.array(z.string())
-});
+var extractOutputSchema = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "statements": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    }
+  },
+  "required": [
+    "statements"
+  ]
+};
 function createAnswerRelevancyScorer({
   model,
   options = DEFAULT_OPTIONS
@@ -235,7 +246,33 @@ function createAnswerRelevancyScorer({
     }
   }).analyze({
     description: "Score the relevance of the statements to the input",
-    outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "results": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "result": {
+                "type": "string"
+              },
+              "reason": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "result",
+              "reason"
+            ]
+          }
+        }
+      },
+      "required": [
+        "results"
+      ]
+    },
     createPrompt: ({ run, results }) => {
       const input = getUserMessageFromRunInput(run.input) ?? "";
       return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
@@ -408,28 +445,106 @@ Key Principles:
 5. Provide actionable feedback for improving answer accuracy
 6. Be strict but fair - partial credit for partial matches
 `;
-var extractOutputSchema2 = z.object({
-  outputUnits: z.array(z.string()),
-  groundTruthUnits: z.array(z.string())
-});
-var analyzeOutputSchema = z.object({
-  matches: z.array(
-    z.object({
-      groundTruthUnit: z.string(),
-      outputUnit: z.string().nullable(),
-      matchType: z.enum(["exact", "semantic", "partial", "missing"]),
-      explanation: z.string()
-    })
-  ),
-  extraInOutput: z.array(z.string()),
-  contradictions: z.array(
-    z.object({
-      outputUnit: z.string(),
-      groundTruthUnit: z.string(),
-      explanation: z.string()
-    })
-  )
-});
+var extractOutputSchema2 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "outputUnits": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "groundTruthUnits": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    }
+  },
+  "required": [
+    "outputUnits",
+    "groundTruthUnits"
+  ]
+};
+var analyzeOutputSchema = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "matches": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "groundTruthUnit": {
+            "type": "string"
+          },
+          "outputUnit": {
+            "anyOf": [
+              {
+                "type": "string"
+              },
+              {
+                "type": "null"
+              }
+            ]
+          },
+          "matchType": {
+            "type": "string",
+            "enum": [
+              "exact",
+              "semantic",
+              "partial",
+              "missing"
+            ]
+          },
+          "explanation": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "groundTruthUnit",
+          "outputUnit",
+          "matchType",
+          "explanation"
+        ]
+      }
+    },
+    "extraInOutput": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "contradictions": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "outputUnit": {
+            "type": "string"
+          },
+          "groundTruthUnit": {
+            "type": "string"
+          },
+          "explanation": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "outputUnit",
+          "groundTruthUnit",
+          "explanation"
+        ]
+      }
+    }
+  },
+  "required": [
+    "matches",
+    "extraInOutput",
+    "contradictions"
+  ]
+};
 function createAnswerSimilarityScorer({
   model,
   options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
@@ -708,16 +823,54 @@ function createFaithfulnessScorer({
     type: "agent"
   }).preprocess({
     description: "Extract relevant statements from the LLM output",
-    outputSchema: z.object({
-      claims: z.array(z.string())
-    }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "claims": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        }
+      },
+      "required": [
+        "claims"
+      ]
+    },
     createPrompt: ({ run }) => {
       const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
     description: "Score the relevance of the statements to the input",
-    outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "verdicts": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "verdict": {
+                "type": "string"
+              },
+              "reason": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "verdict",
+              "reason"
+            ]
+          }
+        }
+      },
+      "required": [
+        "verdicts"
+      ]
+    },
     createPrompt: ({ results, run }) => {
       const context = options?.context ?? getToolInvocationContext(run.output);
       const prompt = createFaithfulnessAnalyzePrompt({
@@ -869,13 +1022,51 @@ function createBiasScorer({ model, options }) {
     type: "agent"
   }).preprocess({
     description: "Extract relevant statements from the LLM output",
-    outputSchema: z.object({
-      opinions: z.array(z.string())
-    }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "opinions": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        }
+      },
+      "required": [
+        "opinions"
+      ]
+    },
     createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
   }).analyze({
     description: "Score the relevance of the statements to the input",
-    outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "results": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "result": {
+                "type": "string"
+              },
+              "reason": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "result",
+              "reason"
+            ]
+          }
+        }
+      },
+      "required": [
+        "results"
+      ]
+    },
     createPrompt: ({ run, results }) => {
       const prompt = createBiasAnalyzePrompt({
         output: getAssistantMessageFromRunOutput(run.output) ?? "",
@@ -1104,18 +1295,58 @@ function createHallucinationScorer({
     type: "agent"
   }).preprocess({
     description: "Extract all claims from the given output",
-    outputSchema: z.object({
-      claims: z.array(z.string())
-    }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "claims": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        }
+      },
+      "required": [
+        "claims"
+      ]
+    },
     createPrompt: ({ run }) => {
       const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
       return prompt;
     }
   }).analyze({
     description: "Score the relevance of the statements to the input",
-    outputSchema: z.object({
-      verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
-    }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "verdicts": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "statement": {
+                "type": "string"
+              },
+              "verdict": {
+                "type": "string"
+              },
+              "reason": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "statement",
+              "verdict",
+              "reason"
+            ]
+          }
+        }
+      },
+      "required": [
+        "verdicts"
+      ]
+    },
     createPrompt: async ({ run, results }) => {
       let context;
       if (options?.getContext) {
@@ -1259,7 +1490,33 @@ function createToxicityScorer({
     type: "agent"
   }).analyze({
     description: "Score the relevance of the statements to the input",
-    outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
+    outputSchema: {
+      "$schema": "https://json-schema.org/draft/2020-12/schema",
+      "type": "object",
+      "properties": {
+        "verdicts": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "verdict": {
+                "type": "string"
+              },
+              "reason": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "verdict",
+              "reason"
+            ]
+          }
+        }
+      },
+      "required": [
+        "verdicts"
+      ]
+    },
     createPrompt: ({ run }) => {
       const prompt = createToxicityAnalyzePrompt({
         input: getUserMessageFromRunInput(run.input) ?? "",
@@ -1386,16 +1643,43 @@ Provide a single, concise sentence explaining why this score was given.
 };
 // src/scorers/llm/tool-call-accuracy/index.ts
-var analyzeOutputSchema2 = z.object({
-  evaluations: z.array(
-    z.object({
-      toolCalled: z.string(),
-      wasAppropriate: z.boolean(),
-      reasoning: z.string()
-    })
-  ),
-  missingTools: z.array(z.string()).optional()
-});
+var analyzeOutputSchema2 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "evaluations": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "toolCalled": {
+            "type": "string"
+          },
+          "wasAppropriate": {
+            "type": "boolean"
+          },
+          "reasoning": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "toolCalled",
+          "wasAppropriate",
+          "reasoning"
+        ]
+      }
+    },
+    "missingTools": {
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    }
+  },
+  "required": [
+    "evaluations"
+  ]
+};
 function createToolCallAccuracyScorerLLM({ model, availableTools }) {
   const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
   return createScorer({
@@ -1606,19 +1890,62 @@ Example responses:
 }
 // src/scorers/llm/context-relevance/index.ts
-var analyzeOutputSchema3 = z.object({
-  evaluations: z.array(
-    z.object({
-      context_index: z.number(),
-      contextPiece: z.string(),
-      relevanceLevel: z.enum(["high", "medium", "low", "none"]),
-      wasUsed: z.boolean(),
-      reasoning: z.string()
-    })
-  ),
-  missingContext: z.array(z.string()).optional().default([]),
-  overallAssessment: z.string()
-});
+var analyzeOutputSchema3 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "evaluations": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "context_index": {
+            "type": "number"
+          },
+          "contextPiece": {
+            "type": "string"
+          },
+          "relevanceLevel": {
+            "type": "string",
+            "enum": [
+              "high",
+              "medium",
+              "low",
+              "none"
+            ]
+          },
+          "wasUsed": {
+            "type": "boolean"
+          },
+          "reasoning": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "context_index",
+          "contextPiece",
+          "relevanceLevel",
+          "wasUsed",
+          "reasoning"
+        ]
+      }
+    },
+    "missingContext": {
+      "default": [],
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "overallAssessment": {
+      "type": "string"
+    }
+  },
+  "required": [
+    "evaluations",
+    "overallAssessment"
+  ]
+};
 var DEFAULT_PENALTIES = {
   UNUSED_HIGH_RELEVANCE_CONTEXT: 0.1,
   // 10% penalty per unused high-relevance context
@@ -1852,15 +2179,37 @@ Example responses:
 }
 // src/scorers/llm/context-precision/index.ts
-var contextRelevanceOutputSchema = z.object({
-  verdicts: z.array(
-    z.object({
-      context_index: z.number(),
-      verdict: z.string(),
-      reason: z.string()
-    })
-  )
-});
+var contextRelevanceOutputSchema = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "verdicts": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "context_index": {
+            "type": "number"
+          },
+          "verdict": {
+            "type": "string"
+          },
+          "reason": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "context_index",
+          "verdict",
+          "reason"
+        ]
+      }
+    }
+  },
+  "required": [
+    "verdicts"
+  ]
+};
 var getContext2 = ({
   input,
   output,
@@ -2139,20 +2488,63 @@ Example responses:
 }
 // src/scorers/llm/noise-sensitivity/index.ts
-var scoreSchema = z.number().refine((n) => n >= 0 && n <= 1, { message: "Score must be between 0 and 1" });
-var analyzeOutputSchema4 = z.object({
-  dimensions: z.array(
-    z.object({
-      dimension: z.string(),
-      impactLevel: z.enum(["none", "minimal", "moderate", "significant", "severe"]),
-      specificChanges: z.string(),
-      noiseInfluence: z.string()
-    })
-  ),
-  overallAssessment: z.string(),
-  majorIssues: z.array(z.string()).optional().default([]),
-  robustnessScore: scoreSchema
-});
+var analyzeOutputSchema4 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "dimensions": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "dimension": {
+            "type": "string"
+          },
+          "impactLevel": {
+            "type": "string",
+            "enum": [
+              "none",
+              "minimal",
+              "moderate",
+              "significant",
+              "severe"
+            ]
+          },
+          "specificChanges": {
+            "type": "string"
+          },
+          "noiseInfluence": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "dimension",
+          "impactLevel",
+          "specificChanges",
+          "noiseInfluence"
+        ]
+      }
+    },
+    "overallAssessment": {
+      "type": "string"
+    },
+    "majorIssues": {
+      "default": [],
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "robustnessScore": {
+      "type": "number"
+    }
+  },
+  "required": [
+    "dimensions",
+    "overallAssessment",
+    "robustnessScore"
+  ]
+};
 var DEFAULT_IMPACT_WEIGHTS = {
   none: 1,
   minimal: 0.85,
@@ -2485,37 +2877,124 @@ Example responses:
 }
 // src/scorers/llm/prompt-alignment/index.ts
-var scoreSchema2 = z.number().refine((n) => n >= 0 && n <= 1, { message: "Score must be between 0 and 1" });
-var analyzeOutputSchema5 = z.object({
-  intentAlignment: z.object({
-    score: scoreSchema2,
-    primaryIntent: z.string(),
-    isAddressed: z.boolean(),
-    reasoning: z.string()
-  }),
-  requirementsFulfillment: z.object({
-    requirements: z.array(
-      z.object({
-        requirement: z.string(),
-        isFulfilled: z.boolean(),
-        reasoning: z.string()
-      })
-    ),
-    overallScore: scoreSchema2
-  }),
-  completeness: z.object({
-    score: scoreSchema2,
-    missingElements: z.array(z.string()),
-    reasoning: z.string()
-  }),
-  responseAppropriateness: z.object({
-    score: scoreSchema2,
-    formatAlignment: z.boolean(),
-    toneAlignment: z.boolean(),
-    reasoning: z.string()
-  }),
-  overallAssessment: z.string()
-});
+var analyzeOutputSchema5 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "intentAlignment": {
+      "type": "object",
+      "properties": {
+        "score": {
+          "type": "number"
+        },
+        "primaryIntent": {
+          "type": "string"
+        },
+        "isAddressed": {
+          "type": "boolean"
+        },
+        "reasoning": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "score",
+        "primaryIntent",
+        "isAddressed",
+        "reasoning"
+      ]
+    },
+    "requirementsFulfillment": {
+      "type": "object",
+      "properties": {
+        "requirements": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "requirement": {
+                "type": "string"
+              },
+              "isFulfilled": {
+                "type": "boolean"
+              },
+              "reasoning": {
+                "type": "string"
+              }
+            },
+            "required": [
+              "requirement",
+              "isFulfilled",
+              "reasoning"
+            ]
+          }
+        },
+        "overallScore": {
+          "type": "number"
+        }
+      },
+      "required": [
+        "requirements",
+        "overallScore"
+      ]
+    },
+    "completeness": {
+      "type": "object",
+      "properties": {
+        "score": {
+          "type": "number"
+        },
+        "missingElements": {
+          "type": "array",
+          "items": {
+            "type": "string"
+          }
+        },
+        "reasoning": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "score",
+        "missingElements",
+        "reasoning"
+      ]
+    },
+    "responseAppropriateness": {
+      "type": "object",
+      "properties": {
+        "score": {
+          "type": "number"
+        },
+        "formatAlignment": {
+          "type": "boolean"
+        },
+        "toneAlignment": {
+          "type": "boolean"
+        },
+        "reasoning": {
+          "type": "string"
+        }
+      },
+      "required": [
+        "score",
+        "formatAlignment",
+        "toneAlignment",
+        "reasoning"
+      ]
+    },
+    "overallAssessment": {
+      "type": "string"
+    }
+  },
+  "required": [
+    "intentAlignment",
+    "requirementsFulfillment",
+    "completeness",
+    "responseAppropriateness",
+    "overallAssessment"
+  ]
+};
 var SCORING_WEIGHTS = {
   USER: {
     INTENT_ALIGNMENT: 0.4,
@@ -2710,19 +3189,64 @@ Provide a single, concise sentence explaining why this score was given.
 };
 // src/scorers/llm/trajectory/index.ts
-var analyzeOutputSchema6 = z.object({
-  stepEvaluations: z.array(
-    z.object({
-      stepName: z.string().describe("Name of the step (tool name or action)"),
-      wasNecessary: z.boolean().describe("Whether this step was necessary for the task"),
-      wasInOrder: z.boolean().describe("Whether this step was in a logical position in the sequence"),
-      reasoning: z.string().describe("Brief explanation of the evaluation")
-    })
-  ),
-  missingSteps: z.array(z.string()).optional().describe("Steps that should have been taken but were not"),
-  extraSteps: z.array(z.string()).optional().describe("Steps that were unnecessary or redundant"),
-  overallAssessment: z.string().describe("Brief overall assessment of the trajectory quality")
-});
+var analyzeOutputSchema6 = {
+  "$schema": "https://json-schema.org/draft/2020-12/schema",
+  "type": "object",
+  "properties": {
+    "stepEvaluations": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "stepName": {
+            "type": "string",
+            "description": "Name of the step (tool name or action)"
+          },
+          "wasNecessary": {
+            "type": "boolean",
+            "description": "Whether this step was necessary for the task"
+          },
+          "wasInOrder": {
+            "type": "boolean",
+            "description": "Whether this step was in a logical position in the sequence"
+          },
+          "reasoning": {
+            "type": "string",
+            "description": "Brief explanation of the evaluation"
+          }
+        },
+        "required": [
+          "stepName",
+          "wasNecessary",
+          "wasInOrder",
+          "reasoning"
+        ]
+      }
+    },
+    "missingSteps": {
+      "description": "Steps that should have been taken but were not",
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "extraSteps": {
+      "description": "Steps that were unnecessary or redundant",
+      "type": "array",
+      "items": {
+        "type": "string"
+      }
+    },
+    "overallAssessment": {
+      "type": "string",
+      "description": "Brief overall assessment of the trajectory quality"
+    }
+  },
+  "required": [
+    "stepEvaluations",
+    "overallAssessment"
+  ]
+};
 function formatStepDetails(step) {
   switch (step.stepType) {
     case "tool_call":