npm - @mastra/evals - Versions diffs - 0.13.5 → 0.13.6 - Mend

@mastra/evals 0.13.5 → 0.13.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

package/CHANGELOG.md +22 -0
package/dist/{chunk-4LRZVFXR.js → chunk-KHEXN75Q.js} +72 -3
package/dist/chunk-KHEXN75Q.js.map +1 -0
package/dist/{chunk-EKSPLMYP.cjs → chunk-QKR2PMLZ.cjs} +79 -2
package/dist/chunk-QKR2PMLZ.cjs.map +1 -0
package/dist/{dist-QNM75ISG.cjs → dist-ALHZKHK6.cjs} +9 -9
package/dist/{dist-QNM75ISG.cjs.map → dist-ALHZKHK6.cjs.map} +1 -1
package/dist/{dist-KXHZV6E4.js → dist-HPW4UI62.js} +9 -9
package/dist/{dist-KXHZV6E4.js.map → dist-HPW4UI62.js.map} +1 -1
package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/dist/scorers/code/index.cjs +2 -2
package/dist/scorers/code/index.js +1 -1
package/dist/scorers/llm/answer-similarity/index.d.ts +34 -0
package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -0
package/dist/scorers/llm/answer-similarity/prompts.d.ts +29 -0
package/dist/scorers/llm/answer-similarity/prompts.d.ts.map +1 -0
package/dist/scorers/llm/index.cjs +335 -68
package/dist/scorers/llm/index.cjs.map +1 -1
package/dist/scorers/llm/index.d.ts +1 -0
package/dist/scorers/llm/index.d.ts.map +1 -1
package/dist/scorers/llm/index.js +291 -27
package/dist/scorers/llm/index.js.map +1 -1
package/dist/scorers/utils.cjs +60 -0
package/dist/scorers/utils.cjs.map +1 -0
package/dist/scorers/utils.d.ts +1 -1
package/dist/scorers/utils.d.ts.map +1 -1
package/dist/scorers/utils.js +3 -0
package/dist/scorers/utils.js.map +1 -0
package/package.json +14 -4
package/dist/chunk-4LRZVFXR.js.map +0 -1
package/dist/chunk-EKSPLMYP.cjs.map +0 -1

package/dist/scorers/llm/index.d.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 export * from './answer-relevancy/index.js';
+export * from './answer-similarity/index.js';
 export * from './faithfulness/index.js';
 export * from './bias/index.js';
 export * from './hallucination/index.js';

package/dist/scorers/llm/index.d.ts.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}
1	+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../../src/scorers/llm/index.ts"],"names":[],"mappings":"AAAA,cAAc,oBAAoB,CAAC;AACnC,cAAc,qBAAqB,CAAC;AACpC,cAAc,gBAAgB,CAAC;AAC/B,cAAc,QAAQ,CAAC;AACvB,cAAc,iBAAiB,CAAC;AAChC,cAAc,YAAY,CAAC;AAC3B,cAAc,sBAAsB,CAAC;AACrC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,qBAAqB,CAAC;AACpC,cAAc,oBAAoB,CAAC"}

package/dist/scorers/llm/index.js CHANGED Viewed

@@ -1,5 +1,5 @@
 import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
-import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-4LRZVFXR.js';
+import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-KHEXN75Q.js';
 import { createScorer } from '@mastra/core/scores';
 import { z } from 'zod';
@@ -264,6 +264,270 @@ function createAnswerRelevancyScorer({
   });
 }
+// src/scorers/llm/answer-similarity/prompts.ts
+var createExtractPrompt2 = ({ output, groundTruth }) => `
+Extract and normalize the semantic units (facts, claims, concepts) from both the agent output and the ground truth answer.
+Break down each text into its core semantic components while preserving meaning and relationships.
+Focus on extracting:
+- Key facts and claims
+- Important concepts and entities
+- Relationships between concepts
+- Quantitative information
+- Qualitative descriptions
+Guidelines:
+- Preserve the semantic meaning, not just keywords
+- Group related information together
+- Normalize different phrasings of the same concept
+- Keep numerical values and units together
+- Don't over-split compound concepts that belong together
+Return ONLY valid JSON with two arrays of semantic units. Do not include any text before or after the JSON.
+Agent Output:
+${output}
+Ground Truth:
+${groundTruth}
+Required JSON format (return valid JSON only):
+{
+  "outputUnits": [],
+  "groundTruthUnits": []
+}
+Important: Return valid JSON only, no additional text or explanations.
+`;
+var createAnalyzePrompt = ({
+  outputUnits,
+  groundTruthUnits
+}) => `
+Compare the semantic units from the agent output against the ground truth to evaluate answer similarity.
+Analyze each ground truth unit and determine:
+1. Whether it has a matching unit in the output (exact or semantic match)
+2. The quality of the match (exact, semantic, partial, missing)
+3. Whether there are contradictions
+Also identify:
+- Extra information in the output not present in ground truth
+- Any contradictory statements between output and ground truth
+Matching Guidelines:
+- "exact": The same information expressed identically or with minor wording differences
+- "semantic": The same concept or fact expressed differently but with equivalent meaning
+- "partial": Some overlap but missing important details or context
+- "missing": No corresponding information found in the output
+- "contradiction": Information that directly conflicts with the ground truth (wrong facts, incorrect names, false claims)
+CRITICAL: If the output contains factually incorrect information (wrong names, wrong facts, opposite claims), you MUST identify contradictions and mark relevant matches as "missing" while adding entries to the contradictions array.
+Return ONLY valid JSON with detailed analysis. Do not include any text before or after the JSON.
+Output Units:
+${JSON.stringify(outputUnits, null, 2)}
+Ground Truth Units:
+${JSON.stringify(groundTruthUnits, null, 2)}
+Required JSON format (copy this structure exactly):
+{
+  "matches": [
+    {
+      "groundTruthUnit": "unit from ground truth",
+      "outputUnit": "corresponding unit from output or null if missing",
+      "matchType": "exact",
+      "explanation": "brief explanation of the match quality"
+    }
+  ],
+  "extraInOutput": [],
+  "contradictions": []
+}
+Important:
+- matchType must be exactly one of: "exact", "semantic", "partial", "missing"
+- outputUnit must be a string or null (not undefined)
+- All arrays must be present even if empty
+- Return valid JSON only, no additional text
+`;
+var createReasonPrompt2 = ({
+  output,
+  groundTruth,
+  score,
+  analysis,
+  scale
+}) => `
+Generate a clear, actionable explanation of the answer similarity score.
+Context:
+- Agent Output: ${output}
+- Ground Truth: ${groundTruth}
+- Score: ${score}/${scale}
+- Analysis: ${JSON.stringify(analysis, null, 2)}
+Provide a concise explanation that:
+1. States the overall similarity level (high/moderate/low)
+2. Highlights what the agent got right
+3. Identifies key missing or incorrect information
+4. Suggests specific improvements if score is not perfect
+Keep the explanation under 3 sentences and focus on actionable insights.
+Format: "The score is {score}/{scale} because {explanation}. {what matched well}. {what needs improvement or is perfect}."
+Example good responses:
+- "The score is 0.9/1 because the answer captures all key concepts with minor phrasing differences. The agent correctly identified the main facts and relationships. Only missing a minor detail about the specific date mentioned in the ground truth."
+- "The score is 0.5/1 because the answer is partially correct but missing crucial information. The agent correctly explained the basic concept. However, it missed the quantitative data and specific examples that were essential to the complete answer."
+- "The score is 1.0/1 because the answer perfectly matches the ground truth semantically. All key facts, relationships, and details are accurately represented. No improvements needed."
+`;
+// src/scorers/llm/answer-similarity/index.ts
+var ANSWER_SIMILARITY_DEFAULT_OPTIONS = {
+  requireGroundTruth: true,
+  semanticThreshold: 0.8,
+  exactMatchBonus: 0.2,
+  missingPenalty: 0.15,
+  contradictionPenalty: 1,
+  extraInfoPenalty: 0.05,
+  scale: 1
+};
+var ANSWER_SIMILARITY_INSTRUCTIONS = `
+You are a precise answer similarity evaluator for CI/CD testing. Your role is to compare agent outputs against ground truth answers to ensure consistency and accuracy in automated testing.
+Key Principles:
+1. Focus on semantic equivalence, not just string matching
+2. Recognize that different phrasings can convey the same information
+3. Identify missing critical information from the ground truth
+4. Detect contradictions between output and ground truth
+5. Provide actionable feedback for improving answer accuracy
+6. Be strict but fair - partial credit for partial matches
+`;
+var extractOutputSchema2 = z.object({
+  outputUnits: z.array(z.string()),
+  groundTruthUnits: z.array(z.string())
+});
+var analyzeOutputSchema = z.object({
+  matches: z.array(
+    z.object({
+      groundTruthUnit: z.string(),
+      outputUnit: z.string().nullable(),
+      matchType: z.enum(["exact", "semantic", "partial", "missing"]),
+      explanation: z.string()
+    })
+  ),
+  extraInOutput: z.array(z.string()),
+  contradictions: z.array(
+    z.object({
+      outputUnit: z.string(),
+      groundTruthUnit: z.string(),
+      explanation: z.string()
+    })
+  )
+});
+function createAnswerSimilarityScorer({
+  model,
+  options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
+}) {
+  const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
+  return createScorer({
+    name: "Answer Similarity Scorer",
+    description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
+    judge: {
+      model,
+      instructions: ANSWER_SIMILARITY_INSTRUCTIONS
+    }
+  }).preprocess({
+    description: "Extract semantic units from output and ground truth",
+    outputSchema: extractOutputSchema2,
+    createPrompt: ({ run }) => {
+      if (!run.groundTruth) {
+        if (mergedOptions.requireGroundTruth) {
+          throw new Error("Answer Similarity Scorer requires ground truth to be provided");
+        }
+        return createExtractPrompt2({
+          output: "",
+          groundTruth: ""
+        });
+      }
+      const output = getAssistantMessageFromRunOutput(run.output) ?? "";
+      const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
+      return createExtractPrompt2({
+        output,
+        groundTruth
+      });
+    }
+  }).analyze({
+    description: "Compare semantic units between output and ground truth",
+    outputSchema: analyzeOutputSchema,
+    createPrompt: ({ results }) => {
+      const outputUnits = results.preprocessStepResult?.outputUnits || [];
+      const groundTruthUnits = results.preprocessStepResult?.groundTruthUnits || [];
+      return createAnalyzePrompt({
+        outputUnits,
+        groundTruthUnits
+      });
+    }
+  }).generateScore(({ run, results }) => {
+    if (!run.groundTruth) {
+      return 0;
+    }
+    const analysis = results.analyzeStepResult;
+    if (!analysis) {
+      return 0;
+    }
+    let score = 0;
+    const totalUnits = analysis.matches.length;
+    if (totalUnits === 0) {
+      return 0;
+    }
+    for (const match of analysis.matches) {
+      switch (match.matchType) {
+        case "exact":
+          score += 1 + mergedOptions.exactMatchBonus;
+          break;
+        case "semantic":
+          score += mergedOptions.semanticThreshold;
+          break;
+        case "partial":
+          score += mergedOptions.semanticThreshold * 0.5;
+          break;
+        case "missing":
+          score -= mergedOptions.missingPenalty;
+          break;
+      }
+    }
+    const maxPossibleScore = totalUnits * (1 + mergedOptions.exactMatchBonus);
+    score = score / maxPossibleScore;
+    const contradictionPenalty = analysis.contradictions.length * mergedOptions.contradictionPenalty;
+    score -= contradictionPenalty;
+    const extraInfoPenalty = Math.min(
+      analysis.extraInOutput.length * mergedOptions.extraInfoPenalty,
+      0.2
+      // Cap extra info penalty at 0.2
+    );
+    score -= extraInfoPenalty;
+    score = Math.max(0, Math.min(1, score));
+    return roundToTwoDecimals(score * mergedOptions.scale);
+  }).generateReason({
+    description: "Generate explanation of similarity score",
+    createPrompt: ({ run, results, score }) => {
+      if (!run.groundTruth) {
+        return "No ground truth was provided for comparison. Score is 0 by default.";
+      }
+      const output = getAssistantMessageFromRunOutput(run.output) ?? "";
+      const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
+      return createReasonPrompt2({
+        output,
+        groundTruth,
+        score,
+        analysis: results.analyzeStepResult,
+        scale: mergedOptions.scale
+      });
+    }
+  });
+}
 // src/scorers/llm/faithfulness/prompts.ts
 var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
@@ -1016,7 +1280,7 @@ OUTPUT REQUIREMENTS:
 You excel at identifying the difference between tools that directly serve the user's stated need versus tools that might be generally useful but weren't requested.
 `;
-var createAnalyzePrompt = ({
+var createAnalyzePrompt2 = ({
   userInput,
   agentResponse,
   toolsCalled,
@@ -1067,7 +1331,7 @@ STRICT EVALUATION CRITERIA:
 Evaluate each tool that was called, or if no tools were called, evaluate whether that was the right decision.
 `;
 };
-var createReasonPrompt2 = ({
+var createReasonPrompt3 = ({
   userInput,
   score,
   evaluations,
@@ -1086,7 +1350,7 @@ Provide a single, concise sentence explaining why this score was given.
 };
 // src/scorers/llm/tool-call-accuracy/index.ts
-var analyzeOutputSchema = z.object({
+var analyzeOutputSchema2 = z.object({
   evaluations: z.array(
     z.object({
       toolCalled: z.string(),
@@ -1119,12 +1383,12 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
     };
   }).analyze({
     description: "Analyze the appropriateness of tool selections",
-    outputSchema: analyzeOutputSchema,
+    outputSchema: analyzeOutputSchema2,
     createPrompt: ({ run, results }) => {
       const userInput = getUserMessageFromRunInput(run.input) ?? "";
       const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
       const toolsCalled = results.preprocessStepResult?.actualTools || [];
-      return createAnalyzePrompt({
+      return createAnalyzePrompt2({
         userInput,
         agentResponse,
         toolsCalled,
@@ -1146,7 +1410,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
       const userInput = getUserMessageFromRunInput(run.input) ?? "";
       const evaluations = results.analyzeStepResult?.evaluations || [];
       const missingTools = results.analyzeStepResult?.missingTools || [];
-      return createReasonPrompt2({
+      return createReasonPrompt3({
         userInput,
         score,
         evaluations,
@@ -1173,7 +1437,7 @@ Evaluation Guidelines:
 - Consider whether missing context might have led to a better response
 Be thorough and fair in your evaluation, considering both what context was provided and what might have been more useful.`;
-function createAnalyzePrompt2({
+function createAnalyzePrompt3({
   userQuery,
   agentResponse,
   providedContext
@@ -1258,7 +1522,7 @@ Context:
   "overallAssessment": "The context is mostly high-quality with 2 out of 3 pieces being highly relevant and used in the response"
 }`;
 }
-function createReasonPrompt3({
+function createReasonPrompt4({
   userQuery,
   score,
   evaluations,
@@ -1304,7 +1568,7 @@ Example responses:
 }
 // src/scorers/llm/context-relevance/index.ts
-var analyzeOutputSchema2 = z.object({
+var analyzeOutputSchema3 = z.object({
   evaluations: z.array(
     z.object({
       context_index: z.number(),
@@ -1344,19 +1608,19 @@ function createContextRelevanceScorerLLM({
     }
   }).analyze({
     description: "Analyze the relevance and utility of provided context",
-    outputSchema: analyzeOutputSchema2,
+    outputSchema: analyzeOutputSchema3,
     createPrompt: ({ run }) => {
       const userQuery = getUserMessageFromRunInput(run.input) ?? "";
       const agentResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
       const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
       if (context.length === 0) {
-        return createAnalyzePrompt2({
+        return createAnalyzePrompt3({
           userQuery,
           agentResponse,
           providedContext: ["[No context was provided for evaluation]"]
         });
       }
-      return createAnalyzePrompt2({
+      return createAnalyzePrompt3({
         userQuery,
         agentResponse,
         providedContext: context
@@ -1406,7 +1670,7 @@ function createContextRelevanceScorerLLM({
       }
       const evaluations = results.analyzeStepResult?.evaluations || [];
       const missingContext = results.analyzeStepResult?.missingContext || [];
-      return createReasonPrompt3({
+      return createReasonPrompt4({
         userQuery,
         score,
         evaluations,
@@ -1639,7 +1903,7 @@ Noise Impact Assessment:
 - **Severe Impact (0.0-0.1)**: Response is substantially worse, incorrect, or completely derailed
 Be thorough in comparing both responses and identifying specific ways the noise affected the agent's performance.`;
-function createAnalyzePrompt3({
+function createAnalyzePrompt4({
   userQuery,
   baselineResponse,
   noisyQuery,
@@ -1758,7 +2022,7 @@ Noisy Response: "Regular exercise improves cardiovascular health and strengthens
   "robustnessScore": 0.85
 }`;
 }
-function createReasonPrompt4({
+function createReasonPrompt5({
   userQuery,
   score,
   dimensions,
@@ -1813,7 +2077,7 @@ Example responses:
 }
 // src/scorers/llm/noise-sensitivity/index.ts
-var analyzeOutputSchema3 = z.object({
+var analyzeOutputSchema4 = z.object({
   dimensions: z.array(
     z.object({
       dimension: z.string(),
@@ -1857,14 +2121,14 @@ function createNoiseSensitivityScorerLLM({
     }
   }).analyze({
     description: "Analyze the impact of noise on agent response quality",
-    outputSchema: analyzeOutputSchema3,
+    outputSchema: analyzeOutputSchema4,
     createPrompt: ({ run }) => {
       const originalQuery = getUserMessageFromRunInput(run.input) ?? "";
       const noisyResponse = getAssistantMessageFromRunOutput(run.output) ?? "";
       if (!originalQuery || !noisyResponse) {
         throw new Error("Both original query and noisy response are required for evaluation");
       }
-      return createAnalyzePrompt3({
+      return createAnalyzePrompt4({
         userQuery: originalQuery,
         baselineResponse: options.baselineResponse,
         noisyQuery: options.noisyQuery,
@@ -1912,7 +2176,7 @@ function createNoiseSensitivityScorerLLM({
       if (!analysisResult) {
         throw new Error("Analysis step failed to produce results for reason generation");
       }
-      return createReasonPrompt4({
+      return createReasonPrompt5({
         userQuery: originalQuery,
         score,
         dimensions: analysisResult.dimensions || [],
@@ -1941,7 +2205,7 @@ Evaluation Guidelines:
 - Be objective and focus on alignment rather than response quality
 Score each dimension from 0.0 (completely misaligned) to 1.0 (perfectly aligned).`;
-function createAnalyzePrompt4({
+function createAnalyzePrompt5({
   userPrompt,
   systemPrompt,
   agentResponse,
@@ -2093,7 +2357,7 @@ Agent Response: "def factorial(n):
   "overallAssessment": "The response perfectly aligns with the prompt, providing a correct Python factorial function with the requested error handling for negative numbers"
 }`;
 }
-function createReasonPrompt5({
+function createReasonPrompt6({
   userPrompt,
   systemPrompt,
   score,
@@ -2156,7 +2420,7 @@ Example responses:
 }
 // src/scorers/llm/prompt-alignment/index.ts
-var analyzeOutputSchema4 = z.object({
+var analyzeOutputSchema5 = z.object({
   intentAlignment: z.object({
     score: z.number().min(0).max(1),
     primaryIntent: z.string(),
@@ -2228,7 +2492,7 @@ function createPromptAlignmentScorerLLM({
     }
   }).analyze({
     description: "Analyze prompt-response alignment across multiple dimensions",
-    outputSchema: analyzeOutputSchema4,
+    outputSchema: analyzeOutputSchema5,
     createPrompt: ({ run }) => {
       const userPrompt = getUserMessageFromRunInput(run.input) ?? "";
       const systemPrompt = getCombinedSystemPrompt(run.input) ?? "";
@@ -2245,7 +2509,7 @@ function createPromptAlignmentScorerLLM({
       if (!agentResponse) {
         throw new Error("Agent response is required for prompt alignment scoring");
       }
-      return createAnalyzePrompt4({
+      return createAnalyzePrompt5({
         userPrompt,
         systemPrompt,
         agentResponse,
@@ -2278,7 +2542,7 @@ function createPromptAlignmentScorerLLM({
       if (!analysis) {
         return `Unable to analyze prompt alignment. Score: ${score}`;
       }
-      return createReasonPrompt5({
+      return createReasonPrompt6({
         userPrompt,
         systemPrompt,
         score,
@@ -2290,6 +2554,6 @@ function createPromptAlignmentScorerLLM({
   });
 }
-export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
+export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
 //# sourceMappingURL=index.js.map
 //# sourceMappingURL=index.js.map