npm - @mastra/evals - Versions diffs - 0.10.5 → 0.10.6 - Mend

@mastra/evals 0.10.5 → 0.10.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/LICENSE.md +11 -42
package/README.md +0 -7
package/dist/_tsup-dts-rollup.d.cts +217 -0
package/dist/_tsup-dts-rollup.d.ts +217 -0
package/dist/chunk-2JVD5IX6.cjs +8 -0
package/dist/chunk-UYXFD4VX.js +6 -0
package/dist/{dist-M6SH7RKY.js → dist-5JXLPLM2.js} +8 -8
package/dist/{dist-HYT46G4X.cjs → dist-IVAARSAW.cjs} +8 -8
package/dist/index.cjs +1 -1
package/dist/index.js +1 -1
package/dist/{magic-string.es-WF7K5PCM.cjs → magic-string.es-66FD77JZ.cjs} +7 -13
package/dist/{magic-string.es-2DLRP5BO.js → magic-string.es-LD4FLE5J.js} +7 -13
package/dist/metrics/llm/index.cjs +13 -17
package/dist/metrics/llm/index.js +2 -6
package/dist/scorers/code/index.cjs +220 -0
package/dist/scorers/code/index.d.cts +4 -0
package/dist/scorers/code/index.d.ts +4 -0
package/dist/scorers/code/index.js +209 -0
package/dist/scorers/llm/index.cjs +1036 -0
package/dist/scorers/llm/index.d.cts +11 -0
package/dist/scorers/llm/index.d.ts +11 -0
package/dist/scorers/llm/index.js +1028 -0
package/package.json +28 -8

package/dist/metrics/llm/index.cjs CHANGED Viewed

@@ -1,14 +1,10 @@
 'use strict';
 var chunkCOBCYVZ7_cjs = require('../../chunk-COBCYVZ7.cjs');
+var chunk2JVD5IX6_cjs = require('../../chunk-2JVD5IX6.cjs');
 var _eval = require('@mastra/core/eval');
 var zod = require('zod');
-// src/metrics/llm/utils.ts
-var roundToTwoDecimals = (num) => {
-  return Math.round((num + Number.EPSILON) * 100) / 100;
-};
 // src/metrics/llm/answer-relevancy/prompts.ts
 var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.
@@ -187,7 +183,7 @@ function generateReasonPrompt({
   output,
   scale
 }) {
-  return `Explain the irrelevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
+  return `Explain the relevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
     Context:
     Input: ${input}
     Output: ${output}
@@ -288,7 +284,7 @@ var AnswerRelevancyMetric = class extends _eval.Metric {
       }
     }
     const score = relevancyCount / numberOfVerdicts;
-    return roundToTwoDecimals(score * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
   }
 };
@@ -492,7 +488,7 @@ var ContextPositionMetric = class extends _eval.Metric {
       return 0;
     }
     const finalScore = weightedSum / maxPossibleSum * this.scale;
-    return roundToTwoDecimals(finalScore);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(finalScore);
   }
 };
@@ -700,7 +696,7 @@ var ContextPrecisionMetric = class extends _eval.Metric {
       return 0;
     }
     const finalScore = weightedPrecisionSum / relevantCount;
-    return roundToTwoDecimals(finalScore * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(finalScore * this.scale);
   }
 };
@@ -938,7 +934,7 @@ var FaithfulnessMetric = class extends _eval.Metric {
       return 0;
     }
     const score = supportedClaims / totalClaims * this.scale;
-    return roundToTwoDecimals(score);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score);
   }
 };
@@ -1155,7 +1151,7 @@ var HallucinationMetric = class extends _eval.Metric {
       return 0;
     }
     const score = contradictedStatements / totalStatements * this.scale;
-    return roundToTwoDecimals(score);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score);
   }
 };
@@ -1459,7 +1455,7 @@ var PromptAlignmentMetric = class extends _eval.Metric {
       },
       { naCount: 0, alignmentCount: 0, applicableCount: 0 }
     );
-    const score = counts.applicableCount > 0 ? roundToTwoDecimals(counts.alignmentCount / counts.applicableCount * this.scale) : 0;
+    const score = counts.applicableCount > 0 ? chunk2JVD5IX6_cjs.roundToTwoDecimals(counts.alignmentCount / counts.applicableCount * this.scale) : 0;
     return {
       score,
       totalInstructions,
@@ -1619,7 +1615,7 @@ var ToxicityMetric = class extends _eval.Metric {
       }
     }
     const score = toxicityCount / numberOfVerdicts;
-    return roundToTwoDecimals(score * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
   }
 };
@@ -1795,7 +1791,7 @@ var ContextRelevancyMetric = class extends _eval.Metric {
     }
     const relevantVerdicts = verdicts.filter((v) => v.verdict.toLowerCase() === "yes");
     const score = relevantVerdicts.length / totalVerdicts;
-    return roundToTwoDecimals(score * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
   }
 };
@@ -1941,7 +1937,7 @@ var ContextualRecallMetric = class extends _eval.Metric {
     }
     const justifiedVerdicts = verdicts.filter((v) => v.verdict === "yes");
     const score = justifiedVerdicts.length / totalVerdicts;
-    return roundToTwoDecimals(score * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
   }
 };
@@ -2288,7 +2284,7 @@ var SummarizationMetric = class extends _eval.Metric {
       }
     }
     const score = positiveCount / numberOfVerdicts;
-    return roundToTwoDecimals(score * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
   }
 };
@@ -2466,7 +2462,7 @@ var BiasMetric = class extends _eval.Metric {
     }
     const biasedVerdicts = evaluation.filter((v) => v.verdict.toLowerCase() === "yes");
     const score = biasedVerdicts.length / numberOfVerdicts;
-    return roundToTwoDecimals(score * this.scale);
+    return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
   }
 };

package/dist/metrics/llm/index.js CHANGED Viewed

@@ -1,12 +1,8 @@
 import { MastraAgentJudge } from '../../chunk-TXXJUIES.js';
+import { roundToTwoDecimals } from '../../chunk-UYXFD4VX.js';
 import { Metric } from '@mastra/core/eval';
 import { z } from 'zod';
-// src/metrics/llm/utils.ts
-var roundToTwoDecimals = (num) => {
-  return Math.round((num + Number.EPSILON) * 100) / 100;
-};
 // src/metrics/llm/answer-relevancy/prompts.ts
 var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.
@@ -185,7 +181,7 @@ function generateReasonPrompt({
   output,
   scale
 }) {
-  return `Explain the irrelevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
+  return `Explain the relevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
     Context:
     Input: ${input}
     Output: ${output}

package/dist/scorers/code/index.cjs ADDED Viewed

@@ -0,0 +1,220 @@
+'use strict';
+var scores = require('@mastra/core/scores');
+var nlp = require('compromise');
+var difflib = require('difflib');
+var keyword_extractor = require('keyword-extractor');
+var stringSimilarity = require('string-similarity');
+function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
+var nlp__default = /*#__PURE__*/_interopDefault(nlp);
+var keyword_extractor__default = /*#__PURE__*/_interopDefault(keyword_extractor);
+var stringSimilarity__default = /*#__PURE__*/_interopDefault(stringSimilarity);
+function normalizeString(str) {
+  return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
+}
+function extractElements(doc) {
+  const nouns = doc.nouns().out("array") || [];
+  const verbs = doc.verbs().toInfinitive().out("array") || [];
+  const topics = doc.topics().out("array") || [];
+  const terms = doc.terms().out("array") || [];
+  const cleanAndSplitTerm = (term) => {
+    const normalized = normalizeString(term);
+    return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
+  };
+  const processedTerms = [
+    ...nouns.flatMap(cleanAndSplitTerm),
+    ...verbs.flatMap(cleanAndSplitTerm),
+    ...topics.flatMap(cleanAndSplitTerm),
+    ...terms.flatMap(cleanAndSplitTerm)
+  ];
+  return [...new Set(processedTerms)];
+}
+function calculateCoverage({ original, simplified }) {
+  if (original.length === 0) {
+    return simplified.length === 0 ? 1 : 0;
+  }
+  const covered = original.filter(
+    (element) => simplified.some((s) => {
+      const elem = normalizeString(element);
+      const simp = normalizeString(s);
+      if (elem.length <= 3) {
+        return elem === simp;
+      }
+      const longer = elem.length > simp.length ? elem : simp;
+      const shorter = elem.length > simp.length ? simp : elem;
+      if (longer.includes(shorter)) {
+        return shorter.length / longer.length > 0.6;
+      }
+      return false;
+    })
+  );
+  return covered.length / original.length;
+}
+function createCompletenessScorer() {
+  return scores.createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    extract: async (run) => {
+      const isInputInvalid = !run.input || run.input.some((i) => i.content === null || i.content === void 0);
+      const isOutputInvalid = !run.output || run.output.text === null || run.output.text === void 0;
+      if (isInputInvalid || isOutputInvalid) {
+        throw new Error("Inputs cannot be null or undefined");
+      }
+      const input = run.input.map((i) => i.content).join(", ");
+      const output = run.output.text;
+      const inputToProcess = input;
+      const outputToProcess = output;
+      const inputDoc = nlp__default.default(inputToProcess.trim());
+      const outputDoc = nlp__default.default(outputToProcess.trim());
+      const inputElements = extractElements(inputDoc);
+      const outputElements = extractElements(outputDoc);
+      return {
+        result: {
+          inputElements,
+          outputElements,
+          missingElements: inputElements.filter((e) => !outputElements.includes(e)),
+          elementCounts: {
+            input: inputElements.length,
+            output: outputElements.length
+          }
+        }
+      };
+    },
+    analyze: async (run) => {
+      const inputElements = run.extractStepResult?.inputElements;
+      const outputElements = run.extractStepResult?.outputElements;
+      return {
+        score: calculateCoverage({
+          original: inputElements,
+          simplified: outputElements
+        })
+      };
+    }
+  });
+}
+function createTextualDifferenceScorer() {
+  return scores.createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    analyze: async (run) => {
+      const input = run.input.map((i) => i.content).join(", ");
+      const output = run.output.text;
+      const matcher = new difflib.SequenceMatcher(null, input, output);
+      const ratio = matcher.ratio();
+      const ops = matcher.getOpcodes();
+      const changes = ops.filter(([op]) => op !== "equal").length;
+      const maxLength = Math.max(input.length, output.length);
+      const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
+      const confidence = 1 - lengthDiff;
+      return {
+        score: ratio,
+        result: {
+          confidence,
+          changes,
+          lengthDiff
+        }
+      };
+    }
+  });
+}
+function createKeywordCoverageScorer() {
+  return scores.createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    extract: async (run) => {
+      const input = run.input.map((i) => i.content).join(", ");
+      const output = run.output.text;
+      if (!input && !output) {
+        return {
+          result: {
+            referenceKeywords: /* @__PURE__ */ new Set(),
+            responseKeywords: /* @__PURE__ */ new Set()
+          }
+        };
+      }
+      const extractKeywords = (text) => {
+        return keyword_extractor__default.default.extract(text, {
+          language: "english",
+          remove_digits: true,
+          return_changed_case: true,
+          remove_duplicates: true
+        });
+      };
+      const referenceKeywords = new Set(extractKeywords(input));
+      const responseKeywords = new Set(extractKeywords(output));
+      return {
+        result: {
+          referenceKeywords,
+          responseKeywords
+        }
+      };
+    },
+    analyze: async (run) => {
+      if (!run.extractStepResult?.referenceKeywords.size && !run.extractStepResult?.responseKeywords.size) {
+        return {
+          score: 1,
+          result: {
+            totalKeywords: 0,
+            matchedKeywords: 0
+          }
+        };
+      }
+      const matchedKeywords = [...run.extractStepResult?.referenceKeywords].filter(
+        (k) => run.extractStepResult?.responseKeywords.has(k)
+      );
+      const totalKeywords = run.extractStepResult?.referenceKeywords.size;
+      const coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
+      return {
+        score: coverage,
+        result: {
+          totalKeywords: run.extractStepResult?.referenceKeywords.size,
+          matchedKeywords: matchedKeywords.length
+        }
+      };
+    }
+  });
+}
+function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
+  return scores.createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    extract: async (run) => {
+      let processedInput = run.input.map((i) => i.content).join(", ");
+      let processedOutput = run.output.text;
+      if (ignoreCase) {
+        processedInput = processedInput.toLowerCase();
+        processedOutput = processedOutput.toLowerCase();
+      }
+      if (ignoreWhitespace) {
+        processedInput = processedInput.replace(/\s+/g, " ").trim();
+        processedOutput = processedOutput.replace(/\s+/g, " ").trim();
+      }
+      return {
+        result: {
+          processedInput,
+          processedOutput
+        }
+      };
+    },
+    analyze: async (run) => {
+      const similarity = stringSimilarity__default.default.compareTwoStrings(
+        run.extractStepResult?.processedInput,
+        run.extractStepResult?.processedOutput
+      );
+      return {
+        score: similarity,
+        result: {
+          similarity
+        }
+      };
+    }
+  });
+}
+exports.createCompletenessScorer = createCompletenessScorer;
+exports.createContentSimilarityScorer = createContentSimilarityScorer;
+exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
+exports.createTextualDifferenceScorer = createTextualDifferenceScorer;

package/dist/scorers/code/index.d.cts ADDED Viewed

@@ -0,0 +1,4 @@
+export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '../../_tsup-dts-rollup.cjs';
+export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.cjs';
+export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.cjs';
+export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.cjs';

package/dist/scorers/code/index.d.ts ADDED Viewed

@@ -0,0 +1,4 @@
+export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '../../_tsup-dts-rollup.js';
+export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.js';
+export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.js';
+export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.js';

package/dist/scorers/code/index.js ADDED Viewed

@@ -0,0 +1,209 @@
+import { createScorer } from '@mastra/core/scores';
+import nlp from 'compromise';
+import { SequenceMatcher } from 'difflib';
+import keyword_extractor from 'keyword-extractor';
+import stringSimilarity from 'string-similarity';
+function normalizeString(str) {
+  return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
+}
+function extractElements(doc) {
+  const nouns = doc.nouns().out("array") || [];
+  const verbs = doc.verbs().toInfinitive().out("array") || [];
+  const topics = doc.topics().out("array") || [];
+  const terms = doc.terms().out("array") || [];
+  const cleanAndSplitTerm = (term) => {
+    const normalized = normalizeString(term);
+    return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
+  };
+  const processedTerms = [
+    ...nouns.flatMap(cleanAndSplitTerm),
+    ...verbs.flatMap(cleanAndSplitTerm),
+    ...topics.flatMap(cleanAndSplitTerm),
+    ...terms.flatMap(cleanAndSplitTerm)
+  ];
+  return [...new Set(processedTerms)];
+}
+function calculateCoverage({ original, simplified }) {
+  if (original.length === 0) {
+    return simplified.length === 0 ? 1 : 0;
+  }
+  const covered = original.filter(
+    (element) => simplified.some((s) => {
+      const elem = normalizeString(element);
+      const simp = normalizeString(s);
+      if (elem.length <= 3) {
+        return elem === simp;
+      }
+      const longer = elem.length > simp.length ? elem : simp;
+      const shorter = elem.length > simp.length ? simp : elem;
+      if (longer.includes(shorter)) {
+        return shorter.length / longer.length > 0.6;
+      }
+      return false;
+    })
+  );
+  return covered.length / original.length;
+}
+function createCompletenessScorer() {
+  return createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    extract: async (run) => {
+      const isInputInvalid = !run.input || run.input.some((i) => i.content === null || i.content === void 0);
+      const isOutputInvalid = !run.output || run.output.text === null || run.output.text === void 0;
+      if (isInputInvalid || isOutputInvalid) {
+        throw new Error("Inputs cannot be null or undefined");
+      }
+      const input = run.input.map((i) => i.content).join(", ");
+      const output = run.output.text;
+      const inputToProcess = input;
+      const outputToProcess = output;
+      const inputDoc = nlp(inputToProcess.trim());
+      const outputDoc = nlp(outputToProcess.trim());
+      const inputElements = extractElements(inputDoc);
+      const outputElements = extractElements(outputDoc);
+      return {
+        result: {
+          inputElements,
+          outputElements,
+          missingElements: inputElements.filter((e) => !outputElements.includes(e)),
+          elementCounts: {
+            input: inputElements.length,
+            output: outputElements.length
+          }
+        }
+      };
+    },
+    analyze: async (run) => {
+      const inputElements = run.extractStepResult?.inputElements;
+      const outputElements = run.extractStepResult?.outputElements;
+      return {
+        score: calculateCoverage({
+          original: inputElements,
+          simplified: outputElements
+        })
+      };
+    }
+  });
+}
+function createTextualDifferenceScorer() {
+  return createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    analyze: async (run) => {
+      const input = run.input.map((i) => i.content).join(", ");
+      const output = run.output.text;
+      const matcher = new SequenceMatcher(null, input, output);
+      const ratio = matcher.ratio();
+      const ops = matcher.getOpcodes();
+      const changes = ops.filter(([op]) => op !== "equal").length;
+      const maxLength = Math.max(input.length, output.length);
+      const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
+      const confidence = 1 - lengthDiff;
+      return {
+        score: ratio,
+        result: {
+          confidence,
+          changes,
+          lengthDiff
+        }
+      };
+    }
+  });
+}
+function createKeywordCoverageScorer() {
+  return createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    extract: async (run) => {
+      const input = run.input.map((i) => i.content).join(", ");
+      const output = run.output.text;
+      if (!input && !output) {
+        return {
+          result: {
+            referenceKeywords: /* @__PURE__ */ new Set(),
+            responseKeywords: /* @__PURE__ */ new Set()
+          }
+        };
+      }
+      const extractKeywords = (text) => {
+        return keyword_extractor.extract(text, {
+          language: "english",
+          remove_digits: true,
+          return_changed_case: true,
+          remove_duplicates: true
+        });
+      };
+      const referenceKeywords = new Set(extractKeywords(input));
+      const responseKeywords = new Set(extractKeywords(output));
+      return {
+        result: {
+          referenceKeywords,
+          responseKeywords
+        }
+      };
+    },
+    analyze: async (run) => {
+      if (!run.extractStepResult?.referenceKeywords.size && !run.extractStepResult?.responseKeywords.size) {
+        return {
+          score: 1,
+          result: {
+            totalKeywords: 0,
+            matchedKeywords: 0
+          }
+        };
+      }
+      const matchedKeywords = [...run.extractStepResult?.referenceKeywords].filter(
+        (k) => run.extractStepResult?.responseKeywords.has(k)
+      );
+      const totalKeywords = run.extractStepResult?.referenceKeywords.size;
+      const coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
+      return {
+        score: coverage,
+        result: {
+          totalKeywords: run.extractStepResult?.referenceKeywords.size,
+          matchedKeywords: matchedKeywords.length
+        }
+      };
+    }
+  });
+}
+function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
+  return createScorer({
+    name: "Completeness",
+    description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
+    extract: async (run) => {
+      let processedInput = run.input.map((i) => i.content).join(", ");
+      let processedOutput = run.output.text;
+      if (ignoreCase) {
+        processedInput = processedInput.toLowerCase();
+        processedOutput = processedOutput.toLowerCase();
+      }
+      if (ignoreWhitespace) {
+        processedInput = processedInput.replace(/\s+/g, " ").trim();
+        processedOutput = processedOutput.replace(/\s+/g, " ").trim();
+      }
+      return {
+        result: {
+          processedInput,
+          processedOutput
+        }
+      };
+    },
+    analyze: async (run) => {
+      const similarity = stringSimilarity.compareTwoStrings(
+        run.extractStepResult?.processedInput,
+        run.extractStepResult?.processedOutput
+      );
+      return {
+        score: similarity,
+        result: {
+          similarity
+        }
+      };
+    }
+  });
+}
+export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer };