@mastra/evals 0.14.3-alpha.0 → 1.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. package/CHANGELOG.md +36 -9
  2. package/README.md +19 -159
  3. package/dist/{chunk-KHEXN75Q.js → chunk-CCLM7KPF.js} +45 -21
  4. package/dist/chunk-CCLM7KPF.js.map +1 -0
  5. package/dist/{chunk-QKR2PMLZ.cjs → chunk-TPQLLHZW.cjs} +46 -21
  6. package/dist/chunk-TPQLLHZW.cjs.map +1 -0
  7. package/dist/scorers/code/completeness/index.d.ts +1 -1
  8. package/dist/scorers/code/completeness/index.d.ts.map +1 -1
  9. package/dist/scorers/code/content-similarity/index.d.ts +1 -1
  10. package/dist/scorers/code/content-similarity/index.d.ts.map +1 -1
  11. package/dist/scorers/code/keyword-coverage/index.d.ts +1 -1
  12. package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -1
  13. package/dist/scorers/code/textual-difference/index.d.ts +1 -1
  14. package/dist/scorers/code/textual-difference/index.d.ts.map +1 -1
  15. package/dist/scorers/code/tone/index.d.ts +1 -1
  16. package/dist/scorers/code/tone/index.d.ts.map +1 -1
  17. package/dist/scorers/code/tool-call-accuracy/index.d.ts +1 -1
  18. package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -1
  19. package/dist/scorers/llm/answer-relevancy/index.d.ts +1 -1
  20. package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
  21. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  22. package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
  23. package/dist/scorers/llm/bias/index.d.ts +2 -2
  24. package/dist/scorers/llm/bias/index.d.ts.map +1 -1
  25. package/dist/scorers/llm/context-precision/index.d.ts +3 -3
  26. package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
  27. package/dist/scorers/llm/context-relevance/index.d.ts +3 -3
  28. package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
  29. package/dist/scorers/llm/faithfulness/index.d.ts +2 -2
  30. package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
  31. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  32. package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
  33. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  34. package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
  35. package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
  36. package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
  37. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -2
  38. package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
  39. package/dist/scorers/llm/toxicity/index.d.ts +2 -2
  40. package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
  41. package/dist/scorers/{llm → prebuilt}/index.cjs +479 -62
  42. package/dist/scorers/prebuilt/index.cjs.map +1 -0
  43. package/dist/scorers/prebuilt/index.d.ts +3 -0
  44. package/dist/scorers/prebuilt/index.d.ts.map +1 -0
  45. package/dist/scorers/{llm → prebuilt}/index.js +419 -15
  46. package/dist/scorers/prebuilt/index.js.map +1 -0
  47. package/dist/scorers/utils.cjs +21 -17
  48. package/dist/scorers/utils.d.ts +21 -11
  49. package/dist/scorers/utils.d.ts.map +1 -1
  50. package/dist/scorers/utils.js +1 -1
  51. package/package.json +12 -58
  52. package/dist/attachListeners.d.ts +0 -4
  53. package/dist/attachListeners.d.ts.map +0 -1
  54. package/dist/chunk-7QAUEU4L.cjs +0 -10
  55. package/dist/chunk-7QAUEU4L.cjs.map +0 -1
  56. package/dist/chunk-EMMSS5I5.cjs +0 -37
  57. package/dist/chunk-EMMSS5I5.cjs.map +0 -1
  58. package/dist/chunk-G3PMV62Z.js +0 -33
  59. package/dist/chunk-G3PMV62Z.js.map +0 -1
  60. package/dist/chunk-IUSAD2BW.cjs +0 -19
  61. package/dist/chunk-IUSAD2BW.cjs.map +0 -1
  62. package/dist/chunk-KHEXN75Q.js.map +0 -1
  63. package/dist/chunk-QKR2PMLZ.cjs.map +0 -1
  64. package/dist/chunk-QTWX6TKR.js +0 -8
  65. package/dist/chunk-QTWX6TKR.js.map +0 -1
  66. package/dist/chunk-YGTIO3J5.js +0 -17
  67. package/dist/chunk-YGTIO3J5.js.map +0 -1
  68. package/dist/dist-LDTK3TIP.cjs +0 -16759
  69. package/dist/dist-LDTK3TIP.cjs.map +0 -1
  70. package/dist/dist-OWYZEOJK.js +0 -16737
  71. package/dist/dist-OWYZEOJK.js.map +0 -1
  72. package/dist/evaluation.d.ts +0 -8
  73. package/dist/evaluation.d.ts.map +0 -1
  74. package/dist/index.cjs +0 -93
  75. package/dist/index.cjs.map +0 -1
  76. package/dist/index.d.ts +0 -3
  77. package/dist/index.d.ts.map +0 -1
  78. package/dist/index.js +0 -89
  79. package/dist/index.js.map +0 -1
  80. package/dist/magic-string.es-7ORA5OGR.js +0 -1305
  81. package/dist/magic-string.es-7ORA5OGR.js.map +0 -1
  82. package/dist/magic-string.es-NZ2XWFKN.cjs +0 -1311
  83. package/dist/magic-string.es-NZ2XWFKN.cjs.map +0 -1
  84. package/dist/metrics/index.d.ts +0 -4
  85. package/dist/metrics/index.d.ts.map +0 -1
  86. package/dist/metrics/judge/index.cjs +0 -12
  87. package/dist/metrics/judge/index.cjs.map +0 -1
  88. package/dist/metrics/judge/index.d.ts +0 -7
  89. package/dist/metrics/judge/index.d.ts.map +0 -1
  90. package/dist/metrics/judge/index.js +0 -3
  91. package/dist/metrics/judge/index.js.map +0 -1
  92. package/dist/metrics/llm/answer-relevancy/index.d.ts +0 -16
  93. package/dist/metrics/llm/answer-relevancy/index.d.ts.map +0 -1
  94. package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +0 -20
  95. package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +0 -1
  96. package/dist/metrics/llm/answer-relevancy/prompts.d.ts +0 -19
  97. package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +0 -1
  98. package/dist/metrics/llm/bias/index.d.ts +0 -14
  99. package/dist/metrics/llm/bias/index.d.ts.map +0 -1
  100. package/dist/metrics/llm/bias/metricJudge.d.ts +0 -14
  101. package/dist/metrics/llm/bias/metricJudge.d.ts.map +0 -1
  102. package/dist/metrics/llm/bias/prompts.d.ts +0 -14
  103. package/dist/metrics/llm/bias/prompts.d.ts.map +0 -1
  104. package/dist/metrics/llm/context-position/index.d.ts +0 -16
  105. package/dist/metrics/llm/context-position/index.d.ts.map +0 -1
  106. package/dist/metrics/llm/context-position/metricJudge.d.ts +0 -20
  107. package/dist/metrics/llm/context-position/metricJudge.d.ts.map +0 -1
  108. package/dist/metrics/llm/context-position/prompts.d.ts +0 -17
  109. package/dist/metrics/llm/context-position/prompts.d.ts.map +0 -1
  110. package/dist/metrics/llm/context-precision/index.d.ts +0 -16
  111. package/dist/metrics/llm/context-precision/index.d.ts.map +0 -1
  112. package/dist/metrics/llm/context-precision/metricJudge.d.ts +0 -20
  113. package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +0 -1
  114. package/dist/metrics/llm/context-precision/prompts.d.ts +0 -17
  115. package/dist/metrics/llm/context-precision/prompts.d.ts.map +0 -1
  116. package/dist/metrics/llm/context-relevancy/index.d.ts +0 -16
  117. package/dist/metrics/llm/context-relevancy/index.d.ts.map +0 -1
  118. package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +0 -16
  119. package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +0 -1
  120. package/dist/metrics/llm/context-relevancy/prompts.d.ts +0 -13
  121. package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +0 -1
  122. package/dist/metrics/llm/contextual-recall/index.d.ts +0 -16
  123. package/dist/metrics/llm/contextual-recall/index.d.ts.map +0 -1
  124. package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +0 -16
  125. package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +0 -1
  126. package/dist/metrics/llm/contextual-recall/prompts.d.ts +0 -13
  127. package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +0 -1
  128. package/dist/metrics/llm/faithfulness/index.d.ts +0 -16
  129. package/dist/metrics/llm/faithfulness/index.d.ts.map +0 -1
  130. package/dist/metrics/llm/faithfulness/metricJudge.d.ts +0 -22
  131. package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +0 -1
  132. package/dist/metrics/llm/faithfulness/prompts.d.ts +0 -20
  133. package/dist/metrics/llm/faithfulness/prompts.d.ts.map +0 -1
  134. package/dist/metrics/llm/hallucination/index.d.ts +0 -16
  135. package/dist/metrics/llm/hallucination/index.d.ts.map +0 -1
  136. package/dist/metrics/llm/hallucination/metricJudge.d.ts +0 -22
  137. package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +0 -1
  138. package/dist/metrics/llm/hallucination/prompts.d.ts +0 -17
  139. package/dist/metrics/llm/hallucination/prompts.d.ts.map +0 -1
  140. package/dist/metrics/llm/index.cjs +0 -2481
  141. package/dist/metrics/llm/index.cjs.map +0 -1
  142. package/dist/metrics/llm/index.d.ts +0 -12
  143. package/dist/metrics/llm/index.d.ts.map +0 -1
  144. package/dist/metrics/llm/index.js +0 -2469
  145. package/dist/metrics/llm/index.js.map +0 -1
  146. package/dist/metrics/llm/prompt-alignment/index.d.ts +0 -33
  147. package/dist/metrics/llm/prompt-alignment/index.d.ts.map +0 -1
  148. package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +0 -20
  149. package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +0 -1
  150. package/dist/metrics/llm/prompt-alignment/prompts.d.ts +0 -17
  151. package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +0 -1
  152. package/dist/metrics/llm/summarization/index.d.ts +0 -19
  153. package/dist/metrics/llm/summarization/index.d.ts.map +0 -1
  154. package/dist/metrics/llm/summarization/metricJudge.d.ts +0 -34
  155. package/dist/metrics/llm/summarization/metricJudge.d.ts.map +0 -1
  156. package/dist/metrics/llm/summarization/prompts.d.ts +0 -30
  157. package/dist/metrics/llm/summarization/prompts.d.ts.map +0 -1
  158. package/dist/metrics/llm/toxicity/index.d.ts +0 -14
  159. package/dist/metrics/llm/toxicity/index.d.ts.map +0 -1
  160. package/dist/metrics/llm/toxicity/metricJudge.d.ts +0 -14
  161. package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +0 -1
  162. package/dist/metrics/llm/toxicity/prompts.d.ts +0 -10
  163. package/dist/metrics/llm/toxicity/prompts.d.ts.map +0 -1
  164. package/dist/metrics/llm/types.d.ts +0 -7
  165. package/dist/metrics/llm/types.d.ts.map +0 -1
  166. package/dist/metrics/llm/utils.d.ts +0 -14
  167. package/dist/metrics/llm/utils.d.ts.map +0 -1
  168. package/dist/metrics/nlp/completeness/index.d.ts +0 -21
  169. package/dist/metrics/nlp/completeness/index.d.ts.map +0 -1
  170. package/dist/metrics/nlp/content-similarity/index.d.ts +0 -18
  171. package/dist/metrics/nlp/content-similarity/index.d.ts.map +0 -1
  172. package/dist/metrics/nlp/index.cjs +0 -203
  173. package/dist/metrics/nlp/index.cjs.map +0 -1
  174. package/dist/metrics/nlp/index.d.ts +0 -6
  175. package/dist/metrics/nlp/index.d.ts.map +0 -1
  176. package/dist/metrics/nlp/index.js +0 -190
  177. package/dist/metrics/nlp/index.js.map +0 -1
  178. package/dist/metrics/nlp/keyword-coverage/index.d.ts +0 -13
  179. package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +0 -1
  180. package/dist/metrics/nlp/textual-difference/index.d.ts +0 -15
  181. package/dist/metrics/nlp/textual-difference/index.d.ts.map +0 -1
  182. package/dist/metrics/nlp/tone/index.d.ts +0 -18
  183. package/dist/metrics/nlp/tone/index.d.ts.map +0 -1
  184. package/dist/scorers/code/index.cjs +0 -329
  185. package/dist/scorers/code/index.cjs.map +0 -1
  186. package/dist/scorers/code/index.js +0 -315
  187. package/dist/scorers/code/index.js.map +0 -1
  188. package/dist/scorers/llm/index.cjs.map +0 -1
  189. package/dist/scorers/llm/index.js.map +0 -1
@@ -1,7 +1,10 @@
1
- import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
2
- import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals as roundToTwoDecimals$1, extractToolCalls, getCombinedSystemPrompt } from '../../chunk-KHEXN75Q.js';
3
- import { createScorer } from '@mastra/core/scores';
1
+ import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage } from '../../chunk-CCLM7KPF.js';
2
+ import { createScorer } from '@mastra/core/evals';
4
3
  import { z } from 'zod';
4
+ import nlp from 'compromise';
5
+ import keyword_extractor from 'keyword-extractor';
6
+ import stringSimilarity from 'string-similarity';
7
+ import Sentiment from 'sentiment';
5
8
 
6
9
  // src/scorers/llm/answer-relevancy/prompts.ts
7
10
  var createExtractPrompt = (output) => `
@@ -215,6 +218,7 @@ function createAnswerRelevancyScorer({
215
218
  options = DEFAULT_OPTIONS
216
219
  }) {
217
220
  return createScorer({
221
+ id: "answer-relevancy-scorer",
218
222
  name: "Answer Relevancy Scorer",
219
223
  description: "A scorer that evaluates the relevancy of an LLM output to an input",
220
224
  judge: {
@@ -432,6 +436,7 @@ function createAnswerSimilarityScorer({
432
436
  }) {
433
437
  const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
434
438
  return createScorer({
439
+ id: "answer-similarity-scorer",
435
440
  name: "Answer Similarity Scorer",
436
441
  description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
437
442
  judge: {
@@ -689,6 +694,7 @@ function createFaithfulnessScorer({
689
694
  options
690
695
  }) {
691
696
  return createScorer({
697
+ id: "faithfulness-scorer",
692
698
  name: "Faithfulness Scorer",
693
699
  description: "A scorer that evaluates the faithfulness of an LLM output to an input",
694
700
  judge: {
@@ -707,7 +713,10 @@ function createFaithfulnessScorer({
707
713
  description: "Score the relevance of the statements to the input",
708
714
  outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
709
715
  createPrompt: ({ results, run }) => {
710
- const context = options?.context ?? run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : "") ?? [];
716
+ const assistantMessage = run.output.find(({ role }) => role === "assistant");
717
+ const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
718
+ (toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
719
+ ) ?? [];
711
720
  const prompt = createFaithfulnessAnalyzePrompt({
712
721
  claims: results.preprocessStepResult || [],
713
722
  context
@@ -721,14 +730,15 @@ function createFaithfulnessScorer({
721
730
  return 0;
722
731
  }
723
732
  const score = supportedClaims / totalClaims * (options?.scale || 1);
724
- return roundToTwoDecimals$1(score);
733
+ return roundToTwoDecimals(score);
725
734
  }).generateReason({
726
735
  description: "Reason about the results",
727
736
  createPrompt: ({ run, results, score }) => {
737
+ const assistantMessage = run.output.find(({ role }) => role === "assistant");
728
738
  const prompt = createFaithfulnessReasonPrompt({
729
739
  input: getUserMessageFromRunInput(run.input) ?? "",
730
740
  output: getAssistantMessageFromRunOutput(run.output) ?? "",
731
- context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
741
+ context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
732
742
  score,
733
743
  scale: options?.scale || 1,
734
744
  verdicts: results.analyzeStepResult?.verdicts || []
@@ -847,6 +857,7 @@ ${biases.join("\n")}
847
857
  // src/scorers/llm/bias/index.ts
848
858
  function createBiasScorer({ model, options }) {
849
859
  return createScorer({
860
+ id: "bias-scorer",
850
861
  name: "Bias Scorer",
851
862
  description: "A scorer that evaluates the bias of an LLM output to an input",
852
863
  judge: {
@@ -876,7 +887,7 @@ function createBiasScorer({ model, options }) {
876
887
  }
877
888
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
878
889
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
879
- return roundToTwoDecimals$1(score * (options?.scale || 1));
890
+ return roundToTwoDecimals(score * (options?.scale || 1));
880
891
  }).generateReason({
881
892
  description: "Reason about the results",
882
893
  createPrompt: ({ score, results }) => {
@@ -1081,6 +1092,7 @@ function createHallucinationScorer({
1081
1092
  options
1082
1093
  }) {
1083
1094
  return createScorer({
1095
+ id: "hallucination-scorer",
1084
1096
  name: "Hallucination Scorer",
1085
1097
  description: "A scorer that evaluates the hallucination of an LLM output to an input",
1086
1098
  judge: {
@@ -1116,7 +1128,7 @@ function createHallucinationScorer({
1116
1128
  return 0;
1117
1129
  }
1118
1130
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
1119
- return roundToTwoDecimals$1(score);
1131
+ return roundToTwoDecimals(score);
1120
1132
  }).generateReason({
1121
1133
  description: "Reason about the results",
1122
1134
  createPrompt: ({ run, results, score }) => {
@@ -1223,6 +1235,7 @@ function createToxicityScorer({
1223
1235
  options
1224
1236
  }) {
1225
1237
  return createScorer({
1238
+ id: "toxicity-scorer",
1226
1239
  name: "Toxicity Scorer",
1227
1240
  description: "A scorer that evaluates the toxicity of an LLM output to an input",
1228
1241
  judge: {
@@ -1252,7 +1265,7 @@ function createToxicityScorer({
1252
1265
  }
1253
1266
  }
1254
1267
  const score = toxicityCount / numberOfVerdicts;
1255
- return roundToTwoDecimals$1(score * (options?.scale || 1));
1268
+ return roundToTwoDecimals(score * (options?.scale || 1));
1256
1269
  }).generateReason({
1257
1270
  description: "Reason about the results",
1258
1271
  createPrompt: ({ results, score }) => {
@@ -1372,6 +1385,7 @@ var analyzeOutputSchema2 = z.object({
1372
1385
  function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1373
1386
  const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
1374
1387
  return createScorer({
1388
+ id: "llm-tool-call-accuracy-scorer",
1375
1389
  name: "Tool Call Accuracy (LLM)",
1376
1390
  description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
1377
1391
  judge: {
@@ -1413,7 +1427,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1413
1427
  }
1414
1428
  const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1415
1429
  const totalToolCalls = evaluations.length;
1416
- return roundToTwoDecimals$1(appropriateToolCalls / totalToolCalls);
1430
+ return roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1417
1431
  }).generateReason({
1418
1432
  description: "Generate human-readable explanation of tool selection evaluation",
1419
1433
  createPrompt: ({ run, results, score }) => {
@@ -1610,6 +1624,7 @@ function createContextRelevanceScorerLLM({
1610
1624
  throw new Error("Context array cannot be empty if provided");
1611
1625
  }
1612
1626
  return createScorer({
1627
+ id: "context-relevance-scorer",
1613
1628
  name: "Context Relevance (LLM)",
1614
1629
  description: "Evaluates how relevant and useful the provided context was for generating the agent response",
1615
1630
  judge: {
@@ -1670,7 +1685,7 @@ function createContextRelevanceScorerLLM({
1670
1685
  const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1671
1686
  const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1672
1687
  const scaledScore = finalScore * (options.scale || 1);
1673
- return roundToTwoDecimals$1(scaledScore);
1688
+ return roundToTwoDecimals(scaledScore);
1674
1689
  }).generateReason({
1675
1690
  description: "Generate human-readable explanation of context relevance evaluation",
1676
1691
  createPrompt: ({ run, results, score }) => {
@@ -1833,6 +1848,7 @@ function createContextPrecisionScorer({
1833
1848
  throw new Error("Context array cannot be empty if provided");
1834
1849
  }
1835
1850
  return createScorer({
1851
+ id: "context-precision-scorer",
1836
1852
  name: "Context Precision Scorer",
1837
1853
  description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
1838
1854
  judge: {
@@ -1878,7 +1894,7 @@ function createContextPrecisionScorer({
1878
1894
  }
1879
1895
  const map = sumPrecision / relevantCount;
1880
1896
  const score = map * (options.scale || 1);
1881
- return roundToTwoDecimals$1(score);
1897
+ return roundToTwoDecimals(score);
1882
1898
  }).generateReason({
1883
1899
  description: "Reason about the context precision results",
1884
1900
  createPrompt: ({ run, results, score }) => {
@@ -2125,6 +2141,7 @@ function createNoiseSensitivityScorerLLM({
2125
2141
  throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
2126
2142
  }
2127
2143
  return createScorer({
2144
+ id: "noise-sensitivity-scorer",
2128
2145
  name: "Noise Sensitivity (LLM)",
2129
2146
  description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
2130
2147
  judge: {
@@ -2180,7 +2197,7 @@ function createNoiseSensitivityScorerLLM({
2180
2197
  const majorIssues = analysisResult.majorIssues || [];
2181
2198
  const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
2182
2199
  finalScore = Math.max(0, finalScore - issuesPenalty);
2183
- return roundToTwoDecimals$1(finalScore);
2200
+ return roundToTwoDecimals(finalScore);
2184
2201
  }).generateReason({
2185
2202
  description: "Generate human-readable explanation of noise sensitivity evaluation",
2186
2203
  createPrompt: ({ run, results, score }) => {
@@ -2497,6 +2514,7 @@ function createPromptAlignmentScorerLLM({
2497
2514
  const scale = options?.scale || 1;
2498
2515
  const evaluationMode = options?.evaluationMode || "both";
2499
2516
  return createScorer({
2517
+ id: "prompt-alignment-scorer",
2500
2518
  name: "Prompt Alignment (LLM)",
2501
2519
  description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
2502
2520
  judge: {
@@ -2545,7 +2563,7 @@ function createPromptAlignmentScorerLLM({
2545
2563
  weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2546
2564
  }
2547
2565
  const finalScore = weightedScore * scale;
2548
- return roundToTwoDecimals$1(finalScore);
2566
+ return roundToTwoDecimals(finalScore);
2549
2567
  }).generateReason({
2550
2568
  description: "Generate human-readable explanation of prompt alignment evaluation",
2551
2569
  createPrompt: ({ run, results, score }) => {
@@ -2566,7 +2584,393 @@ function createPromptAlignmentScorerLLM({
2566
2584
  }
2567
2585
  });
2568
2586
  }
2587
+ function normalizeString(str) {
2588
+ return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
2589
+ }
2590
+ function extractElements(doc) {
2591
+ const nouns = doc.nouns().out("array") || [];
2592
+ const verbs = doc.verbs().toInfinitive().out("array") || [];
2593
+ const topics = doc.topics().out("array") || [];
2594
+ const terms = doc.terms().out("array") || [];
2595
+ const cleanAndSplitTerm = (term) => {
2596
+ const normalized = normalizeString(term);
2597
+ return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
2598
+ };
2599
+ const processedTerms = [
2600
+ ...nouns.flatMap(cleanAndSplitTerm),
2601
+ ...verbs.flatMap(cleanAndSplitTerm),
2602
+ ...topics.flatMap(cleanAndSplitTerm),
2603
+ ...terms.flatMap(cleanAndSplitTerm)
2604
+ ];
2605
+ return [...new Set(processedTerms)];
2606
+ }
2607
+ function calculateCoverage({ original, simplified }) {
2608
+ if (original.length === 0) {
2609
+ return simplified.length === 0 ? 1 : 0;
2610
+ }
2611
+ const covered = original.filter(
2612
+ (element) => simplified.some((s) => {
2613
+ const elem = normalizeString(element);
2614
+ const simp = normalizeString(s);
2615
+ if (elem.length <= 3) {
2616
+ return elem === simp;
2617
+ }
2618
+ const longer = elem.length > simp.length ? elem : simp;
2619
+ const shorter = elem.length > simp.length ? simp : elem;
2620
+ if (longer.includes(shorter)) {
2621
+ return shorter.length / longer.length > 0.6;
2622
+ }
2623
+ return false;
2624
+ })
2625
+ );
2626
+ return covered.length / original.length;
2627
+ }
2628
+ function createCompletenessScorer() {
2629
+ return createScorer({
2630
+ id: "completeness-scorer",
2631
+ name: "Completeness Scorer",
2632
+ description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
2633
+ type: "agent"
2634
+ }).preprocess(async ({ run }) => {
2635
+ const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
2636
+ const content = getTextContentFromMastraDBMessage(i);
2637
+ return content === null || content === void 0;
2638
+ });
2639
+ const isOutputInvalid = !run.output || run.output.some((i) => {
2640
+ const content = getTextContentFromMastraDBMessage(i);
2641
+ return content === null || content === void 0;
2642
+ });
2643
+ if (isInputInvalid || isOutputInvalid) {
2644
+ throw new Error("Inputs cannot be null or undefined");
2645
+ }
2646
+ const input = run.input?.inputMessages.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2647
+ const output = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2648
+ const inputToProcess = input;
2649
+ const outputToProcess = output;
2650
+ const inputDoc = nlp(inputToProcess.trim());
2651
+ const outputDoc = nlp(outputToProcess.trim());
2652
+ const inputElements = extractElements(inputDoc);
2653
+ const outputElements = extractElements(outputDoc);
2654
+ return {
2655
+ inputElements,
2656
+ outputElements,
2657
+ missingElements: inputElements.filter((e) => !outputElements.includes(e)),
2658
+ elementCounts: {
2659
+ input: inputElements.length,
2660
+ output: outputElements.length
2661
+ }
2662
+ };
2663
+ }).generateScore(({ results }) => {
2664
+ const inputElements = results.preprocessStepResult?.inputElements;
2665
+ const outputElements = results.preprocessStepResult?.outputElements;
2666
+ return calculateCoverage({
2667
+ original: inputElements,
2668
+ simplified: outputElements
2669
+ });
2670
+ });
2671
+ }
2672
+ function calculateRatio(input, output) {
2673
+ if (input === output) {
2674
+ return 1;
2675
+ }
2676
+ if (input.length === 0 || output.length === 0) {
2677
+ return 0;
2678
+ }
2679
+ const matches = longestCommonSubsequence(input, output);
2680
+ const total = input.length + output.length;
2681
+ return total > 0 ? 2 * matches / total : 0;
2682
+ }
2683
+ function longestCommonSubsequence(str1, str2) {
2684
+ const m = str1.length;
2685
+ const n = str2.length;
2686
+ const dp = [];
2687
+ for (let i = 0; i <= m; i++) {
2688
+ dp[i] = [];
2689
+ for (let j = 0; j <= n; j++) {
2690
+ dp[i][j] = 0;
2691
+ }
2692
+ }
2693
+ for (let i = 1; i <= m; i++) {
2694
+ for (let j = 1; j <= n; j++) {
2695
+ if (str1[i - 1] === str2[j - 1]) {
2696
+ dp[i][j] = dp[i - 1][j - 1] + 1;
2697
+ } else {
2698
+ dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
2699
+ }
2700
+ }
2701
+ }
2702
+ return dp[m][n];
2703
+ }
2704
+ function countChanges(input, output) {
2705
+ const inputNormalized = input.replace(/\s+/g, " ").trim();
2706
+ const outputNormalized = output.replace(/\s+/g, " ").trim();
2707
+ if (inputNormalized === outputNormalized) {
2708
+ if (input !== output) {
2709
+ const inputWords2 = input.split(/\s+/).filter((w) => w.length > 0);
2710
+ const outputWords2 = output.split(/\s+/).filter((w) => w.length > 0);
2711
+ return Math.abs(inputWords2.length - outputWords2.length) || 1;
2712
+ }
2713
+ return 0;
2714
+ }
2715
+ const inputWords = inputNormalized.split(/\s+/).filter((w) => w.length > 0);
2716
+ const outputWords = outputNormalized.split(/\s+/).filter((w) => w.length > 0);
2717
+ if (inputWords.length === 0 && outputWords.length === 0) {
2718
+ return 0;
2719
+ }
2720
+ if (inputWords.length === 0) {
2721
+ return outputWords.length;
2722
+ }
2723
+ if (outputWords.length === 0) {
2724
+ return inputWords.length;
2725
+ }
2726
+ const matchingWords = findCommonWords(inputWords, outputWords);
2727
+ const maxLength = Math.max(inputWords.length, outputWords.length);
2728
+ const changes = maxLength - matchingWords;
2729
+ return changes;
2730
+ }
2731
+ function findCommonWords(arr1, arr2) {
2732
+ let matches = 0;
2733
+ const used = /* @__PURE__ */ new Set();
2734
+ for (let i = 0; i < arr1.length; i++) {
2735
+ for (let j = 0; j < arr2.length; j++) {
2736
+ if (!used.has(j) && arr1[i] === arr2[j]) {
2737
+ matches++;
2738
+ used.add(j);
2739
+ break;
2740
+ }
2741
+ }
2742
+ }
2743
+ return matches;
2744
+ }
2745
+ function createTextualDifferenceScorer() {
2746
+ return createScorer({
2747
+ id: "textual-difference-scorer",
2748
+ name: "Textual Difference Scorer",
2749
+ description: "Calculate textual difference between input and output using sequence matching algorithms.",
2750
+ type: "agent"
2751
+ }).preprocess(async ({ run }) => {
2752
+ const input = run.input?.inputMessages?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2753
+ const output = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2754
+ const ratio = calculateRatio(input, output);
2755
+ const changes = countChanges(input, output);
2756
+ const maxLength = Math.max(input.length, output.length);
2757
+ const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
2758
+ const confidence = 1 - lengthDiff;
2759
+ return {
2760
+ ratio,
2761
+ confidence,
2762
+ changes,
2763
+ lengthDiff
2764
+ };
2765
+ }).generateScore(({ results }) => {
2766
+ return results.preprocessStepResult?.ratio;
2767
+ });
2768
+ }
2769
+ function createKeywordCoverageScorer() {
2770
+ return createScorer({
2771
+ id: "keyword-coverage-scorer",
2772
+ name: "Keyword Coverage Scorer",
2773
+ description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
2774
+ type: "agent"
2775
+ }).preprocess(async ({ run }) => {
2776
+ const input = run.input?.inputMessages?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2777
+ const output = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2778
+ if (!input && !output) {
2779
+ return {
2780
+ result: {
2781
+ referenceKeywords: /* @__PURE__ */ new Set(),
2782
+ responseKeywords: /* @__PURE__ */ new Set()
2783
+ }
2784
+ };
2785
+ }
2786
+ const extractKeywords = (text) => {
2787
+ return keyword_extractor.extract(text, {
2788
+ language: "english",
2789
+ remove_digits: true,
2790
+ return_changed_case: true,
2791
+ remove_duplicates: true
2792
+ });
2793
+ };
2794
+ const referenceKeywords = new Set(extractKeywords(input));
2795
+ const responseKeywords = new Set(extractKeywords(output));
2796
+ return {
2797
+ referenceKeywords,
2798
+ responseKeywords
2799
+ };
2800
+ }).analyze(async ({ results }) => {
2801
+ if (!results.preprocessStepResult?.referenceKeywords?.size && !results.preprocessStepResult?.responseKeywords?.size) {
2802
+ return {
2803
+ totalKeywordsLength: 0,
2804
+ matchedKeywordsLength: 0
2805
+ };
2806
+ }
2807
+ const matchedKeywords = [...results.preprocessStepResult?.referenceKeywords].filter(
2808
+ (k) => results.preprocessStepResult?.responseKeywords?.has(k)
2809
+ );
2810
+ return {
2811
+ totalKeywordsLength: Array.from(results.preprocessStepResult?.referenceKeywords).length ?? 0,
2812
+ matchedKeywordsLength: matchedKeywords.length ?? 0
2813
+ };
2814
+ }).generateScore(({ results }) => {
2815
+ if (!results.analyzeStepResult?.totalKeywordsLength) {
2816
+ return 1;
2817
+ }
2818
+ const totalKeywords = results.analyzeStepResult?.totalKeywordsLength;
2819
+ const matchedKeywords = results.analyzeStepResult?.matchedKeywordsLength;
2820
+ return totalKeywords > 0 ? matchedKeywords / totalKeywords : 0;
2821
+ });
2822
+ }
2823
+ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
2824
+ return createScorer({
2825
+ id: "content-similarity-scorer",
2826
+ name: "Content Similarity Scorer",
2827
+ description: "Calculates content similarity between input and output messages using string comparison algorithms.",
2828
+ type: "agent"
2829
+ }).preprocess(async ({ run }) => {
2830
+ let processedInput = run.input?.inputMessages.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2831
+ let processedOutput = run.output.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2832
+ if (ignoreCase) {
2833
+ processedInput = processedInput.toLowerCase();
2834
+ processedOutput = processedOutput.toLowerCase();
2835
+ }
2836
+ if (ignoreWhitespace) {
2837
+ processedInput = processedInput.replace(/\s+/g, " ").trim();
2838
+ processedOutput = processedOutput.replace(/\s+/g, " ").trim();
2839
+ }
2840
+ return {
2841
+ processedInput,
2842
+ processedOutput
2843
+ };
2844
+ }).generateScore(({ results }) => {
2845
+ const similarity = stringSimilarity.compareTwoStrings(
2846
+ results.preprocessStepResult?.processedInput,
2847
+ results.preprocessStepResult?.processedOutput
2848
+ );
2849
+ return similarity;
2850
+ });
2851
+ }
2852
+ function createToneScorer(config = {}) {
2853
+ const { referenceTone } = config;
2854
+ return createScorer({
2855
+ id: "tone-scorer",
2856
+ name: "Tone Scorer",
2857
+ description: "Analyzes the tone and sentiment of agent responses using sentiment analysis. Can compare against a reference tone or evaluate sentiment stability.",
2858
+ type: "agent"
2859
+ }).preprocess(async ({ run }) => {
2860
+ const sentiment = new Sentiment();
2861
+ const agentMessage = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
2862
+ const responseSentiment = sentiment.analyze(agentMessage);
2863
+ if (referenceTone) {
2864
+ const referenceSentiment = sentiment.analyze(referenceTone);
2865
+ const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
2866
+ const normalizedScore = Math.max(0, 1 - sentimentDiff);
2867
+ return {
2868
+ score: normalizedScore,
2869
+ responseSentiment: responseSentiment.comparative,
2870
+ referenceSentiment: referenceSentiment.comparative,
2871
+ difference: sentimentDiff
2872
+ };
2873
+ }
2874
+ const sentences = agentMessage.match(/[^.!?]+[.!?]+/g) || [agentMessage];
2875
+ const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
2876
+ const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
2877
+ const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
2878
+ const stability = Math.max(0, 1 - variance);
2879
+ return {
2880
+ score: stability,
2881
+ avgSentiment,
2882
+ sentimentVariance: variance
2883
+ };
2884
+ }).generateScore(({ results }) => {
2885
+ return results.preprocessStepResult?.score;
2886
+ });
2887
+ }
2888
+ function checkToolOrder(actualTools, expectedOrder, strictMode = false) {
2889
+ if (strictMode) {
2890
+ return JSON.stringify(actualTools) === JSON.stringify(expectedOrder);
2891
+ }
2892
+ const expectedIndices = [];
2893
+ for (const expectedTool of expectedOrder) {
2894
+ const index = actualTools.indexOf(expectedTool);
2895
+ if (index === -1) {
2896
+ return false;
2897
+ }
2898
+ expectedIndices.push(index);
2899
+ }
2900
+ for (let i = 1; i < expectedIndices.length; i++) {
2901
+ const currentIndex = expectedIndices[i];
2902
+ const prevIndex = expectedIndices[i - 1];
2903
+ if (currentIndex !== void 0 && prevIndex !== void 0 && currentIndex <= prevIndex) {
2904
+ return false;
2905
+ }
2906
+ }
2907
+ return true;
2908
+ }
2909
+ function calculateAccuracy({
2910
+ expectedTool,
2911
+ actualTools,
2912
+ strictMode = false,
2913
+ expectedToolOrder
2914
+ }) {
2915
+ if (actualTools.length === 0) {
2916
+ return 0;
2917
+ }
2918
+ if (expectedToolOrder && expectedToolOrder.length > 0) {
2919
+ return checkToolOrder(actualTools, expectedToolOrder, strictMode) ? 1 : 0;
2920
+ }
2921
+ if (!expectedTool) {
2922
+ return 0;
2923
+ }
2924
+ if (strictMode) {
2925
+ return actualTools.length === 1 && actualTools[0] === expectedTool ? 1 : 0;
2926
+ }
2927
+ return actualTools.includes(expectedTool) ? 1 : 0;
2928
+ }
2929
+ function createToolCallAccuracyScorerCode(options) {
2930
+ const { expectedTool, strictMode = false, expectedToolOrder } = options;
2931
+ if (!expectedTool && !expectedToolOrder) {
2932
+ throw new Error("Either expectedTool or expectedToolOrder must be provided");
2933
+ }
2934
+ const getDescription = () => {
2935
+ return expectedToolOrder ? `Evaluates whether the LLM called tools in the correct order: [${expectedToolOrder.join(", ")}]` : `Evaluates whether the LLM selected the correct tool (${expectedTool}) from the available tools`;
2936
+ };
2937
+ return createScorer({
2938
+ id: "code-tool-call-accuracy-scorer",
2939
+ name: "Tool Call Accuracy Scorer",
2940
+ description: getDescription(),
2941
+ type: "agent"
2942
+ }).preprocess(async ({ run }) => {
2943
+ const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
2944
+ const isOutputInvalid = !run.output || run.output.length === 0;
2945
+ if (isInputInvalid || isOutputInvalid) {
2946
+ throw new Error("Input and output messages cannot be null or empty");
2947
+ }
2948
+ const { tools: actualTools, toolCallInfos } = extractToolCalls(run.output);
2949
+ const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
2950
+ return {
2951
+ expectedTool,
2952
+ actualTools,
2953
+ strictMode,
2954
+ expectedToolOrder,
2955
+ hasToolCalls: actualTools.length > 0,
2956
+ correctToolCalled,
2957
+ toolCallInfos,
2958
+ correctOrderCalled: expectedToolOrder ? checkToolOrder(actualTools, expectedToolOrder, strictMode) : null
2959
+ };
2960
+ }).generateScore(({ results }) => {
2961
+ const preprocessResult = results.preprocessStepResult;
2962
+ if (!preprocessResult) {
2963
+ return 0;
2964
+ }
2965
+ return calculateAccuracy({
2966
+ expectedTool: preprocessResult.expectedTool,
2967
+ actualTools: preprocessResult.actualTools,
2968
+ strictMode: preprocessResult.strictMode,
2969
+ expectedToolOrder: preprocessResult.expectedToolOrder
2970
+ });
2971
+ });
2972
+ }
2569
2973
 
2570
- export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
2974
+ export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer };
2571
2975
  //# sourceMappingURL=index.js.map
2572
2976
  //# sourceMappingURL=index.js.map