@mastra/evals 0.14.3-alpha.0 → 1.0.0-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (189) hide show
  1. package/CHANGELOG.md +36 -9
  2. package/README.md +19 -159
  3. package/dist/{chunk-KHEXN75Q.js → chunk-CCLM7KPF.js} +45 -21
  4. package/dist/chunk-CCLM7KPF.js.map +1 -0
  5. package/dist/{chunk-QKR2PMLZ.cjs → chunk-TPQLLHZW.cjs} +46 -21
  6. package/dist/chunk-TPQLLHZW.cjs.map +1 -0
  7. package/dist/scorers/code/completeness/index.d.ts +1 -1
  8. package/dist/scorers/code/completeness/index.d.ts.map +1 -1
  9. package/dist/scorers/code/content-similarity/index.d.ts +1 -1
  10. package/dist/scorers/code/content-similarity/index.d.ts.map +1 -1
  11. package/dist/scorers/code/keyword-coverage/index.d.ts +1 -1
  12. package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -1
  13. package/dist/scorers/code/textual-difference/index.d.ts +1 -1
  14. package/dist/scorers/code/textual-difference/index.d.ts.map +1 -1
  15. package/dist/scorers/code/tone/index.d.ts +1 -1
  16. package/dist/scorers/code/tone/index.d.ts.map +1 -1
  17. package/dist/scorers/code/tool-call-accuracy/index.d.ts +1 -1
  18. package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -1
  19. package/dist/scorers/llm/answer-relevancy/index.d.ts +1 -1
  20. package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
  21. package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
  22. package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
  23. package/dist/scorers/llm/bias/index.d.ts +2 -2
  24. package/dist/scorers/llm/bias/index.d.ts.map +1 -1
  25. package/dist/scorers/llm/context-precision/index.d.ts +3 -3
  26. package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
  27. package/dist/scorers/llm/context-relevance/index.d.ts +3 -3
  28. package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
  29. package/dist/scorers/llm/faithfulness/index.d.ts +2 -2
  30. package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
  31. package/dist/scorers/llm/hallucination/index.d.ts +2 -2
  32. package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
  33. package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
  34. package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
  35. package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
  36. package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
  37. package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -2
  38. package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
  39. package/dist/scorers/llm/toxicity/index.d.ts +2 -2
  40. package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
  41. package/dist/scorers/{llm → prebuilt}/index.cjs +479 -62
  42. package/dist/scorers/prebuilt/index.cjs.map +1 -0
  43. package/dist/scorers/prebuilt/index.d.ts +3 -0
  44. package/dist/scorers/prebuilt/index.d.ts.map +1 -0
  45. package/dist/scorers/{llm → prebuilt}/index.js +419 -15
  46. package/dist/scorers/prebuilt/index.js.map +1 -0
  47. package/dist/scorers/utils.cjs +21 -17
  48. package/dist/scorers/utils.d.ts +21 -11
  49. package/dist/scorers/utils.d.ts.map +1 -1
  50. package/dist/scorers/utils.js +1 -1
  51. package/package.json +12 -58
  52. package/dist/attachListeners.d.ts +0 -4
  53. package/dist/attachListeners.d.ts.map +0 -1
  54. package/dist/chunk-7QAUEU4L.cjs +0 -10
  55. package/dist/chunk-7QAUEU4L.cjs.map +0 -1
  56. package/dist/chunk-EMMSS5I5.cjs +0 -37
  57. package/dist/chunk-EMMSS5I5.cjs.map +0 -1
  58. package/dist/chunk-G3PMV62Z.js +0 -33
  59. package/dist/chunk-G3PMV62Z.js.map +0 -1
  60. package/dist/chunk-IUSAD2BW.cjs +0 -19
  61. package/dist/chunk-IUSAD2BW.cjs.map +0 -1
  62. package/dist/chunk-KHEXN75Q.js.map +0 -1
  63. package/dist/chunk-QKR2PMLZ.cjs.map +0 -1
  64. package/dist/chunk-QTWX6TKR.js +0 -8
  65. package/dist/chunk-QTWX6TKR.js.map +0 -1
  66. package/dist/chunk-YGTIO3J5.js +0 -17
  67. package/dist/chunk-YGTIO3J5.js.map +0 -1
  68. package/dist/dist-LDTK3TIP.cjs +0 -16759
  69. package/dist/dist-LDTK3TIP.cjs.map +0 -1
  70. package/dist/dist-OWYZEOJK.js +0 -16737
  71. package/dist/dist-OWYZEOJK.js.map +0 -1
  72. package/dist/evaluation.d.ts +0 -8
  73. package/dist/evaluation.d.ts.map +0 -1
  74. package/dist/index.cjs +0 -93
  75. package/dist/index.cjs.map +0 -1
  76. package/dist/index.d.ts +0 -3
  77. package/dist/index.d.ts.map +0 -1
  78. package/dist/index.js +0 -89
  79. package/dist/index.js.map +0 -1
  80. package/dist/magic-string.es-7ORA5OGR.js +0 -1305
  81. package/dist/magic-string.es-7ORA5OGR.js.map +0 -1
  82. package/dist/magic-string.es-NZ2XWFKN.cjs +0 -1311
  83. package/dist/magic-string.es-NZ2XWFKN.cjs.map +0 -1
  84. package/dist/metrics/index.d.ts +0 -4
  85. package/dist/metrics/index.d.ts.map +0 -1
  86. package/dist/metrics/judge/index.cjs +0 -12
  87. package/dist/metrics/judge/index.cjs.map +0 -1
  88. package/dist/metrics/judge/index.d.ts +0 -7
  89. package/dist/metrics/judge/index.d.ts.map +0 -1
  90. package/dist/metrics/judge/index.js +0 -3
  91. package/dist/metrics/judge/index.js.map +0 -1
  92. package/dist/metrics/llm/answer-relevancy/index.d.ts +0 -16
  93. package/dist/metrics/llm/answer-relevancy/index.d.ts.map +0 -1
  94. package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +0 -20
  95. package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +0 -1
  96. package/dist/metrics/llm/answer-relevancy/prompts.d.ts +0 -19
  97. package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +0 -1
  98. package/dist/metrics/llm/bias/index.d.ts +0 -14
  99. package/dist/metrics/llm/bias/index.d.ts.map +0 -1
  100. package/dist/metrics/llm/bias/metricJudge.d.ts +0 -14
  101. package/dist/metrics/llm/bias/metricJudge.d.ts.map +0 -1
  102. package/dist/metrics/llm/bias/prompts.d.ts +0 -14
  103. package/dist/metrics/llm/bias/prompts.d.ts.map +0 -1
  104. package/dist/metrics/llm/context-position/index.d.ts +0 -16
  105. package/dist/metrics/llm/context-position/index.d.ts.map +0 -1
  106. package/dist/metrics/llm/context-position/metricJudge.d.ts +0 -20
  107. package/dist/metrics/llm/context-position/metricJudge.d.ts.map +0 -1
  108. package/dist/metrics/llm/context-position/prompts.d.ts +0 -17
  109. package/dist/metrics/llm/context-position/prompts.d.ts.map +0 -1
  110. package/dist/metrics/llm/context-precision/index.d.ts +0 -16
  111. package/dist/metrics/llm/context-precision/index.d.ts.map +0 -1
  112. package/dist/metrics/llm/context-precision/metricJudge.d.ts +0 -20
  113. package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +0 -1
  114. package/dist/metrics/llm/context-precision/prompts.d.ts +0 -17
  115. package/dist/metrics/llm/context-precision/prompts.d.ts.map +0 -1
  116. package/dist/metrics/llm/context-relevancy/index.d.ts +0 -16
  117. package/dist/metrics/llm/context-relevancy/index.d.ts.map +0 -1
  118. package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +0 -16
  119. package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +0 -1
  120. package/dist/metrics/llm/context-relevancy/prompts.d.ts +0 -13
  121. package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +0 -1
  122. package/dist/metrics/llm/contextual-recall/index.d.ts +0 -16
  123. package/dist/metrics/llm/contextual-recall/index.d.ts.map +0 -1
  124. package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +0 -16
  125. package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +0 -1
  126. package/dist/metrics/llm/contextual-recall/prompts.d.ts +0 -13
  127. package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +0 -1
  128. package/dist/metrics/llm/faithfulness/index.d.ts +0 -16
  129. package/dist/metrics/llm/faithfulness/index.d.ts.map +0 -1
  130. package/dist/metrics/llm/faithfulness/metricJudge.d.ts +0 -22
  131. package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +0 -1
  132. package/dist/metrics/llm/faithfulness/prompts.d.ts +0 -20
  133. package/dist/metrics/llm/faithfulness/prompts.d.ts.map +0 -1
  134. package/dist/metrics/llm/hallucination/index.d.ts +0 -16
  135. package/dist/metrics/llm/hallucination/index.d.ts.map +0 -1
  136. package/dist/metrics/llm/hallucination/metricJudge.d.ts +0 -22
  137. package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +0 -1
  138. package/dist/metrics/llm/hallucination/prompts.d.ts +0 -17
  139. package/dist/metrics/llm/hallucination/prompts.d.ts.map +0 -1
  140. package/dist/metrics/llm/index.cjs +0 -2481
  141. package/dist/metrics/llm/index.cjs.map +0 -1
  142. package/dist/metrics/llm/index.d.ts +0 -12
  143. package/dist/metrics/llm/index.d.ts.map +0 -1
  144. package/dist/metrics/llm/index.js +0 -2469
  145. package/dist/metrics/llm/index.js.map +0 -1
  146. package/dist/metrics/llm/prompt-alignment/index.d.ts +0 -33
  147. package/dist/metrics/llm/prompt-alignment/index.d.ts.map +0 -1
  148. package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +0 -20
  149. package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +0 -1
  150. package/dist/metrics/llm/prompt-alignment/prompts.d.ts +0 -17
  151. package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +0 -1
  152. package/dist/metrics/llm/summarization/index.d.ts +0 -19
  153. package/dist/metrics/llm/summarization/index.d.ts.map +0 -1
  154. package/dist/metrics/llm/summarization/metricJudge.d.ts +0 -34
  155. package/dist/metrics/llm/summarization/metricJudge.d.ts.map +0 -1
  156. package/dist/metrics/llm/summarization/prompts.d.ts +0 -30
  157. package/dist/metrics/llm/summarization/prompts.d.ts.map +0 -1
  158. package/dist/metrics/llm/toxicity/index.d.ts +0 -14
  159. package/dist/metrics/llm/toxicity/index.d.ts.map +0 -1
  160. package/dist/metrics/llm/toxicity/metricJudge.d.ts +0 -14
  161. package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +0 -1
  162. package/dist/metrics/llm/toxicity/prompts.d.ts +0 -10
  163. package/dist/metrics/llm/toxicity/prompts.d.ts.map +0 -1
  164. package/dist/metrics/llm/types.d.ts +0 -7
  165. package/dist/metrics/llm/types.d.ts.map +0 -1
  166. package/dist/metrics/llm/utils.d.ts +0 -14
  167. package/dist/metrics/llm/utils.d.ts.map +0 -1
  168. package/dist/metrics/nlp/completeness/index.d.ts +0 -21
  169. package/dist/metrics/nlp/completeness/index.d.ts.map +0 -1
  170. package/dist/metrics/nlp/content-similarity/index.d.ts +0 -18
  171. package/dist/metrics/nlp/content-similarity/index.d.ts.map +0 -1
  172. package/dist/metrics/nlp/index.cjs +0 -203
  173. package/dist/metrics/nlp/index.cjs.map +0 -1
  174. package/dist/metrics/nlp/index.d.ts +0 -6
  175. package/dist/metrics/nlp/index.d.ts.map +0 -1
  176. package/dist/metrics/nlp/index.js +0 -190
  177. package/dist/metrics/nlp/index.js.map +0 -1
  178. package/dist/metrics/nlp/keyword-coverage/index.d.ts +0 -13
  179. package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +0 -1
  180. package/dist/metrics/nlp/textual-difference/index.d.ts +0 -15
  181. package/dist/metrics/nlp/textual-difference/index.d.ts.map +0 -1
  182. package/dist/metrics/nlp/tone/index.d.ts +0 -18
  183. package/dist/metrics/nlp/tone/index.d.ts.map +0 -1
  184. package/dist/scorers/code/index.cjs +0 -329
  185. package/dist/scorers/code/index.cjs.map +0 -1
  186. package/dist/scorers/code/index.js +0 -315
  187. package/dist/scorers/code/index.js.map +0 -1
  188. package/dist/scorers/llm/index.cjs.map +0 -1
  189. package/dist/scorers/llm/index.js.map +0 -1
@@ -1,9 +1,19 @@
1
1
  'use strict';
2
2
 
3
- var chunk7QAUEU4L_cjs = require('../../chunk-7QAUEU4L.cjs');
4
- var chunkQKR2PMLZ_cjs = require('../../chunk-QKR2PMLZ.cjs');
5
- var scores = require('@mastra/core/scores');
3
+ var chunkTPQLLHZW_cjs = require('../../chunk-TPQLLHZW.cjs');
4
+ var evals = require('@mastra/core/evals');
6
5
  var zod = require('zod');
6
+ var nlp = require('compromise');
7
+ var keyword_extractor = require('keyword-extractor');
8
+ var stringSimilarity = require('string-similarity');
9
+ var Sentiment = require('sentiment');
10
+
11
+ function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
12
+
13
+ var nlp__default = /*#__PURE__*/_interopDefault(nlp);
14
+ var keyword_extractor__default = /*#__PURE__*/_interopDefault(keyword_extractor);
15
+ var stringSimilarity__default = /*#__PURE__*/_interopDefault(stringSimilarity);
16
+ var Sentiment__default = /*#__PURE__*/_interopDefault(Sentiment);
7
17
 
8
18
  // src/scorers/llm/answer-relevancy/prompts.ts
9
19
  var createExtractPrompt = (output) => `
@@ -216,7 +226,8 @@ function createAnswerRelevancyScorer({
216
226
  model,
217
227
  options = DEFAULT_OPTIONS
218
228
  }) {
219
- return scores.createScorer({
229
+ return evals.createScorer({
230
+ id: "answer-relevancy-scorer",
220
231
  name: "Answer Relevancy Scorer",
221
232
  description: "A scorer that evaluates the relevancy of an LLM output to an input",
222
233
  judge: {
@@ -228,14 +239,14 @@ function createAnswerRelevancyScorer({
228
239
  description: "Extract relevant statements from the LLM output",
229
240
  outputSchema: extractOutputSchema,
230
241
  createPrompt: ({ run }) => {
231
- const assistantMessage = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
242
+ const assistantMessage = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
232
243
  return createExtractPrompt(assistantMessage);
233
244
  }
234
245
  }).analyze({
235
246
  description: "Score the relevance of the statements to the input",
236
247
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
237
248
  createPrompt: ({ run, results }) => {
238
- const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
249
+ const input = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
239
250
  return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
240
251
  }
241
252
  }).generateScore(({ results }) => {
@@ -252,13 +263,13 @@ function createAnswerRelevancyScorer({
252
263
  }
253
264
  }
254
265
  const score = relevancyCount / numberOfResults;
255
- return chunk7QAUEU4L_cjs.roundToTwoDecimals(score * options.scale);
266
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * options.scale);
256
267
  }).generateReason({
257
268
  description: "Reason about the results",
258
269
  createPrompt: ({ run, results, score }) => {
259
270
  return createReasonPrompt({
260
- input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
261
- output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
271
+ input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
272
+ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
262
273
  score,
263
274
  results: results.analyzeStepResult.results,
264
275
  scale: options.scale
@@ -433,7 +444,8 @@ function createAnswerSimilarityScorer({
433
444
  options = ANSWER_SIMILARITY_DEFAULT_OPTIONS
434
445
  }) {
435
446
  const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
436
- return scores.createScorer({
447
+ return evals.createScorer({
448
+ id: "answer-similarity-scorer",
437
449
  name: "Answer Similarity Scorer",
438
450
  description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
439
451
  judge: {
@@ -454,7 +466,7 @@ function createAnswerSimilarityScorer({
454
466
  groundTruth: ""
455
467
  });
456
468
  }
457
- const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
469
+ const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
458
470
  const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
459
471
  return createExtractPrompt2({
460
472
  output,
@@ -512,14 +524,14 @@ function createAnswerSimilarityScorer({
512
524
  );
513
525
  score -= extraInfoPenalty;
514
526
  score = Math.max(0, Math.min(1, score));
515
- return chunk7QAUEU4L_cjs.roundToTwoDecimals(score * mergedOptions.scale);
527
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * mergedOptions.scale);
516
528
  }).generateReason({
517
529
  description: "Generate explanation of similarity score",
518
530
  createPrompt: ({ run, results, score }) => {
519
531
  if (!run.groundTruth) {
520
532
  return "No ground truth was provided for comparison. Score is 0 by default.";
521
533
  }
522
- const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
534
+ const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
523
535
  const groundTruth = typeof run.groundTruth === "string" ? run.groundTruth : JSON.stringify(run.groundTruth);
524
536
  return createReasonPrompt2({
525
537
  output,
@@ -690,7 +702,8 @@ function createFaithfulnessScorer({
690
702
  model,
691
703
  options
692
704
  }) {
693
- return scores.createScorer({
705
+ return evals.createScorer({
706
+ id: "faithfulness-scorer",
694
707
  name: "Faithfulness Scorer",
695
708
  description: "A scorer that evaluates the faithfulness of an LLM output to an input",
696
709
  judge: {
@@ -702,14 +715,17 @@ function createFaithfulnessScorer({
702
715
  description: "Extract relevant statements from the LLM output",
703
716
  outputSchema: zod.z.array(zod.z.string()),
704
717
  createPrompt: ({ run }) => {
705
- const prompt = createFaithfulnessExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
718
+ const prompt = createFaithfulnessExtractPrompt({ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
706
719
  return prompt;
707
720
  }
708
721
  }).analyze({
709
722
  description: "Score the relevance of the statements to the input",
710
723
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
711
724
  createPrompt: ({ results, run }) => {
712
- const context = options?.context ?? run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : "") ?? [];
725
+ const assistantMessage = run.output.find(({ role }) => role === "assistant");
726
+ const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
727
+ (toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
728
+ ) ?? [];
713
729
  const prompt = createFaithfulnessAnalyzePrompt({
714
730
  claims: results.preprocessStepResult || [],
715
731
  context
@@ -723,14 +739,15 @@ function createFaithfulnessScorer({
723
739
  return 0;
724
740
  }
725
741
  const score = supportedClaims / totalClaims * (options?.scale || 1);
726
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
742
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score);
727
743
  }).generateReason({
728
744
  description: "Reason about the results",
729
745
  createPrompt: ({ run, results, score }) => {
746
+ const assistantMessage = run.output.find(({ role }) => role === "assistant");
730
747
  const prompt = createFaithfulnessReasonPrompt({
731
- input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
732
- output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
733
- context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
748
+ input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
749
+ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
750
+ context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
734
751
  score,
735
752
  scale: options?.scale || 1,
736
753
  verdicts: results.analyzeStepResult?.verdicts || []
@@ -848,7 +865,8 @@ ${biases.join("\n")}
848
865
 
849
866
  // src/scorers/llm/bias/index.ts
850
867
  function createBiasScorer({ model, options }) {
851
- return scores.createScorer({
868
+ return evals.createScorer({
869
+ id: "bias-scorer",
852
870
  name: "Bias Scorer",
853
871
  description: "A scorer that evaluates the bias of an LLM output to an input",
854
872
  judge: {
@@ -861,13 +879,13 @@ function createBiasScorer({ model, options }) {
861
879
  outputSchema: zod.z.object({
862
880
  opinions: zod.z.array(zod.z.string())
863
881
  }),
864
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
882
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" })
865
883
  }).analyze({
866
884
  description: "Score the relevance of the statements to the input",
867
885
  outputSchema: zod.z.object({ results: zod.z.array(zod.z.object({ result: zod.z.string(), reason: zod.z.string() })) }),
868
886
  createPrompt: ({ run, results }) => {
869
887
  const prompt = createBiasAnalyzePrompt({
870
- output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
888
+ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
871
889
  opinions: results.preprocessStepResult?.opinions || []
872
890
  });
873
891
  return prompt;
@@ -878,7 +896,7 @@ function createBiasScorer({ model, options }) {
878
896
  }
879
897
  const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
880
898
  const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
881
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
899
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * (options?.scale || 1));
882
900
  }).generateReason({
883
901
  description: "Reason about the results",
884
902
  createPrompt: ({ score, results }) => {
@@ -1082,7 +1100,8 @@ function createHallucinationScorer({
1082
1100
  model,
1083
1101
  options
1084
1102
  }) {
1085
- return scores.createScorer({
1103
+ return evals.createScorer({
1104
+ id: "hallucination-scorer",
1086
1105
  name: "Hallucination Scorer",
1087
1106
  description: "A scorer that evaluates the hallucination of an LLM output to an input",
1088
1107
  judge: {
@@ -1096,7 +1115,7 @@ function createHallucinationScorer({
1096
1115
  claims: zod.z.array(zod.z.string())
1097
1116
  }),
1098
1117
  createPrompt: ({ run }) => {
1099
- const prompt = createHallucinationExtractPrompt({ output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1118
+ const prompt = createHallucinationExtractPrompt({ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "" });
1100
1119
  return prompt;
1101
1120
  }
1102
1121
  }).analyze({
@@ -1118,13 +1137,13 @@ function createHallucinationScorer({
1118
1137
  return 0;
1119
1138
  }
1120
1139
  const score = contradictedStatements / totalStatements * (options?.scale || 1);
1121
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
1140
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score);
1122
1141
  }).generateReason({
1123
1142
  description: "Reason about the results",
1124
1143
  createPrompt: ({ run, results, score }) => {
1125
1144
  const prompt = createHallucinationReasonPrompt({
1126
- input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
1127
- output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1145
+ input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
1146
+ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "",
1128
1147
  context: options?.context || [],
1129
1148
  score,
1130
1149
  scale: options?.scale || 1,
@@ -1224,7 +1243,8 @@ function createToxicityScorer({
1224
1243
  model,
1225
1244
  options
1226
1245
  }) {
1227
- return scores.createScorer({
1246
+ return evals.createScorer({
1247
+ id: "toxicity-scorer",
1228
1248
  name: "Toxicity Scorer",
1229
1249
  description: "A scorer that evaluates the toxicity of an LLM output to an input",
1230
1250
  judge: {
@@ -1237,8 +1257,8 @@ function createToxicityScorer({
1237
1257
  outputSchema: zod.z.object({ verdicts: zod.z.array(zod.z.object({ verdict: zod.z.string(), reason: zod.z.string() })) }),
1238
1258
  createPrompt: ({ run }) => {
1239
1259
  const prompt = createToxicityAnalyzePrompt({
1240
- input: chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "",
1241
- output: chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1260
+ input: chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "",
1261
+ output: chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? ""
1242
1262
  });
1243
1263
  return prompt;
1244
1264
  }
@@ -1254,7 +1274,7 @@ function createToxicityScorer({
1254
1274
  }
1255
1275
  }
1256
1276
  const score = toxicityCount / numberOfVerdicts;
1257
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1277
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score * (options?.scale || 1));
1258
1278
  }).generateReason({
1259
1279
  description: "Reason about the results",
1260
1280
  createPrompt: ({ results, score }) => {
@@ -1373,7 +1393,8 @@ var analyzeOutputSchema2 = zod.z.object({
1373
1393
  });
1374
1394
  function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1375
1395
  const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
1376
- return scores.createScorer({
1396
+ return evals.createScorer({
1397
+ id: "llm-tool-call-accuracy-scorer",
1377
1398
  name: "Tool Call Accuracy (LLM)",
1378
1399
  description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
1379
1400
  judge: {
@@ -1387,7 +1408,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1387
1408
  if (isInputInvalid || isOutputInvalid) {
1388
1409
  throw new Error("Input and output messages cannot be null or empty");
1389
1410
  }
1390
- const { tools: actualTools, toolCallInfos } = chunkQKR2PMLZ_cjs.extractToolCalls(run.output);
1411
+ const { tools: actualTools, toolCallInfos } = chunkTPQLLHZW_cjs.extractToolCalls(run.output);
1391
1412
  return {
1392
1413
  actualTools,
1393
1414
  hasToolCalls: actualTools.length > 0,
@@ -1397,8 +1418,8 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1397
1418
  description: "Analyze the appropriateness of tool selections",
1398
1419
  outputSchema: analyzeOutputSchema2,
1399
1420
  createPrompt: ({ run, results }) => {
1400
- const userInput = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1401
- const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1421
+ const userInput = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
1422
+ const agentResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1402
1423
  const toolsCalled = results.preprocessStepResult?.actualTools || [];
1403
1424
  return createAnalyzePrompt2({
1404
1425
  userInput,
@@ -1415,11 +1436,11 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
1415
1436
  }
1416
1437
  const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
1417
1438
  const totalToolCalls = evaluations.length;
1418
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1439
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
1419
1440
  }).generateReason({
1420
1441
  description: "Generate human-readable explanation of tool selection evaluation",
1421
1442
  createPrompt: ({ run, results, score }) => {
1422
- const userInput = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1443
+ const userInput = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
1423
1444
  const evaluations = results.analyzeStepResult?.evaluations || [];
1424
1445
  const missingTools = results.analyzeStepResult?.missingTools || [];
1425
1446
  return createReasonPrompt3({
@@ -1611,7 +1632,8 @@ function createContextRelevanceScorerLLM({
1611
1632
  if (options.context && options.context.length === 0) {
1612
1633
  throw new Error("Context array cannot be empty if provided");
1613
1634
  }
1614
- return scores.createScorer({
1635
+ return evals.createScorer({
1636
+ id: "context-relevance-scorer",
1615
1637
  name: "Context Relevance (LLM)",
1616
1638
  description: "Evaluates how relevant and useful the provided context was for generating the agent response",
1617
1639
  judge: {
@@ -1623,8 +1645,8 @@ function createContextRelevanceScorerLLM({
1623
1645
  description: "Analyze the relevance and utility of provided context",
1624
1646
  outputSchema: analyzeOutputSchema3,
1625
1647
  createPrompt: ({ run }) => {
1626
- const userQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1627
- const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1648
+ const userQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
1649
+ const agentResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1628
1650
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1629
1651
  if (context.length === 0) {
1630
1652
  return createAnalyzePrompt3({
@@ -1672,11 +1694,11 @@ function createContextRelevanceScorerLLM({
1672
1694
  const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
1673
1695
  const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
1674
1696
  const scaledScore = finalScore * (options.scale || 1);
1675
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(scaledScore);
1697
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(scaledScore);
1676
1698
  }).generateReason({
1677
1699
  description: "Generate human-readable explanation of context relevance evaluation",
1678
1700
  createPrompt: ({ run, results, score }) => {
1679
- const userQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1701
+ const userQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
1680
1702
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1681
1703
  if (context.length === 0) {
1682
1704
  return `No context was available for evaluation. The agent response was generated without any supporting context. Score: ${score}`;
@@ -1834,7 +1856,8 @@ function createContextPrecisionScorer({
1834
1856
  if (options.context && options.context.length === 0) {
1835
1857
  throw new Error("Context array cannot be empty if provided");
1836
1858
  }
1837
- return scores.createScorer({
1859
+ return evals.createScorer({
1860
+ id: "context-precision-scorer",
1838
1861
  name: "Context Precision Scorer",
1839
1862
  description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
1840
1863
  judge: {
@@ -1846,8 +1869,8 @@ function createContextPrecisionScorer({
1846
1869
  description: "Evaluate the relevance of each context piece for generating the expected output",
1847
1870
  outputSchema: contextRelevanceOutputSchema,
1848
1871
  createPrompt: ({ run }) => {
1849
- const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1850
- const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1872
+ const input = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
1873
+ const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1851
1874
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1852
1875
  if (context.length === 0) {
1853
1876
  throw new Error("No context available for evaluation");
@@ -1880,12 +1903,12 @@ function createContextPrecisionScorer({
1880
1903
  }
1881
1904
  const map = sumPrecision / relevantCount;
1882
1905
  const score = map * (options.scale || 1);
1883
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(score);
1906
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(score);
1884
1907
  }).generateReason({
1885
1908
  description: "Reason about the context precision results",
1886
1909
  createPrompt: ({ run, results, score }) => {
1887
- const input = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
1888
- const output = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1910
+ const input = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
1911
+ const output = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
1889
1912
  const context = options.contextExtractor ? options.contextExtractor(run.input, run.output) : options.context;
1890
1913
  return createContextPrecisionReasonPrompt({
1891
1914
  input,
@@ -2126,7 +2149,8 @@ function createNoiseSensitivityScorerLLM({
2126
2149
  if (!options.baselineResponse || !options.noisyQuery) {
2127
2150
  throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
2128
2151
  }
2129
- return scores.createScorer({
2152
+ return evals.createScorer({
2153
+ id: "noise-sensitivity-scorer",
2130
2154
  name: "Noise Sensitivity (LLM)",
2131
2155
  description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
2132
2156
  judge: {
@@ -2138,8 +2162,8 @@ function createNoiseSensitivityScorerLLM({
2138
2162
  description: "Analyze the impact of noise on agent response quality",
2139
2163
  outputSchema: analyzeOutputSchema4,
2140
2164
  createPrompt: ({ run }) => {
2141
- const originalQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2142
- const noisyResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2165
+ const originalQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
2166
+ const noisyResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2143
2167
  if (!originalQuery || !noisyResponse) {
2144
2168
  throw new Error("Both original query and noisy response are required for evaluation");
2145
2169
  }
@@ -2182,11 +2206,11 @@ function createNoiseSensitivityScorerLLM({
2182
2206
  const majorIssues = analysisResult.majorIssues || [];
2183
2207
  const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
2184
2208
  finalScore = Math.max(0, finalScore - issuesPenalty);
2185
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(finalScore);
2209
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(finalScore);
2186
2210
  }).generateReason({
2187
2211
  description: "Generate human-readable explanation of noise sensitivity evaluation",
2188
2212
  createPrompt: ({ run, results, score }) => {
2189
- const originalQuery = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2213
+ const originalQuery = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
2190
2214
  const analysisResult = results.analyzeStepResult;
2191
2215
  if (!analysisResult) {
2192
2216
  throw new Error("Analysis step failed to produce results for reason generation");
@@ -2498,7 +2522,8 @@ function createPromptAlignmentScorerLLM({
2498
2522
  }) {
2499
2523
  const scale = options?.scale || 1;
2500
2524
  const evaluationMode = options?.evaluationMode || "both";
2501
- return scores.createScorer({
2525
+ return evals.createScorer({
2526
+ id: "prompt-alignment-scorer",
2502
2527
  name: "Prompt Alignment (LLM)",
2503
2528
  description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
2504
2529
  judge: {
@@ -2509,9 +2534,9 @@ function createPromptAlignmentScorerLLM({
2509
2534
  description: "Analyze prompt-response alignment across multiple dimensions",
2510
2535
  outputSchema: analyzeOutputSchema5,
2511
2536
  createPrompt: ({ run }) => {
2512
- const userPrompt = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2513
- const systemPrompt = chunkQKR2PMLZ_cjs.getCombinedSystemPrompt(run.input) ?? "";
2514
- const agentResponse = chunkQKR2PMLZ_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2537
+ const userPrompt = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
2538
+ const systemPrompt = chunkTPQLLHZW_cjs.getCombinedSystemPrompt(run.input) ?? "";
2539
+ const agentResponse = chunkTPQLLHZW_cjs.getAssistantMessageFromRunOutput(run.output) ?? "";
2515
2540
  if (evaluationMode === "user" && !userPrompt) {
2516
2541
  throw new Error("User prompt is required for user prompt alignment scoring");
2517
2542
  }
@@ -2547,12 +2572,12 @@ function createPromptAlignmentScorerLLM({
2547
2572
  weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
2548
2573
  }
2549
2574
  const finalScore = weightedScore * scale;
2550
- return chunkQKR2PMLZ_cjs.roundToTwoDecimals(finalScore);
2575
+ return chunkTPQLLHZW_cjs.roundToTwoDecimals(finalScore);
2551
2576
  }).generateReason({
2552
2577
  description: "Generate human-readable explanation of prompt alignment evaluation",
2553
2578
  createPrompt: ({ run, results, score }) => {
2554
- const userPrompt = chunkQKR2PMLZ_cjs.getUserMessageFromRunInput(run.input) ?? "";
2555
- const systemPrompt = chunkQKR2PMLZ_cjs.getCombinedSystemPrompt(run.input) ?? "";
2579
+ const userPrompt = chunkTPQLLHZW_cjs.getUserMessageFromRunInput(run.input) ?? "";
2580
+ const systemPrompt = chunkTPQLLHZW_cjs.getCombinedSystemPrompt(run.input) ?? "";
2556
2581
  const analysis = results.analyzeStepResult;
2557
2582
  if (!analysis) {
2558
2583
  return `Unable to analyze prompt alignment. Score: ${score}`;
@@ -2568,6 +2593,392 @@ function createPromptAlignmentScorerLLM({
2568
2593
  }
2569
2594
  });
2570
2595
  }
2596
+ function normalizeString(str) {
2597
+ return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
2598
+ }
2599
+ function extractElements(doc) {
2600
+ const nouns = doc.nouns().out("array") || [];
2601
+ const verbs = doc.verbs().toInfinitive().out("array") || [];
2602
+ const topics = doc.topics().out("array") || [];
2603
+ const terms = doc.terms().out("array") || [];
2604
+ const cleanAndSplitTerm = (term) => {
2605
+ const normalized = normalizeString(term);
2606
+ return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
2607
+ };
2608
+ const processedTerms = [
2609
+ ...nouns.flatMap(cleanAndSplitTerm),
2610
+ ...verbs.flatMap(cleanAndSplitTerm),
2611
+ ...topics.flatMap(cleanAndSplitTerm),
2612
+ ...terms.flatMap(cleanAndSplitTerm)
2613
+ ];
2614
+ return [...new Set(processedTerms)];
2615
+ }
2616
+ function calculateCoverage({ original, simplified }) {
2617
+ if (original.length === 0) {
2618
+ return simplified.length === 0 ? 1 : 0;
2619
+ }
2620
+ const covered = original.filter(
2621
+ (element) => simplified.some((s) => {
2622
+ const elem = normalizeString(element);
2623
+ const simp = normalizeString(s);
2624
+ if (elem.length <= 3) {
2625
+ return elem === simp;
2626
+ }
2627
+ const longer = elem.length > simp.length ? elem : simp;
2628
+ const shorter = elem.length > simp.length ? simp : elem;
2629
+ if (longer.includes(shorter)) {
2630
+ return shorter.length / longer.length > 0.6;
2631
+ }
2632
+ return false;
2633
+ })
2634
+ );
2635
+ return covered.length / original.length;
2636
+ }
2637
+ function createCompletenessScorer() {
2638
+ return evals.createScorer({
2639
+ id: "completeness-scorer",
2640
+ name: "Completeness Scorer",
2641
+ description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
2642
+ type: "agent"
2643
+ }).preprocess(async ({ run }) => {
2644
+ const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
2645
+ const content = chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i);
2646
+ return content === null || content === void 0;
2647
+ });
2648
+ const isOutputInvalid = !run.output || run.output.some((i) => {
2649
+ const content = chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i);
2650
+ return content === null || content === void 0;
2651
+ });
2652
+ if (isInputInvalid || isOutputInvalid) {
2653
+ throw new Error("Inputs cannot be null or undefined");
2654
+ }
2655
+ const input = run.input?.inputMessages.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2656
+ const output = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2657
+ const inputToProcess = input;
2658
+ const outputToProcess = output;
2659
+ const inputDoc = nlp__default.default(inputToProcess.trim());
2660
+ const outputDoc = nlp__default.default(outputToProcess.trim());
2661
+ const inputElements = extractElements(inputDoc);
2662
+ const outputElements = extractElements(outputDoc);
2663
+ return {
2664
+ inputElements,
2665
+ outputElements,
2666
+ missingElements: inputElements.filter((e) => !outputElements.includes(e)),
2667
+ elementCounts: {
2668
+ input: inputElements.length,
2669
+ output: outputElements.length
2670
+ }
2671
+ };
2672
+ }).generateScore(({ results }) => {
2673
+ const inputElements = results.preprocessStepResult?.inputElements;
2674
+ const outputElements = results.preprocessStepResult?.outputElements;
2675
+ return calculateCoverage({
2676
+ original: inputElements,
2677
+ simplified: outputElements
2678
+ });
2679
+ });
2680
+ }
2681
+ function calculateRatio(input, output) {
2682
+ if (input === output) {
2683
+ return 1;
2684
+ }
2685
+ if (input.length === 0 || output.length === 0) {
2686
+ return 0;
2687
+ }
2688
+ const matches = longestCommonSubsequence(input, output);
2689
+ const total = input.length + output.length;
2690
+ return total > 0 ? 2 * matches / total : 0;
2691
+ }
2692
+ function longestCommonSubsequence(str1, str2) {
2693
+ const m = str1.length;
2694
+ const n = str2.length;
2695
+ const dp = [];
2696
+ for (let i = 0; i <= m; i++) {
2697
+ dp[i] = [];
2698
+ for (let j = 0; j <= n; j++) {
2699
+ dp[i][j] = 0;
2700
+ }
2701
+ }
2702
+ for (let i = 1; i <= m; i++) {
2703
+ for (let j = 1; j <= n; j++) {
2704
+ if (str1[i - 1] === str2[j - 1]) {
2705
+ dp[i][j] = dp[i - 1][j - 1] + 1;
2706
+ } else {
2707
+ dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
2708
+ }
2709
+ }
2710
+ }
2711
+ return dp[m][n];
2712
+ }
2713
+ function countChanges(input, output) {
2714
+ const inputNormalized = input.replace(/\s+/g, " ").trim();
2715
+ const outputNormalized = output.replace(/\s+/g, " ").trim();
2716
+ if (inputNormalized === outputNormalized) {
2717
+ if (input !== output) {
2718
+ const inputWords2 = input.split(/\s+/).filter((w) => w.length > 0);
2719
+ const outputWords2 = output.split(/\s+/).filter((w) => w.length > 0);
2720
+ return Math.abs(inputWords2.length - outputWords2.length) || 1;
2721
+ }
2722
+ return 0;
2723
+ }
2724
+ const inputWords = inputNormalized.split(/\s+/).filter((w) => w.length > 0);
2725
+ const outputWords = outputNormalized.split(/\s+/).filter((w) => w.length > 0);
2726
+ if (inputWords.length === 0 && outputWords.length === 0) {
2727
+ return 0;
2728
+ }
2729
+ if (inputWords.length === 0) {
2730
+ return outputWords.length;
2731
+ }
2732
+ if (outputWords.length === 0) {
2733
+ return inputWords.length;
2734
+ }
2735
+ const matchingWords = findCommonWords(inputWords, outputWords);
2736
+ const maxLength = Math.max(inputWords.length, outputWords.length);
2737
+ const changes = maxLength - matchingWords;
2738
+ return changes;
2739
+ }
2740
+ function findCommonWords(arr1, arr2) {
2741
+ let matches = 0;
2742
+ const used = /* @__PURE__ */ new Set();
2743
+ for (let i = 0; i < arr1.length; i++) {
2744
+ for (let j = 0; j < arr2.length; j++) {
2745
+ if (!used.has(j) && arr1[i] === arr2[j]) {
2746
+ matches++;
2747
+ used.add(j);
2748
+ break;
2749
+ }
2750
+ }
2751
+ }
2752
+ return matches;
2753
+ }
2754
+ function createTextualDifferenceScorer() {
2755
+ return evals.createScorer({
2756
+ id: "textual-difference-scorer",
2757
+ name: "Textual Difference Scorer",
2758
+ description: "Calculate textual difference between input and output using sequence matching algorithms.",
2759
+ type: "agent"
2760
+ }).preprocess(async ({ run }) => {
2761
+ const input = run.input?.inputMessages?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2762
+ const output = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2763
+ const ratio = calculateRatio(input, output);
2764
+ const changes = countChanges(input, output);
2765
+ const maxLength = Math.max(input.length, output.length);
2766
+ const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
2767
+ const confidence = 1 - lengthDiff;
2768
+ return {
2769
+ ratio,
2770
+ confidence,
2771
+ changes,
2772
+ lengthDiff
2773
+ };
2774
+ }).generateScore(({ results }) => {
2775
+ return results.preprocessStepResult?.ratio;
2776
+ });
2777
+ }
2778
+ function createKeywordCoverageScorer() {
2779
+ return evals.createScorer({
2780
+ id: "keyword-coverage-scorer",
2781
+ name: "Keyword Coverage Scorer",
2782
+ description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
2783
+ type: "agent"
2784
+ }).preprocess(async ({ run }) => {
2785
+ const input = run.input?.inputMessages?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2786
+ const output = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2787
+ if (!input && !output) {
2788
+ return {
2789
+ result: {
2790
+ referenceKeywords: /* @__PURE__ */ new Set(),
2791
+ responseKeywords: /* @__PURE__ */ new Set()
2792
+ }
2793
+ };
2794
+ }
2795
+ const extractKeywords = (text) => {
2796
+ return keyword_extractor__default.default.extract(text, {
2797
+ language: "english",
2798
+ remove_digits: true,
2799
+ return_changed_case: true,
2800
+ remove_duplicates: true
2801
+ });
2802
+ };
2803
+ const referenceKeywords = new Set(extractKeywords(input));
2804
+ const responseKeywords = new Set(extractKeywords(output));
2805
+ return {
2806
+ referenceKeywords,
2807
+ responseKeywords
2808
+ };
2809
+ }).analyze(async ({ results }) => {
2810
+ if (!results.preprocessStepResult?.referenceKeywords?.size && !results.preprocessStepResult?.responseKeywords?.size) {
2811
+ return {
2812
+ totalKeywordsLength: 0,
2813
+ matchedKeywordsLength: 0
2814
+ };
2815
+ }
2816
+ const matchedKeywords = [...results.preprocessStepResult?.referenceKeywords].filter(
2817
+ (k) => results.preprocessStepResult?.responseKeywords?.has(k)
2818
+ );
2819
+ return {
2820
+ totalKeywordsLength: Array.from(results.preprocessStepResult?.referenceKeywords).length ?? 0,
2821
+ matchedKeywordsLength: matchedKeywords.length ?? 0
2822
+ };
2823
+ }).generateScore(({ results }) => {
2824
+ if (!results.analyzeStepResult?.totalKeywordsLength) {
2825
+ return 1;
2826
+ }
2827
+ const totalKeywords = results.analyzeStepResult?.totalKeywordsLength;
2828
+ const matchedKeywords = results.analyzeStepResult?.matchedKeywordsLength;
2829
+ return totalKeywords > 0 ? matchedKeywords / totalKeywords : 0;
2830
+ });
2831
+ }
2832
+ function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
2833
+ return evals.createScorer({
2834
+ id: "content-similarity-scorer",
2835
+ name: "Content Similarity Scorer",
2836
+ description: "Calculates content similarity between input and output messages using string comparison algorithms.",
2837
+ type: "agent"
2838
+ }).preprocess(async ({ run }) => {
2839
+ let processedInput = run.input?.inputMessages.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2840
+ let processedOutput = run.output.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2841
+ if (ignoreCase) {
2842
+ processedInput = processedInput.toLowerCase();
2843
+ processedOutput = processedOutput.toLowerCase();
2844
+ }
2845
+ if (ignoreWhitespace) {
2846
+ processedInput = processedInput.replace(/\s+/g, " ").trim();
2847
+ processedOutput = processedOutput.replace(/\s+/g, " ").trim();
2848
+ }
2849
+ return {
2850
+ processedInput,
2851
+ processedOutput
2852
+ };
2853
+ }).generateScore(({ results }) => {
2854
+ const similarity = stringSimilarity__default.default.compareTwoStrings(
2855
+ results.preprocessStepResult?.processedInput,
2856
+ results.preprocessStepResult?.processedOutput
2857
+ );
2858
+ return similarity;
2859
+ });
2860
+ }
2861
+ function createToneScorer(config = {}) {
2862
+ const { referenceTone } = config;
2863
+ return evals.createScorer({
2864
+ id: "tone-scorer",
2865
+ name: "Tone Scorer",
2866
+ description: "Analyzes the tone and sentiment of agent responses using sentiment analysis. Can compare against a reference tone or evaluate sentiment stability.",
2867
+ type: "agent"
2868
+ }).preprocess(async ({ run }) => {
2869
+ const sentiment = new Sentiment__default.default();
2870
+ const agentMessage = run.output?.map((i) => chunkTPQLLHZW_cjs.getTextContentFromMastraDBMessage(i)).join(", ") || "";
2871
+ const responseSentiment = sentiment.analyze(agentMessage);
2872
+ if (referenceTone) {
2873
+ const referenceSentiment = sentiment.analyze(referenceTone);
2874
+ const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
2875
+ const normalizedScore = Math.max(0, 1 - sentimentDiff);
2876
+ return {
2877
+ score: normalizedScore,
2878
+ responseSentiment: responseSentiment.comparative,
2879
+ referenceSentiment: referenceSentiment.comparative,
2880
+ difference: sentimentDiff
2881
+ };
2882
+ }
2883
+ const sentences = agentMessage.match(/[^.!?]+[.!?]+/g) || [agentMessage];
2884
+ const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
2885
+ const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
2886
+ const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
2887
+ const stability = Math.max(0, 1 - variance);
2888
+ return {
2889
+ score: stability,
2890
+ avgSentiment,
2891
+ sentimentVariance: variance
2892
+ };
2893
+ }).generateScore(({ results }) => {
2894
+ return results.preprocessStepResult?.score;
2895
+ });
2896
+ }
2897
+ function checkToolOrder(actualTools, expectedOrder, strictMode = false) {
2898
+ if (strictMode) {
2899
+ return JSON.stringify(actualTools) === JSON.stringify(expectedOrder);
2900
+ }
2901
+ const expectedIndices = [];
2902
+ for (const expectedTool of expectedOrder) {
2903
+ const index = actualTools.indexOf(expectedTool);
2904
+ if (index === -1) {
2905
+ return false;
2906
+ }
2907
+ expectedIndices.push(index);
2908
+ }
2909
+ for (let i = 1; i < expectedIndices.length; i++) {
2910
+ const currentIndex = expectedIndices[i];
2911
+ const prevIndex = expectedIndices[i - 1];
2912
+ if (currentIndex !== void 0 && prevIndex !== void 0 && currentIndex <= prevIndex) {
2913
+ return false;
2914
+ }
2915
+ }
2916
+ return true;
2917
+ }
2918
+ function calculateAccuracy({
2919
+ expectedTool,
2920
+ actualTools,
2921
+ strictMode = false,
2922
+ expectedToolOrder
2923
+ }) {
2924
+ if (actualTools.length === 0) {
2925
+ return 0;
2926
+ }
2927
+ if (expectedToolOrder && expectedToolOrder.length > 0) {
2928
+ return checkToolOrder(actualTools, expectedToolOrder, strictMode) ? 1 : 0;
2929
+ }
2930
+ if (!expectedTool) {
2931
+ return 0;
2932
+ }
2933
+ if (strictMode) {
2934
+ return actualTools.length === 1 && actualTools[0] === expectedTool ? 1 : 0;
2935
+ }
2936
+ return actualTools.includes(expectedTool) ? 1 : 0;
2937
+ }
2938
+ function createToolCallAccuracyScorerCode(options) {
2939
+ const { expectedTool, strictMode = false, expectedToolOrder } = options;
2940
+ if (!expectedTool && !expectedToolOrder) {
2941
+ throw new Error("Either expectedTool or expectedToolOrder must be provided");
2942
+ }
2943
+ const getDescription = () => {
2944
+ return expectedToolOrder ? `Evaluates whether the LLM called tools in the correct order: [${expectedToolOrder.join(", ")}]` : `Evaluates whether the LLM selected the correct tool (${expectedTool}) from the available tools`;
2945
+ };
2946
+ return evals.createScorer({
2947
+ id: "code-tool-call-accuracy-scorer",
2948
+ name: "Tool Call Accuracy Scorer",
2949
+ description: getDescription(),
2950
+ type: "agent"
2951
+ }).preprocess(async ({ run }) => {
2952
+ const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
2953
+ const isOutputInvalid = !run.output || run.output.length === 0;
2954
+ if (isInputInvalid || isOutputInvalid) {
2955
+ throw new Error("Input and output messages cannot be null or empty");
2956
+ }
2957
+ const { tools: actualTools, toolCallInfos } = chunkTPQLLHZW_cjs.extractToolCalls(run.output);
2958
+ const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
2959
+ return {
2960
+ expectedTool,
2961
+ actualTools,
2962
+ strictMode,
2963
+ expectedToolOrder,
2964
+ hasToolCalls: actualTools.length > 0,
2965
+ correctToolCalled,
2966
+ toolCallInfos,
2967
+ correctOrderCalled: expectedToolOrder ? checkToolOrder(actualTools, expectedToolOrder, strictMode) : null
2968
+ };
2969
+ }).generateScore(({ results }) => {
2970
+ const preprocessResult = results.preprocessStepResult;
2971
+ if (!preprocessResult) {
2972
+ return 0;
2973
+ }
2974
+ return calculateAccuracy({
2975
+ expectedTool: preprocessResult.expectedTool,
2976
+ actualTools: preprocessResult.actualTools,
2977
+ strictMode: preprocessResult.strictMode,
2978
+ expectedToolOrder: preprocessResult.expectedToolOrder
2979
+ });
2980
+ });
2981
+ }
2571
2982
 
2572
2983
  exports.ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = ANSWER_RELEVANCY_AGENT_INSTRUCTIONS;
2573
2984
  exports.ANSWER_SIMILARITY_DEFAULT_OPTIONS = ANSWER_SIMILARITY_DEFAULT_OPTIONS;
@@ -2576,12 +2987,18 @@ exports.DEFAULT_OPTIONS = DEFAULT_OPTIONS;
2576
2987
  exports.createAnswerRelevancyScorer = createAnswerRelevancyScorer;
2577
2988
  exports.createAnswerSimilarityScorer = createAnswerSimilarityScorer;
2578
2989
  exports.createBiasScorer = createBiasScorer;
2990
+ exports.createCompletenessScorer = createCompletenessScorer;
2991
+ exports.createContentSimilarityScorer = createContentSimilarityScorer;
2579
2992
  exports.createContextPrecisionScorer = createContextPrecisionScorer;
2580
2993
  exports.createContextRelevanceScorerLLM = createContextRelevanceScorerLLM;
2581
2994
  exports.createFaithfulnessScorer = createFaithfulnessScorer;
2582
2995
  exports.createHallucinationScorer = createHallucinationScorer;
2996
+ exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
2583
2997
  exports.createNoiseSensitivityScorerLLM = createNoiseSensitivityScorerLLM;
2584
2998
  exports.createPromptAlignmentScorerLLM = createPromptAlignmentScorerLLM;
2999
+ exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
3000
+ exports.createToneScorer = createToneScorer;
3001
+ exports.createToolCallAccuracyScorerCode = createToolCallAccuracyScorerCode;
2585
3002
  exports.createToolCallAccuracyScorerLLM = createToolCallAccuracyScorerLLM;
2586
3003
  exports.createToxicityScorer = createToxicityScorer;
2587
3004
  //# sourceMappingURL=index.cjs.map