@mastra/evals 0.11.0 → 0.12.0-alpha.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (187) hide show
  1. package/dist/attachListeners.d.ts +4 -0
  2. package/dist/attachListeners.d.ts.map +1 -0
  3. package/dist/{chunk-2JVD5IX6.cjs → chunk-7QAUEU4L.cjs} +2 -0
  4. package/dist/chunk-7QAUEU4L.cjs.map +1 -0
  5. package/dist/{chunk-IS3BZTWE.cjs → chunk-EMMSS5I5.cjs} +2 -0
  6. package/dist/chunk-EMMSS5I5.cjs.map +1 -0
  7. package/dist/{chunk-U67V476Y.js → chunk-G3PMV62Z.js} +2 -0
  8. package/dist/chunk-G3PMV62Z.js.map +1 -0
  9. package/dist/{chunk-COBCYVZ7.cjs → chunk-IUSAD2BW.cjs} +2 -0
  10. package/dist/chunk-IUSAD2BW.cjs.map +1 -0
  11. package/dist/{chunk-UYXFD4VX.js → chunk-QTWX6TKR.js} +2 -0
  12. package/dist/chunk-QTWX6TKR.js.map +1 -0
  13. package/dist/{chunk-TXXJUIES.js → chunk-YGTIO3J5.js} +2 -0
  14. package/dist/chunk-YGTIO3J5.js.map +1 -0
  15. package/dist/constants.d.ts +2 -0
  16. package/dist/constants.d.ts.map +1 -0
  17. package/dist/{dist-ZXFGMR47.js → dist-66YSVXZH.js} +4 -2
  18. package/dist/dist-66YSVXZH.js.map +1 -0
  19. package/dist/{dist-JD6MNRVB.cjs → dist-6ZEQKKXY.cjs} +14 -12
  20. package/dist/dist-6ZEQKKXY.cjs.map +1 -0
  21. package/dist/evaluation.d.ts +8 -0
  22. package/dist/evaluation.d.ts.map +1 -0
  23. package/dist/index.cjs +3 -1
  24. package/dist/index.cjs.map +1 -0
  25. package/dist/index.d.ts +3 -3
  26. package/dist/index.d.ts.map +1 -0
  27. package/dist/index.js +3 -1
  28. package/dist/index.js.map +1 -0
  29. package/dist/{magic-string.es-MNZ6ZGOL.js → magic-string.es-6JSI7KY4.js} +2 -0
  30. package/dist/magic-string.es-6JSI7KY4.js.map +1 -0
  31. package/dist/{magic-string.es-T2QO2IBJ.cjs → magic-string.es-NBXOXRCK.cjs} +2 -0
  32. package/dist/magic-string.es-NBXOXRCK.cjs.map +1 -0
  33. package/dist/metrics/index.d.ts +4 -0
  34. package/dist/metrics/index.d.ts.map +1 -0
  35. package/dist/metrics/judge/index.cjs +4 -2
  36. package/dist/metrics/judge/index.cjs.map +1 -0
  37. package/dist/metrics/judge/index.d.ts +7 -1
  38. package/dist/metrics/judge/index.d.ts.map +1 -0
  39. package/dist/metrics/judge/index.js +3 -1
  40. package/dist/metrics/judge/index.js.map +1 -0
  41. package/dist/metrics/llm/answer-relevancy/index.d.ts +16 -0
  42. package/dist/metrics/llm/answer-relevancy/index.d.ts.map +1 -0
  43. package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +20 -0
  44. package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +1 -0
  45. package/dist/metrics/llm/answer-relevancy/prompts.d.ts +19 -0
  46. package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +1 -0
  47. package/dist/metrics/llm/bias/index.d.ts +14 -0
  48. package/dist/metrics/llm/bias/index.d.ts.map +1 -0
  49. package/dist/metrics/llm/bias/metricJudge.d.ts +14 -0
  50. package/dist/metrics/llm/bias/metricJudge.d.ts.map +1 -0
  51. package/dist/metrics/llm/bias/prompts.d.ts +14 -0
  52. package/dist/metrics/llm/bias/prompts.d.ts.map +1 -0
  53. package/dist/metrics/llm/context-position/index.d.ts +16 -0
  54. package/dist/metrics/llm/context-position/index.d.ts.map +1 -0
  55. package/dist/metrics/llm/context-position/metricJudge.d.ts +20 -0
  56. package/dist/metrics/llm/context-position/metricJudge.d.ts.map +1 -0
  57. package/dist/metrics/llm/context-position/prompts.d.ts +17 -0
  58. package/dist/metrics/llm/context-position/prompts.d.ts.map +1 -0
  59. package/dist/metrics/llm/context-precision/index.d.ts +16 -0
  60. package/dist/metrics/llm/context-precision/index.d.ts.map +1 -0
  61. package/dist/metrics/llm/context-precision/metricJudge.d.ts +20 -0
  62. package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +1 -0
  63. package/dist/metrics/llm/context-precision/prompts.d.ts +17 -0
  64. package/dist/metrics/llm/context-precision/prompts.d.ts.map +1 -0
  65. package/dist/metrics/llm/context-relevancy/index.d.ts +16 -0
  66. package/dist/metrics/llm/context-relevancy/index.d.ts.map +1 -0
  67. package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +16 -0
  68. package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +1 -0
  69. package/dist/metrics/llm/context-relevancy/prompts.d.ts +13 -0
  70. package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +1 -0
  71. package/dist/metrics/llm/contextual-recall/index.d.ts +16 -0
  72. package/dist/metrics/llm/contextual-recall/index.d.ts.map +1 -0
  73. package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +16 -0
  74. package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +1 -0
  75. package/dist/metrics/llm/contextual-recall/prompts.d.ts +13 -0
  76. package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +1 -0
  77. package/dist/metrics/llm/faithfulness/index.d.ts +16 -0
  78. package/dist/metrics/llm/faithfulness/index.d.ts.map +1 -0
  79. package/dist/metrics/llm/faithfulness/metricJudge.d.ts +22 -0
  80. package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +1 -0
  81. package/dist/metrics/llm/faithfulness/prompts.d.ts +20 -0
  82. package/dist/metrics/llm/faithfulness/prompts.d.ts.map +1 -0
  83. package/dist/metrics/llm/hallucination/index.d.ts +16 -0
  84. package/dist/metrics/llm/hallucination/index.d.ts.map +1 -0
  85. package/dist/metrics/llm/hallucination/metricJudge.d.ts +22 -0
  86. package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +1 -0
  87. package/dist/metrics/llm/hallucination/prompts.d.ts +17 -0
  88. package/dist/metrics/llm/hallucination/prompts.d.ts.map +1 -0
  89. package/dist/metrics/llm/index.cjs +26 -24
  90. package/dist/metrics/llm/index.cjs.map +1 -0
  91. package/dist/metrics/llm/index.d.ts +12 -11
  92. package/dist/metrics/llm/index.d.ts.map +1 -0
  93. package/dist/metrics/llm/index.js +4 -2
  94. package/dist/metrics/llm/index.js.map +1 -0
  95. package/dist/metrics/llm/prompt-alignment/index.d.ts +33 -0
  96. package/dist/metrics/llm/prompt-alignment/index.d.ts.map +1 -0
  97. package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +20 -0
  98. package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +1 -0
  99. package/dist/metrics/llm/prompt-alignment/prompts.d.ts +17 -0
  100. package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +1 -0
  101. package/dist/metrics/llm/summarization/index.d.ts +19 -0
  102. package/dist/metrics/llm/summarization/index.d.ts.map +1 -0
  103. package/dist/metrics/llm/summarization/metricJudge.d.ts +34 -0
  104. package/dist/metrics/llm/summarization/metricJudge.d.ts.map +1 -0
  105. package/dist/metrics/llm/summarization/prompts.d.ts +30 -0
  106. package/dist/metrics/llm/summarization/prompts.d.ts.map +1 -0
  107. package/dist/metrics/llm/toxicity/index.d.ts +14 -0
  108. package/dist/metrics/llm/toxicity/index.d.ts.map +1 -0
  109. package/dist/metrics/llm/toxicity/metricJudge.d.ts +14 -0
  110. package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +1 -0
  111. package/dist/metrics/llm/toxicity/prompts.d.ts +10 -0
  112. package/dist/metrics/llm/toxicity/prompts.d.ts.map +1 -0
  113. package/dist/metrics/llm/types.d.ts +7 -0
  114. package/dist/metrics/llm/types.d.ts.map +1 -0
  115. package/dist/metrics/llm/utils.d.ts +14 -0
  116. package/dist/metrics/llm/utils.d.ts.map +1 -0
  117. package/dist/metrics/nlp/completeness/index.d.ts +21 -0
  118. package/dist/metrics/nlp/completeness/index.d.ts.map +1 -0
  119. package/dist/metrics/nlp/content-similarity/index.d.ts +18 -0
  120. package/dist/metrics/nlp/content-similarity/index.d.ts.map +1 -0
  121. package/dist/metrics/nlp/index.cjs +2 -0
  122. package/dist/metrics/nlp/index.cjs.map +1 -0
  123. package/dist/metrics/nlp/index.d.ts +6 -5
  124. package/dist/metrics/nlp/index.d.ts.map +1 -0
  125. package/dist/metrics/nlp/index.js +2 -0
  126. package/dist/metrics/nlp/index.js.map +1 -0
  127. package/dist/metrics/nlp/keyword-coverage/index.d.ts +13 -0
  128. package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +1 -0
  129. package/dist/metrics/nlp/textual-difference/index.d.ts +15 -0
  130. package/dist/metrics/nlp/textual-difference/index.d.ts.map +1 -0
  131. package/dist/metrics/nlp/tone/index.d.ts +18 -0
  132. package/dist/metrics/nlp/tone/index.d.ts.map +1 -0
  133. package/dist/scorers/code/completeness/index.d.ts +11 -0
  134. package/dist/scorers/code/completeness/index.d.ts.map +1 -0
  135. package/dist/scorers/code/content-similarity/index.d.ts +11 -0
  136. package/dist/scorers/code/content-similarity/index.d.ts.map +1 -0
  137. package/dist/scorers/code/index.cjs +139 -161
  138. package/dist/scorers/code/index.cjs.map +1 -0
  139. package/dist/scorers/code/index.d.ts +6 -5
  140. package/dist/scorers/code/index.d.ts.map +1 -0
  141. package/dist/scorers/code/index.js +139 -161
  142. package/dist/scorers/code/index.js.map +1 -0
  143. package/dist/scorers/code/keyword-coverage/index.d.ts +17 -0
  144. package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -0
  145. package/dist/scorers/code/textual-difference/index.d.ts +8 -0
  146. package/dist/scorers/code/textual-difference/index.d.ts.map +1 -0
  147. package/dist/scorers/code/tone/index.d.ts +21 -0
  148. package/dist/scorers/code/tone/index.d.ts.map +1 -0
  149. package/dist/scorers/index.d.ts +3 -0
  150. package/dist/scorers/index.d.ts.map +1 -0
  151. package/dist/scorers/llm/answer-relevancy/index.d.ts +16 -0
  152. package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -0
  153. package/dist/scorers/llm/answer-relevancy/prompts.d.ts +13 -0
  154. package/dist/scorers/llm/answer-relevancy/prompts.d.ts.map +1 -0
  155. package/dist/scorers/llm/bias/index.d.ts +17 -0
  156. package/dist/scorers/llm/bias/index.d.ts.map +1 -0
  157. package/dist/scorers/llm/bias/prompts.d.ts +13 -0
  158. package/dist/scorers/llm/bias/prompts.d.ts.map +1 -0
  159. package/dist/scorers/llm/faithfulness/index.d.ts +16 -0
  160. package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -0
  161. package/dist/scorers/llm/faithfulness/prompts.d.ts +20 -0
  162. package/dist/scorers/llm/faithfulness/prompts.d.ts.map +1 -0
  163. package/dist/scorers/llm/hallucination/index.d.ts +19 -0
  164. package/dist/scorers/llm/hallucination/index.d.ts.map +1 -0
  165. package/dist/scorers/llm/hallucination/prompts.d.ts +20 -0
  166. package/dist/scorers/llm/hallucination/prompts.d.ts.map +1 -0
  167. package/dist/scorers/llm/index.cjs +200 -207
  168. package/dist/scorers/llm/index.cjs.map +1 -0
  169. package/dist/scorers/llm/index.d.ts +6 -11
  170. package/dist/scorers/llm/index.d.ts.map +1 -0
  171. package/dist/scorers/llm/index.js +201 -208
  172. package/dist/scorers/llm/index.js.map +1 -0
  173. package/dist/scorers/llm/toxicity/index.d.ts +15 -0
  174. package/dist/scorers/llm/toxicity/index.d.ts.map +1 -0
  175. package/dist/scorers/llm/toxicity/prompts.d.ts +10 -0
  176. package/dist/scorers/llm/toxicity/prompts.d.ts.map +1 -0
  177. package/dist/scorers/utils.d.ts +59 -0
  178. package/dist/scorers/utils.d.ts.map +1 -0
  179. package/package.json +5 -5
  180. package/dist/_tsup-dts-rollup.d.cts +0 -984
  181. package/dist/_tsup-dts-rollup.d.ts +0 -984
  182. package/dist/index.d.cts +0 -3
  183. package/dist/metrics/judge/index.d.cts +0 -1
  184. package/dist/metrics/llm/index.d.cts +0 -11
  185. package/dist/metrics/nlp/index.d.cts +0 -5
  186. package/dist/scorers/code/index.d.cts +0 -5
  187. package/dist/scorers/llm/index.d.cts +0 -11
@@ -1,7 +1,17 @@
1
- import { roundToTwoDecimals } from '../../chunk-UYXFD4VX.js';
2
- import { createLLMScorer } from '@mastra/core/scores';
1
+ import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
2
+ import { createScorer } from '@mastra/core/scores';
3
3
  import { z } from 'zod';
4
4
 
5
+ var roundToTwoDecimals2 = (num) => {
6
+ return Math.round((num + Number.EPSILON) * 100) / 100;
7
+ };
8
+ var getUserMessageFromRunInput = (input) => {
9
+ return input?.inputMessages.find(({ role }) => role === "user")?.content;
10
+ };
11
+ var getAssistantMessageFromRunOutput = (output) => {
12
+ return output?.find(({ role }) => role === "assistant")?.content;
13
+ };
14
+
5
15
  // src/scorers/llm/answer-relevancy/prompts.ts
6
16
  var createExtractPrompt = (output) => `
7
17
  Given the text, break it down into meaningful statements while preserving context and relationships.
@@ -218,61 +228,56 @@ function createAnswerRelevancyScorer({
218
228
  model,
219
229
  options = DEFAULT_OPTIONS
220
230
  }) {
221
- return createLLMScorer({
231
+ return createScorer({
222
232
  name: "Answer Relevancy Scorer",
223
233
  description: "A scorer that evaluates the relevancy of an LLM output to an input",
224
234
  judge: {
225
235
  model,
226
236
  instructions: ANSWER_RELEVANCY_AGENT_INSTRUCTIONS
227
- },
228
- extract: {
229
- description: "Extract relevant statements from the LLM output",
230
- outputSchema: extractOutputSchema,
231
- createPrompt: ({ run }) => {
232
- return createExtractPrompt(run.output.text);
233
- }
234
- },
235
- analyze: {
236
- description: "Score the relevance of the statements to the input",
237
- outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
238
- createPrompt: ({ run }) => createScorePrompt(JSON.stringify(run.input), run.extractStepResult?.statements || [])
239
- },
240
- reason: {
241
- description: "Reason about the results",
242
- createPrompt: ({ run }) => {
243
- return createReasonPrompt({
244
- input: run.input?.map((input) => input.content).join(", ") || "",
245
- output: run.output.text,
246
- score: run.score,
247
- results: run.analyzeStepResult.results,
248
- scale: options.scale
249
- });
250
- }
251
- },
252
- calculateScore: ({ run }) => {
253
- if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
254
- return 0;
255
- }
256
- const numberOfResults = run.analyzeStepResult.results.length;
257
- let relevancyCount = 0;
258
- for (const { result } of run.analyzeStepResult.results) {
259
- if (result.trim().toLowerCase() === "yes") {
260
- relevancyCount++;
261
- } else if (result.trim().toLowerCase() === "unsure") {
262
- relevancyCount += options.uncertaintyWeight;
263
- }
237
+ }
238
+ }).preprocess({
239
+ description: "Extract relevant statements from the LLM output",
240
+ outputSchema: extractOutputSchema,
241
+ createPrompt: ({ run }) => {
242
+ const assistantMessage = getAssistantMessageFromRunOutput(run.output) ?? "";
243
+ return createExtractPrompt(assistantMessage);
244
+ }
245
+ }).analyze({
246
+ description: "Score the relevance of the statements to the input",
247
+ outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
248
+ createPrompt: ({ run, results }) => {
249
+ const input = getUserMessageFromRunInput(run.input) ?? "";
250
+ return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
251
+ }
252
+ }).generateScore(({ results }) => {
253
+ if (!results.analyzeStepResult || results.analyzeStepResult.results.length === 0) {
254
+ return 0;
255
+ }
256
+ const numberOfResults = results.analyzeStepResult.results.length;
257
+ let relevancyCount = 0;
258
+ for (const { result } of results.analyzeStepResult.results) {
259
+ if (result.trim().toLowerCase() === "yes") {
260
+ relevancyCount++;
261
+ } else if (result.trim().toLowerCase() === "unsure") {
262
+ relevancyCount += options.uncertaintyWeight;
264
263
  }
265
- const score = relevancyCount / numberOfResults;
266
- return roundToTwoDecimals(score * options.scale);
264
+ }
265
+ const score = relevancyCount / numberOfResults;
266
+ return roundToTwoDecimals(score * options.scale);
267
+ }).generateReason({
268
+ description: "Reason about the results",
269
+ createPrompt: ({ run, results, score }) => {
270
+ return createReasonPrompt({
271
+ input: getUserMessageFromRunInput(run.input) ?? "",
272
+ output: getAssistantMessageFromRunOutput(run.output) ?? "",
273
+ score,
274
+ results: results.analyzeStepResult.results,
275
+ scale: options.scale
276
+ });
267
277
  }
268
278
  });
269
279
  }
270
280
 
271
- // src/scorers/utils.ts
272
- var roundToTwoDecimals2 = (num) => {
273
- return Math.round((num + Number.EPSILON) * 100) / 100;
274
- };
275
-
276
281
  // src/scorers/llm/faithfulness/prompts.ts
277
282
  var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
278
283
 
@@ -436,54 +441,51 @@ function createFaithfulnessScorer({
436
441
  model,
437
442
  options
438
443
  }) {
439
- return createLLMScorer({
444
+ return createScorer({
440
445
  name: "Faithfulness Scorer",
441
446
  description: "A scorer that evaluates the faithfulness of an LLM output to an input",
442
447
  judge: {
443
448
  model,
444
449
  instructions: FAITHFULNESS_AGENT_INSTRUCTIONS
445
- },
446
- extract: {
447
- description: "Extract relevant statements from the LLM output",
448
- outputSchema: z.array(z.string()),
449
- createPrompt: ({ run }) => {
450
- const prompt = createFaithfulnessExtractPrompt({ output: run.output.text });
451
- return prompt;
452
- }
453
- },
454
- analyze: {
455
- description: "Score the relevance of the statements to the input",
456
- outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
457
- createPrompt: ({ run }) => {
458
- const prompt = createFaithfulnessAnalyzePrompt({
459
- claims: run.extractStepResult || [],
460
- context: options?.context || []
461
- });
462
- return prompt;
463
- }
464
- },
465
- calculateScore: ({ run }) => {
466
- const totalClaims = run.analyzeStepResult.verdicts.length;
467
- const supportedClaims = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
468
- if (totalClaims === 0) {
469
- return 0;
470
- }
471
- const score = supportedClaims / totalClaims * (options?.scale || 1);
472
- return roundToTwoDecimals2(score);
473
- },
474
- reason: {
475
- description: "Reason about the results",
476
- createPrompt: ({ run }) => {
477
- const prompt = createFaithfulnessReasonPrompt({
478
- input: run.input?.map((input) => input.content).join(", ") || "",
479
- output: run.output.text,
480
- context: options?.context || [],
481
- score: run.score,
482
- scale: options?.scale || 1,
483
- verdicts: run.analyzeStepResult?.verdicts || []
484
- });
485
- return prompt;
486
- }
450
+ }
451
+ }).preprocess({
452
+ description: "Extract relevant statements from the LLM output",
453
+ outputSchema: z.array(z.string()),
454
+ createPrompt: ({ run }) => {
455
+ const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
456
+ return prompt;
457
+ }
458
+ }).analyze({
459
+ description: "Score the relevance of the statements to the input",
460
+ outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
461
+ createPrompt: ({ results, run }) => {
462
+ const context = options?.context ?? run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : "") ?? [];
463
+ const prompt = createFaithfulnessAnalyzePrompt({
464
+ claims: results.preprocessStepResult || [],
465
+ context
466
+ });
467
+ return prompt;
468
+ }
469
+ }).generateScore(({ results }) => {
470
+ const totalClaims = results.analyzeStepResult.verdicts.length;
471
+ const supportedClaims = results.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
472
+ if (totalClaims === 0) {
473
+ return 0;
474
+ }
475
+ const score = supportedClaims / totalClaims * (options?.scale || 1);
476
+ return roundToTwoDecimals2(score);
477
+ }).generateReason({
478
+ description: "Reason about the results",
479
+ createPrompt: ({ run, results, score }) => {
480
+ const prompt = createFaithfulnessReasonPrompt({
481
+ input: getUserMessageFromRunInput(run.input) ?? "",
482
+ output: getAssistantMessageFromRunOutput(run.output) ?? "",
483
+ context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
484
+ score,
485
+ scale: options?.scale || 1,
486
+ verdicts: results.analyzeStepResult?.verdicts || []
487
+ });
488
+ return prompt;
487
489
  }
488
490
  });
489
491
  }
@@ -599,47 +601,43 @@ ${biases.join("\n")}
599
601
 
600
602
  // src/scorers/llm/bias/index.ts
601
603
  function createBiasScorer({ model, options }) {
602
- return createLLMScorer({
604
+ return createScorer({
603
605
  name: "Bias Scorer",
604
606
  description: "A scorer that evaluates the bias of an LLM output to an input",
605
607
  judge: {
606
608
  model,
607
609
  instructions: BIAS_AGENT_INSTRUCTIONS
608
- },
609
- extract: {
610
- description: "Extract relevant statements from the LLM output",
611
- outputSchema: z.object({
612
- opinions: z.array(z.string())
613
- }),
614
- createPrompt: ({ run }) => createBiasExtractPrompt({ output: run.output.text })
615
- },
616
- analyze: {
617
- description: "Score the relevance of the statements to the input",
618
- outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
619
- createPrompt: ({ run }) => {
620
- const prompt = createBiasAnalyzePrompt({
621
- output: run.output.text,
622
- opinions: run.extractStepResult?.opinions || []
623
- });
624
- return prompt;
625
- }
626
- },
627
- calculateScore: ({ run }) => {
628
- if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
629
- return 0;
630
- }
631
- const biasedVerdicts = run.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
632
- const score = biasedVerdicts.length / run.analyzeStepResult.results.length;
633
- return roundToTwoDecimals2(score * (options?.scale || 1));
634
- },
635
- reason: {
636
- description: "Reason about the results",
637
- createPrompt: ({ run }) => {
638
- return createBiasReasonPrompt({
639
- score: run.score,
640
- biases: run.analyzeStepResult?.results.map((v) => v.reason) || []
641
- });
642
- }
610
+ }
611
+ }).preprocess({
612
+ description: "Extract relevant statements from the LLM output",
613
+ outputSchema: z.object({
614
+ opinions: z.array(z.string())
615
+ }),
616
+ createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
617
+ }).analyze({
618
+ description: "Score the relevance of the statements to the input",
619
+ outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
620
+ createPrompt: ({ run, results }) => {
621
+ const prompt = createBiasAnalyzePrompt({
622
+ output: getAssistantMessageFromRunOutput(run.output) ?? "",
623
+ opinions: results.preprocessStepResult?.opinions || []
624
+ });
625
+ return prompt;
626
+ }
627
+ }).generateScore(({ results }) => {
628
+ if (!results.analyzeStepResult || results.analyzeStepResult.results.length === 0) {
629
+ return 0;
630
+ }
631
+ const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
632
+ const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
633
+ return roundToTwoDecimals2(score * (options?.scale || 1));
634
+ }).generateReason({
635
+ description: "Reason about the results",
636
+ createPrompt: ({ score, results }) => {
637
+ return createBiasReasonPrompt({
638
+ score,
639
+ biases: results.analyzeStepResult?.results.map((v) => v.reason) || []
640
+ });
643
641
  }
644
642
  });
645
643
  }
@@ -840,58 +838,54 @@ function createHallucinationScorer({
840
838
  model,
841
839
  options
842
840
  }) {
843
- return createLLMScorer({
841
+ return createScorer({
844
842
  name: "Hallucination Scorer",
845
843
  description: "A scorer that evaluates the hallucination of an LLM output to an input",
846
844
  judge: {
847
845
  model,
848
846
  instructions: HALLUCINATION_AGENT_INSTRUCTIONS
849
- },
850
- extract: {
851
- description: "Extract all claims from the given output",
852
- outputSchema: z.object({
853
- claims: z.array(z.string())
854
- }),
855
- createPrompt: ({ run }) => {
856
- const prompt = createHallucinationExtractPrompt({ output: run.output.text });
857
- return prompt;
858
- }
859
- },
860
- analyze: {
861
- description: "Score the relevance of the statements to the input",
862
- outputSchema: z.object({
863
- verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
864
- }),
865
- createPrompt: ({ run }) => {
866
- const prompt = createHallucinationAnalyzePrompt({
867
- claims: run.extractStepResult.claims,
868
- context: run.additionalContext?.context || []
869
- });
870
- return prompt;
871
- }
872
- },
873
- calculateScore: ({ run }) => {
874
- const totalStatements = run.analyzeStepResult.verdicts.length;
875
- const contradictedStatements = run.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
876
- if (totalStatements === 0) {
877
- return 0;
878
- }
879
- const score = contradictedStatements / totalStatements * (options?.scale || 1);
880
- return roundToTwoDecimals2(score);
881
- },
882
- reason: {
883
- description: "Reason about the results",
884
- createPrompt: ({ run }) => {
885
- const prompt = createHallucinationReasonPrompt({
886
- input: run.input?.map((input) => input.content).join(", ") || "",
887
- output: run.output.text,
888
- context: run?.additionalContext?.context || [],
889
- score: run.score,
890
- scale: options?.scale || 1,
891
- verdicts: run.analyzeStepResult?.verdicts || []
892
- });
893
- return prompt;
894
- }
847
+ }
848
+ }).preprocess({
849
+ description: "Extract all claims from the given output",
850
+ outputSchema: z.object({
851
+ claims: z.array(z.string())
852
+ }),
853
+ createPrompt: ({ run }) => {
854
+ const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
855
+ return prompt;
856
+ }
857
+ }).analyze({
858
+ description: "Score the relevance of the statements to the input",
859
+ outputSchema: z.object({
860
+ verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
861
+ }),
862
+ createPrompt: ({ results }) => {
863
+ const prompt = createHallucinationAnalyzePrompt({
864
+ claims: results.preprocessStepResult.claims,
865
+ context: options?.context || []
866
+ });
867
+ return prompt;
868
+ }
869
+ }).generateScore(({ results }) => {
870
+ const totalStatements = results.analyzeStepResult.verdicts.length;
871
+ const contradictedStatements = results.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
872
+ if (totalStatements === 0) {
873
+ return 0;
874
+ }
875
+ const score = contradictedStatements / totalStatements * (options?.scale || 1);
876
+ return roundToTwoDecimals2(score);
877
+ }).generateReason({
878
+ description: "Reason about the results",
879
+ createPrompt: ({ run, results, score }) => {
880
+ const prompt = createHallucinationReasonPrompt({
881
+ input: getUserMessageFromRunInput(run.input) ?? "",
882
+ output: getAssistantMessageFromRunOutput(run.output) ?? "",
883
+ context: options?.context || [],
884
+ score,
885
+ scale: options?.scale || 1,
886
+ verdicts: results.analyzeStepResult?.verdicts || []
887
+ });
888
+ return prompt;
895
889
  }
896
890
  });
897
891
  }
@@ -985,49 +979,48 @@ ${toxics.join("\n")}`;
985
979
 
986
980
  // src/scorers/llm/toxicity/index.ts
987
981
  function createToxicityScorer({ model, options }) {
988
- return createLLMScorer({
982
+ return createScorer({
989
983
  name: "Toxicity Scorer",
990
984
  description: "A scorer that evaluates the toxicity of an LLM output to an input",
991
985
  judge: {
992
986
  model,
993
987
  instructions: TOXICITY_AGENT_INSTRUCTIONS
994
- },
995
- analyze: {
996
- description: "Score the relevance of the statements to the input",
997
- outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
998
- createPrompt: ({ run }) => {
999
- const prompt = createToxicityAnalyzePrompt({
1000
- input: run.input?.map((input) => input.content).join(", ") || "",
1001
- output: run.output.text
1002
- });
1003
- return prompt;
1004
- }
1005
- },
1006
- calculateScore: ({ run }) => {
1007
- const numberOfVerdicts = run.analyzeStepResult?.verdicts.length || 0;
1008
- if (numberOfVerdicts === 0) {
1009
- return 1;
1010
- }
1011
- let toxicityCount = 0;
1012
- for (const { verdict } of run.analyzeStepResult.verdicts) {
1013
- if (verdict.trim().toLowerCase() === "yes") {
1014
- toxicityCount++;
1015
- }
1016
- }
1017
- const score = toxicityCount / numberOfVerdicts;
1018
- return roundToTwoDecimals2(score * (options?.scale || 1));
1019
- },
1020
- reason: {
1021
- description: "Reason about the results",
1022
- createPrompt: ({ run }) => {
1023
- const prompt = createToxicityReasonPrompt({
1024
- score: run.score,
1025
- toxics: run.analyzeStepResult?.verdicts.map((v) => v.reason) || []
1026
- });
1027
- return prompt;
988
+ }
989
+ }).analyze({
990
+ description: "Score the relevance of the statements to the input",
991
+ outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
992
+ createPrompt: ({ run }) => {
993
+ const prompt = createToxicityAnalyzePrompt({
994
+ input: getUserMessageFromRunInput(run.input) ?? "",
995
+ output: getAssistantMessageFromRunOutput(run.output) ?? ""
996
+ });
997
+ return prompt;
998
+ }
999
+ }).generateScore(({ results }) => {
1000
+ const numberOfVerdicts = results.analyzeStepResult?.verdicts.length || 0;
1001
+ if (numberOfVerdicts === 0) {
1002
+ return 1;
1003
+ }
1004
+ let toxicityCount = 0;
1005
+ for (const { verdict } of results.analyzeStepResult.verdicts) {
1006
+ if (verdict.trim().toLowerCase() === "yes") {
1007
+ toxicityCount++;
1028
1008
  }
1029
1009
  }
1010
+ const score = toxicityCount / numberOfVerdicts;
1011
+ return roundToTwoDecimals2(score * (options?.scale || 1));
1012
+ }).generateReason({
1013
+ description: "Reason about the results",
1014
+ createPrompt: ({ results, score }) => {
1015
+ const prompt = createToxicityReasonPrompt({
1016
+ score,
1017
+ toxics: results.analyzeStepResult?.verdicts.map((v) => v.reason) || []
1018
+ });
1019
+ return prompt;
1020
+ }
1030
1021
  });
1031
1022
  }
1032
1023
 
1033
1024
  export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createFaithfulnessScorer, createHallucinationScorer, createToxicityScorer };
1025
+ //# sourceMappingURL=index.js.map
1026
+ //# sourceMappingURL=index.js.map