@mastra/evals 0.11.0 → 0.12.0-alpha.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/attachListeners.d.ts +4 -0
- package/dist/attachListeners.d.ts.map +1 -0
- package/dist/{chunk-2JVD5IX6.cjs → chunk-7QAUEU4L.cjs} +2 -0
- package/dist/chunk-7QAUEU4L.cjs.map +1 -0
- package/dist/{chunk-IS3BZTWE.cjs → chunk-EMMSS5I5.cjs} +2 -0
- package/dist/chunk-EMMSS5I5.cjs.map +1 -0
- package/dist/{chunk-U67V476Y.js → chunk-G3PMV62Z.js} +2 -0
- package/dist/chunk-G3PMV62Z.js.map +1 -0
- package/dist/{chunk-COBCYVZ7.cjs → chunk-IUSAD2BW.cjs} +2 -0
- package/dist/chunk-IUSAD2BW.cjs.map +1 -0
- package/dist/{chunk-UYXFD4VX.js → chunk-QTWX6TKR.js} +2 -0
- package/dist/chunk-QTWX6TKR.js.map +1 -0
- package/dist/{chunk-TXXJUIES.js → chunk-YGTIO3J5.js} +2 -0
- package/dist/chunk-YGTIO3J5.js.map +1 -0
- package/dist/constants.d.ts +2 -0
- package/dist/constants.d.ts.map +1 -0
- package/dist/{dist-ZXFGMR47.js → dist-66YSVXZH.js} +4 -2
- package/dist/dist-66YSVXZH.js.map +1 -0
- package/dist/{dist-JD6MNRVB.cjs → dist-6ZEQKKXY.cjs} +14 -12
- package/dist/dist-6ZEQKKXY.cjs.map +1 -0
- package/dist/evaluation.d.ts +8 -0
- package/dist/evaluation.d.ts.map +1 -0
- package/dist/index.cjs +3 -1
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +3 -1
- package/dist/index.js.map +1 -0
- package/dist/{magic-string.es-MNZ6ZGOL.js → magic-string.es-6JSI7KY4.js} +2 -0
- package/dist/magic-string.es-6JSI7KY4.js.map +1 -0
- package/dist/{magic-string.es-T2QO2IBJ.cjs → magic-string.es-NBXOXRCK.cjs} +2 -0
- package/dist/magic-string.es-NBXOXRCK.cjs.map +1 -0
- package/dist/metrics/index.d.ts +4 -0
- package/dist/metrics/index.d.ts.map +1 -0
- package/dist/metrics/judge/index.cjs +4 -2
- package/dist/metrics/judge/index.cjs.map +1 -0
- package/dist/metrics/judge/index.d.ts +7 -1
- package/dist/metrics/judge/index.d.ts.map +1 -0
- package/dist/metrics/judge/index.js +3 -1
- package/dist/metrics/judge/index.js.map +1 -0
- package/dist/metrics/llm/answer-relevancy/index.d.ts +16 -0
- package/dist/metrics/llm/answer-relevancy/index.d.ts.map +1 -0
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +20 -0
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts +19 -0
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/bias/index.d.ts +14 -0
- package/dist/metrics/llm/bias/index.d.ts.map +1 -0
- package/dist/metrics/llm/bias/metricJudge.d.ts +14 -0
- package/dist/metrics/llm/bias/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/bias/prompts.d.ts +14 -0
- package/dist/metrics/llm/bias/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/context-position/index.d.ts +16 -0
- package/dist/metrics/llm/context-position/index.d.ts.map +1 -0
- package/dist/metrics/llm/context-position/metricJudge.d.ts +20 -0
- package/dist/metrics/llm/context-position/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/context-position/prompts.d.ts +17 -0
- package/dist/metrics/llm/context-position/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/context-precision/index.d.ts +16 -0
- package/dist/metrics/llm/context-precision/index.d.ts.map +1 -0
- package/dist/metrics/llm/context-precision/metricJudge.d.ts +20 -0
- package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/context-precision/prompts.d.ts +17 -0
- package/dist/metrics/llm/context-precision/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/context-relevancy/index.d.ts +16 -0
- package/dist/metrics/llm/context-relevancy/index.d.ts.map +1 -0
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +16 -0
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/context-relevancy/prompts.d.ts +13 -0
- package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/contextual-recall/index.d.ts +16 -0
- package/dist/metrics/llm/contextual-recall/index.d.ts.map +1 -0
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +16 -0
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/contextual-recall/prompts.d.ts +13 -0
- package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/faithfulness/index.d.ts +16 -0
- package/dist/metrics/llm/faithfulness/index.d.ts.map +1 -0
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts +22 -0
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/faithfulness/prompts.d.ts +20 -0
- package/dist/metrics/llm/faithfulness/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/hallucination/index.d.ts +16 -0
- package/dist/metrics/llm/hallucination/index.d.ts.map +1 -0
- package/dist/metrics/llm/hallucination/metricJudge.d.ts +22 -0
- package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/hallucination/prompts.d.ts +17 -0
- package/dist/metrics/llm/hallucination/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/index.cjs +26 -24
- package/dist/metrics/llm/index.cjs.map +1 -0
- package/dist/metrics/llm/index.d.ts +12 -11
- package/dist/metrics/llm/index.d.ts.map +1 -0
- package/dist/metrics/llm/index.js +4 -2
- package/dist/metrics/llm/index.js.map +1 -0
- package/dist/metrics/llm/prompt-alignment/index.d.ts +33 -0
- package/dist/metrics/llm/prompt-alignment/index.d.ts.map +1 -0
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +20 -0
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts +17 -0
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/summarization/index.d.ts +19 -0
- package/dist/metrics/llm/summarization/index.d.ts.map +1 -0
- package/dist/metrics/llm/summarization/metricJudge.d.ts +34 -0
- package/dist/metrics/llm/summarization/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/summarization/prompts.d.ts +30 -0
- package/dist/metrics/llm/summarization/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/toxicity/index.d.ts +14 -0
- package/dist/metrics/llm/toxicity/index.d.ts.map +1 -0
- package/dist/metrics/llm/toxicity/metricJudge.d.ts +14 -0
- package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +1 -0
- package/dist/metrics/llm/toxicity/prompts.d.ts +10 -0
- package/dist/metrics/llm/toxicity/prompts.d.ts.map +1 -0
- package/dist/metrics/llm/types.d.ts +7 -0
- package/dist/metrics/llm/types.d.ts.map +1 -0
- package/dist/metrics/llm/utils.d.ts +14 -0
- package/dist/metrics/llm/utils.d.ts.map +1 -0
- package/dist/metrics/nlp/completeness/index.d.ts +21 -0
- package/dist/metrics/nlp/completeness/index.d.ts.map +1 -0
- package/dist/metrics/nlp/content-similarity/index.d.ts +18 -0
- package/dist/metrics/nlp/content-similarity/index.d.ts.map +1 -0
- package/dist/metrics/nlp/index.cjs +2 -0
- package/dist/metrics/nlp/index.cjs.map +1 -0
- package/dist/metrics/nlp/index.d.ts +6 -5
- package/dist/metrics/nlp/index.d.ts.map +1 -0
- package/dist/metrics/nlp/index.js +2 -0
- package/dist/metrics/nlp/index.js.map +1 -0
- package/dist/metrics/nlp/keyword-coverage/index.d.ts +13 -0
- package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +1 -0
- package/dist/metrics/nlp/textual-difference/index.d.ts +15 -0
- package/dist/metrics/nlp/textual-difference/index.d.ts.map +1 -0
- package/dist/metrics/nlp/tone/index.d.ts +18 -0
- package/dist/metrics/nlp/tone/index.d.ts.map +1 -0
- package/dist/scorers/code/completeness/index.d.ts +11 -0
- package/dist/scorers/code/completeness/index.d.ts.map +1 -0
- package/dist/scorers/code/content-similarity/index.d.ts +11 -0
- package/dist/scorers/code/content-similarity/index.d.ts.map +1 -0
- package/dist/scorers/code/index.cjs +139 -161
- package/dist/scorers/code/index.cjs.map +1 -0
- package/dist/scorers/code/index.d.ts +6 -5
- package/dist/scorers/code/index.d.ts.map +1 -0
- package/dist/scorers/code/index.js +139 -161
- package/dist/scorers/code/index.js.map +1 -0
- package/dist/scorers/code/keyword-coverage/index.d.ts +17 -0
- package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -0
- package/dist/scorers/code/textual-difference/index.d.ts +8 -0
- package/dist/scorers/code/textual-difference/index.d.ts.map +1 -0
- package/dist/scorers/code/tone/index.d.ts +21 -0
- package/dist/scorers/code/tone/index.d.ts.map +1 -0
- package/dist/scorers/index.d.ts +3 -0
- package/dist/scorers/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-relevancy/index.d.ts +16 -0
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -0
- package/dist/scorers/llm/answer-relevancy/prompts.d.ts +13 -0
- package/dist/scorers/llm/answer-relevancy/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/bias/index.d.ts +17 -0
- package/dist/scorers/llm/bias/index.d.ts.map +1 -0
- package/dist/scorers/llm/bias/prompts.d.ts +13 -0
- package/dist/scorers/llm/bias/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/faithfulness/index.d.ts +16 -0
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -0
- package/dist/scorers/llm/faithfulness/prompts.d.ts +20 -0
- package/dist/scorers/llm/faithfulness/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/hallucination/index.d.ts +19 -0
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -0
- package/dist/scorers/llm/hallucination/prompts.d.ts +20 -0
- package/dist/scorers/llm/hallucination/prompts.d.ts.map +1 -0
- package/dist/scorers/llm/index.cjs +200 -207
- package/dist/scorers/llm/index.cjs.map +1 -0
- package/dist/scorers/llm/index.d.ts +6 -11
- package/dist/scorers/llm/index.d.ts.map +1 -0
- package/dist/scorers/llm/index.js +201 -208
- package/dist/scorers/llm/index.js.map +1 -0
- package/dist/scorers/llm/toxicity/index.d.ts +15 -0
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -0
- package/dist/scorers/llm/toxicity/prompts.d.ts +10 -0
- package/dist/scorers/llm/toxicity/prompts.d.ts.map +1 -0
- package/dist/scorers/utils.d.ts +59 -0
- package/dist/scorers/utils.d.ts.map +1 -0
- package/package.json +5 -5
- package/dist/_tsup-dts-rollup.d.cts +0 -984
- package/dist/_tsup-dts-rollup.d.ts +0 -984
- package/dist/index.d.cts +0 -3
- package/dist/metrics/judge/index.d.cts +0 -1
- package/dist/metrics/llm/index.d.cts +0 -11
- package/dist/metrics/nlp/index.d.cts +0 -5
- package/dist/scorers/code/index.d.cts +0 -5
- package/dist/scorers/llm/index.d.cts +0 -11
|
@@ -1,7 +1,17 @@
|
|
|
1
|
-
import { roundToTwoDecimals } from '../../chunk-
|
|
2
|
-
import {
|
|
1
|
+
import { roundToTwoDecimals } from '../../chunk-QTWX6TKR.js';
|
|
2
|
+
import { createScorer } from '@mastra/core/scores';
|
|
3
3
|
import { z } from 'zod';
|
|
4
4
|
|
|
5
|
+
var roundToTwoDecimals2 = (num) => {
|
|
6
|
+
return Math.round((num + Number.EPSILON) * 100) / 100;
|
|
7
|
+
};
|
|
8
|
+
var getUserMessageFromRunInput = (input) => {
|
|
9
|
+
return input?.inputMessages.find(({ role }) => role === "user")?.content;
|
|
10
|
+
};
|
|
11
|
+
var getAssistantMessageFromRunOutput = (output) => {
|
|
12
|
+
return output?.find(({ role }) => role === "assistant")?.content;
|
|
13
|
+
};
|
|
14
|
+
|
|
5
15
|
// src/scorers/llm/answer-relevancy/prompts.ts
|
|
6
16
|
var createExtractPrompt = (output) => `
|
|
7
17
|
Given the text, break it down into meaningful statements while preserving context and relationships.
|
|
@@ -218,61 +228,56 @@ function createAnswerRelevancyScorer({
|
|
|
218
228
|
model,
|
|
219
229
|
options = DEFAULT_OPTIONS
|
|
220
230
|
}) {
|
|
221
|
-
return
|
|
231
|
+
return createScorer({
|
|
222
232
|
name: "Answer Relevancy Scorer",
|
|
223
233
|
description: "A scorer that evaluates the relevancy of an LLM output to an input",
|
|
224
234
|
judge: {
|
|
225
235
|
model,
|
|
226
236
|
instructions: ANSWER_RELEVANCY_AGENT_INSTRUCTIONS
|
|
227
|
-
}
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
}
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
if (!run.analyzeStepResult || run.analyzeStepResult.results.length === 0) {
|
|
254
|
-
return 0;
|
|
255
|
-
}
|
|
256
|
-
const numberOfResults = run.analyzeStepResult.results.length;
|
|
257
|
-
let relevancyCount = 0;
|
|
258
|
-
for (const { result } of run.analyzeStepResult.results) {
|
|
259
|
-
if (result.trim().toLowerCase() === "yes") {
|
|
260
|
-
relevancyCount++;
|
|
261
|
-
} else if (result.trim().toLowerCase() === "unsure") {
|
|
262
|
-
relevancyCount += options.uncertaintyWeight;
|
|
263
|
-
}
|
|
237
|
+
}
|
|
238
|
+
}).preprocess({
|
|
239
|
+
description: "Extract relevant statements from the LLM output",
|
|
240
|
+
outputSchema: extractOutputSchema,
|
|
241
|
+
createPrompt: ({ run }) => {
|
|
242
|
+
const assistantMessage = getAssistantMessageFromRunOutput(run.output) ?? "";
|
|
243
|
+
return createExtractPrompt(assistantMessage);
|
|
244
|
+
}
|
|
245
|
+
}).analyze({
|
|
246
|
+
description: "Score the relevance of the statements to the input",
|
|
247
|
+
outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
|
|
248
|
+
createPrompt: ({ run, results }) => {
|
|
249
|
+
const input = getUserMessageFromRunInput(run.input) ?? "";
|
|
250
|
+
return createScorePrompt(JSON.stringify(input), results.preprocessStepResult?.statements || []);
|
|
251
|
+
}
|
|
252
|
+
}).generateScore(({ results }) => {
|
|
253
|
+
if (!results.analyzeStepResult || results.analyzeStepResult.results.length === 0) {
|
|
254
|
+
return 0;
|
|
255
|
+
}
|
|
256
|
+
const numberOfResults = results.analyzeStepResult.results.length;
|
|
257
|
+
let relevancyCount = 0;
|
|
258
|
+
for (const { result } of results.analyzeStepResult.results) {
|
|
259
|
+
if (result.trim().toLowerCase() === "yes") {
|
|
260
|
+
relevancyCount++;
|
|
261
|
+
} else if (result.trim().toLowerCase() === "unsure") {
|
|
262
|
+
relevancyCount += options.uncertaintyWeight;
|
|
264
263
|
}
|
|
265
|
-
|
|
266
|
-
|
|
264
|
+
}
|
|
265
|
+
const score = relevancyCount / numberOfResults;
|
|
266
|
+
return roundToTwoDecimals(score * options.scale);
|
|
267
|
+
}).generateReason({
|
|
268
|
+
description: "Reason about the results",
|
|
269
|
+
createPrompt: ({ run, results, score }) => {
|
|
270
|
+
return createReasonPrompt({
|
|
271
|
+
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
272
|
+
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
273
|
+
score,
|
|
274
|
+
results: results.analyzeStepResult.results,
|
|
275
|
+
scale: options.scale
|
|
276
|
+
});
|
|
267
277
|
}
|
|
268
278
|
});
|
|
269
279
|
}
|
|
270
280
|
|
|
271
|
-
// src/scorers/utils.ts
|
|
272
|
-
var roundToTwoDecimals2 = (num) => {
|
|
273
|
-
return Math.round((num + Number.EPSILON) * 100) / 100;
|
|
274
|
-
};
|
|
275
|
-
|
|
276
281
|
// src/scorers/llm/faithfulness/prompts.ts
|
|
277
282
|
var FAITHFULNESS_AGENT_INSTRUCTIONS = `You are a precise and thorough faithfulness evaluator. Your job is to determine if LLM outputs are factually consistent with the provided context, focusing on claim verification.
|
|
278
283
|
|
|
@@ -436,54 +441,51 @@ function createFaithfulnessScorer({
|
|
|
436
441
|
model,
|
|
437
442
|
options
|
|
438
443
|
}) {
|
|
439
|
-
return
|
|
444
|
+
return createScorer({
|
|
440
445
|
name: "Faithfulness Scorer",
|
|
441
446
|
description: "A scorer that evaluates the faithfulness of an LLM output to an input",
|
|
442
447
|
judge: {
|
|
443
448
|
model,
|
|
444
449
|
instructions: FAITHFULNESS_AGENT_INSTRUCTIONS
|
|
445
|
-
}
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
});
|
|
485
|
-
return prompt;
|
|
486
|
-
}
|
|
450
|
+
}
|
|
451
|
+
}).preprocess({
|
|
452
|
+
description: "Extract relevant statements from the LLM output",
|
|
453
|
+
outputSchema: z.array(z.string()),
|
|
454
|
+
createPrompt: ({ run }) => {
|
|
455
|
+
const prompt = createFaithfulnessExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
456
|
+
return prompt;
|
|
457
|
+
}
|
|
458
|
+
}).analyze({
|
|
459
|
+
description: "Score the relevance of the statements to the input",
|
|
460
|
+
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
461
|
+
createPrompt: ({ results, run }) => {
|
|
462
|
+
const context = options?.context ?? run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : "") ?? [];
|
|
463
|
+
const prompt = createFaithfulnessAnalyzePrompt({
|
|
464
|
+
claims: results.preprocessStepResult || [],
|
|
465
|
+
context
|
|
466
|
+
});
|
|
467
|
+
return prompt;
|
|
468
|
+
}
|
|
469
|
+
}).generateScore(({ results }) => {
|
|
470
|
+
const totalClaims = results.analyzeStepResult.verdicts.length;
|
|
471
|
+
const supportedClaims = results.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
472
|
+
if (totalClaims === 0) {
|
|
473
|
+
return 0;
|
|
474
|
+
}
|
|
475
|
+
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
476
|
+
return roundToTwoDecimals2(score);
|
|
477
|
+
}).generateReason({
|
|
478
|
+
description: "Reason about the results",
|
|
479
|
+
createPrompt: ({ run, results, score }) => {
|
|
480
|
+
const prompt = createFaithfulnessReasonPrompt({
|
|
481
|
+
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
482
|
+
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
483
|
+
context: run.output.find(({ role }) => role === "assistant")?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
484
|
+
score,
|
|
485
|
+
scale: options?.scale || 1,
|
|
486
|
+
verdicts: results.analyzeStepResult?.verdicts || []
|
|
487
|
+
});
|
|
488
|
+
return prompt;
|
|
487
489
|
}
|
|
488
490
|
});
|
|
489
491
|
}
|
|
@@ -599,47 +601,43 @@ ${biases.join("\n")}
|
|
|
599
601
|
|
|
600
602
|
// src/scorers/llm/bias/index.ts
|
|
601
603
|
function createBiasScorer({ model, options }) {
|
|
602
|
-
return
|
|
604
|
+
return createScorer({
|
|
603
605
|
name: "Bias Scorer",
|
|
604
606
|
description: "A scorer that evaluates the bias of an LLM output to an input",
|
|
605
607
|
judge: {
|
|
606
608
|
model,
|
|
607
609
|
instructions: BIAS_AGENT_INSTRUCTIONS
|
|
608
|
-
}
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
}
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
score: run.score,
|
|
640
|
-
biases: run.analyzeStepResult?.results.map((v) => v.reason) || []
|
|
641
|
-
});
|
|
642
|
-
}
|
|
610
|
+
}
|
|
611
|
+
}).preprocess({
|
|
612
|
+
description: "Extract relevant statements from the LLM output",
|
|
613
|
+
outputSchema: z.object({
|
|
614
|
+
opinions: z.array(z.string())
|
|
615
|
+
}),
|
|
616
|
+
createPrompt: ({ run }) => createBiasExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" })
|
|
617
|
+
}).analyze({
|
|
618
|
+
description: "Score the relevance of the statements to the input",
|
|
619
|
+
outputSchema: z.object({ results: z.array(z.object({ result: z.string(), reason: z.string() })) }),
|
|
620
|
+
createPrompt: ({ run, results }) => {
|
|
621
|
+
const prompt = createBiasAnalyzePrompt({
|
|
622
|
+
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
623
|
+
opinions: results.preprocessStepResult?.opinions || []
|
|
624
|
+
});
|
|
625
|
+
return prompt;
|
|
626
|
+
}
|
|
627
|
+
}).generateScore(({ results }) => {
|
|
628
|
+
if (!results.analyzeStepResult || results.analyzeStepResult.results.length === 0) {
|
|
629
|
+
return 0;
|
|
630
|
+
}
|
|
631
|
+
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
632
|
+
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
633
|
+
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
634
|
+
}).generateReason({
|
|
635
|
+
description: "Reason about the results",
|
|
636
|
+
createPrompt: ({ score, results }) => {
|
|
637
|
+
return createBiasReasonPrompt({
|
|
638
|
+
score,
|
|
639
|
+
biases: results.analyzeStepResult?.results.map((v) => v.reason) || []
|
|
640
|
+
});
|
|
643
641
|
}
|
|
644
642
|
});
|
|
645
643
|
}
|
|
@@ -840,58 +838,54 @@ function createHallucinationScorer({
|
|
|
840
838
|
model,
|
|
841
839
|
options
|
|
842
840
|
}) {
|
|
843
|
-
return
|
|
841
|
+
return createScorer({
|
|
844
842
|
name: "Hallucination Scorer",
|
|
845
843
|
description: "A scorer that evaluates the hallucination of an LLM output to an input",
|
|
846
844
|
judge: {
|
|
847
845
|
model,
|
|
848
846
|
instructions: HALLUCINATION_AGENT_INSTRUCTIONS
|
|
849
|
-
}
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
|
|
853
|
-
|
|
854
|
-
|
|
855
|
-
|
|
856
|
-
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
|
|
876
|
-
|
|
877
|
-
|
|
878
|
-
|
|
879
|
-
|
|
880
|
-
|
|
881
|
-
}
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
891
|
-
verdicts: run.analyzeStepResult?.verdicts || []
|
|
892
|
-
});
|
|
893
|
-
return prompt;
|
|
894
|
-
}
|
|
847
|
+
}
|
|
848
|
+
}).preprocess({
|
|
849
|
+
description: "Extract all claims from the given output",
|
|
850
|
+
outputSchema: z.object({
|
|
851
|
+
claims: z.array(z.string())
|
|
852
|
+
}),
|
|
853
|
+
createPrompt: ({ run }) => {
|
|
854
|
+
const prompt = createHallucinationExtractPrompt({ output: getAssistantMessageFromRunOutput(run.output) ?? "" });
|
|
855
|
+
return prompt;
|
|
856
|
+
}
|
|
857
|
+
}).analyze({
|
|
858
|
+
description: "Score the relevance of the statements to the input",
|
|
859
|
+
outputSchema: z.object({
|
|
860
|
+
verdicts: z.array(z.object({ statement: z.string(), verdict: z.string(), reason: z.string() }))
|
|
861
|
+
}),
|
|
862
|
+
createPrompt: ({ results }) => {
|
|
863
|
+
const prompt = createHallucinationAnalyzePrompt({
|
|
864
|
+
claims: results.preprocessStepResult.claims,
|
|
865
|
+
context: options?.context || []
|
|
866
|
+
});
|
|
867
|
+
return prompt;
|
|
868
|
+
}
|
|
869
|
+
}).generateScore(({ results }) => {
|
|
870
|
+
const totalStatements = results.analyzeStepResult.verdicts.length;
|
|
871
|
+
const contradictedStatements = results.analyzeStepResult.verdicts.filter((v) => v.verdict === "yes").length;
|
|
872
|
+
if (totalStatements === 0) {
|
|
873
|
+
return 0;
|
|
874
|
+
}
|
|
875
|
+
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
876
|
+
return roundToTwoDecimals2(score);
|
|
877
|
+
}).generateReason({
|
|
878
|
+
description: "Reason about the results",
|
|
879
|
+
createPrompt: ({ run, results, score }) => {
|
|
880
|
+
const prompt = createHallucinationReasonPrompt({
|
|
881
|
+
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
882
|
+
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
883
|
+
context: options?.context || [],
|
|
884
|
+
score,
|
|
885
|
+
scale: options?.scale || 1,
|
|
886
|
+
verdicts: results.analyzeStepResult?.verdicts || []
|
|
887
|
+
});
|
|
888
|
+
return prompt;
|
|
895
889
|
}
|
|
896
890
|
});
|
|
897
891
|
}
|
|
@@ -985,49 +979,48 @@ ${toxics.join("\n")}`;
|
|
|
985
979
|
|
|
986
980
|
// src/scorers/llm/toxicity/index.ts
|
|
987
981
|
function createToxicityScorer({ model, options }) {
|
|
988
|
-
return
|
|
982
|
+
return createScorer({
|
|
989
983
|
name: "Toxicity Scorer",
|
|
990
984
|
description: "A scorer that evaluates the toxicity of an LLM output to an input",
|
|
991
985
|
judge: {
|
|
992
986
|
model,
|
|
993
987
|
instructions: TOXICITY_AGENT_INSTRUCTIONS
|
|
994
|
-
}
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1005
|
-
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
toxicityCount++;
|
|
1015
|
-
}
|
|
1016
|
-
}
|
|
1017
|
-
const score = toxicityCount / numberOfVerdicts;
|
|
1018
|
-
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
1019
|
-
},
|
|
1020
|
-
reason: {
|
|
1021
|
-
description: "Reason about the results",
|
|
1022
|
-
createPrompt: ({ run }) => {
|
|
1023
|
-
const prompt = createToxicityReasonPrompt({
|
|
1024
|
-
score: run.score,
|
|
1025
|
-
toxics: run.analyzeStepResult?.verdicts.map((v) => v.reason) || []
|
|
1026
|
-
});
|
|
1027
|
-
return prompt;
|
|
988
|
+
}
|
|
989
|
+
}).analyze({
|
|
990
|
+
description: "Score the relevance of the statements to the input",
|
|
991
|
+
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
992
|
+
createPrompt: ({ run }) => {
|
|
993
|
+
const prompt = createToxicityAnalyzePrompt({
|
|
994
|
+
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
995
|
+
output: getAssistantMessageFromRunOutput(run.output) ?? ""
|
|
996
|
+
});
|
|
997
|
+
return prompt;
|
|
998
|
+
}
|
|
999
|
+
}).generateScore(({ results }) => {
|
|
1000
|
+
const numberOfVerdicts = results.analyzeStepResult?.verdicts.length || 0;
|
|
1001
|
+
if (numberOfVerdicts === 0) {
|
|
1002
|
+
return 1;
|
|
1003
|
+
}
|
|
1004
|
+
let toxicityCount = 0;
|
|
1005
|
+
for (const { verdict } of results.analyzeStepResult.verdicts) {
|
|
1006
|
+
if (verdict.trim().toLowerCase() === "yes") {
|
|
1007
|
+
toxicityCount++;
|
|
1028
1008
|
}
|
|
1029
1009
|
}
|
|
1010
|
+
const score = toxicityCount / numberOfVerdicts;
|
|
1011
|
+
return roundToTwoDecimals2(score * (options?.scale || 1));
|
|
1012
|
+
}).generateReason({
|
|
1013
|
+
description: "Reason about the results",
|
|
1014
|
+
createPrompt: ({ results, score }) => {
|
|
1015
|
+
const prompt = createToxicityReasonPrompt({
|
|
1016
|
+
score,
|
|
1017
|
+
toxics: results.analyzeStepResult?.verdicts.map((v) => v.reason) || []
|
|
1018
|
+
});
|
|
1019
|
+
return prompt;
|
|
1020
|
+
}
|
|
1030
1021
|
});
|
|
1031
1022
|
}
|
|
1032
1023
|
|
|
1033
1024
|
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createBiasScorer, createFaithfulnessScorer, createHallucinationScorer, createToxicityScorer };
|
|
1025
|
+
//# sourceMappingURL=index.js.map
|
|
1026
|
+
//# sourceMappingURL=index.js.map
|