@mastra/evals 0.14.3-alpha.0 → 1.0.0-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +36 -9
- package/README.md +19 -159
- package/dist/{chunk-KHEXN75Q.js → chunk-CCLM7KPF.js} +45 -21
- package/dist/chunk-CCLM7KPF.js.map +1 -0
- package/dist/{chunk-QKR2PMLZ.cjs → chunk-TPQLLHZW.cjs} +46 -21
- package/dist/chunk-TPQLLHZW.cjs.map +1 -0
- package/dist/scorers/code/completeness/index.d.ts +1 -1
- package/dist/scorers/code/completeness/index.d.ts.map +1 -1
- package/dist/scorers/code/content-similarity/index.d.ts +1 -1
- package/dist/scorers/code/content-similarity/index.d.ts.map +1 -1
- package/dist/scorers/code/keyword-coverage/index.d.ts +1 -1
- package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -1
- package/dist/scorers/code/textual-difference/index.d.ts +1 -1
- package/dist/scorers/code/textual-difference/index.d.ts.map +1 -1
- package/dist/scorers/code/tone/index.d.ts +1 -1
- package/dist/scorers/code/tone/index.d.ts.map +1 -1
- package/dist/scorers/code/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts +2 -2
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +3 -3
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts +3 -3
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +2 -2
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -2
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +2 -2
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/{llm → prebuilt}/index.cjs +479 -62
- package/dist/scorers/prebuilt/index.cjs.map +1 -0
- package/dist/scorers/prebuilt/index.d.ts +3 -0
- package/dist/scorers/prebuilt/index.d.ts.map +1 -0
- package/dist/scorers/{llm → prebuilt}/index.js +419 -15
- package/dist/scorers/prebuilt/index.js.map +1 -0
- package/dist/scorers/utils.cjs +21 -17
- package/dist/scorers/utils.d.ts +21 -11
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +12 -58
- package/dist/attachListeners.d.ts +0 -4
- package/dist/attachListeners.d.ts.map +0 -1
- package/dist/chunk-7QAUEU4L.cjs +0 -10
- package/dist/chunk-7QAUEU4L.cjs.map +0 -1
- package/dist/chunk-EMMSS5I5.cjs +0 -37
- package/dist/chunk-EMMSS5I5.cjs.map +0 -1
- package/dist/chunk-G3PMV62Z.js +0 -33
- package/dist/chunk-G3PMV62Z.js.map +0 -1
- package/dist/chunk-IUSAD2BW.cjs +0 -19
- package/dist/chunk-IUSAD2BW.cjs.map +0 -1
- package/dist/chunk-KHEXN75Q.js.map +0 -1
- package/dist/chunk-QKR2PMLZ.cjs.map +0 -1
- package/dist/chunk-QTWX6TKR.js +0 -8
- package/dist/chunk-QTWX6TKR.js.map +0 -1
- package/dist/chunk-YGTIO3J5.js +0 -17
- package/dist/chunk-YGTIO3J5.js.map +0 -1
- package/dist/dist-LDTK3TIP.cjs +0 -16759
- package/dist/dist-LDTK3TIP.cjs.map +0 -1
- package/dist/dist-OWYZEOJK.js +0 -16737
- package/dist/dist-OWYZEOJK.js.map +0 -1
- package/dist/evaluation.d.ts +0 -8
- package/dist/evaluation.d.ts.map +0 -1
- package/dist/index.cjs +0 -93
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.ts +0 -3
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js +0 -89
- package/dist/index.js.map +0 -1
- package/dist/magic-string.es-7ORA5OGR.js +0 -1305
- package/dist/magic-string.es-7ORA5OGR.js.map +0 -1
- package/dist/magic-string.es-NZ2XWFKN.cjs +0 -1311
- package/dist/magic-string.es-NZ2XWFKN.cjs.map +0 -1
- package/dist/metrics/index.d.ts +0 -4
- package/dist/metrics/index.d.ts.map +0 -1
- package/dist/metrics/judge/index.cjs +0 -12
- package/dist/metrics/judge/index.cjs.map +0 -1
- package/dist/metrics/judge/index.d.ts +0 -7
- package/dist/metrics/judge/index.d.ts.map +0 -1
- package/dist/metrics/judge/index.js +0 -3
- package/dist/metrics/judge/index.js.map +0 -1
- package/dist/metrics/llm/answer-relevancy/index.d.ts +0 -16
- package/dist/metrics/llm/answer-relevancy/index.d.ts.map +0 -1
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts +0 -19
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/bias/index.d.ts +0 -14
- package/dist/metrics/llm/bias/index.d.ts.map +0 -1
- package/dist/metrics/llm/bias/metricJudge.d.ts +0 -14
- package/dist/metrics/llm/bias/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/bias/prompts.d.ts +0 -14
- package/dist/metrics/llm/bias/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/index.d.ts +0 -16
- package/dist/metrics/llm/context-position/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/context-position/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/prompts.d.ts +0 -17
- package/dist/metrics/llm/context-position/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/index.d.ts +0 -16
- package/dist/metrics/llm/context-precision/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/prompts.d.ts +0 -17
- package/dist/metrics/llm/context-precision/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/index.d.ts +0 -16
- package/dist/metrics/llm/context-relevancy/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +0 -16
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/prompts.d.ts +0 -13
- package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/index.d.ts +0 -16
- package/dist/metrics/llm/contextual-recall/index.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +0 -16
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/prompts.d.ts +0 -13
- package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/index.d.ts +0 -16
- package/dist/metrics/llm/faithfulness/index.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts +0 -22
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/prompts.d.ts +0 -20
- package/dist/metrics/llm/faithfulness/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/index.d.ts +0 -16
- package/dist/metrics/llm/hallucination/index.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/metricJudge.d.ts +0 -22
- package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/prompts.d.ts +0 -17
- package/dist/metrics/llm/hallucination/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/index.cjs +0 -2481
- package/dist/metrics/llm/index.cjs.map +0 -1
- package/dist/metrics/llm/index.d.ts +0 -12
- package/dist/metrics/llm/index.d.ts.map +0 -1
- package/dist/metrics/llm/index.js +0 -2469
- package/dist/metrics/llm/index.js.map +0 -1
- package/dist/metrics/llm/prompt-alignment/index.d.ts +0 -33
- package/dist/metrics/llm/prompt-alignment/index.d.ts.map +0 -1
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts +0 -17
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/index.d.ts +0 -19
- package/dist/metrics/llm/summarization/index.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/metricJudge.d.ts +0 -34
- package/dist/metrics/llm/summarization/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/prompts.d.ts +0 -30
- package/dist/metrics/llm/summarization/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/index.d.ts +0 -14
- package/dist/metrics/llm/toxicity/index.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/metricJudge.d.ts +0 -14
- package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/prompts.d.ts +0 -10
- package/dist/metrics/llm/toxicity/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/types.d.ts +0 -7
- package/dist/metrics/llm/types.d.ts.map +0 -1
- package/dist/metrics/llm/utils.d.ts +0 -14
- package/dist/metrics/llm/utils.d.ts.map +0 -1
- package/dist/metrics/nlp/completeness/index.d.ts +0 -21
- package/dist/metrics/nlp/completeness/index.d.ts.map +0 -1
- package/dist/metrics/nlp/content-similarity/index.d.ts +0 -18
- package/dist/metrics/nlp/content-similarity/index.d.ts.map +0 -1
- package/dist/metrics/nlp/index.cjs +0 -203
- package/dist/metrics/nlp/index.cjs.map +0 -1
- package/dist/metrics/nlp/index.d.ts +0 -6
- package/dist/metrics/nlp/index.d.ts.map +0 -1
- package/dist/metrics/nlp/index.js +0 -190
- package/dist/metrics/nlp/index.js.map +0 -1
- package/dist/metrics/nlp/keyword-coverage/index.d.ts +0 -13
- package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +0 -1
- package/dist/metrics/nlp/textual-difference/index.d.ts +0 -15
- package/dist/metrics/nlp/textual-difference/index.d.ts.map +0 -1
- package/dist/metrics/nlp/tone/index.d.ts +0 -18
- package/dist/metrics/nlp/tone/index.d.ts.map +0 -1
- package/dist/scorers/code/index.cjs +0 -329
- package/dist/scorers/code/index.cjs.map +0 -1
- package/dist/scorers/code/index.js +0 -315
- package/dist/scorers/code/index.js.map +0 -1
- package/dist/scorers/llm/index.cjs.map +0 -1
- package/dist/scorers/llm/index.js.map +0 -1
|
@@ -1,7 +1,10 @@
|
|
|
1
|
-
import { roundToTwoDecimals } from '../../chunk-
|
|
2
|
-
import {
|
|
3
|
-
import { createScorer } from '@mastra/core/scores';
|
|
1
|
+
import { getAssistantMessageFromRunOutput, getUserMessageFromRunInput, roundToTwoDecimals, extractToolCalls, getCombinedSystemPrompt, getTextContentFromMastraDBMessage } from '../../chunk-CCLM7KPF.js';
|
|
2
|
+
import { createScorer } from '@mastra/core/evals';
|
|
4
3
|
import { z } from 'zod';
|
|
4
|
+
import nlp from 'compromise';
|
|
5
|
+
import keyword_extractor from 'keyword-extractor';
|
|
6
|
+
import stringSimilarity from 'string-similarity';
|
|
7
|
+
import Sentiment from 'sentiment';
|
|
5
8
|
|
|
6
9
|
// src/scorers/llm/answer-relevancy/prompts.ts
|
|
7
10
|
var createExtractPrompt = (output) => `
|
|
@@ -215,6 +218,7 @@ function createAnswerRelevancyScorer({
|
|
|
215
218
|
options = DEFAULT_OPTIONS
|
|
216
219
|
}) {
|
|
217
220
|
return createScorer({
|
|
221
|
+
id: "answer-relevancy-scorer",
|
|
218
222
|
name: "Answer Relevancy Scorer",
|
|
219
223
|
description: "A scorer that evaluates the relevancy of an LLM output to an input",
|
|
220
224
|
judge: {
|
|
@@ -432,6 +436,7 @@ function createAnswerSimilarityScorer({
|
|
|
432
436
|
}) {
|
|
433
437
|
const mergedOptions = { ...ANSWER_SIMILARITY_DEFAULT_OPTIONS, ...options };
|
|
434
438
|
return createScorer({
|
|
439
|
+
id: "answer-similarity-scorer",
|
|
435
440
|
name: "Answer Similarity Scorer",
|
|
436
441
|
description: "Evaluates how similar an agent output is to a ground truth answer for CI/CD testing",
|
|
437
442
|
judge: {
|
|
@@ -689,6 +694,7 @@ function createFaithfulnessScorer({
|
|
|
689
694
|
options
|
|
690
695
|
}) {
|
|
691
696
|
return createScorer({
|
|
697
|
+
id: "faithfulness-scorer",
|
|
692
698
|
name: "Faithfulness Scorer",
|
|
693
699
|
description: "A scorer that evaluates the faithfulness of an LLM output to an input",
|
|
694
700
|
judge: {
|
|
@@ -707,7 +713,10 @@ function createFaithfulnessScorer({
|
|
|
707
713
|
description: "Score the relevance of the statements to the input",
|
|
708
714
|
outputSchema: z.object({ verdicts: z.array(z.object({ verdict: z.string(), reason: z.string() })) }),
|
|
709
715
|
createPrompt: ({ results, run }) => {
|
|
710
|
-
const
|
|
716
|
+
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
717
|
+
const context = options?.context ?? assistantMessage?.content?.toolInvocations?.map(
|
|
718
|
+
(toolCall) => toolCall.state === "result" ? JSON.stringify(toolCall.result) : ""
|
|
719
|
+
) ?? [];
|
|
711
720
|
const prompt = createFaithfulnessAnalyzePrompt({
|
|
712
721
|
claims: results.preprocessStepResult || [],
|
|
713
722
|
context
|
|
@@ -721,14 +730,15 @@ function createFaithfulnessScorer({
|
|
|
721
730
|
return 0;
|
|
722
731
|
}
|
|
723
732
|
const score = supportedClaims / totalClaims * (options?.scale || 1);
|
|
724
|
-
return roundToTwoDecimals
|
|
733
|
+
return roundToTwoDecimals(score);
|
|
725
734
|
}).generateReason({
|
|
726
735
|
description: "Reason about the results",
|
|
727
736
|
createPrompt: ({ run, results, score }) => {
|
|
737
|
+
const assistantMessage = run.output.find(({ role }) => role === "assistant");
|
|
728
738
|
const prompt = createFaithfulnessReasonPrompt({
|
|
729
739
|
input: getUserMessageFromRunInput(run.input) ?? "",
|
|
730
740
|
output: getAssistantMessageFromRunOutput(run.output) ?? "",
|
|
731
|
-
context:
|
|
741
|
+
context: assistantMessage?.content?.toolInvocations?.map((toolCall) => JSON.stringify(toolCall)) || [],
|
|
732
742
|
score,
|
|
733
743
|
scale: options?.scale || 1,
|
|
734
744
|
verdicts: results.analyzeStepResult?.verdicts || []
|
|
@@ -847,6 +857,7 @@ ${biases.join("\n")}
|
|
|
847
857
|
// src/scorers/llm/bias/index.ts
|
|
848
858
|
function createBiasScorer({ model, options }) {
|
|
849
859
|
return createScorer({
|
|
860
|
+
id: "bias-scorer",
|
|
850
861
|
name: "Bias Scorer",
|
|
851
862
|
description: "A scorer that evaluates the bias of an LLM output to an input",
|
|
852
863
|
judge: {
|
|
@@ -876,7 +887,7 @@ function createBiasScorer({ model, options }) {
|
|
|
876
887
|
}
|
|
877
888
|
const biasedVerdicts = results.analyzeStepResult.results.filter((v) => v.result.toLowerCase() === "yes");
|
|
878
889
|
const score = biasedVerdicts.length / results.analyzeStepResult.results.length;
|
|
879
|
-
return roundToTwoDecimals
|
|
890
|
+
return roundToTwoDecimals(score * (options?.scale || 1));
|
|
880
891
|
}).generateReason({
|
|
881
892
|
description: "Reason about the results",
|
|
882
893
|
createPrompt: ({ score, results }) => {
|
|
@@ -1081,6 +1092,7 @@ function createHallucinationScorer({
|
|
|
1081
1092
|
options
|
|
1082
1093
|
}) {
|
|
1083
1094
|
return createScorer({
|
|
1095
|
+
id: "hallucination-scorer",
|
|
1084
1096
|
name: "Hallucination Scorer",
|
|
1085
1097
|
description: "A scorer that evaluates the hallucination of an LLM output to an input",
|
|
1086
1098
|
judge: {
|
|
@@ -1116,7 +1128,7 @@ function createHallucinationScorer({
|
|
|
1116
1128
|
return 0;
|
|
1117
1129
|
}
|
|
1118
1130
|
const score = contradictedStatements / totalStatements * (options?.scale || 1);
|
|
1119
|
-
return roundToTwoDecimals
|
|
1131
|
+
return roundToTwoDecimals(score);
|
|
1120
1132
|
}).generateReason({
|
|
1121
1133
|
description: "Reason about the results",
|
|
1122
1134
|
createPrompt: ({ run, results, score }) => {
|
|
@@ -1223,6 +1235,7 @@ function createToxicityScorer({
|
|
|
1223
1235
|
options
|
|
1224
1236
|
}) {
|
|
1225
1237
|
return createScorer({
|
|
1238
|
+
id: "toxicity-scorer",
|
|
1226
1239
|
name: "Toxicity Scorer",
|
|
1227
1240
|
description: "A scorer that evaluates the toxicity of an LLM output to an input",
|
|
1228
1241
|
judge: {
|
|
@@ -1252,7 +1265,7 @@ function createToxicityScorer({
|
|
|
1252
1265
|
}
|
|
1253
1266
|
}
|
|
1254
1267
|
const score = toxicityCount / numberOfVerdicts;
|
|
1255
|
-
return roundToTwoDecimals
|
|
1268
|
+
return roundToTwoDecimals(score * (options?.scale || 1));
|
|
1256
1269
|
}).generateReason({
|
|
1257
1270
|
description: "Reason about the results",
|
|
1258
1271
|
createPrompt: ({ results, score }) => {
|
|
@@ -1372,6 +1385,7 @@ var analyzeOutputSchema2 = z.object({
|
|
|
1372
1385
|
function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
1373
1386
|
const toolDefinitions = availableTools.map((tool) => `${tool.id}: ${tool.description}`).join("\n");
|
|
1374
1387
|
return createScorer({
|
|
1388
|
+
id: "llm-tool-call-accuracy-scorer",
|
|
1375
1389
|
name: "Tool Call Accuracy (LLM)",
|
|
1376
1390
|
description: "Evaluates whether an agent selected appropriate tools for the given task using LLM analysis",
|
|
1377
1391
|
judge: {
|
|
@@ -1413,7 +1427,7 @@ function createToolCallAccuracyScorerLLM({ model, availableTools }) {
|
|
|
1413
1427
|
}
|
|
1414
1428
|
const appropriateToolCalls = evaluations.filter((e) => e.wasAppropriate).length;
|
|
1415
1429
|
const totalToolCalls = evaluations.length;
|
|
1416
|
-
return roundToTwoDecimals
|
|
1430
|
+
return roundToTwoDecimals(appropriateToolCalls / totalToolCalls);
|
|
1417
1431
|
}).generateReason({
|
|
1418
1432
|
description: "Generate human-readable explanation of tool selection evaluation",
|
|
1419
1433
|
createPrompt: ({ run, results, score }) => {
|
|
@@ -1610,6 +1624,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1610
1624
|
throw new Error("Context array cannot be empty if provided");
|
|
1611
1625
|
}
|
|
1612
1626
|
return createScorer({
|
|
1627
|
+
id: "context-relevance-scorer",
|
|
1613
1628
|
name: "Context Relevance (LLM)",
|
|
1614
1629
|
description: "Evaluates how relevant and useful the provided context was for generating the agent response",
|
|
1615
1630
|
judge: {
|
|
@@ -1670,7 +1685,7 @@ function createContextRelevanceScorerLLM({
|
|
|
1670
1685
|
const missingContextPenalty = Math.min(missingContext.length * missingPenaltyRate, maxMissingPenalty);
|
|
1671
1686
|
const finalScore = Math.max(0, relevanceScore - usagePenalty - missingContextPenalty);
|
|
1672
1687
|
const scaledScore = finalScore * (options.scale || 1);
|
|
1673
|
-
return roundToTwoDecimals
|
|
1688
|
+
return roundToTwoDecimals(scaledScore);
|
|
1674
1689
|
}).generateReason({
|
|
1675
1690
|
description: "Generate human-readable explanation of context relevance evaluation",
|
|
1676
1691
|
createPrompt: ({ run, results, score }) => {
|
|
@@ -1833,6 +1848,7 @@ function createContextPrecisionScorer({
|
|
|
1833
1848
|
throw new Error("Context array cannot be empty if provided");
|
|
1834
1849
|
}
|
|
1835
1850
|
return createScorer({
|
|
1851
|
+
id: "context-precision-scorer",
|
|
1836
1852
|
name: "Context Precision Scorer",
|
|
1837
1853
|
description: "A scorer that evaluates the relevance and precision of retrieved context nodes for generating expected outputs",
|
|
1838
1854
|
judge: {
|
|
@@ -1878,7 +1894,7 @@ function createContextPrecisionScorer({
|
|
|
1878
1894
|
}
|
|
1879
1895
|
const map = sumPrecision / relevantCount;
|
|
1880
1896
|
const score = map * (options.scale || 1);
|
|
1881
|
-
return roundToTwoDecimals
|
|
1897
|
+
return roundToTwoDecimals(score);
|
|
1882
1898
|
}).generateReason({
|
|
1883
1899
|
description: "Reason about the context precision results",
|
|
1884
1900
|
createPrompt: ({ run, results, score }) => {
|
|
@@ -2125,6 +2141,7 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2125
2141
|
throw new Error("Both baselineResponse and noisyQuery are required for Noise Sensitivity scoring");
|
|
2126
2142
|
}
|
|
2127
2143
|
return createScorer({
|
|
2144
|
+
id: "noise-sensitivity-scorer",
|
|
2128
2145
|
name: "Noise Sensitivity (LLM)",
|
|
2129
2146
|
description: "Evaluates how robust an agent is when exposed to irrelevant, distracting, or misleading information",
|
|
2130
2147
|
judge: {
|
|
@@ -2180,7 +2197,7 @@ function createNoiseSensitivityScorerLLM({
|
|
|
2180
2197
|
const majorIssues = analysisResult.majorIssues || [];
|
|
2181
2198
|
const issuesPenalty = Math.min(majorIssues.length * majorIssuePenaltyRate, maxMajorIssuePenalty);
|
|
2182
2199
|
finalScore = Math.max(0, finalScore - issuesPenalty);
|
|
2183
|
-
return roundToTwoDecimals
|
|
2200
|
+
return roundToTwoDecimals(finalScore);
|
|
2184
2201
|
}).generateReason({
|
|
2185
2202
|
description: "Generate human-readable explanation of noise sensitivity evaluation",
|
|
2186
2203
|
createPrompt: ({ run, results, score }) => {
|
|
@@ -2497,6 +2514,7 @@ function createPromptAlignmentScorerLLM({
|
|
|
2497
2514
|
const scale = options?.scale || 1;
|
|
2498
2515
|
const evaluationMode = options?.evaluationMode || "both";
|
|
2499
2516
|
return createScorer({
|
|
2517
|
+
id: "prompt-alignment-scorer",
|
|
2500
2518
|
name: "Prompt Alignment (LLM)",
|
|
2501
2519
|
description: "Evaluates how well the agent response aligns with the intent and requirements of the user prompt",
|
|
2502
2520
|
judge: {
|
|
@@ -2545,7 +2563,7 @@ function createPromptAlignmentScorerLLM({
|
|
|
2545
2563
|
weightedScore = userScore * SCORING_WEIGHTS.BOTH.USER_WEIGHT + systemScore * SCORING_WEIGHTS.BOTH.SYSTEM_WEIGHT;
|
|
2546
2564
|
}
|
|
2547
2565
|
const finalScore = weightedScore * scale;
|
|
2548
|
-
return roundToTwoDecimals
|
|
2566
|
+
return roundToTwoDecimals(finalScore);
|
|
2549
2567
|
}).generateReason({
|
|
2550
2568
|
description: "Generate human-readable explanation of prompt alignment evaluation",
|
|
2551
2569
|
createPrompt: ({ run, results, score }) => {
|
|
@@ -2566,7 +2584,393 @@ function createPromptAlignmentScorerLLM({
|
|
|
2566
2584
|
}
|
|
2567
2585
|
});
|
|
2568
2586
|
}
|
|
2587
|
+
function normalizeString(str) {
|
|
2588
|
+
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
2589
|
+
}
|
|
2590
|
+
function extractElements(doc) {
|
|
2591
|
+
const nouns = doc.nouns().out("array") || [];
|
|
2592
|
+
const verbs = doc.verbs().toInfinitive().out("array") || [];
|
|
2593
|
+
const topics = doc.topics().out("array") || [];
|
|
2594
|
+
const terms = doc.terms().out("array") || [];
|
|
2595
|
+
const cleanAndSplitTerm = (term) => {
|
|
2596
|
+
const normalized = normalizeString(term);
|
|
2597
|
+
return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
|
|
2598
|
+
};
|
|
2599
|
+
const processedTerms = [
|
|
2600
|
+
...nouns.flatMap(cleanAndSplitTerm),
|
|
2601
|
+
...verbs.flatMap(cleanAndSplitTerm),
|
|
2602
|
+
...topics.flatMap(cleanAndSplitTerm),
|
|
2603
|
+
...terms.flatMap(cleanAndSplitTerm)
|
|
2604
|
+
];
|
|
2605
|
+
return [...new Set(processedTerms)];
|
|
2606
|
+
}
|
|
2607
|
+
function calculateCoverage({ original, simplified }) {
|
|
2608
|
+
if (original.length === 0) {
|
|
2609
|
+
return simplified.length === 0 ? 1 : 0;
|
|
2610
|
+
}
|
|
2611
|
+
const covered = original.filter(
|
|
2612
|
+
(element) => simplified.some((s) => {
|
|
2613
|
+
const elem = normalizeString(element);
|
|
2614
|
+
const simp = normalizeString(s);
|
|
2615
|
+
if (elem.length <= 3) {
|
|
2616
|
+
return elem === simp;
|
|
2617
|
+
}
|
|
2618
|
+
const longer = elem.length > simp.length ? elem : simp;
|
|
2619
|
+
const shorter = elem.length > simp.length ? simp : elem;
|
|
2620
|
+
if (longer.includes(shorter)) {
|
|
2621
|
+
return shorter.length / longer.length > 0.6;
|
|
2622
|
+
}
|
|
2623
|
+
return false;
|
|
2624
|
+
})
|
|
2625
|
+
);
|
|
2626
|
+
return covered.length / original.length;
|
|
2627
|
+
}
|
|
2628
|
+
function createCompletenessScorer() {
|
|
2629
|
+
return createScorer({
|
|
2630
|
+
id: "completeness-scorer",
|
|
2631
|
+
name: "Completeness Scorer",
|
|
2632
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2633
|
+
type: "agent"
|
|
2634
|
+
}).preprocess(async ({ run }) => {
|
|
2635
|
+
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => {
|
|
2636
|
+
const content = getTextContentFromMastraDBMessage(i);
|
|
2637
|
+
return content === null || content === void 0;
|
|
2638
|
+
});
|
|
2639
|
+
const isOutputInvalid = !run.output || run.output.some((i) => {
|
|
2640
|
+
const content = getTextContentFromMastraDBMessage(i);
|
|
2641
|
+
return content === null || content === void 0;
|
|
2642
|
+
});
|
|
2643
|
+
if (isInputInvalid || isOutputInvalid) {
|
|
2644
|
+
throw new Error("Inputs cannot be null or undefined");
|
|
2645
|
+
}
|
|
2646
|
+
const input = run.input?.inputMessages.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2647
|
+
const output = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2648
|
+
const inputToProcess = input;
|
|
2649
|
+
const outputToProcess = output;
|
|
2650
|
+
const inputDoc = nlp(inputToProcess.trim());
|
|
2651
|
+
const outputDoc = nlp(outputToProcess.trim());
|
|
2652
|
+
const inputElements = extractElements(inputDoc);
|
|
2653
|
+
const outputElements = extractElements(outputDoc);
|
|
2654
|
+
return {
|
|
2655
|
+
inputElements,
|
|
2656
|
+
outputElements,
|
|
2657
|
+
missingElements: inputElements.filter((e) => !outputElements.includes(e)),
|
|
2658
|
+
elementCounts: {
|
|
2659
|
+
input: inputElements.length,
|
|
2660
|
+
output: outputElements.length
|
|
2661
|
+
}
|
|
2662
|
+
};
|
|
2663
|
+
}).generateScore(({ results }) => {
|
|
2664
|
+
const inputElements = results.preprocessStepResult?.inputElements;
|
|
2665
|
+
const outputElements = results.preprocessStepResult?.outputElements;
|
|
2666
|
+
return calculateCoverage({
|
|
2667
|
+
original: inputElements,
|
|
2668
|
+
simplified: outputElements
|
|
2669
|
+
});
|
|
2670
|
+
});
|
|
2671
|
+
}
|
|
2672
|
+
function calculateRatio(input, output) {
|
|
2673
|
+
if (input === output) {
|
|
2674
|
+
return 1;
|
|
2675
|
+
}
|
|
2676
|
+
if (input.length === 0 || output.length === 0) {
|
|
2677
|
+
return 0;
|
|
2678
|
+
}
|
|
2679
|
+
const matches = longestCommonSubsequence(input, output);
|
|
2680
|
+
const total = input.length + output.length;
|
|
2681
|
+
return total > 0 ? 2 * matches / total : 0;
|
|
2682
|
+
}
|
|
2683
|
+
function longestCommonSubsequence(str1, str2) {
|
|
2684
|
+
const m = str1.length;
|
|
2685
|
+
const n = str2.length;
|
|
2686
|
+
const dp = [];
|
|
2687
|
+
for (let i = 0; i <= m; i++) {
|
|
2688
|
+
dp[i] = [];
|
|
2689
|
+
for (let j = 0; j <= n; j++) {
|
|
2690
|
+
dp[i][j] = 0;
|
|
2691
|
+
}
|
|
2692
|
+
}
|
|
2693
|
+
for (let i = 1; i <= m; i++) {
|
|
2694
|
+
for (let j = 1; j <= n; j++) {
|
|
2695
|
+
if (str1[i - 1] === str2[j - 1]) {
|
|
2696
|
+
dp[i][j] = dp[i - 1][j - 1] + 1;
|
|
2697
|
+
} else {
|
|
2698
|
+
dp[i][j] = Math.max(dp[i - 1][j], dp[i][j - 1]);
|
|
2699
|
+
}
|
|
2700
|
+
}
|
|
2701
|
+
}
|
|
2702
|
+
return dp[m][n];
|
|
2703
|
+
}
|
|
2704
|
+
function countChanges(input, output) {
|
|
2705
|
+
const inputNormalized = input.replace(/\s+/g, " ").trim();
|
|
2706
|
+
const outputNormalized = output.replace(/\s+/g, " ").trim();
|
|
2707
|
+
if (inputNormalized === outputNormalized) {
|
|
2708
|
+
if (input !== output) {
|
|
2709
|
+
const inputWords2 = input.split(/\s+/).filter((w) => w.length > 0);
|
|
2710
|
+
const outputWords2 = output.split(/\s+/).filter((w) => w.length > 0);
|
|
2711
|
+
return Math.abs(inputWords2.length - outputWords2.length) || 1;
|
|
2712
|
+
}
|
|
2713
|
+
return 0;
|
|
2714
|
+
}
|
|
2715
|
+
const inputWords = inputNormalized.split(/\s+/).filter((w) => w.length > 0);
|
|
2716
|
+
const outputWords = outputNormalized.split(/\s+/).filter((w) => w.length > 0);
|
|
2717
|
+
if (inputWords.length === 0 && outputWords.length === 0) {
|
|
2718
|
+
return 0;
|
|
2719
|
+
}
|
|
2720
|
+
if (inputWords.length === 0) {
|
|
2721
|
+
return outputWords.length;
|
|
2722
|
+
}
|
|
2723
|
+
if (outputWords.length === 0) {
|
|
2724
|
+
return inputWords.length;
|
|
2725
|
+
}
|
|
2726
|
+
const matchingWords = findCommonWords(inputWords, outputWords);
|
|
2727
|
+
const maxLength = Math.max(inputWords.length, outputWords.length);
|
|
2728
|
+
const changes = maxLength - matchingWords;
|
|
2729
|
+
return changes;
|
|
2730
|
+
}
|
|
2731
|
+
function findCommonWords(arr1, arr2) {
|
|
2732
|
+
let matches = 0;
|
|
2733
|
+
const used = /* @__PURE__ */ new Set();
|
|
2734
|
+
for (let i = 0; i < arr1.length; i++) {
|
|
2735
|
+
for (let j = 0; j < arr2.length; j++) {
|
|
2736
|
+
if (!used.has(j) && arr1[i] === arr2[j]) {
|
|
2737
|
+
matches++;
|
|
2738
|
+
used.add(j);
|
|
2739
|
+
break;
|
|
2740
|
+
}
|
|
2741
|
+
}
|
|
2742
|
+
}
|
|
2743
|
+
return matches;
|
|
2744
|
+
}
|
|
2745
|
+
function createTextualDifferenceScorer() {
|
|
2746
|
+
return createScorer({
|
|
2747
|
+
id: "textual-difference-scorer",
|
|
2748
|
+
name: "Textual Difference Scorer",
|
|
2749
|
+
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
2750
|
+
type: "agent"
|
|
2751
|
+
}).preprocess(async ({ run }) => {
|
|
2752
|
+
const input = run.input?.inputMessages?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2753
|
+
const output = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2754
|
+
const ratio = calculateRatio(input, output);
|
|
2755
|
+
const changes = countChanges(input, output);
|
|
2756
|
+
const maxLength = Math.max(input.length, output.length);
|
|
2757
|
+
const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
|
|
2758
|
+
const confidence = 1 - lengthDiff;
|
|
2759
|
+
return {
|
|
2760
|
+
ratio,
|
|
2761
|
+
confidence,
|
|
2762
|
+
changes,
|
|
2763
|
+
lengthDiff
|
|
2764
|
+
};
|
|
2765
|
+
}).generateScore(({ results }) => {
|
|
2766
|
+
return results.preprocessStepResult?.ratio;
|
|
2767
|
+
});
|
|
2768
|
+
}
|
|
2769
|
+
function createKeywordCoverageScorer() {
|
|
2770
|
+
return createScorer({
|
|
2771
|
+
id: "keyword-coverage-scorer",
|
|
2772
|
+
name: "Keyword Coverage Scorer",
|
|
2773
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
2774
|
+
type: "agent"
|
|
2775
|
+
}).preprocess(async ({ run }) => {
|
|
2776
|
+
const input = run.input?.inputMessages?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2777
|
+
const output = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2778
|
+
if (!input && !output) {
|
|
2779
|
+
return {
|
|
2780
|
+
result: {
|
|
2781
|
+
referenceKeywords: /* @__PURE__ */ new Set(),
|
|
2782
|
+
responseKeywords: /* @__PURE__ */ new Set()
|
|
2783
|
+
}
|
|
2784
|
+
};
|
|
2785
|
+
}
|
|
2786
|
+
const extractKeywords = (text) => {
|
|
2787
|
+
return keyword_extractor.extract(text, {
|
|
2788
|
+
language: "english",
|
|
2789
|
+
remove_digits: true,
|
|
2790
|
+
return_changed_case: true,
|
|
2791
|
+
remove_duplicates: true
|
|
2792
|
+
});
|
|
2793
|
+
};
|
|
2794
|
+
const referenceKeywords = new Set(extractKeywords(input));
|
|
2795
|
+
const responseKeywords = new Set(extractKeywords(output));
|
|
2796
|
+
return {
|
|
2797
|
+
referenceKeywords,
|
|
2798
|
+
responseKeywords
|
|
2799
|
+
};
|
|
2800
|
+
}).analyze(async ({ results }) => {
|
|
2801
|
+
if (!results.preprocessStepResult?.referenceKeywords?.size && !results.preprocessStepResult?.responseKeywords?.size) {
|
|
2802
|
+
return {
|
|
2803
|
+
totalKeywordsLength: 0,
|
|
2804
|
+
matchedKeywordsLength: 0
|
|
2805
|
+
};
|
|
2806
|
+
}
|
|
2807
|
+
const matchedKeywords = [...results.preprocessStepResult?.referenceKeywords].filter(
|
|
2808
|
+
(k) => results.preprocessStepResult?.responseKeywords?.has(k)
|
|
2809
|
+
);
|
|
2810
|
+
return {
|
|
2811
|
+
totalKeywordsLength: Array.from(results.preprocessStepResult?.referenceKeywords).length ?? 0,
|
|
2812
|
+
matchedKeywordsLength: matchedKeywords.length ?? 0
|
|
2813
|
+
};
|
|
2814
|
+
}).generateScore(({ results }) => {
|
|
2815
|
+
if (!results.analyzeStepResult?.totalKeywordsLength) {
|
|
2816
|
+
return 1;
|
|
2817
|
+
}
|
|
2818
|
+
const totalKeywords = results.analyzeStepResult?.totalKeywordsLength;
|
|
2819
|
+
const matchedKeywords = results.analyzeStepResult?.matchedKeywordsLength;
|
|
2820
|
+
return totalKeywords > 0 ? matchedKeywords / totalKeywords : 0;
|
|
2821
|
+
});
|
|
2822
|
+
}
|
|
2823
|
+
function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
|
|
2824
|
+
return createScorer({
|
|
2825
|
+
id: "content-similarity-scorer",
|
|
2826
|
+
name: "Content Similarity Scorer",
|
|
2827
|
+
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
2828
|
+
type: "agent"
|
|
2829
|
+
}).preprocess(async ({ run }) => {
|
|
2830
|
+
let processedInput = run.input?.inputMessages.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2831
|
+
let processedOutput = run.output.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2832
|
+
if (ignoreCase) {
|
|
2833
|
+
processedInput = processedInput.toLowerCase();
|
|
2834
|
+
processedOutput = processedOutput.toLowerCase();
|
|
2835
|
+
}
|
|
2836
|
+
if (ignoreWhitespace) {
|
|
2837
|
+
processedInput = processedInput.replace(/\s+/g, " ").trim();
|
|
2838
|
+
processedOutput = processedOutput.replace(/\s+/g, " ").trim();
|
|
2839
|
+
}
|
|
2840
|
+
return {
|
|
2841
|
+
processedInput,
|
|
2842
|
+
processedOutput
|
|
2843
|
+
};
|
|
2844
|
+
}).generateScore(({ results }) => {
|
|
2845
|
+
const similarity = stringSimilarity.compareTwoStrings(
|
|
2846
|
+
results.preprocessStepResult?.processedInput,
|
|
2847
|
+
results.preprocessStepResult?.processedOutput
|
|
2848
|
+
);
|
|
2849
|
+
return similarity;
|
|
2850
|
+
});
|
|
2851
|
+
}
|
|
2852
|
+
function createToneScorer(config = {}) {
|
|
2853
|
+
const { referenceTone } = config;
|
|
2854
|
+
return createScorer({
|
|
2855
|
+
id: "tone-scorer",
|
|
2856
|
+
name: "Tone Scorer",
|
|
2857
|
+
description: "Analyzes the tone and sentiment of agent responses using sentiment analysis. Can compare against a reference tone or evaluate sentiment stability.",
|
|
2858
|
+
type: "agent"
|
|
2859
|
+
}).preprocess(async ({ run }) => {
|
|
2860
|
+
const sentiment = new Sentiment();
|
|
2861
|
+
const agentMessage = run.output?.map((i) => getTextContentFromMastraDBMessage(i)).join(", ") || "";
|
|
2862
|
+
const responseSentiment = sentiment.analyze(agentMessage);
|
|
2863
|
+
if (referenceTone) {
|
|
2864
|
+
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
2865
|
+
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
2866
|
+
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
2867
|
+
return {
|
|
2868
|
+
score: normalizedScore,
|
|
2869
|
+
responseSentiment: responseSentiment.comparative,
|
|
2870
|
+
referenceSentiment: referenceSentiment.comparative,
|
|
2871
|
+
difference: sentimentDiff
|
|
2872
|
+
};
|
|
2873
|
+
}
|
|
2874
|
+
const sentences = agentMessage.match(/[^.!?]+[.!?]+/g) || [agentMessage];
|
|
2875
|
+
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
2876
|
+
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
2877
|
+
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
2878
|
+
const stability = Math.max(0, 1 - variance);
|
|
2879
|
+
return {
|
|
2880
|
+
score: stability,
|
|
2881
|
+
avgSentiment,
|
|
2882
|
+
sentimentVariance: variance
|
|
2883
|
+
};
|
|
2884
|
+
}).generateScore(({ results }) => {
|
|
2885
|
+
return results.preprocessStepResult?.score;
|
|
2886
|
+
});
|
|
2887
|
+
}
|
|
2888
|
+
function checkToolOrder(actualTools, expectedOrder, strictMode = false) {
|
|
2889
|
+
if (strictMode) {
|
|
2890
|
+
return JSON.stringify(actualTools) === JSON.stringify(expectedOrder);
|
|
2891
|
+
}
|
|
2892
|
+
const expectedIndices = [];
|
|
2893
|
+
for (const expectedTool of expectedOrder) {
|
|
2894
|
+
const index = actualTools.indexOf(expectedTool);
|
|
2895
|
+
if (index === -1) {
|
|
2896
|
+
return false;
|
|
2897
|
+
}
|
|
2898
|
+
expectedIndices.push(index);
|
|
2899
|
+
}
|
|
2900
|
+
for (let i = 1; i < expectedIndices.length; i++) {
|
|
2901
|
+
const currentIndex = expectedIndices[i];
|
|
2902
|
+
const prevIndex = expectedIndices[i - 1];
|
|
2903
|
+
if (currentIndex !== void 0 && prevIndex !== void 0 && currentIndex <= prevIndex) {
|
|
2904
|
+
return false;
|
|
2905
|
+
}
|
|
2906
|
+
}
|
|
2907
|
+
return true;
|
|
2908
|
+
}
|
|
2909
|
+
function calculateAccuracy({
|
|
2910
|
+
expectedTool,
|
|
2911
|
+
actualTools,
|
|
2912
|
+
strictMode = false,
|
|
2913
|
+
expectedToolOrder
|
|
2914
|
+
}) {
|
|
2915
|
+
if (actualTools.length === 0) {
|
|
2916
|
+
return 0;
|
|
2917
|
+
}
|
|
2918
|
+
if (expectedToolOrder && expectedToolOrder.length > 0) {
|
|
2919
|
+
return checkToolOrder(actualTools, expectedToolOrder, strictMode) ? 1 : 0;
|
|
2920
|
+
}
|
|
2921
|
+
if (!expectedTool) {
|
|
2922
|
+
return 0;
|
|
2923
|
+
}
|
|
2924
|
+
if (strictMode) {
|
|
2925
|
+
return actualTools.length === 1 && actualTools[0] === expectedTool ? 1 : 0;
|
|
2926
|
+
}
|
|
2927
|
+
return actualTools.includes(expectedTool) ? 1 : 0;
|
|
2928
|
+
}
|
|
2929
|
+
function createToolCallAccuracyScorerCode(options) {
|
|
2930
|
+
const { expectedTool, strictMode = false, expectedToolOrder } = options;
|
|
2931
|
+
if (!expectedTool && !expectedToolOrder) {
|
|
2932
|
+
throw new Error("Either expectedTool or expectedToolOrder must be provided");
|
|
2933
|
+
}
|
|
2934
|
+
const getDescription = () => {
|
|
2935
|
+
return expectedToolOrder ? `Evaluates whether the LLM called tools in the correct order: [${expectedToolOrder.join(", ")}]` : `Evaluates whether the LLM selected the correct tool (${expectedTool}) from the available tools`;
|
|
2936
|
+
};
|
|
2937
|
+
return createScorer({
|
|
2938
|
+
id: "code-tool-call-accuracy-scorer",
|
|
2939
|
+
name: "Tool Call Accuracy Scorer",
|
|
2940
|
+
description: getDescription(),
|
|
2941
|
+
type: "agent"
|
|
2942
|
+
}).preprocess(async ({ run }) => {
|
|
2943
|
+
const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
|
|
2944
|
+
const isOutputInvalid = !run.output || run.output.length === 0;
|
|
2945
|
+
if (isInputInvalid || isOutputInvalid) {
|
|
2946
|
+
throw new Error("Input and output messages cannot be null or empty");
|
|
2947
|
+
}
|
|
2948
|
+
const { tools: actualTools, toolCallInfos } = extractToolCalls(run.output);
|
|
2949
|
+
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
2950
|
+
return {
|
|
2951
|
+
expectedTool,
|
|
2952
|
+
actualTools,
|
|
2953
|
+
strictMode,
|
|
2954
|
+
expectedToolOrder,
|
|
2955
|
+
hasToolCalls: actualTools.length > 0,
|
|
2956
|
+
correctToolCalled,
|
|
2957
|
+
toolCallInfos,
|
|
2958
|
+
correctOrderCalled: expectedToolOrder ? checkToolOrder(actualTools, expectedToolOrder, strictMode) : null
|
|
2959
|
+
};
|
|
2960
|
+
}).generateScore(({ results }) => {
|
|
2961
|
+
const preprocessResult = results.preprocessStepResult;
|
|
2962
|
+
if (!preprocessResult) {
|
|
2963
|
+
return 0;
|
|
2964
|
+
}
|
|
2965
|
+
return calculateAccuracy({
|
|
2966
|
+
expectedTool: preprocessResult.expectedTool,
|
|
2967
|
+
actualTools: preprocessResult.actualTools,
|
|
2968
|
+
strictMode: preprocessResult.strictMode,
|
|
2969
|
+
expectedToolOrder: preprocessResult.expectedToolOrder
|
|
2970
|
+
});
|
|
2971
|
+
});
|
|
2972
|
+
}
|
|
2569
2973
|
|
|
2570
|
-
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createToolCallAccuracyScorerLLM, createToxicityScorer };
|
|
2974
|
+
export { ANSWER_RELEVANCY_AGENT_INSTRUCTIONS, ANSWER_SIMILARITY_DEFAULT_OPTIONS, ANSWER_SIMILARITY_INSTRUCTIONS, DEFAULT_OPTIONS, createAnswerRelevancyScorer, createAnswerSimilarityScorer, createBiasScorer, createCompletenessScorer, createContentSimilarityScorer, createContextPrecisionScorer, createContextRelevanceScorerLLM, createFaithfulnessScorer, createHallucinationScorer, createKeywordCoverageScorer, createNoiseSensitivityScorerLLM, createPromptAlignmentScorerLLM, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode, createToolCallAccuracyScorerLLM, createToxicityScorer };
|
|
2571
2975
|
//# sourceMappingURL=index.js.map
|
|
2572
2976
|
//# sourceMappingURL=index.js.map
|