@mastra/evals 0.14.4 → 1.0.0-beta.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +34 -25
- package/README.md +19 -159
- package/dist/{chunk-KHEXN75Q.js → chunk-CCLM7KPF.js} +45 -21
- package/dist/chunk-CCLM7KPF.js.map +1 -0
- package/dist/{chunk-QKR2PMLZ.cjs → chunk-TPQLLHZW.cjs} +46 -21
- package/dist/chunk-TPQLLHZW.cjs.map +1 -0
- package/dist/scorers/code/completeness/index.d.ts +1 -1
- package/dist/scorers/code/completeness/index.d.ts.map +1 -1
- package/dist/scorers/code/content-similarity/index.d.ts +1 -1
- package/dist/scorers/code/content-similarity/index.d.ts.map +1 -1
- package/dist/scorers/code/keyword-coverage/index.d.ts +1 -1
- package/dist/scorers/code/keyword-coverage/index.d.ts.map +1 -1
- package/dist/scorers/code/textual-difference/index.d.ts +1 -1
- package/dist/scorers/code/textual-difference/index.d.ts.map +1 -1
- package/dist/scorers/code/tone/index.d.ts +1 -1
- package/dist/scorers/code/tone/index.d.ts.map +1 -1
- package/dist/scorers/code/tool-call-accuracy/index.d.ts +1 -1
- package/dist/scorers/code/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts +1 -1
- package/dist/scorers/llm/answer-relevancy/index.d.ts.map +1 -1
- package/dist/scorers/llm/answer-similarity/index.d.ts +2 -2
- package/dist/scorers/llm/answer-similarity/index.d.ts.map +1 -1
- package/dist/scorers/llm/bias/index.d.ts +2 -2
- package/dist/scorers/llm/bias/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-precision/index.d.ts +3 -3
- package/dist/scorers/llm/context-precision/index.d.ts.map +1 -1
- package/dist/scorers/llm/context-relevance/index.d.ts +3 -3
- package/dist/scorers/llm/context-relevance/index.d.ts.map +1 -1
- package/dist/scorers/llm/faithfulness/index.d.ts +2 -2
- package/dist/scorers/llm/faithfulness/index.d.ts.map +1 -1
- package/dist/scorers/llm/hallucination/index.d.ts +2 -2
- package/dist/scorers/llm/hallucination/index.d.ts.map +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts +1 -1
- package/dist/scorers/llm/noise-sensitivity/index.d.ts.map +1 -1
- package/dist/scorers/llm/prompt-alignment/index.d.ts +2 -2
- package/dist/scorers/llm/prompt-alignment/index.d.ts.map +1 -1
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts +2 -2
- package/dist/scorers/llm/tool-call-accuracy/index.d.ts.map +1 -1
- package/dist/scorers/llm/toxicity/index.d.ts +2 -2
- package/dist/scorers/llm/toxicity/index.d.ts.map +1 -1
- package/dist/scorers/{llm → prebuilt}/index.cjs +479 -62
- package/dist/scorers/prebuilt/index.cjs.map +1 -0
- package/dist/scorers/prebuilt/index.d.ts +3 -0
- package/dist/scorers/prebuilt/index.d.ts.map +1 -0
- package/dist/scorers/{llm → prebuilt}/index.js +419 -15
- package/dist/scorers/prebuilt/index.js.map +1 -0
- package/dist/scorers/utils.cjs +21 -17
- package/dist/scorers/utils.d.ts +21 -11
- package/dist/scorers/utils.d.ts.map +1 -1
- package/dist/scorers/utils.js +1 -1
- package/package.json +15 -59
- package/dist/attachListeners.d.ts +0 -4
- package/dist/attachListeners.d.ts.map +0 -1
- package/dist/chunk-44PMY5ES.js +0 -78
- package/dist/chunk-44PMY5ES.js.map +0 -1
- package/dist/chunk-7QAUEU4L.cjs +0 -10
- package/dist/chunk-7QAUEU4L.cjs.map +0 -1
- package/dist/chunk-EMMSS5I5.cjs +0 -37
- package/dist/chunk-EMMSS5I5.cjs.map +0 -1
- package/dist/chunk-G3PMV62Z.js +0 -33
- package/dist/chunk-G3PMV62Z.js.map +0 -1
- package/dist/chunk-IUSAD2BW.cjs +0 -19
- package/dist/chunk-IUSAD2BW.cjs.map +0 -1
- package/dist/chunk-KHEXN75Q.js.map +0 -1
- package/dist/chunk-PWGOG6ML.cjs +0 -81
- package/dist/chunk-PWGOG6ML.cjs.map +0 -1
- package/dist/chunk-QKR2PMLZ.cjs.map +0 -1
- package/dist/chunk-QTWX6TKR.js +0 -8
- package/dist/chunk-QTWX6TKR.js.map +0 -1
- package/dist/chunk-YGTIO3J5.js +0 -17
- package/dist/chunk-YGTIO3J5.js.map +0 -1
- package/dist/dist-LDTK3TIP.cjs +0 -16759
- package/dist/dist-LDTK3TIP.cjs.map +0 -1
- package/dist/dist-OWYZEOJK.js +0 -16737
- package/dist/dist-OWYZEOJK.js.map +0 -1
- package/dist/evaluation.d.ts +0 -8
- package/dist/evaluation.d.ts.map +0 -1
- package/dist/index.cjs +0 -93
- package/dist/index.cjs.map +0 -1
- package/dist/index.d.ts +0 -3
- package/dist/index.d.ts.map +0 -1
- package/dist/index.js +0 -89
- package/dist/index.js.map +0 -1
- package/dist/magic-string.es-7ORA5OGR.js +0 -1305
- package/dist/magic-string.es-7ORA5OGR.js.map +0 -1
- package/dist/magic-string.es-NZ2XWFKN.cjs +0 -1311
- package/dist/magic-string.es-NZ2XWFKN.cjs.map +0 -1
- package/dist/metrics/index.d.ts +0 -4
- package/dist/metrics/index.d.ts.map +0 -1
- package/dist/metrics/judge/index.cjs +0 -12
- package/dist/metrics/judge/index.cjs.map +0 -1
- package/dist/metrics/judge/index.d.ts +0 -7
- package/dist/metrics/judge/index.d.ts.map +0 -1
- package/dist/metrics/judge/index.js +0 -3
- package/dist/metrics/judge/index.js.map +0 -1
- package/dist/metrics/llm/answer-relevancy/index.d.ts +0 -16
- package/dist/metrics/llm/answer-relevancy/index.d.ts.map +0 -1
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/answer-relevancy/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts +0 -19
- package/dist/metrics/llm/answer-relevancy/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/bias/index.d.ts +0 -14
- package/dist/metrics/llm/bias/index.d.ts.map +0 -1
- package/dist/metrics/llm/bias/metricJudge.d.ts +0 -14
- package/dist/metrics/llm/bias/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/bias/prompts.d.ts +0 -14
- package/dist/metrics/llm/bias/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/index.d.ts +0 -16
- package/dist/metrics/llm/context-position/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/context-position/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-position/prompts.d.ts +0 -17
- package/dist/metrics/llm/context-position/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/index.d.ts +0 -16
- package/dist/metrics/llm/context-precision/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/context-precision/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-precision/prompts.d.ts +0 -17
- package/dist/metrics/llm/context-precision/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/index.d.ts +0 -16
- package/dist/metrics/llm/context-relevancy/index.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts +0 -16
- package/dist/metrics/llm/context-relevancy/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/context-relevancy/prompts.d.ts +0 -13
- package/dist/metrics/llm/context-relevancy/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/index.d.ts +0 -16
- package/dist/metrics/llm/contextual-recall/index.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts +0 -16
- package/dist/metrics/llm/contextual-recall/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/contextual-recall/prompts.d.ts +0 -13
- package/dist/metrics/llm/contextual-recall/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/index.d.ts +0 -16
- package/dist/metrics/llm/faithfulness/index.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts +0 -22
- package/dist/metrics/llm/faithfulness/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/faithfulness/prompts.d.ts +0 -20
- package/dist/metrics/llm/faithfulness/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/index.d.ts +0 -16
- package/dist/metrics/llm/hallucination/index.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/metricJudge.d.ts +0 -22
- package/dist/metrics/llm/hallucination/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/hallucination/prompts.d.ts +0 -17
- package/dist/metrics/llm/hallucination/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/index.cjs +0 -2481
- package/dist/metrics/llm/index.cjs.map +0 -1
- package/dist/metrics/llm/index.d.ts +0 -12
- package/dist/metrics/llm/index.d.ts.map +0 -1
- package/dist/metrics/llm/index.js +0 -2469
- package/dist/metrics/llm/index.js.map +0 -1
- package/dist/metrics/llm/prompt-alignment/index.d.ts +0 -33
- package/dist/metrics/llm/prompt-alignment/index.d.ts.map +0 -1
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts +0 -20
- package/dist/metrics/llm/prompt-alignment/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts +0 -17
- package/dist/metrics/llm/prompt-alignment/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/index.d.ts +0 -19
- package/dist/metrics/llm/summarization/index.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/metricJudge.d.ts +0 -34
- package/dist/metrics/llm/summarization/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/summarization/prompts.d.ts +0 -30
- package/dist/metrics/llm/summarization/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/index.d.ts +0 -14
- package/dist/metrics/llm/toxicity/index.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/metricJudge.d.ts +0 -14
- package/dist/metrics/llm/toxicity/metricJudge.d.ts.map +0 -1
- package/dist/metrics/llm/toxicity/prompts.d.ts +0 -10
- package/dist/metrics/llm/toxicity/prompts.d.ts.map +0 -1
- package/dist/metrics/llm/types.d.ts +0 -7
- package/dist/metrics/llm/types.d.ts.map +0 -1
- package/dist/metrics/llm/utils.d.ts +0 -14
- package/dist/metrics/llm/utils.d.ts.map +0 -1
- package/dist/metrics/nlp/completeness/index.d.ts +0 -21
- package/dist/metrics/nlp/completeness/index.d.ts.map +0 -1
- package/dist/metrics/nlp/content-similarity/index.d.ts +0 -18
- package/dist/metrics/nlp/content-similarity/index.d.ts.map +0 -1
- package/dist/metrics/nlp/index.cjs +0 -201
- package/dist/metrics/nlp/index.cjs.map +0 -1
- package/dist/metrics/nlp/index.d.ts +0 -6
- package/dist/metrics/nlp/index.d.ts.map +0 -1
- package/dist/metrics/nlp/index.js +0 -188
- package/dist/metrics/nlp/index.js.map +0 -1
- package/dist/metrics/nlp/keyword-coverage/index.d.ts +0 -13
- package/dist/metrics/nlp/keyword-coverage/index.d.ts.map +0 -1
- package/dist/metrics/nlp/textual-difference/index.d.ts +0 -15
- package/dist/metrics/nlp/textual-difference/index.d.ts.map +0 -1
- package/dist/metrics/nlp/tone/index.d.ts +0 -18
- package/dist/metrics/nlp/tone/index.d.ts.map +0 -1
- package/dist/ratio.d.ts +0 -13
- package/dist/ratio.d.ts.map +0 -1
- package/dist/scorers/code/index.cjs +0 -327
- package/dist/scorers/code/index.cjs.map +0 -1
- package/dist/scorers/code/index.js +0 -313
- package/dist/scorers/code/index.js.map +0 -1
- package/dist/scorers/llm/index.cjs.map +0 -1
- package/dist/scorers/llm/index.js.map +0 -1
|
@@ -1,313 +0,0 @@
|
|
|
1
|
-
import { calculateRatio, countChanges } from '../../chunk-44PMY5ES.js';
|
|
2
|
-
import { extractToolCalls } from '../../chunk-KHEXN75Q.js';
|
|
3
|
-
import { createScorer } from '@mastra/core/scores';
|
|
4
|
-
import nlp from 'compromise';
|
|
5
|
-
import keyword_extractor from 'keyword-extractor';
|
|
6
|
-
import stringSimilarity from 'string-similarity';
|
|
7
|
-
import Sentiment from 'sentiment';
|
|
8
|
-
|
|
9
|
-
function normalizeString(str) {
|
|
10
|
-
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
11
|
-
}
|
|
12
|
-
function extractElements(doc) {
|
|
13
|
-
const nouns = doc.nouns().out("array") || [];
|
|
14
|
-
const verbs = doc.verbs().toInfinitive().out("array") || [];
|
|
15
|
-
const topics = doc.topics().out("array") || [];
|
|
16
|
-
const terms = doc.terms().out("array") || [];
|
|
17
|
-
const cleanAndSplitTerm = (term) => {
|
|
18
|
-
const normalized = normalizeString(term);
|
|
19
|
-
return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
|
|
20
|
-
};
|
|
21
|
-
const processedTerms = [
|
|
22
|
-
...nouns.flatMap(cleanAndSplitTerm),
|
|
23
|
-
...verbs.flatMap(cleanAndSplitTerm),
|
|
24
|
-
...topics.flatMap(cleanAndSplitTerm),
|
|
25
|
-
...terms.flatMap(cleanAndSplitTerm)
|
|
26
|
-
];
|
|
27
|
-
return [...new Set(processedTerms)];
|
|
28
|
-
}
|
|
29
|
-
function calculateCoverage({ original, simplified }) {
|
|
30
|
-
if (original.length === 0) {
|
|
31
|
-
return simplified.length === 0 ? 1 : 0;
|
|
32
|
-
}
|
|
33
|
-
const covered = original.filter(
|
|
34
|
-
(element) => simplified.some((s) => {
|
|
35
|
-
const elem = normalizeString(element);
|
|
36
|
-
const simp = normalizeString(s);
|
|
37
|
-
if (elem.length <= 3) {
|
|
38
|
-
return elem === simp;
|
|
39
|
-
}
|
|
40
|
-
const longer = elem.length > simp.length ? elem : simp;
|
|
41
|
-
const shorter = elem.length > simp.length ? simp : elem;
|
|
42
|
-
if (longer.includes(shorter)) {
|
|
43
|
-
return shorter.length / longer.length > 0.6;
|
|
44
|
-
}
|
|
45
|
-
return false;
|
|
46
|
-
})
|
|
47
|
-
);
|
|
48
|
-
return covered.length / original.length;
|
|
49
|
-
}
|
|
50
|
-
function createCompletenessScorer() {
|
|
51
|
-
return createScorer({
|
|
52
|
-
name: "Completeness Scorer",
|
|
53
|
-
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
54
|
-
type: "agent"
|
|
55
|
-
}).preprocess(async ({ run }) => {
|
|
56
|
-
const isInputInvalid = !run.input || run.input.inputMessages.some((i) => i.content === null || i.content === void 0);
|
|
57
|
-
const isOutputInvalid = !run.output || run.output.some((i) => i.content === null || i.content === void 0);
|
|
58
|
-
if (isInputInvalid || isOutputInvalid) {
|
|
59
|
-
throw new Error("Inputs cannot be null or undefined");
|
|
60
|
-
}
|
|
61
|
-
const input = run.input?.inputMessages.map((i) => i.content).join(", ") || "";
|
|
62
|
-
const output = run.output?.map(({ content }) => content).join(", ") || "";
|
|
63
|
-
const inputToProcess = input;
|
|
64
|
-
const outputToProcess = output;
|
|
65
|
-
const inputDoc = nlp(inputToProcess.trim());
|
|
66
|
-
const outputDoc = nlp(outputToProcess.trim());
|
|
67
|
-
const inputElements = extractElements(inputDoc);
|
|
68
|
-
const outputElements = extractElements(outputDoc);
|
|
69
|
-
return {
|
|
70
|
-
inputElements,
|
|
71
|
-
outputElements,
|
|
72
|
-
missingElements: inputElements.filter((e) => !outputElements.includes(e)),
|
|
73
|
-
elementCounts: {
|
|
74
|
-
input: inputElements.length,
|
|
75
|
-
output: outputElements.length
|
|
76
|
-
}
|
|
77
|
-
};
|
|
78
|
-
}).generateScore(({ results }) => {
|
|
79
|
-
const inputElements = results.preprocessStepResult?.inputElements;
|
|
80
|
-
const outputElements = results.preprocessStepResult?.outputElements;
|
|
81
|
-
return calculateCoverage({
|
|
82
|
-
original: inputElements,
|
|
83
|
-
simplified: outputElements
|
|
84
|
-
});
|
|
85
|
-
});
|
|
86
|
-
}
|
|
87
|
-
function createTextualDifferenceScorer() {
|
|
88
|
-
return createScorer({
|
|
89
|
-
name: "Textual Difference Scorer",
|
|
90
|
-
description: "Calculate textual difference between input and output using sequence matching algorithms.",
|
|
91
|
-
type: "agent"
|
|
92
|
-
}).preprocess(async ({ run }) => {
|
|
93
|
-
const input = run.input?.inputMessages?.map((i) => i.content).join(", ") || "";
|
|
94
|
-
const output = run.output?.map((i) => i.content).join(", ") || "";
|
|
95
|
-
const ratio = calculateRatio(input, output);
|
|
96
|
-
const changes = countChanges(input, output);
|
|
97
|
-
const maxLength = Math.max(input.length, output.length);
|
|
98
|
-
const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
|
|
99
|
-
const confidence = 1 - lengthDiff;
|
|
100
|
-
return {
|
|
101
|
-
ratio,
|
|
102
|
-
confidence,
|
|
103
|
-
changes,
|
|
104
|
-
lengthDiff
|
|
105
|
-
};
|
|
106
|
-
}).generateScore(({ results }) => {
|
|
107
|
-
return results.preprocessStepResult?.ratio;
|
|
108
|
-
});
|
|
109
|
-
}
|
|
110
|
-
function createKeywordCoverageScorer() {
|
|
111
|
-
return createScorer({
|
|
112
|
-
name: "Keyword Coverage Scorer",
|
|
113
|
-
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
114
|
-
type: "agent"
|
|
115
|
-
}).preprocess(async ({ run }) => {
|
|
116
|
-
const input = run.input?.inputMessages?.map((i) => i.content).join(", ") || "";
|
|
117
|
-
const output = run.output?.map((i) => i.content).join(", ") || "";
|
|
118
|
-
if (!input && !output) {
|
|
119
|
-
return {
|
|
120
|
-
result: {
|
|
121
|
-
referenceKeywords: /* @__PURE__ */ new Set(),
|
|
122
|
-
responseKeywords: /* @__PURE__ */ new Set()
|
|
123
|
-
}
|
|
124
|
-
};
|
|
125
|
-
}
|
|
126
|
-
const extractKeywords = (text) => {
|
|
127
|
-
return keyword_extractor.extract(text, {
|
|
128
|
-
language: "english",
|
|
129
|
-
remove_digits: true,
|
|
130
|
-
return_changed_case: true,
|
|
131
|
-
remove_duplicates: true
|
|
132
|
-
});
|
|
133
|
-
};
|
|
134
|
-
const referenceKeywords = new Set(extractKeywords(input));
|
|
135
|
-
const responseKeywords = new Set(extractKeywords(output));
|
|
136
|
-
return {
|
|
137
|
-
referenceKeywords,
|
|
138
|
-
responseKeywords
|
|
139
|
-
};
|
|
140
|
-
}).analyze(async ({ results }) => {
|
|
141
|
-
if (!results.preprocessStepResult?.referenceKeywords?.size && !results.preprocessStepResult?.responseKeywords?.size) {
|
|
142
|
-
return {
|
|
143
|
-
totalKeywordsLength: 0,
|
|
144
|
-
matchedKeywordsLength: 0
|
|
145
|
-
};
|
|
146
|
-
}
|
|
147
|
-
const matchedKeywords = [...results.preprocessStepResult?.referenceKeywords].filter(
|
|
148
|
-
(k) => results.preprocessStepResult?.responseKeywords?.has(k)
|
|
149
|
-
);
|
|
150
|
-
return {
|
|
151
|
-
totalKeywordsLength: Array.from(results.preprocessStepResult?.referenceKeywords).length ?? 0,
|
|
152
|
-
matchedKeywordsLength: matchedKeywords.length ?? 0
|
|
153
|
-
};
|
|
154
|
-
}).generateScore(({ results }) => {
|
|
155
|
-
if (!results.analyzeStepResult?.totalKeywordsLength) {
|
|
156
|
-
return 1;
|
|
157
|
-
}
|
|
158
|
-
const totalKeywords = results.analyzeStepResult?.totalKeywordsLength;
|
|
159
|
-
const matchedKeywords = results.analyzeStepResult?.matchedKeywordsLength;
|
|
160
|
-
return totalKeywords > 0 ? matchedKeywords / totalKeywords : 0;
|
|
161
|
-
});
|
|
162
|
-
}
|
|
163
|
-
function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
|
|
164
|
-
return createScorer({
|
|
165
|
-
name: "Content Similarity Scorer",
|
|
166
|
-
description: "Calculates content similarity between input and output messages using string comparison algorithms.",
|
|
167
|
-
type: "agent"
|
|
168
|
-
}).preprocess(async ({ run }) => {
|
|
169
|
-
let processedInput = run.input?.inputMessages.map((i) => i.content).join(", ") || "";
|
|
170
|
-
let processedOutput = run.output.map((i) => i.content).join(", ") || "";
|
|
171
|
-
if (ignoreCase) {
|
|
172
|
-
processedInput = processedInput.toLowerCase();
|
|
173
|
-
processedOutput = processedOutput.toLowerCase();
|
|
174
|
-
}
|
|
175
|
-
if (ignoreWhitespace) {
|
|
176
|
-
processedInput = processedInput.replace(/\s+/g, " ").trim();
|
|
177
|
-
processedOutput = processedOutput.replace(/\s+/g, " ").trim();
|
|
178
|
-
}
|
|
179
|
-
return {
|
|
180
|
-
processedInput,
|
|
181
|
-
processedOutput
|
|
182
|
-
};
|
|
183
|
-
}).generateScore(({ results }) => {
|
|
184
|
-
const similarity = stringSimilarity.compareTwoStrings(
|
|
185
|
-
results.preprocessStepResult?.processedInput,
|
|
186
|
-
results.preprocessStepResult?.processedOutput
|
|
187
|
-
);
|
|
188
|
-
return similarity;
|
|
189
|
-
});
|
|
190
|
-
}
|
|
191
|
-
function createToneScorer(config = {}) {
|
|
192
|
-
const { referenceTone } = config;
|
|
193
|
-
return createScorer({
|
|
194
|
-
name: "Tone Scorer",
|
|
195
|
-
description: "Analyzes the tone and sentiment of agent responses using sentiment analysis. Can compare against a reference tone or evaluate sentiment stability.",
|
|
196
|
-
type: "agent"
|
|
197
|
-
}).preprocess(async ({ run }) => {
|
|
198
|
-
const sentiment = new Sentiment();
|
|
199
|
-
const agentMessage = run.output?.map((i) => i.content).join(", ") || "";
|
|
200
|
-
const responseSentiment = sentiment.analyze(agentMessage);
|
|
201
|
-
if (referenceTone) {
|
|
202
|
-
const referenceSentiment = sentiment.analyze(referenceTone);
|
|
203
|
-
const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);
|
|
204
|
-
const normalizedScore = Math.max(0, 1 - sentimentDiff);
|
|
205
|
-
return {
|
|
206
|
-
score: normalizedScore,
|
|
207
|
-
responseSentiment: responseSentiment.comparative,
|
|
208
|
-
referenceSentiment: referenceSentiment.comparative,
|
|
209
|
-
difference: sentimentDiff
|
|
210
|
-
};
|
|
211
|
-
}
|
|
212
|
-
const sentences = agentMessage.match(/[^.!?]+[.!?]+/g) || [agentMessage];
|
|
213
|
-
const sentiments = sentences.map((s) => sentiment.analyze(s).comparative);
|
|
214
|
-
const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;
|
|
215
|
-
const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;
|
|
216
|
-
const stability = Math.max(0, 1 - variance);
|
|
217
|
-
return {
|
|
218
|
-
score: stability,
|
|
219
|
-
avgSentiment,
|
|
220
|
-
sentimentVariance: variance
|
|
221
|
-
};
|
|
222
|
-
}).generateScore(({ results }) => {
|
|
223
|
-
return results.preprocessStepResult?.score;
|
|
224
|
-
});
|
|
225
|
-
}
|
|
226
|
-
function checkToolOrder(actualTools, expectedOrder, strictMode = false) {
|
|
227
|
-
if (strictMode) {
|
|
228
|
-
return JSON.stringify(actualTools) === JSON.stringify(expectedOrder);
|
|
229
|
-
}
|
|
230
|
-
const expectedIndices = [];
|
|
231
|
-
for (const expectedTool of expectedOrder) {
|
|
232
|
-
const index = actualTools.indexOf(expectedTool);
|
|
233
|
-
if (index === -1) {
|
|
234
|
-
return false;
|
|
235
|
-
}
|
|
236
|
-
expectedIndices.push(index);
|
|
237
|
-
}
|
|
238
|
-
for (let i = 1; i < expectedIndices.length; i++) {
|
|
239
|
-
const currentIndex = expectedIndices[i];
|
|
240
|
-
const prevIndex = expectedIndices[i - 1];
|
|
241
|
-
if (currentIndex !== void 0 && prevIndex !== void 0 && currentIndex <= prevIndex) {
|
|
242
|
-
return false;
|
|
243
|
-
}
|
|
244
|
-
}
|
|
245
|
-
return true;
|
|
246
|
-
}
|
|
247
|
-
function calculateAccuracy({
|
|
248
|
-
expectedTool,
|
|
249
|
-
actualTools,
|
|
250
|
-
strictMode = false,
|
|
251
|
-
expectedToolOrder
|
|
252
|
-
}) {
|
|
253
|
-
if (actualTools.length === 0) {
|
|
254
|
-
return 0;
|
|
255
|
-
}
|
|
256
|
-
if (expectedToolOrder && expectedToolOrder.length > 0) {
|
|
257
|
-
return checkToolOrder(actualTools, expectedToolOrder, strictMode) ? 1 : 0;
|
|
258
|
-
}
|
|
259
|
-
if (!expectedTool) {
|
|
260
|
-
return 0;
|
|
261
|
-
}
|
|
262
|
-
if (strictMode) {
|
|
263
|
-
return actualTools.length === 1 && actualTools[0] === expectedTool ? 1 : 0;
|
|
264
|
-
}
|
|
265
|
-
return actualTools.includes(expectedTool) ? 1 : 0;
|
|
266
|
-
}
|
|
267
|
-
function createToolCallAccuracyScorerCode(options) {
|
|
268
|
-
const { expectedTool, strictMode = false, expectedToolOrder } = options;
|
|
269
|
-
if (!expectedTool && !expectedToolOrder) {
|
|
270
|
-
throw new Error("Either expectedTool or expectedToolOrder must be provided");
|
|
271
|
-
}
|
|
272
|
-
const getDescription = () => {
|
|
273
|
-
return expectedToolOrder ? `Evaluates whether the LLM called tools in the correct order: [${expectedToolOrder.join(", ")}]` : `Evaluates whether the LLM selected the correct tool (${expectedTool}) from the available tools`;
|
|
274
|
-
};
|
|
275
|
-
return createScorer({
|
|
276
|
-
name: "Tool Call Accuracy Scorer",
|
|
277
|
-
description: getDescription(),
|
|
278
|
-
type: "agent"
|
|
279
|
-
}).preprocess(async ({ run }) => {
|
|
280
|
-
const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;
|
|
281
|
-
const isOutputInvalid = !run.output || run.output.length === 0;
|
|
282
|
-
if (isInputInvalid || isOutputInvalid) {
|
|
283
|
-
throw new Error("Input and output messages cannot be null or empty");
|
|
284
|
-
}
|
|
285
|
-
const { tools: actualTools, toolCallInfos } = extractToolCalls(run.output);
|
|
286
|
-
const correctToolCalled = expectedTool ? strictMode ? actualTools.length === 1 && actualTools[0] === expectedTool : actualTools.includes(expectedTool) : false;
|
|
287
|
-
return {
|
|
288
|
-
expectedTool,
|
|
289
|
-
actualTools,
|
|
290
|
-
strictMode,
|
|
291
|
-
expectedToolOrder,
|
|
292
|
-
hasToolCalls: actualTools.length > 0,
|
|
293
|
-
correctToolCalled,
|
|
294
|
-
toolCallInfos,
|
|
295
|
-
correctOrderCalled: expectedToolOrder ? checkToolOrder(actualTools, expectedToolOrder, strictMode) : null
|
|
296
|
-
};
|
|
297
|
-
}).generateScore(({ results }) => {
|
|
298
|
-
const preprocessResult = results.preprocessStepResult;
|
|
299
|
-
if (!preprocessResult) {
|
|
300
|
-
return 0;
|
|
301
|
-
}
|
|
302
|
-
return calculateAccuracy({
|
|
303
|
-
expectedTool: preprocessResult.expectedTool,
|
|
304
|
-
actualTools: preprocessResult.actualTools,
|
|
305
|
-
strictMode: preprocessResult.strictMode,
|
|
306
|
-
expectedToolOrder: preprocessResult.expectedToolOrder
|
|
307
|
-
});
|
|
308
|
-
});
|
|
309
|
-
}
|
|
310
|
-
|
|
311
|
-
export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer, createToneScorer, createToolCallAccuracyScorerCode };
|
|
312
|
-
//# sourceMappingURL=index.js.map
|
|
313
|
-
//# sourceMappingURL=index.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../../../src/scorers/code/completeness/index.ts","../../../src/scorers/code/textual-difference/index.ts","../../../src/scorers/code/keyword-coverage/index.ts","../../../src/scorers/code/content-similarity/index.ts","../../../src/scorers/code/tone/index.ts","../../../src/scorers/code/tool-call-accuracy/index.ts"],"names":["createScorer"],"mappings":";;;;;;;;AAGA,SAAS,gBAAgB,GAAA,EAAqB;AAE5C,EAAA,OAAO,GAAA,CACJ,UAAU,KAAK,CAAA,CACf,QAAQ,kBAAA,EAAoB,EAAE,EAC9B,WAAA,EAAY;AACjB;AAEA,SAAS,gBAAgB,GAAA,EAAoB;AAE3C,EAAA,MAAM,QAAQ,GAAA,CAAI,KAAA,GAAQ,GAAA,CAAI,OAAO,KAAK,EAAC;AAC3C,EAAA,MAAM,KAAA,GAAQ,IAAI,KAAA,EAAM,CAAE,cAAa,CAAE,GAAA,CAAI,OAAO,CAAA,IAAK,EAAC;AAC1D,EAAA,MAAM,SAAS,GAAA,CAAI,MAAA,GAAS,GAAA,CAAI,OAAO,KAAK,EAAC;AAC7C,EAAA,MAAM,QAAQ,GAAA,CAAI,KAAA,GAAQ,GAAA,CAAI,OAAO,KAAK,EAAC;AAG3C,EAAA,MAAM,iBAAA,GAAoB,CAAC,IAAA,KAA2B;AAEpD,IAAA,MAAM,UAAA,GAAa,gBAAgB,IAAI,CAAA;AAGvC,IAAA,OAAO,WACJ,OAAA,CAAQ,iBAAA,EAAmB,OAAO,CAAA,CAClC,OAAA,CAAQ,eAAe,GAAG,CAAA,CAC1B,IAAA,EAAK,CACL,MAAM,KAAK,CAAA,CACX,OAAO,CAAA,IAAA,KAAQ,IAAA,CAAK,SAAS,CAAC,CAAA;AAAA,EACnC,CAAA;AAGA,EAAA,MAAM,cAAA,GAAiB;AAAA,IACrB,GAAG,KAAA,CAAM,OAAA,CAAQ,iBAAiB,CAAA;AAAA,IAClC,GAAG,KAAA,CAAM,OAAA,CAAQ,iBAAiB,CAAA;AAAA,IAClC,GAAG,MAAA,CAAO,OAAA,CAAQ,iBAAiB,CAAA;AAAA,IACnC,GAAG,KAAA,CAAM,OAAA,CAAQ,iBAAiB;AAAA,GACpC;AAGA,EAAA,OAAO,CAAC,GAAG,IAAI,GAAA,CAAI,cAAc,CAAC,CAAA;AACpC;AAEA,SAAS,iBAAA,CAAkB,EAAE,QAAA,EAAU,UAAA,EAAW,EAAyD;AACzG,EAAA,IAAI,QAAA,CAAS,WAAW,CAAA,EAAG;AACzB,IAAA,OAAO,UAAA,CAAW,MAAA,KAAW,CAAA,GAAI,CAAA,GAAI,CAAA;AAAA,EACvC;AAGA,EAAA,MAAM,UAAU,QAAA,CAAS,MAAA;AAAA,IAAO,CAAA,OAAA,KAC9B,UAAA,CAAW,IAAA,CAAK,CAAA,CAAA,KAAK;AACnB,MAAA,MAAM,IAAA,GAAO,gBAAgB,OAAO,CAAA;AACpC,MAAA,MAAM,IAAA,GAAO,gBAAgB,CAAC,CAAA;AAG9B,MAAA,IAAI,IAAA,CAAK,UAAU,CAAA,EAAG;AACpB,QAAA,OAAO,IAAA,KAAS,IAAA;AAAA,MAClB;AAGA,MAAA,MAAM,MAAA,GAAS,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,SAAS,IAAA,GAAO,IAAA;AAClD,MAAA,MAAM,OAAA,GAAU,IAAA,CAAK,MAAA,GAAS,IAAA,CAAK,SAAS,IAAA,GAAO,IAAA;AAEnD,MAAA,IAAI,MAAA,CAAO,QAAA,CAAS,OAAO,CAAA,EAAG;AAC5B,QAAA,OAAO,OAAA,CAAQ,MAAA,GAAS,MAAA,CAAO,MAAA,GAAS,GAAA;AAAA,MAC1C;AAEA,MAAA,OAAO,KAAA;AAAA,IACT,CAAC;AAAA,GACH;AACA,EAAA,OAAO,OAAA,CAAQ,SAAS,QAAA,CAAS,MAAA;AACnC;AAEO,SAAS,wBAAA,GAA2B;AACzC,EAAA,OAAO,YAAA,CAAa;AAAA,IAClB,IAAA,EAAM,qBAAA;AAAA,IACN,WAAA,EACE,qHAAA;AAAA,IACF,IAAA,EAAM;AAAA,GACP,CAAA,CACE,UAAA,CAAW,OAAO,EAAE,KAAI,KAAM;AAC7B,IAAA,MAAM,cAAA,GACJ,CAAC,GAAA,CAAI,KAAA,IACL,IAAI,KAAA,CAAM,aAAA,CAAc,IAAA,CAAK,CAAC,MAA2B,CAAA,CAAE,OAAA,KAAY,IAAA,IAAQ,CAAA,CAAE,YAAY,MAAS,CAAA;AAExG,IAAA,MAAM,eAAA,GACJ,CAAC,GAAA,CAAI,MAAA,IAAU,IAAI,MAAA,CAAO,IAAA,CAAK,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAA,KAAY,IAAA,IAAQ,CAAA,CAAE,YAAY,MAAS,CAAA;AAE1G,IAAA,IAAI,kBAAkB,eAAA,EAAiB;AACrC,MAAA,MAAM,IAAI,MAAM,oCAAoC,CAAA;AAAA,IACtD;AAEA,IAAA,MAAM,KAAA,GAAQ,GAAA,CAAI,KAAA,EAAO,aAAA,CAAc,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AAChG,IAAA,MAAM,MAAA,GAAS,GAAA,CAAI,MAAA,EAAQ,GAAA,CAAI,CAAC,EAAE,OAAA,EAAQ,KAA2B,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AAE5F,IAAA,MAAM,cAAA,GAAiB,KAAA;AACvB,IAAA,MAAM,eAAA,GAAkB,MAAA;AAExB,IAAA,MAAM,QAAA,GAAW,GAAA,CAAI,cAAA,CAAe,IAAA,EAAM,CAAA;AAC1C,IAAA,MAAM,SAAA,GAAY,GAAA,CAAI,eAAA,CAAgB,IAAA,EAAM,CAAA;AAG5C,IAAA,MAAM,aAAA,GAAgB,gBAAgB,QAAQ,CAAA;AAC9C,IAAA,MAAM,cAAA,GAAiB,gBAAgB,SAAS,CAAA;AAEhD,IAAA,OAAO;AAAA,MACL,aAAA;AAAA,MACA,cAAA;AAAA,MACA,eAAA,EAAiB,cAAc,MAAA,CAAO,CAAA,CAAA,KAAK,CAAC,cAAA,CAAe,QAAA,CAAS,CAAC,CAAC,CAAA;AAAA,MACtE,aAAA,EAAe;AAAA,QACb,OAAO,aAAA,CAAc,MAAA;AAAA,QACrB,QAAQ,cAAA,CAAe;AAAA;AACzB,KACF;AAAA,EACF,CAAC,CAAA,CACA,aAAA,CAAc,CAAC,EAAE,SAAQ,KAAM;AAC9B,IAAA,MAAM,aAAA,GAAgB,QAAQ,oBAAA,EAAsB,aAAA;AACpD,IAAA,MAAM,cAAA,GAAiB,QAAQ,oBAAA,EAAsB,cAAA;AAErD,IAAA,OAAO,iBAAA,CAAkB;AAAA,MACvB,QAAA,EAAU,aAAA;AAAA,MACV,UAAA,EAAY;AAAA,KACb,CAAA;AAAA,EACH,CAAC,CAAA;AACL;AC1HO,SAAS,6BAAA,GAAgC;AAC9C,EAAA,OAAOA,YAAAA,CAAa;AAAA,IAClB,IAAA,EAAM,2BAAA;AAAA,IACN,WAAA,EAAa,2FAAA;AAAA,IACb,IAAA,EAAM;AAAA,GACP,CAAA,CACE,UAAA,CAAW,OAAO,EAAE,KAAI,KAAM;AAC7B,IAAA,MAAM,KAAA,GAAQ,GAAA,CAAI,KAAA,EAAO,aAAA,EAAe,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AACjG,IAAA,MAAM,MAAA,GAAS,GAAA,CAAI,MAAA,EAAQ,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AAGpF,IAAA,MAAM,KAAA,GAAQ,cAAA,CAAe,KAAA,EAAO,MAAM,CAAA;AAG1C,IAAA,MAAM,OAAA,GAAU,YAAA,CAAa,KAAA,EAAO,MAAM,CAAA;AAG1C,IAAA,MAAM,YAAY,IAAA,CAAK,GAAA,CAAI,KAAA,CAAM,MAAA,EAAQ,OAAO,MAAM,CAAA;AACtD,IAAA,MAAM,UAAA,GAAa,SAAA,GAAY,CAAA,GAAI,IAAA,CAAK,GAAA,CAAI,MAAM,MAAA,GAAS,MAAA,CAAO,MAAM,CAAA,GAAI,SAAA,GAAY,CAAA;AACxF,IAAA,MAAM,aAAa,CAAA,GAAI,UAAA;AAEvB,IAAA,OAAO;AAAA,MACL,KAAA;AAAA,MACA,UAAA;AAAA,MACA,OAAA;AAAA,MACA;AAAA,KACF;AAAA,EACF,CAAC,CAAA,CACA,aAAA,CAAc,CAAC,EAAE,SAAQ,KAAM;AAC9B,IAAA,OAAO,QAAQ,oBAAA,EAAsB,KAAA;AAAA,EACvC,CAAC,CAAA;AACL;AC/BO,SAAS,2BAAA,GAA8B;AAC5C,EAAA,OAAOA,YAAAA,CAAa;AAAA,IAClB,IAAA,EAAM,yBAAA;AAAA,IACN,WAAA,EACE,qHAAA;AAAA,IACF,IAAA,EAAM;AAAA,GACP,CAAA,CACE,UAAA,CAAW,OAAO,EAAE,KAAI,KAAM;AAC7B,IAAA,MAAM,KAAA,GAAQ,GAAA,CAAI,KAAA,EAAO,aAAA,EAAe,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AACjG,IAAA,MAAM,MAAA,GAAS,GAAA,CAAI,MAAA,EAAQ,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AAEpF,IAAA,IAAI,CAAC,KAAA,IAAS,CAAC,MAAA,EAAQ;AACrB,MAAA,OAAO;AAAA,QACL,MAAA,EAAQ;AAAA,UACN,iBAAA,sBAAuB,GAAA,EAAY;AAAA,UACnC,gBAAA,sBAAsB,GAAA;AAAY;AACpC,OACF;AAAA,IACF;AAEA,IAAA,MAAM,eAAA,GAAkB,CAAC,IAAA,KAAiB;AACxC,MAAA,OAAO,iBAAA,CAAkB,QAAQ,IAAA,EAAM;AAAA,QACrC,QAAA,EAAU,SAAA;AAAA,QACV,aAAA,EAAe,IAAA;AAAA,QACf,mBAAA,EAAqB,IAAA;AAAA,QACrB,iBAAA,EAAmB;AAAA,OACpB,CAAA;AAAA,IACH,CAAA;AAEA,IAAA,MAAM,iBAAA,GAAoB,IAAI,GAAA,CAAI,eAAA,CAAgB,KAAK,CAAC,CAAA;AACxD,IAAA,MAAM,gBAAA,GAAmB,IAAI,GAAA,CAAI,eAAA,CAAgB,MAAM,CAAC,CAAA;AACxD,IAAA,OAAO;AAAA,MACL,iBAAA;AAAA,MACA;AAAA,KACF;AAAA,EACF,CAAC,CAAA,CACA,OAAA,CAAQ,OAAO,EAAE,SAAQ,KAAM;AAC9B,IAAA,IACE,CAAC,QAAQ,oBAAA,EAAsB,iBAAA,EAAmB,QAClD,CAAC,OAAA,CAAQ,oBAAA,EAAsB,gBAAA,EAAkB,IAAA,EACjD;AACA,MAAA,OAAO;AAAA,QACL,mBAAA,EAAqB,CAAA;AAAA,QACrB,qBAAA,EAAuB;AAAA,OACzB;AAAA,IACF;AAEA,IAAA,MAAM,kBAAkB,CAAC,GAAG,OAAA,CAAQ,oBAAA,EAAsB,iBAAiB,CAAA,CAAE,MAAA;AAAA,MAAO,CAAA,CAAA,KAClF,OAAA,CAAQ,oBAAA,EAAsB,gBAAA,EAAkB,IAAI,CAAC;AAAA,KACvD;AAEA,IAAA,OAAO;AAAA,MACL,qBAAqB,KAAA,CAAM,IAAA,CAAK,QAAQ,oBAAA,EAAsB,iBAAiB,EAAE,MAAA,IAAU,CAAA;AAAA,MAC3F,qBAAA,EAAuB,gBAAgB,MAAA,IAAU;AAAA,KACnD;AAAA,EACF,CAAC,CAAA,CACA,aAAA,CAAc,CAAC,EAAE,SAAQ,KAAM;AAC9B,IAAA,IAAI,CAAC,OAAA,CAAQ,iBAAA,EAAmB,mBAAA,EAAqB;AACnD,MAAA,OAAO,CAAA;AAAA,IACT;AAEA,IAAA,MAAM,aAAA,GAAgB,QAAQ,iBAAA,EAAmB,mBAAA;AACjD,IAAA,MAAM,eAAA,GAAkB,QAAQ,iBAAA,EAAmB,qBAAA;AACnD,IAAA,OAAO,aAAA,GAAgB,CAAA,GAAI,eAAA,GAAkB,aAAA,GAAgB,CAAA;AAAA,EAC/D,CAAC,CAAA;AACL;AC5DO,SAAS,6BAAA,CACd,EAAE,UAAA,EAAY,gBAAA,EAAiB,GAA8B,EAAE,UAAA,EAAY,IAAA,EAAM,gBAAA,EAAkB,IAAA,EAAK,EACxG;AACA,EAAA,OAAOA,YAAAA,CAAa;AAAA,IAClB,IAAA,EAAM,2BAAA;AAAA,IACN,WAAA,EAAa,qGAAA;AAAA,IACb,IAAA,EAAM;AAAA,GACP,CAAA,CACE,UAAA,CAAW,OAAO,EAAE,KAAI,KAAM;AAC7B,IAAA,IAAI,cAAA,GAAiB,GAAA,CAAI,KAAA,EAAO,aAAA,CAAc,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AACvG,IAAA,IAAI,eAAA,GAAkB,GAAA,CAAI,MAAA,CAAO,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AAE1F,IAAA,IAAI,UAAA,EAAY;AACd,MAAA,cAAA,GAAiB,eAAe,WAAA,EAAY;AAC5C,MAAA,eAAA,GAAkB,gBAAgB,WAAA,EAAY;AAAA,IAChD;AAEA,IAAA,IAAI,gBAAA,EAAkB;AACpB,MAAA,cAAA,GAAiB,cAAA,CAAe,OAAA,CAAQ,MAAA,EAAQ,GAAG,EAAE,IAAA,EAAK;AAC1D,MAAA,eAAA,GAAkB,eAAA,CAAgB,OAAA,CAAQ,MAAA,EAAQ,GAAG,EAAE,IAAA,EAAK;AAAA,IAC9D;AAEA,IAAA,OAAO;AAAA,MACL,cAAA;AAAA,MACA;AAAA,KACF;AAAA,EACF,CAAC,CAAA,CACA,aAAA,CAAc,CAAC,EAAE,SAAQ,KAAM;AAC9B,IAAA,MAAM,aAAa,gBAAA,CAAiB,iBAAA;AAAA,MAClC,QAAQ,oBAAA,EAAsB,cAAA;AAAA,MAC9B,QAAQ,oBAAA,EAAsB;AAAA,KAChC;AAEA,IAAA,OAAO,UAAA;AAAA,EACT,CAAC,CAAA;AACL;ACpCO,SAAS,gBAAA,CAAiB,MAAA,GAA2B,EAAC,EAAG;AAC9D,EAAA,MAAM,EAAE,eAAc,GAAI,MAAA;AAE1B,EAAA,OAAOA,YAAAA,CAAa;AAAA,IAClB,IAAA,EAAM,aAAA;AAAA,IACN,WAAA,EACE,oJAAA;AAAA,IACF,IAAA,EAAM;AAAA,GACP,CAAA,CACE,UAAA,CAAW,OAAO,EAAE,KAAI,KAAM;AAC7B,IAAA,MAAM,SAAA,GAAY,IAAI,SAAA,EAAU;AAChC,IAAA,MAAM,YAAA,GAAuB,GAAA,CAAI,MAAA,EAAQ,GAAA,CAAI,CAAC,CAAA,KAA2B,CAAA,CAAE,OAAO,CAAA,CAAE,IAAA,CAAK,IAAI,CAAA,IAAK,EAAA;AAClG,IAAA,MAAM,iBAAA,GAAoB,SAAA,CAAU,OAAA,CAAQ,YAAY,CAAA;AAExD,IAAA,IAAI,aAAA,EAAe;AAEjB,MAAA,MAAM,kBAAA,GAAqB,SAAA,CAAU,OAAA,CAAQ,aAAa,CAAA;AAC1D,MAAA,MAAM,gBAAgB,IAAA,CAAK,GAAA,CAAI,iBAAA,CAAkB,WAAA,GAAc,mBAAmB,WAAW,CAAA;AAC7F,MAAA,MAAM,eAAA,GAAkB,IAAA,CAAK,GAAA,CAAI,CAAA,EAAG,IAAI,aAAa,CAAA;AAErD,MAAA,OAAO;AAAA,QACL,KAAA,EAAO,eAAA;AAAA,QACP,mBAAmB,iBAAA,CAAkB,WAAA;AAAA,QACrC,oBAAoB,kBAAA,CAAmB,WAAA;AAAA,QACvC,UAAA,EAAY;AAAA,OACd;AAAA,IACF;AAGA,IAAA,MAAM,YAAY,YAAA,CAAa,KAAA,CAAM,gBAAgB,CAAA,IAAK,CAAC,YAAY,CAAA;AACvE,IAAA,MAAM,UAAA,GAAa,UAAU,GAAA,CAAI,CAAA,CAAA,KAAK,UAAU,OAAA,CAAQ,CAAC,EAAE,WAAW,CAAA;AACtE,IAAA,MAAM,YAAA,GAAe,UAAA,CAAW,MAAA,CAAO,CAAC,CAAA,EAAG,MAAM,CAAA,GAAI,CAAA,EAAG,CAAC,CAAA,GAAI,UAAA,CAAW,MAAA;AACxE,IAAA,MAAM,QAAA,GAAW,UAAA,CAAW,MAAA,CAAO,CAAC,KAAK,CAAA,KAAM,GAAA,GAAM,IAAA,CAAK,GAAA,CAAI,IAAI,YAAA,EAAc,CAAC,CAAA,EAAG,CAAC,IAAI,UAAA,CAAW,MAAA;AACpG,IAAA,MAAM,SAAA,GAAY,IAAA,CAAK,GAAA,CAAI,CAAA,EAAG,IAAI,QAAQ,CAAA;AAE1C,IAAA,OAAO;AAAA,MACL,KAAA,EAAO,SAAA;AAAA,MACP,YAAA;AAAA,MACA,iBAAA,EAAmB;AAAA,KACrB;AAAA,EACF,CAAC,CAAA,CACA,aAAA,CAAc,CAAC,EAAE,SAAQ,KAAM;AAC9B,IAAA,OAAO,QAAQ,oBAAA,EAAsB,KAAA;AAAA,EACvC,CAAC,CAAA;AACL;AC3CA,SAAS,cAAA,CAAe,WAAA,EAAuB,aAAA,EAAyB,UAAA,GAAsB,KAAA,EAAgB;AAC5G,EAAA,IAAI,UAAA,EAAY;AACd,IAAA,OAAO,KAAK,SAAA,CAAU,WAAW,CAAA,KAAM,IAAA,CAAK,UAAU,aAAa,CAAA;AAAA,EACrE;AAEA,EAAA,MAAM,kBAA4B,EAAC;AACnC,EAAA,KAAA,MAAW,gBAAgB,aAAA,EAAe;AACxC,IAAA,MAAM,KAAA,GAAQ,WAAA,CAAY,OAAA,CAAQ,YAAY,CAAA;AAC9C,IAAA,IAAI,UAAU,EAAA,EAAI;AAChB,MAAA,OAAO,KAAA;AAAA,IACT;AACA,IAAA,eAAA,CAAgB,KAAK,KAAK,CAAA;AAAA,EAC5B;AAEA,EAAA,KAAA,IAAS,CAAA,GAAI,CAAA,EAAG,CAAA,GAAI,eAAA,CAAgB,QAAQ,CAAA,EAAA,EAAK;AAC/C,IAAA,MAAM,YAAA,GAAe,gBAAgB,CAAC,CAAA;AACtC,IAAA,MAAM,SAAA,GAAY,eAAA,CAAgB,CAAA,GAAI,CAAC,CAAA;AACvC,IAAA,IAAI,YAAA,KAAiB,MAAA,IAAa,SAAA,KAAc,MAAA,IAAa,gBAAgB,SAAA,EAAW;AACtF,MAAA,OAAO,KAAA;AAAA,IACT;AAAA,EACF;AAEA,EAAA,OAAO,IAAA;AACT;AAEA,SAAS,iBAAA,CAAkB;AAAA,EACzB,YAAA;AAAA,EACA,WAAA;AAAA,EACA,UAAA,GAAa,KAAA;AAAA,EACb;AACF,CAAA,EAKW;AACT,EAAA,IAAI,WAAA,CAAY,WAAW,CAAA,EAAG;AAC5B,IAAA,OAAO,CAAA;AAAA,EACT;AAEA,EAAA,IAAI,iBAAA,IAAqB,iBAAA,CAAkB,MAAA,GAAS,CAAA,EAAG;AACrD,IAAA,OAAO,cAAA,CAAe,WAAA,EAAa,iBAAA,EAAmB,UAAU,IAAI,CAAA,GAAI,CAAA;AAAA,EAC1E;AAEA,EAAA,IAAI,CAAC,YAAA,EAAc;AACjB,IAAA,OAAO,CAAA;AAAA,EACT;AAEA,EAAA,IAAI,UAAA,EAAY;AACd,IAAA,OAAO,YAAY,MAAA,KAAW,CAAA,IAAK,YAAY,CAAC,CAAA,KAAM,eAAe,CAAA,GAAI,CAAA;AAAA,EAC3E;AAEA,EAAA,OAAO,WAAA,CAAY,QAAA,CAAS,YAAY,CAAA,GAAI,CAAA,GAAI,CAAA;AAClD;AAEO,SAAS,iCAAiC,OAAA,EAAkC;AACjF,EAAA,MAAM,EAAE,YAAA,EAAc,UAAA,GAAa,KAAA,EAAO,mBAAkB,GAAI,OAAA;AAEhE,EAAA,IAAI,CAAC,YAAA,IAAgB,CAAC,iBAAA,EAAmB;AACvC,IAAA,MAAM,IAAI,MAAM,2DAA2D,CAAA;AAAA,EAC7E;AAEA,EAAA,MAAM,iBAAiB,MAAM;AAC3B,IAAA,OAAO,iBAAA,GACH,iEAAiE,iBAAA,CAAkB,IAAA,CAAK,IAAI,CAAC,CAAA,CAAA,CAAA,GAC7F,wDAAwD,YAAY,CAAA,0BAAA,CAAA;AAAA,EAC1E,CAAA;AAEA,EAAA,OAAOA,YAAAA,CAAa;AAAA,IAClB,IAAA,EAAM,2BAAA;AAAA,IACN,aAAa,cAAA,EAAe;AAAA,IAC5B,IAAA,EAAM;AAAA,GACP,CAAA,CACE,UAAA,CAAW,OAAO,EAAE,KAAI,KAAM;AAC7B,IAAA,MAAM,cAAA,GAAiB,CAAC,GAAA,CAAI,KAAA,IAAS,CAAC,GAAA,CAAI,KAAA,CAAM,aAAA,IAAiB,GAAA,CAAI,KAAA,CAAM,aAAA,CAAc,MAAA,KAAW,CAAA;AACpG,IAAA,MAAM,kBAAkB,CAAC,GAAA,CAAI,MAAA,IAAU,GAAA,CAAI,OAAO,MAAA,KAAW,CAAA;AAE7D,IAAA,IAAI,kBAAkB,eAAA,EAAiB;AACrC,MAAA,MAAM,IAAI,MAAM,mDAAmD,CAAA;AAAA,IACrE;AAEA,IAAA,MAAM,EAAE,KAAA,EAAO,WAAA,EAAa,eAAc,GAAI,gBAAA,CAAiB,IAAI,MAAM,CAAA;AAEzE,IAAA,MAAM,iBAAA,GAAoB,YAAA,GACtB,UAAA,GACE,WAAA,CAAY,MAAA,KAAW,CAAA,IAAK,WAAA,CAAY,CAAC,CAAA,KAAM,YAAA,GAC/C,WAAA,CAAY,QAAA,CAAS,YAAY,CAAA,GACnC,KAAA;AAEJ,IAAA,OAAO;AAAA,MACL,YAAA;AAAA,MACA,WAAA;AAAA,MACA,UAAA;AAAA,MACA,iBAAA;AAAA,MACA,YAAA,EAAc,YAAY,MAAA,GAAS,CAAA;AAAA,MACnC,iBAAA;AAAA,MACA,aAAA;AAAA,MACA,oBAAoB,iBAAA,GAAoB,cAAA,CAAe,WAAA,EAAa,iBAAA,EAAmB,UAAU,CAAA,GAAI;AAAA,KACvG;AAAA,EACF,CAAC,CAAA,CACA,aAAA,CAAc,CAAC,EAAE,SAAQ,KAAM;AAC9B,IAAA,MAAM,mBAAmB,OAAA,CAAQ,oBAAA;AACjC,IAAA,IAAI,CAAC,gBAAA,EAAkB;AACrB,MAAA,OAAO,CAAA;AAAA,IACT;AAEA,IAAA,OAAO,iBAAA,CAAkB;AAAA,MACvB,cAAc,gBAAA,CAAiB,YAAA;AAAA,MAC/B,aAAa,gBAAA,CAAiB,WAAA;AAAA,MAC9B,YAAY,gBAAA,CAAiB,UAAA;AAAA,MAC7B,mBAAmB,gBAAA,CAAiB;AAAA,KACrC,CAAA;AAAA,EACH,CAAC,CAAA;AACL","file":"index.js","sourcesContent":["import { createScorer } from '@mastra/core/scores';\nimport nlp from 'compromise';\n\nfunction normalizeString(str: string): string {\n // Remove diacritics and convert to lowercase\n return str\n .normalize('NFD')\n .replace(/[\\u0300-\\u036f]/g, '')\n .toLowerCase();\n}\n\nfunction extractElements(doc: any): string[] {\n // Get more specific elements and ensure they're arrays\n const nouns = doc.nouns().out('array') || [];\n const verbs = doc.verbs().toInfinitive().out('array') || [];\n const topics = doc.topics().out('array') || [];\n const terms = doc.terms().out('array') || [];\n\n // Helper function to clean and split terms\n const cleanAndSplitTerm = (term: string): string[] => {\n // First normalize the string\n const normalized = normalizeString(term);\n\n // Split on word boundaries and filter out empty strings\n return normalized\n .replace(/([a-z])([A-Z])/g, '$1 $2') // Split camelCase\n .replace(/[^a-z0-9]+/g, ' ') // Replace non-alphanumeric with spaces\n .trim()\n .split(/\\s+/)\n .filter(word => word.length > 0);\n };\n\n // Process all elements\n const processedTerms = [\n ...nouns.flatMap(cleanAndSplitTerm),\n ...verbs.flatMap(cleanAndSplitTerm),\n ...topics.flatMap(cleanAndSplitTerm),\n ...terms.flatMap(cleanAndSplitTerm),\n ];\n\n // Remove duplicates\n return [...new Set(processedTerms)];\n}\n\nfunction calculateCoverage({ original, simplified }: { original: string[]; simplified: string[] }): number {\n if (original.length === 0) {\n return simplified.length === 0 ? 1 : 0;\n }\n\n // Exact matching for short words (3 chars or less), substring matching for longer words\n const covered = original.filter(element =>\n simplified.some(s => {\n const elem = normalizeString(element);\n const simp = normalizeString(s);\n\n // For short words (3 chars or less), require exact match\n if (elem.length <= 3) {\n return elem === simp;\n }\n\n // For longer words, require substantial overlap (more than 60% of the longer word)\n const longer = elem.length > simp.length ? elem : simp;\n const shorter = elem.length > simp.length ? simp : elem;\n\n if (longer.includes(shorter)) {\n return shorter.length / longer.length > 0.6;\n }\n\n return false;\n }),\n );\n return covered.length / original.length;\n}\n\nexport function createCompletenessScorer() {\n return createScorer({\n name: 'Completeness Scorer',\n description:\n 'Leverage the nlp method from \"compromise\" to extract elements from the input and output and calculate the coverage.',\n type: 'agent',\n })\n .preprocess(async ({ run }) => {\n const isInputInvalid =\n !run.input ||\n run.input.inputMessages.some((i: { content: string }) => i.content === null || i.content === undefined);\n\n const isOutputInvalid =\n !run.output || run.output.some((i: { content: string }) => i.content === null || i.content === undefined);\n\n if (isInputInvalid || isOutputInvalid) {\n throw new Error('Inputs cannot be null or undefined');\n }\n\n const input = run.input?.inputMessages.map((i: { content: string }) => i.content).join(', ') || '';\n const output = run.output?.map(({ content }: { content: string }) => content).join(', ') || '';\n\n const inputToProcess = input;\n const outputToProcess = output;\n\n const inputDoc = nlp(inputToProcess.trim());\n const outputDoc = nlp(outputToProcess.trim());\n\n // Extract and log elements\n const inputElements = extractElements(inputDoc);\n const outputElements = extractElements(outputDoc);\n\n return {\n inputElements,\n outputElements,\n missingElements: inputElements.filter(e => !outputElements.includes(e)),\n elementCounts: {\n input: inputElements.length,\n output: outputElements.length,\n },\n };\n })\n .generateScore(({ results }) => {\n const inputElements = results.preprocessStepResult?.inputElements;\n const outputElements = results.preprocessStepResult?.outputElements;\n\n return calculateCoverage({\n original: inputElements,\n simplified: outputElements,\n });\n });\n}\n","import { createScorer } from '@mastra/core/scores';\nimport { calculateRatio, countChanges } from '../../../ratio';\n\nexport function createTextualDifferenceScorer() {\n return createScorer({\n name: 'Textual Difference Scorer',\n description: 'Calculate textual difference between input and output using sequence matching algorithms.',\n type: 'agent',\n })\n .preprocess(async ({ run }) => {\n const input = run.input?.inputMessages?.map((i: { content: string }) => i.content).join(', ') || '';\n const output = run.output?.map((i: { content: string }) => i.content).join(', ') || '';\n\n // Calculate similarity ratio using LCS approach (similar to SequenceMatcher.ratio())\n const ratio = calculateRatio(input, output);\n\n // Count changes by comparing words\n const changes = countChanges(input, output);\n\n // Calculate confidence based on text length difference\n const maxLength = Math.max(input.length, output.length);\n const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;\n const confidence = 1 - lengthDiff;\n\n return {\n ratio,\n confidence,\n changes,\n lengthDiff,\n };\n })\n .generateScore(({ results }) => {\n return results.preprocessStepResult?.ratio;\n });\n}\n","import { createScorer } from '@mastra/core/scores';\nimport keyword_extractor from 'keyword-extractor';\n\nexport function createKeywordCoverageScorer() {\n return createScorer({\n name: 'Keyword Coverage Scorer',\n description:\n 'Leverage the nlp method from \"compromise\" to extract elements from the input and output and calculate the coverage.',\n type: 'agent',\n })\n .preprocess(async ({ run }) => {\n const input = run.input?.inputMessages?.map((i: { content: string }) => i.content).join(', ') || '';\n const output = run.output?.map((i: { content: string }) => i.content).join(', ') || '';\n\n if (!input && !output) {\n return {\n result: {\n referenceKeywords: new Set<string>(),\n responseKeywords: new Set<string>(),\n },\n };\n }\n\n const extractKeywords = (text: string) => {\n return keyword_extractor.extract(text, {\n language: 'english',\n remove_digits: true,\n return_changed_case: true,\n remove_duplicates: true,\n });\n };\n\n const referenceKeywords = new Set(extractKeywords(input));\n const responseKeywords = new Set(extractKeywords(output));\n return {\n referenceKeywords,\n responseKeywords,\n };\n })\n .analyze(async ({ results }) => {\n if (\n !results.preprocessStepResult?.referenceKeywords?.size &&\n !results.preprocessStepResult?.responseKeywords?.size\n ) {\n return {\n totalKeywordsLength: 0,\n matchedKeywordsLength: 0,\n };\n }\n\n const matchedKeywords = [...results.preprocessStepResult?.referenceKeywords].filter(k =>\n results.preprocessStepResult?.responseKeywords?.has(k),\n );\n\n return {\n totalKeywordsLength: Array.from(results.preprocessStepResult?.referenceKeywords).length ?? 0,\n matchedKeywordsLength: matchedKeywords.length ?? 0,\n };\n })\n .generateScore(({ results }) => {\n if (!results.analyzeStepResult?.totalKeywordsLength) {\n return 1;\n }\n\n const totalKeywords = results.analyzeStepResult?.totalKeywordsLength!;\n const matchedKeywords = results.analyzeStepResult?.matchedKeywordsLength!;\n return totalKeywords > 0 ? matchedKeywords / totalKeywords : 0;\n });\n}\n","import { createScorer } from '@mastra/core/scores';\nimport stringSimilarity from 'string-similarity';\n\ninterface ContentSimilarityOptions {\n ignoreCase?: boolean;\n ignoreWhitespace?: boolean;\n}\n\nexport function createContentSimilarityScorer(\n { ignoreCase, ignoreWhitespace }: ContentSimilarityOptions = { ignoreCase: true, ignoreWhitespace: true },\n) {\n return createScorer({\n name: 'Content Similarity Scorer',\n description: 'Calculates content similarity between input and output messages using string comparison algorithms.',\n type: 'agent',\n })\n .preprocess(async ({ run }) => {\n let processedInput = run.input?.inputMessages.map((i: { content: string }) => i.content).join(', ') || '';\n let processedOutput = run.output.map((i: { content: string }) => i.content).join(', ') || '';\n\n if (ignoreCase) {\n processedInput = processedInput.toLowerCase();\n processedOutput = processedOutput.toLowerCase();\n }\n\n if (ignoreWhitespace) {\n processedInput = processedInput.replace(/\\s+/g, ' ').trim();\n processedOutput = processedOutput.replace(/\\s+/g, ' ').trim();\n }\n\n return {\n processedInput,\n processedOutput,\n };\n })\n .generateScore(({ results }) => {\n const similarity = stringSimilarity.compareTwoStrings(\n results.preprocessStepResult?.processedInput,\n results.preprocessStepResult?.processedOutput,\n );\n\n return similarity;\n });\n}\n","import { createScorer } from '@mastra/core/scores';\nimport Sentiment from 'sentiment';\n\ninterface ToneScorerConfig {\n referenceTone?: string;\n}\n\nexport function createToneScorer(config: ToneScorerConfig = {}) {\n const { referenceTone } = config;\n\n return createScorer({\n name: 'Tone Scorer',\n description:\n 'Analyzes the tone and sentiment of agent responses using sentiment analysis. Can compare against a reference tone or evaluate sentiment stability.',\n type: 'agent',\n })\n .preprocess(async ({ run }) => {\n const sentiment = new Sentiment();\n const agentMessage: string = run.output?.map((i: { content: string }) => i.content).join(', ') || '';\n const responseSentiment = sentiment.analyze(agentMessage);\n\n if (referenceTone) {\n // Compare sentiment with reference\n const referenceSentiment = sentiment.analyze(referenceTone);\n const sentimentDiff = Math.abs(responseSentiment.comparative - referenceSentiment.comparative);\n const normalizedScore = Math.max(0, 1 - sentimentDiff);\n\n return {\n score: normalizedScore,\n responseSentiment: responseSentiment.comparative,\n referenceSentiment: referenceSentiment.comparative,\n difference: sentimentDiff,\n };\n }\n\n // Evaluate sentiment stability across response\n const sentences = agentMessage.match(/[^.!?]+[.!?]+/g) || [agentMessage];\n const sentiments = sentences.map(s => sentiment.analyze(s).comparative);\n const avgSentiment = sentiments.reduce((a, b) => a + b, 0) / sentiments.length;\n const variance = sentiments.reduce((sum, s) => sum + Math.pow(s - avgSentiment, 2), 0) / sentiments.length;\n const stability = Math.max(0, 1 - variance);\n\n return {\n score: stability,\n avgSentiment,\n sentimentVariance: variance,\n };\n })\n .generateScore(({ results }) => {\n return results.preprocessStepResult?.score;\n });\n}\n","import { createScorer } from '@mastra/core/scores';\nimport { extractToolCalls } from '../../utils';\ninterface ToolCallAccuracyOptions {\n expectedTool?: string;\n strictMode?: boolean;\n expectedToolOrder?: string[];\n}\n\nfunction checkToolOrder(actualTools: string[], expectedOrder: string[], strictMode: boolean = false): boolean {\n if (strictMode) {\n return JSON.stringify(actualTools) === JSON.stringify(expectedOrder);\n }\n\n const expectedIndices: number[] = [];\n for (const expectedTool of expectedOrder) {\n const index = actualTools.indexOf(expectedTool);\n if (index === -1) {\n return false;\n }\n expectedIndices.push(index);\n }\n\n for (let i = 1; i < expectedIndices.length; i++) {\n const currentIndex = expectedIndices[i];\n const prevIndex = expectedIndices[i - 1];\n if (currentIndex !== undefined && prevIndex !== undefined && currentIndex <= prevIndex) {\n return false;\n }\n }\n\n return true;\n}\n\nfunction calculateAccuracy({\n expectedTool,\n actualTools,\n strictMode = false,\n expectedToolOrder,\n}: {\n expectedTool?: string;\n actualTools: string[];\n strictMode?: boolean;\n expectedToolOrder?: string[];\n}): number {\n if (actualTools.length === 0) {\n return 0;\n }\n\n if (expectedToolOrder && expectedToolOrder.length > 0) {\n return checkToolOrder(actualTools, expectedToolOrder, strictMode) ? 1 : 0;\n }\n\n if (!expectedTool) {\n return 0;\n }\n\n if (strictMode) {\n return actualTools.length === 1 && actualTools[0] === expectedTool ? 1 : 0;\n }\n\n return actualTools.includes(expectedTool) ? 1 : 0;\n}\n\nexport function createToolCallAccuracyScorerCode(options: ToolCallAccuracyOptions) {\n const { expectedTool, strictMode = false, expectedToolOrder } = options;\n\n if (!expectedTool && !expectedToolOrder) {\n throw new Error('Either expectedTool or expectedToolOrder must be provided');\n }\n\n const getDescription = () => {\n return expectedToolOrder\n ? `Evaluates whether the LLM called tools in the correct order: [${expectedToolOrder.join(', ')}]`\n : `Evaluates whether the LLM selected the correct tool (${expectedTool}) from the available tools`;\n };\n\n return createScorer({\n name: 'Tool Call Accuracy Scorer',\n description: getDescription(),\n type: 'agent',\n })\n .preprocess(async ({ run }) => {\n const isInputInvalid = !run.input || !run.input.inputMessages || run.input.inputMessages.length === 0;\n const isOutputInvalid = !run.output || run.output.length === 0;\n\n if (isInputInvalid || isOutputInvalid) {\n throw new Error('Input and output messages cannot be null or empty');\n }\n\n const { tools: actualTools, toolCallInfos } = extractToolCalls(run.output);\n\n const correctToolCalled = expectedTool\n ? strictMode\n ? actualTools.length === 1 && actualTools[0] === expectedTool\n : actualTools.includes(expectedTool)\n : false;\n\n return {\n expectedTool,\n actualTools,\n strictMode,\n expectedToolOrder,\n hasToolCalls: actualTools.length > 0,\n correctToolCalled,\n toolCallInfos,\n correctOrderCalled: expectedToolOrder ? checkToolOrder(actualTools, expectedToolOrder, strictMode) : null,\n };\n })\n .generateScore(({ results }) => {\n const preprocessResult = results.preprocessStepResult;\n if (!preprocessResult) {\n return 0;\n }\n\n return calculateAccuracy({\n expectedTool: preprocessResult.expectedTool,\n actualTools: preprocessResult.actualTools,\n strictMode: preprocessResult.strictMode,\n expectedToolOrder: preprocessResult.expectedToolOrder,\n });\n });\n}\n"]}
|