@mastra/evals 0.10.5 → 0.10.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE.md +11 -42
- package/README.md +0 -7
- package/dist/_tsup-dts-rollup.d.cts +217 -0
- package/dist/_tsup-dts-rollup.d.ts +217 -0
- package/dist/chunk-2JVD5IX6.cjs +8 -0
- package/dist/chunk-UYXFD4VX.js +6 -0
- package/dist/{dist-M6SH7RKY.js → dist-5JXLPLM2.js} +8 -8
- package/dist/{dist-HYT46G4X.cjs → dist-IVAARSAW.cjs} +8 -8
- package/dist/index.cjs +1 -1
- package/dist/index.js +1 -1
- package/dist/{magic-string.es-WF7K5PCM.cjs → magic-string.es-66FD77JZ.cjs} +7 -13
- package/dist/{magic-string.es-2DLRP5BO.js → magic-string.es-LD4FLE5J.js} +7 -13
- package/dist/metrics/llm/index.cjs +13 -17
- package/dist/metrics/llm/index.js +2 -6
- package/dist/scorers/code/index.cjs +220 -0
- package/dist/scorers/code/index.d.cts +4 -0
- package/dist/scorers/code/index.d.ts +4 -0
- package/dist/scorers/code/index.js +209 -0
- package/dist/scorers/llm/index.cjs +1036 -0
- package/dist/scorers/llm/index.d.cts +11 -0
- package/dist/scorers/llm/index.d.ts +11 -0
- package/dist/scorers/llm/index.js +1028 -0
- package/package.json +28 -8
|
@@ -1,14 +1,10 @@
|
|
|
1
1
|
'use strict';
|
|
2
2
|
|
|
3
3
|
var chunkCOBCYVZ7_cjs = require('../../chunk-COBCYVZ7.cjs');
|
|
4
|
+
var chunk2JVD5IX6_cjs = require('../../chunk-2JVD5IX6.cjs');
|
|
4
5
|
var _eval = require('@mastra/core/eval');
|
|
5
6
|
var zod = require('zod');
|
|
6
7
|
|
|
7
|
-
// src/metrics/llm/utils.ts
|
|
8
|
-
var roundToTwoDecimals = (num) => {
|
|
9
|
-
return Math.round((num + Number.EPSILON) * 100) / 100;
|
|
10
|
-
};
|
|
11
|
-
|
|
12
8
|
// src/metrics/llm/answer-relevancy/prompts.ts
|
|
13
9
|
var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.
|
|
14
10
|
|
|
@@ -187,7 +183,7 @@ function generateReasonPrompt({
|
|
|
187
183
|
output,
|
|
188
184
|
scale
|
|
189
185
|
}) {
|
|
190
|
-
return `Explain the
|
|
186
|
+
return `Explain the relevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
|
|
191
187
|
Context:
|
|
192
188
|
Input: ${input}
|
|
193
189
|
Output: ${output}
|
|
@@ -288,7 +284,7 @@ var AnswerRelevancyMetric = class extends _eval.Metric {
|
|
|
288
284
|
}
|
|
289
285
|
}
|
|
290
286
|
const score = relevancyCount / numberOfVerdicts;
|
|
291
|
-
return roundToTwoDecimals(score * this.scale);
|
|
287
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
|
|
292
288
|
}
|
|
293
289
|
};
|
|
294
290
|
|
|
@@ -492,7 +488,7 @@ var ContextPositionMetric = class extends _eval.Metric {
|
|
|
492
488
|
return 0;
|
|
493
489
|
}
|
|
494
490
|
const finalScore = weightedSum / maxPossibleSum * this.scale;
|
|
495
|
-
return roundToTwoDecimals(finalScore);
|
|
491
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(finalScore);
|
|
496
492
|
}
|
|
497
493
|
};
|
|
498
494
|
|
|
@@ -700,7 +696,7 @@ var ContextPrecisionMetric = class extends _eval.Metric {
|
|
|
700
696
|
return 0;
|
|
701
697
|
}
|
|
702
698
|
const finalScore = weightedPrecisionSum / relevantCount;
|
|
703
|
-
return roundToTwoDecimals(finalScore * this.scale);
|
|
699
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(finalScore * this.scale);
|
|
704
700
|
}
|
|
705
701
|
};
|
|
706
702
|
|
|
@@ -938,7 +934,7 @@ var FaithfulnessMetric = class extends _eval.Metric {
|
|
|
938
934
|
return 0;
|
|
939
935
|
}
|
|
940
936
|
const score = supportedClaims / totalClaims * this.scale;
|
|
941
|
-
return roundToTwoDecimals(score);
|
|
937
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score);
|
|
942
938
|
}
|
|
943
939
|
};
|
|
944
940
|
|
|
@@ -1155,7 +1151,7 @@ var HallucinationMetric = class extends _eval.Metric {
|
|
|
1155
1151
|
return 0;
|
|
1156
1152
|
}
|
|
1157
1153
|
const score = contradictedStatements / totalStatements * this.scale;
|
|
1158
|
-
return roundToTwoDecimals(score);
|
|
1154
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score);
|
|
1159
1155
|
}
|
|
1160
1156
|
};
|
|
1161
1157
|
|
|
@@ -1459,7 +1455,7 @@ var PromptAlignmentMetric = class extends _eval.Metric {
|
|
|
1459
1455
|
},
|
|
1460
1456
|
{ naCount: 0, alignmentCount: 0, applicableCount: 0 }
|
|
1461
1457
|
);
|
|
1462
|
-
const score = counts.applicableCount > 0 ? roundToTwoDecimals(counts.alignmentCount / counts.applicableCount * this.scale) : 0;
|
|
1458
|
+
const score = counts.applicableCount > 0 ? chunk2JVD5IX6_cjs.roundToTwoDecimals(counts.alignmentCount / counts.applicableCount * this.scale) : 0;
|
|
1463
1459
|
return {
|
|
1464
1460
|
score,
|
|
1465
1461
|
totalInstructions,
|
|
@@ -1619,7 +1615,7 @@ var ToxicityMetric = class extends _eval.Metric {
|
|
|
1619
1615
|
}
|
|
1620
1616
|
}
|
|
1621
1617
|
const score = toxicityCount / numberOfVerdicts;
|
|
1622
|
-
return roundToTwoDecimals(score * this.scale);
|
|
1618
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
|
|
1623
1619
|
}
|
|
1624
1620
|
};
|
|
1625
1621
|
|
|
@@ -1795,7 +1791,7 @@ var ContextRelevancyMetric = class extends _eval.Metric {
|
|
|
1795
1791
|
}
|
|
1796
1792
|
const relevantVerdicts = verdicts.filter((v) => v.verdict.toLowerCase() === "yes");
|
|
1797
1793
|
const score = relevantVerdicts.length / totalVerdicts;
|
|
1798
|
-
return roundToTwoDecimals(score * this.scale);
|
|
1794
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
|
|
1799
1795
|
}
|
|
1800
1796
|
};
|
|
1801
1797
|
|
|
@@ -1941,7 +1937,7 @@ var ContextualRecallMetric = class extends _eval.Metric {
|
|
|
1941
1937
|
}
|
|
1942
1938
|
const justifiedVerdicts = verdicts.filter((v) => v.verdict === "yes");
|
|
1943
1939
|
const score = justifiedVerdicts.length / totalVerdicts;
|
|
1944
|
-
return roundToTwoDecimals(score * this.scale);
|
|
1940
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
|
|
1945
1941
|
}
|
|
1946
1942
|
};
|
|
1947
1943
|
|
|
@@ -2288,7 +2284,7 @@ var SummarizationMetric = class extends _eval.Metric {
|
|
|
2288
2284
|
}
|
|
2289
2285
|
}
|
|
2290
2286
|
const score = positiveCount / numberOfVerdicts;
|
|
2291
|
-
return roundToTwoDecimals(score * this.scale);
|
|
2287
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
|
|
2292
2288
|
}
|
|
2293
2289
|
};
|
|
2294
2290
|
|
|
@@ -2466,7 +2462,7 @@ var BiasMetric = class extends _eval.Metric {
|
|
|
2466
2462
|
}
|
|
2467
2463
|
const biasedVerdicts = evaluation.filter((v) => v.verdict.toLowerCase() === "yes");
|
|
2468
2464
|
const score = biasedVerdicts.length / numberOfVerdicts;
|
|
2469
|
-
return roundToTwoDecimals(score * this.scale);
|
|
2465
|
+
return chunk2JVD5IX6_cjs.roundToTwoDecimals(score * this.scale);
|
|
2470
2466
|
}
|
|
2471
2467
|
};
|
|
2472
2468
|
|
|
@@ -1,12 +1,8 @@
|
|
|
1
1
|
import { MastraAgentJudge } from '../../chunk-TXXJUIES.js';
|
|
2
|
+
import { roundToTwoDecimals } from '../../chunk-UYXFD4VX.js';
|
|
2
3
|
import { Metric } from '@mastra/core/eval';
|
|
3
4
|
import { z } from 'zod';
|
|
4
5
|
|
|
5
|
-
// src/metrics/llm/utils.ts
|
|
6
|
-
var roundToTwoDecimals = (num) => {
|
|
7
|
-
return Math.round((num + Number.EPSILON) * 100) / 100;
|
|
8
|
-
};
|
|
9
|
-
|
|
10
6
|
// src/metrics/llm/answer-relevancy/prompts.ts
|
|
11
7
|
var ANSWER_RELEVANCY_AGENT_INSTRUCTIONS = `You are a balanced and nuanced answer relevancy evaluator. Your job is to determine if LLM outputs are relevant to the input, including handling partially relevant or uncertain cases.
|
|
12
8
|
|
|
@@ -185,7 +181,7 @@ function generateReasonPrompt({
|
|
|
185
181
|
output,
|
|
186
182
|
scale
|
|
187
183
|
}) {
|
|
188
|
-
return `Explain the
|
|
184
|
+
return `Explain the relevancy score where 0 is the lowest and ${scale} is the highest for the LLM's response using this context:
|
|
189
185
|
Context:
|
|
190
186
|
Input: ${input}
|
|
191
187
|
Output: ${output}
|
|
@@ -0,0 +1,220 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var scores = require('@mastra/core/scores');
|
|
4
|
+
var nlp = require('compromise');
|
|
5
|
+
var difflib = require('difflib');
|
|
6
|
+
var keyword_extractor = require('keyword-extractor');
|
|
7
|
+
var stringSimilarity = require('string-similarity');
|
|
8
|
+
|
|
9
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
10
|
+
|
|
11
|
+
var nlp__default = /*#__PURE__*/_interopDefault(nlp);
|
|
12
|
+
var keyword_extractor__default = /*#__PURE__*/_interopDefault(keyword_extractor);
|
|
13
|
+
var stringSimilarity__default = /*#__PURE__*/_interopDefault(stringSimilarity);
|
|
14
|
+
|
|
15
|
+
function normalizeString(str) {
|
|
16
|
+
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
17
|
+
}
|
|
18
|
+
function extractElements(doc) {
|
|
19
|
+
const nouns = doc.nouns().out("array") || [];
|
|
20
|
+
const verbs = doc.verbs().toInfinitive().out("array") || [];
|
|
21
|
+
const topics = doc.topics().out("array") || [];
|
|
22
|
+
const terms = doc.terms().out("array") || [];
|
|
23
|
+
const cleanAndSplitTerm = (term) => {
|
|
24
|
+
const normalized = normalizeString(term);
|
|
25
|
+
return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
|
|
26
|
+
};
|
|
27
|
+
const processedTerms = [
|
|
28
|
+
...nouns.flatMap(cleanAndSplitTerm),
|
|
29
|
+
...verbs.flatMap(cleanAndSplitTerm),
|
|
30
|
+
...topics.flatMap(cleanAndSplitTerm),
|
|
31
|
+
...terms.flatMap(cleanAndSplitTerm)
|
|
32
|
+
];
|
|
33
|
+
return [...new Set(processedTerms)];
|
|
34
|
+
}
|
|
35
|
+
function calculateCoverage({ original, simplified }) {
|
|
36
|
+
if (original.length === 0) {
|
|
37
|
+
return simplified.length === 0 ? 1 : 0;
|
|
38
|
+
}
|
|
39
|
+
const covered = original.filter(
|
|
40
|
+
(element) => simplified.some((s) => {
|
|
41
|
+
const elem = normalizeString(element);
|
|
42
|
+
const simp = normalizeString(s);
|
|
43
|
+
if (elem.length <= 3) {
|
|
44
|
+
return elem === simp;
|
|
45
|
+
}
|
|
46
|
+
const longer = elem.length > simp.length ? elem : simp;
|
|
47
|
+
const shorter = elem.length > simp.length ? simp : elem;
|
|
48
|
+
if (longer.includes(shorter)) {
|
|
49
|
+
return shorter.length / longer.length > 0.6;
|
|
50
|
+
}
|
|
51
|
+
return false;
|
|
52
|
+
})
|
|
53
|
+
);
|
|
54
|
+
return covered.length / original.length;
|
|
55
|
+
}
|
|
56
|
+
function createCompletenessScorer() {
|
|
57
|
+
return scores.createScorer({
|
|
58
|
+
name: "Completeness",
|
|
59
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
60
|
+
extract: async (run) => {
|
|
61
|
+
const isInputInvalid = !run.input || run.input.some((i) => i.content === null || i.content === void 0);
|
|
62
|
+
const isOutputInvalid = !run.output || run.output.text === null || run.output.text === void 0;
|
|
63
|
+
if (isInputInvalid || isOutputInvalid) {
|
|
64
|
+
throw new Error("Inputs cannot be null or undefined");
|
|
65
|
+
}
|
|
66
|
+
const input = run.input.map((i) => i.content).join(", ");
|
|
67
|
+
const output = run.output.text;
|
|
68
|
+
const inputToProcess = input;
|
|
69
|
+
const outputToProcess = output;
|
|
70
|
+
const inputDoc = nlp__default.default(inputToProcess.trim());
|
|
71
|
+
const outputDoc = nlp__default.default(outputToProcess.trim());
|
|
72
|
+
const inputElements = extractElements(inputDoc);
|
|
73
|
+
const outputElements = extractElements(outputDoc);
|
|
74
|
+
return {
|
|
75
|
+
result: {
|
|
76
|
+
inputElements,
|
|
77
|
+
outputElements,
|
|
78
|
+
missingElements: inputElements.filter((e) => !outputElements.includes(e)),
|
|
79
|
+
elementCounts: {
|
|
80
|
+
input: inputElements.length,
|
|
81
|
+
output: outputElements.length
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
};
|
|
85
|
+
},
|
|
86
|
+
analyze: async (run) => {
|
|
87
|
+
const inputElements = run.extractStepResult?.inputElements;
|
|
88
|
+
const outputElements = run.extractStepResult?.outputElements;
|
|
89
|
+
return {
|
|
90
|
+
score: calculateCoverage({
|
|
91
|
+
original: inputElements,
|
|
92
|
+
simplified: outputElements
|
|
93
|
+
})
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
}
|
|
98
|
+
function createTextualDifferenceScorer() {
|
|
99
|
+
return scores.createScorer({
|
|
100
|
+
name: "Completeness",
|
|
101
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
102
|
+
analyze: async (run) => {
|
|
103
|
+
const input = run.input.map((i) => i.content).join(", ");
|
|
104
|
+
const output = run.output.text;
|
|
105
|
+
const matcher = new difflib.SequenceMatcher(null, input, output);
|
|
106
|
+
const ratio = matcher.ratio();
|
|
107
|
+
const ops = matcher.getOpcodes();
|
|
108
|
+
const changes = ops.filter(([op]) => op !== "equal").length;
|
|
109
|
+
const maxLength = Math.max(input.length, output.length);
|
|
110
|
+
const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
|
|
111
|
+
const confidence = 1 - lengthDiff;
|
|
112
|
+
return {
|
|
113
|
+
score: ratio,
|
|
114
|
+
result: {
|
|
115
|
+
confidence,
|
|
116
|
+
changes,
|
|
117
|
+
lengthDiff
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
}
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
function createKeywordCoverageScorer() {
|
|
124
|
+
return scores.createScorer({
|
|
125
|
+
name: "Completeness",
|
|
126
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
127
|
+
extract: async (run) => {
|
|
128
|
+
const input = run.input.map((i) => i.content).join(", ");
|
|
129
|
+
const output = run.output.text;
|
|
130
|
+
if (!input && !output) {
|
|
131
|
+
return {
|
|
132
|
+
result: {
|
|
133
|
+
referenceKeywords: /* @__PURE__ */ new Set(),
|
|
134
|
+
responseKeywords: /* @__PURE__ */ new Set()
|
|
135
|
+
}
|
|
136
|
+
};
|
|
137
|
+
}
|
|
138
|
+
const extractKeywords = (text) => {
|
|
139
|
+
return keyword_extractor__default.default.extract(text, {
|
|
140
|
+
language: "english",
|
|
141
|
+
remove_digits: true,
|
|
142
|
+
return_changed_case: true,
|
|
143
|
+
remove_duplicates: true
|
|
144
|
+
});
|
|
145
|
+
};
|
|
146
|
+
const referenceKeywords = new Set(extractKeywords(input));
|
|
147
|
+
const responseKeywords = new Set(extractKeywords(output));
|
|
148
|
+
return {
|
|
149
|
+
result: {
|
|
150
|
+
referenceKeywords,
|
|
151
|
+
responseKeywords
|
|
152
|
+
}
|
|
153
|
+
};
|
|
154
|
+
},
|
|
155
|
+
analyze: async (run) => {
|
|
156
|
+
if (!run.extractStepResult?.referenceKeywords.size && !run.extractStepResult?.responseKeywords.size) {
|
|
157
|
+
return {
|
|
158
|
+
score: 1,
|
|
159
|
+
result: {
|
|
160
|
+
totalKeywords: 0,
|
|
161
|
+
matchedKeywords: 0
|
|
162
|
+
}
|
|
163
|
+
};
|
|
164
|
+
}
|
|
165
|
+
const matchedKeywords = [...run.extractStepResult?.referenceKeywords].filter(
|
|
166
|
+
(k) => run.extractStepResult?.responseKeywords.has(k)
|
|
167
|
+
);
|
|
168
|
+
const totalKeywords = run.extractStepResult?.referenceKeywords.size;
|
|
169
|
+
const coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
|
|
170
|
+
return {
|
|
171
|
+
score: coverage,
|
|
172
|
+
result: {
|
|
173
|
+
totalKeywords: run.extractStepResult?.referenceKeywords.size,
|
|
174
|
+
matchedKeywords: matchedKeywords.length
|
|
175
|
+
}
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
|
|
181
|
+
return scores.createScorer({
|
|
182
|
+
name: "Completeness",
|
|
183
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
184
|
+
extract: async (run) => {
|
|
185
|
+
let processedInput = run.input.map((i) => i.content).join(", ");
|
|
186
|
+
let processedOutput = run.output.text;
|
|
187
|
+
if (ignoreCase) {
|
|
188
|
+
processedInput = processedInput.toLowerCase();
|
|
189
|
+
processedOutput = processedOutput.toLowerCase();
|
|
190
|
+
}
|
|
191
|
+
if (ignoreWhitespace) {
|
|
192
|
+
processedInput = processedInput.replace(/\s+/g, " ").trim();
|
|
193
|
+
processedOutput = processedOutput.replace(/\s+/g, " ").trim();
|
|
194
|
+
}
|
|
195
|
+
return {
|
|
196
|
+
result: {
|
|
197
|
+
processedInput,
|
|
198
|
+
processedOutput
|
|
199
|
+
}
|
|
200
|
+
};
|
|
201
|
+
},
|
|
202
|
+
analyze: async (run) => {
|
|
203
|
+
const similarity = stringSimilarity__default.default.compareTwoStrings(
|
|
204
|
+
run.extractStepResult?.processedInput,
|
|
205
|
+
run.extractStepResult?.processedOutput
|
|
206
|
+
);
|
|
207
|
+
return {
|
|
208
|
+
score: similarity,
|
|
209
|
+
result: {
|
|
210
|
+
similarity
|
|
211
|
+
}
|
|
212
|
+
};
|
|
213
|
+
}
|
|
214
|
+
});
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
exports.createCompletenessScorer = createCompletenessScorer;
|
|
218
|
+
exports.createContentSimilarityScorer = createContentSimilarityScorer;
|
|
219
|
+
exports.createKeywordCoverageScorer = createKeywordCoverageScorer;
|
|
220
|
+
exports.createTextualDifferenceScorer = createTextualDifferenceScorer;
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '../../_tsup-dts-rollup.cjs';
|
|
2
|
+
export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.cjs';
|
|
3
|
+
export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.cjs';
|
|
4
|
+
export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.cjs';
|
|
@@ -0,0 +1,4 @@
|
|
|
1
|
+
export { createCompletenessScorer_alias_1 as createCompletenessScorer } from '../../_tsup-dts-rollup.js';
|
|
2
|
+
export { createTextualDifferenceScorer_alias_1 as createTextualDifferenceScorer } from '../../_tsup-dts-rollup.js';
|
|
3
|
+
export { createKeywordCoverageScorer_alias_1 as createKeywordCoverageScorer } from '../../_tsup-dts-rollup.js';
|
|
4
|
+
export { createContentSimilarityScorer_alias_1 as createContentSimilarityScorer } from '../../_tsup-dts-rollup.js';
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
import { createScorer } from '@mastra/core/scores';
|
|
2
|
+
import nlp from 'compromise';
|
|
3
|
+
import { SequenceMatcher } from 'difflib';
|
|
4
|
+
import keyword_extractor from 'keyword-extractor';
|
|
5
|
+
import stringSimilarity from 'string-similarity';
|
|
6
|
+
|
|
7
|
+
function normalizeString(str) {
|
|
8
|
+
return str.normalize("NFD").replace(/[\u0300-\u036f]/g, "").toLowerCase();
|
|
9
|
+
}
|
|
10
|
+
function extractElements(doc) {
|
|
11
|
+
const nouns = doc.nouns().out("array") || [];
|
|
12
|
+
const verbs = doc.verbs().toInfinitive().out("array") || [];
|
|
13
|
+
const topics = doc.topics().out("array") || [];
|
|
14
|
+
const terms = doc.terms().out("array") || [];
|
|
15
|
+
const cleanAndSplitTerm = (term) => {
|
|
16
|
+
const normalized = normalizeString(term);
|
|
17
|
+
return normalized.replace(/([a-z])([A-Z])/g, "$1 $2").replace(/[^a-z0-9]+/g, " ").trim().split(/\s+/).filter((word) => word.length > 0);
|
|
18
|
+
};
|
|
19
|
+
const processedTerms = [
|
|
20
|
+
...nouns.flatMap(cleanAndSplitTerm),
|
|
21
|
+
...verbs.flatMap(cleanAndSplitTerm),
|
|
22
|
+
...topics.flatMap(cleanAndSplitTerm),
|
|
23
|
+
...terms.flatMap(cleanAndSplitTerm)
|
|
24
|
+
];
|
|
25
|
+
return [...new Set(processedTerms)];
|
|
26
|
+
}
|
|
27
|
+
function calculateCoverage({ original, simplified }) {
|
|
28
|
+
if (original.length === 0) {
|
|
29
|
+
return simplified.length === 0 ? 1 : 0;
|
|
30
|
+
}
|
|
31
|
+
const covered = original.filter(
|
|
32
|
+
(element) => simplified.some((s) => {
|
|
33
|
+
const elem = normalizeString(element);
|
|
34
|
+
const simp = normalizeString(s);
|
|
35
|
+
if (elem.length <= 3) {
|
|
36
|
+
return elem === simp;
|
|
37
|
+
}
|
|
38
|
+
const longer = elem.length > simp.length ? elem : simp;
|
|
39
|
+
const shorter = elem.length > simp.length ? simp : elem;
|
|
40
|
+
if (longer.includes(shorter)) {
|
|
41
|
+
return shorter.length / longer.length > 0.6;
|
|
42
|
+
}
|
|
43
|
+
return false;
|
|
44
|
+
})
|
|
45
|
+
);
|
|
46
|
+
return covered.length / original.length;
|
|
47
|
+
}
|
|
48
|
+
function createCompletenessScorer() {
|
|
49
|
+
return createScorer({
|
|
50
|
+
name: "Completeness",
|
|
51
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
52
|
+
extract: async (run) => {
|
|
53
|
+
const isInputInvalid = !run.input || run.input.some((i) => i.content === null || i.content === void 0);
|
|
54
|
+
const isOutputInvalid = !run.output || run.output.text === null || run.output.text === void 0;
|
|
55
|
+
if (isInputInvalid || isOutputInvalid) {
|
|
56
|
+
throw new Error("Inputs cannot be null or undefined");
|
|
57
|
+
}
|
|
58
|
+
const input = run.input.map((i) => i.content).join(", ");
|
|
59
|
+
const output = run.output.text;
|
|
60
|
+
const inputToProcess = input;
|
|
61
|
+
const outputToProcess = output;
|
|
62
|
+
const inputDoc = nlp(inputToProcess.trim());
|
|
63
|
+
const outputDoc = nlp(outputToProcess.trim());
|
|
64
|
+
const inputElements = extractElements(inputDoc);
|
|
65
|
+
const outputElements = extractElements(outputDoc);
|
|
66
|
+
return {
|
|
67
|
+
result: {
|
|
68
|
+
inputElements,
|
|
69
|
+
outputElements,
|
|
70
|
+
missingElements: inputElements.filter((e) => !outputElements.includes(e)),
|
|
71
|
+
elementCounts: {
|
|
72
|
+
input: inputElements.length,
|
|
73
|
+
output: outputElements.length
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
};
|
|
77
|
+
},
|
|
78
|
+
analyze: async (run) => {
|
|
79
|
+
const inputElements = run.extractStepResult?.inputElements;
|
|
80
|
+
const outputElements = run.extractStepResult?.outputElements;
|
|
81
|
+
return {
|
|
82
|
+
score: calculateCoverage({
|
|
83
|
+
original: inputElements,
|
|
84
|
+
simplified: outputElements
|
|
85
|
+
})
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
});
|
|
89
|
+
}
|
|
90
|
+
function createTextualDifferenceScorer() {
|
|
91
|
+
return createScorer({
|
|
92
|
+
name: "Completeness",
|
|
93
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
94
|
+
analyze: async (run) => {
|
|
95
|
+
const input = run.input.map((i) => i.content).join(", ");
|
|
96
|
+
const output = run.output.text;
|
|
97
|
+
const matcher = new SequenceMatcher(null, input, output);
|
|
98
|
+
const ratio = matcher.ratio();
|
|
99
|
+
const ops = matcher.getOpcodes();
|
|
100
|
+
const changes = ops.filter(([op]) => op !== "equal").length;
|
|
101
|
+
const maxLength = Math.max(input.length, output.length);
|
|
102
|
+
const lengthDiff = maxLength > 0 ? Math.abs(input.length - output.length) / maxLength : 0;
|
|
103
|
+
const confidence = 1 - lengthDiff;
|
|
104
|
+
return {
|
|
105
|
+
score: ratio,
|
|
106
|
+
result: {
|
|
107
|
+
confidence,
|
|
108
|
+
changes,
|
|
109
|
+
lengthDiff
|
|
110
|
+
}
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
});
|
|
114
|
+
}
|
|
115
|
+
function createKeywordCoverageScorer() {
|
|
116
|
+
return createScorer({
|
|
117
|
+
name: "Completeness",
|
|
118
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
119
|
+
extract: async (run) => {
|
|
120
|
+
const input = run.input.map((i) => i.content).join(", ");
|
|
121
|
+
const output = run.output.text;
|
|
122
|
+
if (!input && !output) {
|
|
123
|
+
return {
|
|
124
|
+
result: {
|
|
125
|
+
referenceKeywords: /* @__PURE__ */ new Set(),
|
|
126
|
+
responseKeywords: /* @__PURE__ */ new Set()
|
|
127
|
+
}
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
const extractKeywords = (text) => {
|
|
131
|
+
return keyword_extractor.extract(text, {
|
|
132
|
+
language: "english",
|
|
133
|
+
remove_digits: true,
|
|
134
|
+
return_changed_case: true,
|
|
135
|
+
remove_duplicates: true
|
|
136
|
+
});
|
|
137
|
+
};
|
|
138
|
+
const referenceKeywords = new Set(extractKeywords(input));
|
|
139
|
+
const responseKeywords = new Set(extractKeywords(output));
|
|
140
|
+
return {
|
|
141
|
+
result: {
|
|
142
|
+
referenceKeywords,
|
|
143
|
+
responseKeywords
|
|
144
|
+
}
|
|
145
|
+
};
|
|
146
|
+
},
|
|
147
|
+
analyze: async (run) => {
|
|
148
|
+
if (!run.extractStepResult?.referenceKeywords.size && !run.extractStepResult?.responseKeywords.size) {
|
|
149
|
+
return {
|
|
150
|
+
score: 1,
|
|
151
|
+
result: {
|
|
152
|
+
totalKeywords: 0,
|
|
153
|
+
matchedKeywords: 0
|
|
154
|
+
}
|
|
155
|
+
};
|
|
156
|
+
}
|
|
157
|
+
const matchedKeywords = [...run.extractStepResult?.referenceKeywords].filter(
|
|
158
|
+
(k) => run.extractStepResult?.responseKeywords.has(k)
|
|
159
|
+
);
|
|
160
|
+
const totalKeywords = run.extractStepResult?.referenceKeywords.size;
|
|
161
|
+
const coverage = totalKeywords > 0 ? matchedKeywords.length / totalKeywords : 0;
|
|
162
|
+
return {
|
|
163
|
+
score: coverage,
|
|
164
|
+
result: {
|
|
165
|
+
totalKeywords: run.extractStepResult?.referenceKeywords.size,
|
|
166
|
+
matchedKeywords: matchedKeywords.length
|
|
167
|
+
}
|
|
168
|
+
};
|
|
169
|
+
}
|
|
170
|
+
});
|
|
171
|
+
}
|
|
172
|
+
function createContentSimilarityScorer({ ignoreCase, ignoreWhitespace } = { ignoreCase: true, ignoreWhitespace: true }) {
|
|
173
|
+
return createScorer({
|
|
174
|
+
name: "Completeness",
|
|
175
|
+
description: 'Leverage the nlp method from "compromise" to extract elements from the input and output and calculate the coverage.',
|
|
176
|
+
extract: async (run) => {
|
|
177
|
+
let processedInput = run.input.map((i) => i.content).join(", ");
|
|
178
|
+
let processedOutput = run.output.text;
|
|
179
|
+
if (ignoreCase) {
|
|
180
|
+
processedInput = processedInput.toLowerCase();
|
|
181
|
+
processedOutput = processedOutput.toLowerCase();
|
|
182
|
+
}
|
|
183
|
+
if (ignoreWhitespace) {
|
|
184
|
+
processedInput = processedInput.replace(/\s+/g, " ").trim();
|
|
185
|
+
processedOutput = processedOutput.replace(/\s+/g, " ").trim();
|
|
186
|
+
}
|
|
187
|
+
return {
|
|
188
|
+
result: {
|
|
189
|
+
processedInput,
|
|
190
|
+
processedOutput
|
|
191
|
+
}
|
|
192
|
+
};
|
|
193
|
+
},
|
|
194
|
+
analyze: async (run) => {
|
|
195
|
+
const similarity = stringSimilarity.compareTwoStrings(
|
|
196
|
+
run.extractStepResult?.processedInput,
|
|
197
|
+
run.extractStepResult?.processedOutput
|
|
198
|
+
);
|
|
199
|
+
return {
|
|
200
|
+
score: similarity,
|
|
201
|
+
result: {
|
|
202
|
+
similarity
|
|
203
|
+
}
|
|
204
|
+
};
|
|
205
|
+
}
|
|
206
|
+
});
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
export { createCompletenessScorer, createContentSimilarityScorer, createKeywordCoverageScorer, createTextualDifferenceScorer };
|