llm-testrunner-components 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -48
- package/dist/cjs/index.cjs.js +24610 -60
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.css +14 -2
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +38 -9
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/lib/evaluation/constant.js +4 -0
- package/dist/collection/lib/evaluation/constant.js.map +1 -0
- package/dist/collection/lib/evaluation/constants/evaluation-approach.js +6 -0
- package/dist/collection/lib/evaluation/constants/evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluation-engine.js +28 -44
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluators/exact/exact.js +51 -0
- package/dist/collection/lib/evaluation/evaluators/exact/exact.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.js +82 -0
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.js +73 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.test.js +313 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +63 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/evaluate-keywords.js +56 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/evaluate-keywords.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/index.js +7 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/index.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/model-loader.js +15 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/model-loader.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/similarity-utils.js +16 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/similarity-utils.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js +65 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/text-utils.js +5 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/text-utils.js.map +1 -0
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js +118 -0
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/rate-limited-fetcher/rate-limited-fetcher.js +6 -6
- package/dist/collection/lib/rate-limited-fetcher/rate-limited-fetcher.js.map +1 -1
- package/dist/collection/types/evaluation.js +6 -0
- package/dist/collection/types/evaluation.js.map +1 -0
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-lpWX1sHl.js +26319 -0
- package/dist/components/p-lpWX1sHl.js.map +1 -0
- package/dist/esm/index.js +24609 -60
- package/dist/esm/index.js.map +1 -1
- package/dist/llm-testrunner/index.esm.js +6 -1
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +4 -1
- package/dist/types/lib/evaluation/constant.d.ts +3 -0
- package/dist/types/lib/evaluation/constants/evaluation-approach.d.ts +4 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +0 -4
- package/dist/types/lib/evaluation/evaluators/exact/exact.d.ts +3 -0
- package/dist/types/lib/evaluation/evaluators/rouge1-evaluator.d.ts +17 -0
- package/dist/types/lib/evaluation/evaluators/rougeL-evaluator.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/rougeL-evaluator.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/SemanticEvaluator.d.ts +6 -0
- package/dist/types/lib/evaluation/evaluators/semantic/evaluate-keywords.d.ts +7 -0
- package/dist/types/lib/evaluation/evaluators/semantic/index.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/semantic/model-loader.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/similarity-utils.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/text-utils.d.ts +1 -0
- package/dist/types/lib/evaluation/rouge1-evaluator.test.d.ts +1 -0
- package/dist/types/lib/evaluation/types.d.ts +19 -0
- package/dist/types/lib/rate-limited-fetcher/rate-limited-fetcher.d.ts +1 -1
- package/dist/types/types/evaluation.d.ts +10 -0
- package/package.json +10 -6
- package/dist/components/p-CYUbsbxt.js +0 -1770
- package/dist/components/p-CYUbsbxt.js.map +0 -1
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import * as rouge from "js-rouge";
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE, ROUGE } from "../constant";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluates a single keyword against the candidate text using ROUGE-1.
|
|
5
|
+
*
|
|
6
|
+
* @param {string} keyword - The expected keyword to evaluate
|
|
7
|
+
* @param {string} candidate - The actual response text
|
|
8
|
+
* @param {number} rougeThreshold - The minimum ROUGE-1 score required to pass
|
|
9
|
+
* @returns {KeywordMatch} The evaluation result for this keyword
|
|
10
|
+
*/
|
|
11
|
+
function evaluateKeyword(keyword, candidate, rougeThreshold) {
|
|
12
|
+
let rouge1Score = 0;
|
|
13
|
+
try {
|
|
14
|
+
if (keyword.trim().length > 0 && candidate.length > 0) {
|
|
15
|
+
const rouge1 = rouge.n(candidate, keyword.trim(), { n: 1 });
|
|
16
|
+
rouge1Score = isNaN(rouge1) ? 0 : rouge1;
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
console.warn(`ROUGE-1 not computed for keyword "${keyword}": Keyword or Candidate is missing.`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
catch (err) {
|
|
23
|
+
console.error(`ROUGE-1 computation failed for keyword "${keyword}":`, err);
|
|
24
|
+
}
|
|
25
|
+
const keywordPassed = rouge1Score >= rougeThreshold;
|
|
26
|
+
const keywordApproachResult = {
|
|
27
|
+
score: rouge1Score,
|
|
28
|
+
approachUsed: ROUGE
|
|
29
|
+
};
|
|
30
|
+
return {
|
|
31
|
+
keyword: keyword,
|
|
32
|
+
found: keywordPassed,
|
|
33
|
+
evaluationApproachResult: keywordApproachResult
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Computes the ROUGE-1 score for a single keyword against the candidate text.
|
|
38
|
+
*
|
|
39
|
+
* ROUGE-1 measures the overlap of unigrams (single words) between the candidate
|
|
40
|
+
* and reference text. A score of 1.0 indicates perfect overlap.
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* const match = evaluateSingleKeyword(
|
|
44
|
+
* "The quick brown fox",
|
|
45
|
+
* "quick fox",
|
|
46
|
+
* 0.5
|
|
47
|
+
* );
|
|
48
|
+
* // Returns: { keyword: "quick fox", found: true, score: 0.67, ... }
|
|
49
|
+
* //general idea , here we are doing it. by word to word comparison
|
|
50
|
+
*/
|
|
51
|
+
export async function performRouge1Evaluation(request) {
|
|
52
|
+
const { testCaseId, actualResponse, expectedKeywords, evaluationParameters } = request;
|
|
53
|
+
const candidate = (actualResponse || '').trim();
|
|
54
|
+
const rougeThreshold = evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;
|
|
55
|
+
let keywordsPassed = 0;
|
|
56
|
+
const totalKeywords = expectedKeywords.length;
|
|
57
|
+
const keywordMatches = expectedKeywords.map(keyword => {
|
|
58
|
+
const match = evaluateKeyword(keyword, candidate, rougeThreshold);
|
|
59
|
+
if (match.found) {
|
|
60
|
+
keywordsPassed++;
|
|
61
|
+
}
|
|
62
|
+
return match;
|
|
63
|
+
});
|
|
64
|
+
const overallPassed = keywordsPassed === totalKeywords;
|
|
65
|
+
const overallApproachResult = {
|
|
66
|
+
score: keywordsPassed / totalKeywords,
|
|
67
|
+
approachUsed: ROUGE
|
|
68
|
+
};
|
|
69
|
+
return {
|
|
70
|
+
testCaseId: testCaseId,
|
|
71
|
+
passed: overallPassed,
|
|
72
|
+
keywordMatches: keywordMatches,
|
|
73
|
+
sourceLinkMatches: [],
|
|
74
|
+
timestamp: new Date().toISOString(),
|
|
75
|
+
evaluationParameters: {
|
|
76
|
+
...evaluationParameters,
|
|
77
|
+
threshold: rougeThreshold
|
|
78
|
+
},
|
|
79
|
+
evaluationApproachResult: overallApproachResult
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=rouge1-evaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rouge1-evaluator.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rouge1-evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,UAAU,CAAC;AAGlC,OAAO,EAAE,wBAAwB,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAE9D;;;;;;;GAOG;AACH,SAAS,eAAe,CAAC,OAAe,EAAE,SAAiB,EAAE,cAAsB;IAC/E,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,IAAI,CAAC;QACD,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACpD,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,SAAS,EAAE,OAAO,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;YAC5D,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAE7C,CAAC;aAAM,CAAC;YACJ,OAAO,CAAC,IAAI,CAAC,qCAAqC,OAAO,qCAAqC,CAAC,CAAC;QACpG,CAAC;IACL,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACX,OAAO,CAAC,KAAK,CAAC,2CAA2C,OAAO,IAAI,EAAE,GAAG,CAAC,CAAC;IAC/E,CAAC;IAED,MAAM,aAAa,GAAG,WAAW,IAAI,cAAc,CAAC;IAEpD,MAAM,qBAAqB,GAA6B;QACpD,KAAK,EAAE,WAAW;QAClB,YAAY,EAAE,KAAK;KACtB,CAAC;IAEF,OAAO;QACH,OAAO,EAAE,OAAO;QAChB,KAAK,EAAE,aAAa;QACpB,wBAAwB,EAAE,qBAAqB;KAClD,CAAC;AACN,CAAC;AAED;;;;;;;;;;;;;;GAcG;AAEH,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAAC,OAA0B;IACpE,MAAM,EACF,UAAU,EACV,cAAc,EACd,gBAAgB,EAChB,oBAAoB,EACvB,GAAG,OAAO,CAAC;IAEZ,MAAM,SAAS,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAChD,MAAM,cAAc,GAAG,oBAAoB,CAAC,SAAS,IAAI,wBAAwB,CAAC;IAElF,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,MAAM,aAAa,GAAG,gBAAgB,CAAC,MAAM,CAAC;IAE9C,MAAM,cAAc,GAAmB,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;QAClE,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,EAAE,SAAS,EAAE,cAAc,CAAC,CAAC;QAElE,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YACd,cAAc,EAAE,CAAC;QACrB,CAAC;QAED,OAAO,KAAK,CAAC;IACjB,CAAC,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,cAAc,KAAK,aAAa,CAAC;IAEvD,MAAM,qBAAqB,GAA6B;QACpD,KAAK,EAAE,cAAc,GAAG,aAAa;QACrC,YAAY,EAAE,KAAK;KACtB,CAAC;IAEF,OAAO;QACH,UAAU,EAAE,UAAU;QACtB,MAAM,EAAE,aAAa;QACrB,cAAc,EAAE,cAAc;QAC9B,iBAAiB,EAAE,EAAE;QACrB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,oBAAoB,EAAE;YAClB,GAAG,oBAAoB;YACvB,SAAS,EAAE,cAAc;SAC5B;QACD,wBAAwB,EAAE,qBAAqB;KAClD,CAAC;AACN,CAAC","sourcesContent":["import * as rouge from 'js-rouge';\nimport { EvaluationApproachResult } from '../../../types/evaluation';\nimport { EvaluationRequest, EvaluationResult, KeywordMatch } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, ROUGE } from '../constant';\n\n/**\n * Evaluates a single keyword against the candidate text using ROUGE-1.\n * \n * @param {string} keyword - The expected keyword to evaluate\n * @param {string} candidate - The actual response text\n * @param {number} rougeThreshold - The minimum ROUGE-1 score required to pass\n * @returns {KeywordMatch} The evaluation result for this keyword\n */\nfunction evaluateKeyword(keyword: string, candidate: string, rougeThreshold: number): KeywordMatch {\n let rouge1Score = 0;\n\n try {\n if (keyword.trim().length > 0 && candidate.length > 0) {\n const rouge1 = rouge.n(candidate, keyword.trim(), { n: 1 });\n rouge1Score = isNaN(rouge1) ? 0 : rouge1;\n\n } else {\n console.warn(`ROUGE-1 not computed for keyword \"${keyword}\": Keyword or Candidate is missing.`);\n }\n } catch (err) {\n console.error(`ROUGE-1 computation failed for keyword \"${keyword}\":`, err);\n }\n\n const keywordPassed = rouge1Score >= rougeThreshold;\n\n const keywordApproachResult: EvaluationApproachResult = {\n score: rouge1Score,\n approachUsed: ROUGE\n };\n\n return {\n keyword: keyword,\n found: keywordPassed,\n evaluationApproachResult: keywordApproachResult\n };\n}\n\n/**\n * Computes the ROUGE-1 score for a single keyword against the candidate text.\n * \n * ROUGE-1 measures the overlap of unigrams (single words) between the candidate\n * and reference text. A score of 1.0 indicates perfect overlap.\n * \n * @example\n * const match = evaluateSingleKeyword(\n * \"The quick brown fox\",\n * \"quick fox\",\n * 0.5\n * );\n * // Returns: { keyword: \"quick fox\", found: true, score: 0.67, ... }\n * //general idea , here we are doing it. by word to word comparison\n */\n\nexport async function performRouge1Evaluation(request: EvaluationRequest): Promise<EvaluationResult> {\n const {\n testCaseId,\n actualResponse,\n expectedKeywords,\n evaluationParameters\n } = request;\n\n const candidate = (actualResponse || '').trim();\n const rougeThreshold = evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;\n\n let keywordsPassed = 0;\n const totalKeywords = expectedKeywords.length;\n\n const keywordMatches: KeywordMatch[] = expectedKeywords.map(keyword => {\n const match = evaluateKeyword(keyword, candidate, rougeThreshold);\n \n if (match.found) {\n keywordsPassed++;\n }\n \n return match;\n });\n\n const overallPassed = keywordsPassed === totalKeywords;\n\n const overallApproachResult: EvaluationApproachResult = {\n score: keywordsPassed / totalKeywords,\n approachUsed: ROUGE\n };\n\n return {\n testCaseId: testCaseId,\n passed: overallPassed,\n keywordMatches: keywordMatches,\n sourceLinkMatches: [],\n timestamp: new Date().toISOString(),\n evaluationParameters: {\n ...evaluationParameters,\n threshold: rougeThreshold\n },\n evaluationApproachResult: overallApproachResult\n };\n}"]}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
import * as rouge from "js-rouge";
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE, ROUGE } from "../constant";
|
|
3
|
+
function evaluateKeyword(keyword, candidate, rougeThreshold) {
|
|
4
|
+
let rougeLScore = 0;
|
|
5
|
+
try {
|
|
6
|
+
const trimmedKeyword = keyword.trim();
|
|
7
|
+
if (trimmedKeyword.length > 0 && candidate.length > 0) {
|
|
8
|
+
const referenceTokens = trimmedKeyword.toLowerCase().split(/\s+/);
|
|
9
|
+
const candidateTokens = candidate.toLowerCase().split(/\s+/);
|
|
10
|
+
if (referenceTokens.length === 1 && candidateTokens.includes(referenceTokens[0])) {
|
|
11
|
+
rougeLScore = 1;
|
|
12
|
+
}
|
|
13
|
+
else {
|
|
14
|
+
const lcsResult = rouge.lcs(candidateTokens, referenceTokens);
|
|
15
|
+
const lcsLength = typeof lcsResult === 'number'
|
|
16
|
+
? lcsResult
|
|
17
|
+
: (lcsResult?.length ?? 0);
|
|
18
|
+
const recall = referenceTokens.length > 0 ? lcsLength / referenceTokens.length : 0;
|
|
19
|
+
const precision = candidateTokens.length > 0 ? lcsLength / candidateTokens.length : 0;
|
|
20
|
+
const denominator = precision + recall;
|
|
21
|
+
const f1Score = denominator > 0 ? (2 * precision * recall) / denominator : 0;
|
|
22
|
+
rougeLScore = f1Score;
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
else {
|
|
26
|
+
console.warn(`ROUGE-L not computed for keyword "${keyword}": Keyword or candidate missing.`);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
catch (err) {
|
|
30
|
+
console.error(`ROUGE-L computation failed for keyword "${keyword}":`, err);
|
|
31
|
+
}
|
|
32
|
+
const keywordPassed = rougeLScore >= rougeThreshold;
|
|
33
|
+
const keywordApproachResult = {
|
|
34
|
+
score: rougeLScore,
|
|
35
|
+
approachUsed: ROUGE,
|
|
36
|
+
};
|
|
37
|
+
return {
|
|
38
|
+
keyword,
|
|
39
|
+
found: keywordPassed,
|
|
40
|
+
evaluationApproachResult: keywordApproachResult,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
export function performRougeLEvaluation(request) {
|
|
44
|
+
const { testCaseId, actualResponse, expectedKeywords, evaluationParameters, } = request;
|
|
45
|
+
const candidate = (actualResponse || '').trim();
|
|
46
|
+
const rougeThreshold = evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;
|
|
47
|
+
let keywordsPassed = 0;
|
|
48
|
+
const totalKeywords = expectedKeywords.length;
|
|
49
|
+
const keywordMatches = expectedKeywords.map((keyword) => {
|
|
50
|
+
const match = evaluateKeyword(keyword, candidate, rougeThreshold);
|
|
51
|
+
if (match.found)
|
|
52
|
+
keywordsPassed++;
|
|
53
|
+
return match;
|
|
54
|
+
});
|
|
55
|
+
const overallPassed = keywordsPassed === totalKeywords;
|
|
56
|
+
const overallApproachResult = {
|
|
57
|
+
score: totalKeywords > 0 ? keywordsPassed / totalKeywords : 1,
|
|
58
|
+
approachUsed: ROUGE,
|
|
59
|
+
};
|
|
60
|
+
return {
|
|
61
|
+
testCaseId,
|
|
62
|
+
passed: overallPassed,
|
|
63
|
+
keywordMatches,
|
|
64
|
+
sourceLinkMatches: [],
|
|
65
|
+
timestamp: new Date().toISOString(),
|
|
66
|
+
evaluationParameters: {
|
|
67
|
+
...evaluationParameters,
|
|
68
|
+
threshold: rougeThreshold,
|
|
69
|
+
},
|
|
70
|
+
evaluationApproachResult: overallApproachResult,
|
|
71
|
+
};
|
|
72
|
+
}
|
|
73
|
+
//# sourceMappingURL=rougeL-evaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rougeL-evaluator.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rougeL-evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,UAAU,CAAC;AAGlC,OAAO,EAAE,wBAAwB,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAG9D,SAAS,eAAe,CAAC,OAAe,EAAE,SAAiB,EAAE,cAAsB;IACjF,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,IAAI,CAAC;QACH,MAAM,cAAc,GAAG,OAAO,CAAC,IAAI,EAAE,CAAC;QACtC,IAAI,cAAc,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtD,MAAM,eAAe,GAAG,cAAc,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAClE,MAAM,eAAe,GAAG,SAAS,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAE7D,IAAI,eAAe,CAAC,MAAM,KAAK,CAAC,IAAI,eAAe,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;gBACjF,WAAW,GAAG,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBAEN,MAAM,SAAS,GAAG,KAAK,CAAC,GAAG,CAAC,eAAe,EAAE,eAAe,CAAC,CAAC;gBAC9D,MAAM,SAAS,GACb,OAAO,SAAS,KAAK,QAAQ;oBAC3B,CAAC,CAAC,SAAS;oBACX,CAAC,CAAC,CAAC,SAAS,EAAE,MAAM,IAAI,CAAC,CAAC,CAAC;gBAE/B,MAAM,MAAM,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;gBACnF,MAAM,SAAS,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,SAAS,GAAG,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC;gBACtF,MAAM,WAAW,GAAG,SAAS,GAAG,MAAM,CAAC;gBAEvC,MAAM,OAAO,GAAG,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,SAAS,GAAG,MAAM,CAAC,GAAG,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC;gBAC7E,WAAW,GAAG,OAAO,CAAC;YACxB,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CAAC,qCAAqC,OAAO,kCAAkC,CAAC,CAAC;QAC/F,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,2CAA2C,OAAO,IAAI,EAAE,GAAG,CAAC,CAAC;IAC7E,CAAC;IAED,MAAM,aAAa,GAAG,WAAW,IAAI,cAAc,CAAC;IAEpD,MAAM,qBAAqB,GAA6B;QACtD,KAAK,EAAE,WAAW;QAClB,YAAY,EAAE,KAAK;KACpB,CAAC;IAEF,OAAO;QACL,OAAO;QACP,KAAK,EAAE,aAAa;QACpB,wBAAwB,EAAE,qBAAqB;KAChD,CAAC;AACJ,CAAC;AAED,MAAM,UAAU,uBAAuB,CAAC,OAA0B;IAChE,MAAM,EACJ,UAAU,EACV,cAAc,EACd,gBAAgB,EAChB,oBAAoB,GACrB,GAAG,OAAO,CAAC;IAEZ,MAAM,SAAS,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAChD,MAAM,cAAc,GAAG,oBAAoB,CAAC,SAAS,IAAI,wBAAwB,CAAC;IAElF,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,MAAM,aAAa,GAAG,gBAAgB,CAAC,MAAM,CAAC;IAE9C,MAAM,cAAc,GAAmB,gBAAgB,CAAC,GAAG,CAAC,CAAC,OAAO,EAAE,EAAE;QACtE,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,EAAE,SAAS,EAAE,cAAc,CAAC,CAAC;QAClE,IAAI,KAAK,CAAC,KAAK;YAAE,cAAc,EAAE,CAAC;QAClC,OAAO,KAAK,CAAC;IACf,CAAC,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,cAAc,KAAK,aAAa,CAAC;IAEvD,MAAM,qBAAqB,GAA6B;QACtD,KAAK,EAAE,aAAa,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QAC7D,YAAY,EAAE,KAAK;KACpB,CAAC;IAEF,OAAO;QACL,UAAU;QACV,MAAM,EAAE,aAAa;QACrB,cAAc;QACd,iBAAiB,EAAE,EAAE;QACrB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,oBAAoB,EAAE;YACpB,GAAG,oBAAoB;YACvB,SAAS,EAAE,cAAc;SAC1B;QACD,wBAAwB,EAAE,qBAAqB;KAChD,CAAC;AACJ,CAAC","sourcesContent":["import * as rouge from 'js-rouge';\nimport { EvaluationApproachResult } from '../../../types/evaluation';\nimport { EvaluationRequest, EvaluationResult, KeywordMatch } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, ROUGE } from '../constant';\n\n\nfunction evaluateKeyword(keyword: string, candidate: string, rougeThreshold: number): KeywordMatch {\n let rougeLScore = 0;\n\n try {\n const trimmedKeyword = keyword.trim();\n if (trimmedKeyword.length > 0 && candidate.length > 0) {\n const referenceTokens = trimmedKeyword.toLowerCase().split(/\\s+/);\n const candidateTokens = candidate.toLowerCase().split(/\\s+/);\n\n if (referenceTokens.length === 1 && candidateTokens.includes(referenceTokens[0])) {\n rougeLScore = 1;\n } else {\n\n const lcsResult = rouge.lcs(candidateTokens, referenceTokens);\n const lcsLength =\n typeof lcsResult === 'number'\n ? lcsResult\n : (lcsResult?.length ?? 0);\n\n const recall = referenceTokens.length > 0 ? lcsLength / referenceTokens.length : 0;\n const precision = candidateTokens.length > 0 ? lcsLength / candidateTokens.length : 0;\n const denominator = precision + recall;\n\n const f1Score = denominator > 0 ? (2 * precision * recall) / denominator : 0;\n rougeLScore = f1Score;\n }\n } else {\n console.warn(`ROUGE-L not computed for keyword \"${keyword}\": Keyword or candidate missing.`);\n }\n } catch (err) {\n console.error(`ROUGE-L computation failed for keyword \"${keyword}\":`, err);\n }\n\n const keywordPassed = rougeLScore >= rougeThreshold;\n\n const keywordApproachResult: EvaluationApproachResult = {\n score: rougeLScore,\n approachUsed: ROUGE,\n };\n\n return {\n keyword,\n found: keywordPassed,\n evaluationApproachResult: keywordApproachResult,\n };\n}\n\nexport function performRougeLEvaluation(request: EvaluationRequest): EvaluationResult { \n const {\n testCaseId,\n actualResponse,\n expectedKeywords,\n evaluationParameters,\n } = request;\n\n const candidate = (actualResponse || '').trim();\n const rougeThreshold = evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;\n\n let keywordsPassed = 0;\n const totalKeywords = expectedKeywords.length;\n\n const keywordMatches: KeywordMatch[] = expectedKeywords.map((keyword) => {\n const match = evaluateKeyword(keyword, candidate, rougeThreshold);\n if (match.found) keywordsPassed++;\n return match;\n });\n\n const overallPassed = keywordsPassed === totalKeywords;\n\n const overallApproachResult: EvaluationApproachResult = {\n score: totalKeywords > 0 ? keywordsPassed / totalKeywords : 1,\n approachUsed: ROUGE,\n };\n\n return {\n testCaseId,\n passed: overallPassed,\n keywordMatches,\n sourceLinkMatches: [],\n timestamp: new Date().toISOString(),\n evaluationParameters: {\n ...evaluationParameters,\n threshold: rougeThreshold,\n },\n evaluationApproachResult: overallApproachResult,\n };\n}"]}
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
import { jest, describe, it, expect } from "@jest/globals";
|
|
2
|
+
import { performRougeLEvaluation } from "./rougeL-evaluator";
|
|
3
|
+
import { DEFAULT_ROUGE_PASS_SCORE, ROUGE } from "../constant";
|
|
4
|
+
describe('performRougeLEvaluation', () => {
|
|
5
|
+
// Helper function to create a base request with optional overrides
|
|
6
|
+
const createRequest = (overrides = {}) => {
|
|
7
|
+
const defaults = {
|
|
8
|
+
testCaseId: 'test-001',
|
|
9
|
+
question: 'Test question',
|
|
10
|
+
expectedKeywords: ['keyword'],
|
|
11
|
+
expectedSourceLinks: [],
|
|
12
|
+
actualResponse: 'response with keyword',
|
|
13
|
+
evaluationParameters: {
|
|
14
|
+
approach: 'rouge',
|
|
15
|
+
threshold: DEFAULT_ROUGE_PASS_SCORE,
|
|
16
|
+
},
|
|
17
|
+
};
|
|
18
|
+
return {
|
|
19
|
+
...defaults,
|
|
20
|
+
...overrides,
|
|
21
|
+
evaluationParameters: {
|
|
22
|
+
...defaults.evaluationParameters,
|
|
23
|
+
...overrides.evaluationParameters,
|
|
24
|
+
},
|
|
25
|
+
};
|
|
26
|
+
};
|
|
27
|
+
describe('basic functionality', () => {
|
|
28
|
+
it('should return a valid EvaluationResult structure', () => {
|
|
29
|
+
const request = createRequest({
|
|
30
|
+
actualResponse: 'AI stands for artificial intelligence',
|
|
31
|
+
expectedKeywords: ['artificial intelligence'],
|
|
32
|
+
});
|
|
33
|
+
const result = performRougeLEvaluation(request);
|
|
34
|
+
expect(result).toMatchObject({
|
|
35
|
+
testCaseId: 'test-001',
|
|
36
|
+
passed: expect.any(Boolean),
|
|
37
|
+
keywordMatches: expect.any(Array),
|
|
38
|
+
sourceLinkMatches: expect.any(Array),
|
|
39
|
+
timestamp: expect.any(String),
|
|
40
|
+
evaluationParameters: expect.any(Object),
|
|
41
|
+
evaluationApproachResult: expect.any(Object),
|
|
42
|
+
});
|
|
43
|
+
});
|
|
44
|
+
it('should use default threshold when not provided', () => {
|
|
45
|
+
const request = createRequest({
|
|
46
|
+
evaluationParameters: { approach: 'rouge' },
|
|
47
|
+
});
|
|
48
|
+
const result = performRougeLEvaluation(request);
|
|
49
|
+
expect(result.evaluationParameters.threshold).toBe(DEFAULT_ROUGE_PASS_SCORE);
|
|
50
|
+
});
|
|
51
|
+
it('should use provided threshold when specified', () => {
|
|
52
|
+
const customThreshold = 0.85;
|
|
53
|
+
const request = createRequest({
|
|
54
|
+
actualResponse: 'response',
|
|
55
|
+
evaluationParameters: {
|
|
56
|
+
approach: 'rouge',
|
|
57
|
+
threshold: customThreshold,
|
|
58
|
+
},
|
|
59
|
+
});
|
|
60
|
+
const result = performRougeLEvaluation(request);
|
|
61
|
+
expect(result.evaluationParameters.threshold).toBe(customThreshold);
|
|
62
|
+
});
|
|
63
|
+
});
|
|
64
|
+
describe('single keyword evaluation', () => {
|
|
65
|
+
it('should pass when single-word keyword is found in candidate', () => {
|
|
66
|
+
const request = createRequest({
|
|
67
|
+
expectedKeywords: ['machine'],
|
|
68
|
+
actualResponse: 'This is about machine learning',
|
|
69
|
+
});
|
|
70
|
+
const result = performRougeLEvaluation(request);
|
|
71
|
+
expect(result).toMatchObject({
|
|
72
|
+
passed: true,
|
|
73
|
+
keywordMatches: [
|
|
74
|
+
{
|
|
75
|
+
keyword: 'machine',
|
|
76
|
+
found: true,
|
|
77
|
+
evaluationApproachResult: {
|
|
78
|
+
score: 1,
|
|
79
|
+
approachUsed: ROUGE,
|
|
80
|
+
},
|
|
81
|
+
},
|
|
82
|
+
],
|
|
83
|
+
});
|
|
84
|
+
});
|
|
85
|
+
it('should fail when single-word keyword is not found in candidate', () => {
|
|
86
|
+
const request = createRequest({
|
|
87
|
+
expectedKeywords: ['quantum'],
|
|
88
|
+
actualResponse: 'This is about machine learning',
|
|
89
|
+
});
|
|
90
|
+
const result = performRougeLEvaluation(request);
|
|
91
|
+
expect(result).toMatchObject({
|
|
92
|
+
passed: false,
|
|
93
|
+
keywordMatches: [
|
|
94
|
+
{
|
|
95
|
+
found: false,
|
|
96
|
+
evaluationApproachResult: {
|
|
97
|
+
score: 0,
|
|
98
|
+
},
|
|
99
|
+
},
|
|
100
|
+
],
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
it('should calculate ROUGE-L score for multi-word keywords', () => {
|
|
104
|
+
const request = createRequest({
|
|
105
|
+
expectedKeywords: ['machine learning'],
|
|
106
|
+
actualResponse: 'AI and machine learning are related',
|
|
107
|
+
evaluationParameters: { approach: 'rouge', threshold: 0.5 },
|
|
108
|
+
});
|
|
109
|
+
const result = performRougeLEvaluation(request);
|
|
110
|
+
expect(result).toMatchObject({
|
|
111
|
+
keywordMatches: [
|
|
112
|
+
{
|
|
113
|
+
found: true,
|
|
114
|
+
evaluationApproachResult: {
|
|
115
|
+
score: expect.closeTo(0.5),
|
|
116
|
+
approachUsed: ROUGE,
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
],
|
|
120
|
+
});
|
|
121
|
+
});
|
|
122
|
+
it('should handle LCS result as object with length property', () => {
|
|
123
|
+
const request = createRequest({
|
|
124
|
+
expectedKeywords: ['deep learning'],
|
|
125
|
+
actualResponse: 'Deep learning is a subset of machine learning',
|
|
126
|
+
});
|
|
127
|
+
const result = performRougeLEvaluation(request);
|
|
128
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThan(0);
|
|
129
|
+
});
|
|
130
|
+
});
|
|
131
|
+
describe('multiple keywords evaluation', () => {
|
|
132
|
+
it('should pass when all keywords meet threshold', () => {
|
|
133
|
+
const request = createRequest({
|
|
134
|
+
expectedKeywords: ['machine', 'learning', 'AI'],
|
|
135
|
+
actualResponse: 'Machine learning and AI are transformative technologies',
|
|
136
|
+
});
|
|
137
|
+
const result = performRougeLEvaluation(request);
|
|
138
|
+
expect(result).toMatchObject({
|
|
139
|
+
passed: true,
|
|
140
|
+
evaluationApproachResult: {
|
|
141
|
+
score: 1,
|
|
142
|
+
approachUsed: ROUGE,
|
|
143
|
+
},
|
|
144
|
+
});
|
|
145
|
+
expect(result.keywordMatches).toHaveLength(3);
|
|
146
|
+
expect(result.keywordMatches.every(match => match.found)).toBe(true);
|
|
147
|
+
});
|
|
148
|
+
it('should fail when not all keywords meet threshold', () => {
|
|
149
|
+
const request = createRequest({
|
|
150
|
+
expectedKeywords: ['machine', 'quantum', 'AI'],
|
|
151
|
+
actualResponse: 'Machine learning and AI are transformative',
|
|
152
|
+
});
|
|
153
|
+
const result = performRougeLEvaluation(request);
|
|
154
|
+
expect(result.passed).toBe(false);
|
|
155
|
+
expect(result.keywordMatches).toHaveLength(3);
|
|
156
|
+
expect(result.keywordMatches.filter(match => match.found)).toHaveLength(2);
|
|
157
|
+
expect(result.evaluationApproachResult.score).toBeCloseTo(2 / 3);
|
|
158
|
+
});
|
|
159
|
+
it('should calculate overall score as ratio of passed keywords', () => {
|
|
160
|
+
const request = createRequest({
|
|
161
|
+
expectedKeywords: ['alpha', 'beta', 'gamma', 'delta'],
|
|
162
|
+
actualResponse: 'alpha and beta are here',
|
|
163
|
+
});
|
|
164
|
+
const result = performRougeLEvaluation(request);
|
|
165
|
+
expect(result).toMatchObject({
|
|
166
|
+
passed: false,
|
|
167
|
+
evaluationApproachResult: {
|
|
168
|
+
score: 0.5, // 2 out of 4
|
|
169
|
+
},
|
|
170
|
+
});
|
|
171
|
+
});
|
|
172
|
+
});
|
|
173
|
+
describe('edge cases', () => {
|
|
174
|
+
it('should handle empty keywords array', () => {
|
|
175
|
+
const request = createRequest({
|
|
176
|
+
expectedKeywords: [],
|
|
177
|
+
actualResponse: 'Some response',
|
|
178
|
+
});
|
|
179
|
+
const result = performRougeLEvaluation(request);
|
|
180
|
+
expect(result).toMatchObject({
|
|
181
|
+
passed: true,
|
|
182
|
+
keywordMatches: [],
|
|
183
|
+
evaluationApproachResult: {
|
|
184
|
+
score: 1,
|
|
185
|
+
},
|
|
186
|
+
});
|
|
187
|
+
});
|
|
188
|
+
it('should handle empty actual response', () => {
|
|
189
|
+
const request = createRequest({
|
|
190
|
+
expectedKeywords: ['machine'],
|
|
191
|
+
actualResponse: '',
|
|
192
|
+
});
|
|
193
|
+
// Suppress expected warning
|
|
194
|
+
const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => { });
|
|
195
|
+
const result = performRougeLEvaluation(request);
|
|
196
|
+
expect(result).toMatchObject({
|
|
197
|
+
passed: false,
|
|
198
|
+
keywordMatches: [
|
|
199
|
+
{
|
|
200
|
+
found: false,
|
|
201
|
+
evaluationApproachResult: {
|
|
202
|
+
score: 0,
|
|
203
|
+
},
|
|
204
|
+
},
|
|
205
|
+
],
|
|
206
|
+
});
|
|
207
|
+
consoleWarnSpy.mockRestore();
|
|
208
|
+
});
|
|
209
|
+
it('should handle whitespace-only keyword', () => {
|
|
210
|
+
const request = createRequest({
|
|
211
|
+
expectedKeywords: [' '],
|
|
212
|
+
actualResponse: 'Some response',
|
|
213
|
+
});
|
|
214
|
+
// Suppress expected warning
|
|
215
|
+
const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => { });
|
|
216
|
+
const result = performRougeLEvaluation(request);
|
|
217
|
+
expect(result.keywordMatches[0]).toMatchObject({
|
|
218
|
+
found: false,
|
|
219
|
+
evaluationApproachResult: {
|
|
220
|
+
score: 0,
|
|
221
|
+
},
|
|
222
|
+
});
|
|
223
|
+
consoleWarnSpy.mockRestore();
|
|
224
|
+
});
|
|
225
|
+
it('should handle null/undefined actualResponse gracefully', () => {
|
|
226
|
+
const request = createRequest({
|
|
227
|
+
expectedKeywords: ['machine'],
|
|
228
|
+
actualResponse: null,
|
|
229
|
+
});
|
|
230
|
+
// Suppress expected warning
|
|
231
|
+
const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => { });
|
|
232
|
+
const result = performRougeLEvaluation(request);
|
|
233
|
+
expect(result).toMatchObject({
|
|
234
|
+
passed: false,
|
|
235
|
+
keywordMatches: [
|
|
236
|
+
{
|
|
237
|
+
found: false,
|
|
238
|
+
},
|
|
239
|
+
],
|
|
240
|
+
});
|
|
241
|
+
consoleWarnSpy.mockRestore();
|
|
242
|
+
});
|
|
243
|
+
});
|
|
244
|
+
describe('case insensitivity', () => {
|
|
245
|
+
it('should perform case-insensitive matching', () => {
|
|
246
|
+
const request = createRequest({
|
|
247
|
+
expectedKeywords: ['MACHINE'],
|
|
248
|
+
actualResponse: 'machine learning is important',
|
|
249
|
+
});
|
|
250
|
+
const result = performRougeLEvaluation(request);
|
|
251
|
+
expect(result.keywordMatches[0]).toMatchObject({
|
|
252
|
+
found: true,
|
|
253
|
+
evaluationApproachResult: {
|
|
254
|
+
score: 1,
|
|
255
|
+
},
|
|
256
|
+
});
|
|
257
|
+
});
|
|
258
|
+
it('should match keywords with mixed case', () => {
|
|
259
|
+
const request = createRequest({
|
|
260
|
+
expectedKeywords: ['MaChInE LeArNiNg'],
|
|
261
|
+
actualResponse: 'MACHINE LEARNING is a field of AI',
|
|
262
|
+
evaluationParameters: { approach: 'rouge', threshold: 0.4 }, // Lower threshold for real ROUGE-L behavior
|
|
263
|
+
});
|
|
264
|
+
const result = performRougeLEvaluation(request);
|
|
265
|
+
expect(result.keywordMatches[0]).toMatchObject({
|
|
266
|
+
found: true,
|
|
267
|
+
});
|
|
268
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThanOrEqual(0.4);
|
|
269
|
+
});
|
|
270
|
+
});
|
|
271
|
+
describe('ROUGE-L score calculation', () => {
|
|
272
|
+
it('should calculate correct F-score from precision and recall', () => {
|
|
273
|
+
const request = createRequest({
|
|
274
|
+
expectedKeywords: ['neural network'],
|
|
275
|
+
actualResponse: 'A neural network processes data',
|
|
276
|
+
});
|
|
277
|
+
const result = performRougeLEvaluation(request);
|
|
278
|
+
// With actual ROUGE-L: both words 'neural' and 'network' are found
|
|
279
|
+
// LCS length = 2, reference length = 2, candidate length = 5
|
|
280
|
+
// recall = 2/2 = 1.0, precision = 2/5 = 0.4
|
|
281
|
+
// F-score = 2 * (1.0 * 0.4) / (1.0 + 0.4) ≈ 0.571
|
|
282
|
+
const expectedFScore = (2 * 1.0 * 0.4) / (1.0 + 0.4);
|
|
283
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeCloseTo(expectedFScore, 2);
|
|
284
|
+
});
|
|
285
|
+
it('should handle partial matches', () => {
|
|
286
|
+
const request = createRequest({
|
|
287
|
+
expectedKeywords: ['artificial intelligence systems'],
|
|
288
|
+
actualResponse: 'Artificial intelligence is growing',
|
|
289
|
+
evaluationParameters: { approach: 'rouge', threshold: 0.5 },
|
|
290
|
+
});
|
|
291
|
+
const result = performRougeLEvaluation(request);
|
|
292
|
+
// With actual ROUGE-L: 'artificial' and 'intelligence' are found, 'systems' is not
|
|
293
|
+
// LCS length = 2, reference length = 3, candidate length = 4
|
|
294
|
+
// recall = 2/3, precision = 2/4 = 0.5
|
|
295
|
+
const recall = 2 / 3;
|
|
296
|
+
const precision = 2 / 4;
|
|
297
|
+
const expectedFScore = (2 * precision * recall) / (precision + recall);
|
|
298
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeCloseTo(expectedFScore, 2);
|
|
299
|
+
});
|
|
300
|
+
});
|
|
301
|
+
describe('timestamp', () => {
|
|
302
|
+
it('should include a valid ISO timestamp', () => {
|
|
303
|
+
const request = createRequest({
|
|
304
|
+
expectedKeywords: ['test'],
|
|
305
|
+
actualResponse: 'test response',
|
|
306
|
+
});
|
|
307
|
+
const result = performRougeLEvaluation(request);
|
|
308
|
+
expect(result.timestamp).toBeDefined();
|
|
309
|
+
expect(new Date(result.timestamp).toISOString()).toBe(result.timestamp);
|
|
310
|
+
});
|
|
311
|
+
});
|
|
312
|
+
});
|
|
313
|
+
//# sourceMappingURL=rougeL-evaluator.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rougeL-evaluator.test.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rougeL-evaluator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAC3D,OAAO,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAE7D,OAAO,EAAE,wBAAwB,EAAE,KAAK,EAAE,MAAM,aAAa,CAAC;AAE9D,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IACvC,mEAAmE;IACnE,MAAM,aAAa,GAAG,CAAC,YAAwC,EAAE,EAAqB,EAAE;QACtF,MAAM,QAAQ,GAAsB;YAClC,UAAU,EAAE,UAAU;YACtB,QAAQ,EAAE,eAAe;YACzB,gBAAgB,EAAE,CAAC,SAAS,CAAC;YAC7B,mBAAmB,EAAE,EAAE;YACvB,cAAc,EAAE,uBAAuB;YACvC,oBAAoB,EAAE;gBACpB,QAAQ,EAAE,OAAO;gBACjB,SAAS,EAAE,wBAAwB;aACpC;SACF,CAAC;QAEF,OAAO;YACL,GAAG,QAAQ;YACX,GAAG,SAAS;YACZ,oBAAoB,EAAE;gBACpB,GAAG,QAAQ,CAAC,oBAAoB;gBAChC,GAAG,SAAS,CAAC,oBAAoB;aAClC;SACF,CAAC;IACJ,CAAC,CAAC;IAEF,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,EAAE,CAAC,kDAAkD,EAAE,GAAG,EAAE;YAC1D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,cAAc,EAAE,uCAAuC;gBACvD,gBAAgB,EAAE,CAAC,yBAAyB,CAAC;aAC9C,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,UAAU,EAAE,UAAU;gBACtB,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC;gBAC3B,cAAc,EAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC;gBACjC,iBAAiB,EAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC;gBACpC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBAC7B,oBAAoB,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBACxC,wBAAwB,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;aAC7C,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,GAAG,EAAE;YACxD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,oBAAoB,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE;aAC5C,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QAC/E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;YACtD,MAAM,eAAe,GAAG,IAAI,CAAC;YAC7B,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,cAAc,EAAE,UAAU;gBAC1B,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,OAAO;oBACjB,SAAS,EAAE,eAAe;iBAC3B;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QACtE,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACzC,EAAE,CAAC,4DAA4D,EAAE,GAAG,EAAE;YACpE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,CAAC;gBAC7B,cAAc,EAAE,gCAAgC;aACjD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,IAAI;gBACZ,cAAc,EAAE;oBACd;wBACE,OAAO,EAAE,SAAS;wBAClB,KAAK,EAAE,IAAI;wBACX,wBAAwB,EAAE;4BACxB,KAAK,EAAE,CAAC;4BACR,YAAY,EAAE,KAAK;yBACpB;qBACF;iBACF;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gEAAgE,EAAE,GAAG,EAAE;YACxE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,CAAC;gBAC7B,cAAc,EAAE,gCAAgC;aACjD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,KAAK;wBACZ,wBAAwB,EAAE;4BACxB,KAAK,EAAE,CAAC;yBACT;qBACF;iBACF;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,GAAG,EAAE;YAChE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,kBAAkB,CAAC;gBACtC,cAAc,EAAE,qCAAqC;gBACrD,oBAAoB,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE;aAC5D,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,IAAI;wBACX,wBAAwB,EAAE;4BACxB,KAAK,EAAE,MAAM,CAAC,OAAO,CAAC,GAAG,CAAC;4BAC1B,YAAY,EAAE,KAAK;yBACpB;qBACF;iBACF;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,yDAAyD,EAAE,GAAG,EAAE;YACjE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,eAAe,CAAC;gBACnC,cAAc,EAAE,+CAA+C;aAChE,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACrF,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,8BAA8B,EAAE,GAAG,EAAE;QAC5C,EAAE,CAAC,8CAA8C,EAAE,GAAG,EAAE;YACtD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,EAAE,UAAU,EAAE,IAAI,CAAC;gBAC/C,cAAc,EAAE,yDAAyD;aAC1E,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,IAAI;gBACZ,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,KAAK;iBACpB;aACF,CAAC,CAAC;YACH,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,GAAG,EAAE;YAC1D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,EAAE,SAAS,EAAE,IAAI,CAAC;gBAC9C,cAAc,EAAE,4CAA4C;aAC7D,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC3E,MAAM,CAAC,MAAM,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,WAAW,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;QACnE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,GAAG,EAAE;YACpE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,OAAO,CAAC;gBACrD,cAAc,EAAE,yBAAyB;aAC1C,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,wBAAwB,EAAE;oBACxB,KAAK,EAAE,GAAG,EAAE,aAAa;iBAC1B;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;QAC1B,EAAE,CAAC,oCAAoC,EAAE,GAAG,EAAE;YAC5C,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,EAAE;gBACpB,cAAc,EAAE,eAAe;aAChC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,IAAI;gBACZ,cAAc,EAAE,EAAE;gBAClB,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;iBACT;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,GAAG,EAAE;YAC7C,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,CAAC;gBAC7B,cAAc,EAAE,EAAE;aACnB,CAAC,CAAC;YAEH,4BAA4B;YAC5B,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAEhF,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,KAAK;wBACZ,wBAAwB,EAAE;4BACxB,KAAK,EAAE,CAAC;yBACT;qBACF;iBACF;aACF,CAAC,CAAC;YAEH,cAAc,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,KAAK,CAAC;gBACzB,cAAc,EAAE,eAAe;aAChC,CAAC,CAAC;YAEH,4BAA4B;YAC5B,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAEhF,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;gBAC7C,KAAK,EAAE,KAAK;gBACZ,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;iBACT;aACF,CAAC,CAAC;YAEH,cAAc,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,GAAG,EAAE;YAChE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,CAAC;gBAC7B,cAAc,EAAE,IAAyB;aAC1C,CAAC,CAAC;YAEH,4BAA4B;YAC5B,MAAM,cAAc,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC,CAAC,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAEhF,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,KAAK;qBACb;iBACF;aACF,CAAC,CAAC;YAEH,cAAc,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,0CAA0C,EAAE,GAAG,EAAE;YAClD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,SAAS,CAAC;gBAC7B,cAAc,EAAE,+BAA+B;aAChD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;gBAC7C,KAAK,EAAE,IAAI;gBACX,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;iBACT;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,GAAG,EAAE;YAC/C,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,kBAAkB,CAAC;gBACtC,cAAc,EAAE,mCAAmC;gBACnD,oBAAoB,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE,EAAE,4CAA4C;aAC1G,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;gBAC7C,KAAK,EAAE,IAAI;aACZ,CAAC,CAAC;YACH,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;QAC9F,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACzC,EAAE,CAAC,4DAA4D,EAAE,GAAG,EAAE;YACpE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,gBAAgB,CAAC;gBACpC,cAAc,EAAE,iCAAiC;aAClD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,mEAAmE;YACnE,6DAA6D;YAC7D,4CAA4C;YAC5C,kDAAkD;YAClD,MAAM,cAAc,GAAG,CAAC,CAAC,GAAG,GAAG,GAAG,GAAG,CAAC,GAAG,CAAC,GAAG,GAAG,GAAG,CAAC,CAAC;YACrD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,WAAW,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC;QACjG,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,+BAA+B,EAAE,GAAG,EAAE;YACvC,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,iCAAiC,CAAC;gBACrD,cAAc,EAAE,oCAAoC;gBACpD,oBAAoB,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,SAAS,EAAE,GAAG,EAAE;aAC5D,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,mFAAmF;YACnF,6DAA6D;YAC7D,sCAAsC;YACtC,MAAM,MAAM,GAAG,CAAC,GAAG,CAAC,CAAC;YACrB,MAAM,SAAS,GAAG,CAAC,GAAG,CAAC,CAAC;YACxB,MAAM,cAAc,GAAG,CAAC,CAAC,GAAG,SAAS,GAAG,MAAM,CAAC,GAAG,CAAC,SAAS,GAAG,MAAM,CAAC,CAAC;YAEvE,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,WAAW,CAAC,cAAc,EAAE,CAAC,CAAC,CAAC;QACjG,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;QACzB,EAAE,CAAC,sCAAsC,EAAE,GAAG,EAAE;YAC9C,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,gBAAgB,EAAE,CAAC,MAAM,CAAC;gBAC1B,cAAc,EAAE,eAAe;aAChC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEhD,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC;YACvC,MAAM,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,SAAU,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC3E,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { jest, describe, it, expect } from '@jest/globals';\nimport { performRougeLEvaluation } from './rougeL-evaluator';\nimport { EvaluationRequest } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, ROUGE } from '../constant';\n\ndescribe('performRougeLEvaluation', () => {\n // Helper function to create a base request with optional overrides\n const createRequest = (overrides: Partial<EvaluationRequest> = {}): EvaluationRequest => {\n const defaults: EvaluationRequest = {\n testCaseId: 'test-001',\n question: 'Test question',\n expectedKeywords: ['keyword'],\n expectedSourceLinks: [],\n actualResponse: 'response with keyword',\n evaluationParameters: {\n approach: 'rouge',\n threshold: DEFAULT_ROUGE_PASS_SCORE,\n },\n };\n\n return {\n ...defaults,\n ...overrides,\n evaluationParameters: {\n ...defaults.evaluationParameters,\n ...overrides.evaluationParameters,\n },\n };\n };\n\n describe('basic functionality', () => {\n it('should return a valid EvaluationResult structure', () => {\n const request = createRequest({\n actualResponse: 'AI stands for artificial intelligence',\n expectedKeywords: ['artificial intelligence'],\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n testCaseId: 'test-001',\n passed: expect.any(Boolean),\n keywordMatches: expect.any(Array),\n sourceLinkMatches: expect.any(Array),\n timestamp: expect.any(String),\n evaluationParameters: expect.any(Object),\n evaluationApproachResult: expect.any(Object),\n });\n });\n\n it('should use default threshold when not provided', () => {\n const request = createRequest({\n evaluationParameters: { approach: 'rouge' },\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(DEFAULT_ROUGE_PASS_SCORE);\n });\n\n it('should use provided threshold when specified', () => {\n const customThreshold = 0.85;\n const request = createRequest({\n actualResponse: 'response',\n evaluationParameters: {\n approach: 'rouge',\n threshold: customThreshold,\n },\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(customThreshold);\n });\n });\n\n describe('single keyword evaluation', () => {\n it('should pass when single-word keyword is found in candidate', () => {\n const request = createRequest({\n expectedKeywords: ['machine'],\n actualResponse: 'This is about machine learning',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: true,\n keywordMatches: [\n {\n keyword: 'machine',\n found: true,\n evaluationApproachResult: {\n score: 1,\n approachUsed: ROUGE,\n },\n },\n ],\n });\n });\n\n it('should fail when single-word keyword is not found in candidate', () => {\n const request = createRequest({\n expectedKeywords: ['quantum'],\n actualResponse: 'This is about machine learning',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n keywordMatches: [\n {\n found: false,\n evaluationApproachResult: {\n score: 0,\n },\n },\n ],\n });\n });\n\n it('should calculate ROUGE-L score for multi-word keywords', () => {\n const request = createRequest({\n expectedKeywords: ['machine learning'],\n actualResponse: 'AI and machine learning are related',\n evaluationParameters: { approach: 'rouge', threshold: 0.5 },\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n keywordMatches: [\n {\n found: true,\n evaluationApproachResult: {\n score: expect.closeTo(0.5),\n approachUsed: ROUGE,\n },\n },\n ],\n });\n });\n\n it('should handle LCS result as object with length property', () => {\n const request = createRequest({\n expectedKeywords: ['deep learning'],\n actualResponse: 'Deep learning is a subset of machine learning',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThan(0);\n });\n });\n\n describe('multiple keywords evaluation', () => {\n it('should pass when all keywords meet threshold', () => {\n const request = createRequest({\n expectedKeywords: ['machine', 'learning', 'AI'],\n actualResponse: 'Machine learning and AI are transformative technologies',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: true,\n evaluationApproachResult: {\n score: 1,\n approachUsed: ROUGE,\n },\n });\n expect(result.keywordMatches).toHaveLength(3);\n expect(result.keywordMatches.every(match => match.found)).toBe(true);\n });\n\n it('should fail when not all keywords meet threshold', () => {\n const request = createRequest({\n expectedKeywords: ['machine', 'quantum', 'AI'],\n actualResponse: 'Machine learning and AI are transformative',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches).toHaveLength(3);\n expect(result.keywordMatches.filter(match => match.found)).toHaveLength(2);\n expect(result.evaluationApproachResult.score).toBeCloseTo(2 / 3);\n });\n\n it('should calculate overall score as ratio of passed keywords', () => {\n const request = createRequest({\n expectedKeywords: ['alpha', 'beta', 'gamma', 'delta'],\n actualResponse: 'alpha and beta are here',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n evaluationApproachResult: {\n score: 0.5, // 2 out of 4\n },\n });\n });\n });\n\n describe('edge cases', () => {\n it('should handle empty keywords array', () => {\n const request = createRequest({\n expectedKeywords: [],\n actualResponse: 'Some response',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: true,\n keywordMatches: [],\n evaluationApproachResult: {\n score: 1,\n },\n });\n });\n\n it('should handle empty actual response', () => {\n const request = createRequest({\n expectedKeywords: ['machine'],\n actualResponse: '',\n });\n\n // Suppress expected warning\n const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => {});\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n keywordMatches: [\n {\n found: false,\n evaluationApproachResult: {\n score: 0,\n },\n },\n ],\n });\n\n consoleWarnSpy.mockRestore();\n });\n\n it('should handle whitespace-only keyword', () => {\n const request = createRequest({\n expectedKeywords: [' '],\n actualResponse: 'Some response',\n });\n\n // Suppress expected warning\n const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => {});\n\n const result = performRougeLEvaluation(request);\n\n expect(result.keywordMatches[0]).toMatchObject({\n found: false,\n evaluationApproachResult: {\n score: 0,\n },\n });\n\n consoleWarnSpy.mockRestore();\n });\n\n it('should handle null/undefined actualResponse gracefully', () => {\n const request = createRequest({\n expectedKeywords: ['machine'],\n actualResponse: null as unknown as string,\n });\n\n // Suppress expected warning\n const consoleWarnSpy = jest.spyOn(console, 'warn').mockImplementation(() => {});\n\n const result = performRougeLEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n keywordMatches: [\n {\n found: false,\n },\n ],\n });\n\n consoleWarnSpy.mockRestore();\n });\n });\n\n describe('case insensitivity', () => {\n it('should perform case-insensitive matching', () => {\n const request = createRequest({\n expectedKeywords: ['MACHINE'],\n actualResponse: 'machine learning is important',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.keywordMatches[0]).toMatchObject({\n found: true,\n evaluationApproachResult: {\n score: 1,\n },\n });\n });\n\n it('should match keywords with mixed case', () => {\n const request = createRequest({\n expectedKeywords: ['MaChInE LeArNiNg'],\n actualResponse: 'MACHINE LEARNING is a field of AI',\n evaluationParameters: { approach: 'rouge', threshold: 0.4 }, // Lower threshold for real ROUGE-L behavior\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.keywordMatches[0]).toMatchObject({\n found: true,\n });\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThanOrEqual(0.4);\n });\n });\n\n describe('ROUGE-L score calculation', () => {\n it('should calculate correct F-score from precision and recall', () => {\n const request = createRequest({\n expectedKeywords: ['neural network'],\n actualResponse: 'A neural network processes data',\n });\n\n const result = performRougeLEvaluation(request);\n\n // With actual ROUGE-L: both words 'neural' and 'network' are found\n // LCS length = 2, reference length = 2, candidate length = 5\n // recall = 2/2 = 1.0, precision = 2/5 = 0.4\n // F-score = 2 * (1.0 * 0.4) / (1.0 + 0.4) ≈ 0.571\n const expectedFScore = (2 * 1.0 * 0.4) / (1.0 + 0.4);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeCloseTo(expectedFScore, 2);\n });\n\n it('should handle partial matches', () => {\n const request = createRequest({\n expectedKeywords: ['artificial intelligence systems'],\n actualResponse: 'Artificial intelligence is growing',\n evaluationParameters: { approach: 'rouge', threshold: 0.5 },\n });\n\n const result = performRougeLEvaluation(request);\n\n // With actual ROUGE-L: 'artificial' and 'intelligence' are found, 'systems' is not\n // LCS length = 2, reference length = 3, candidate length = 4\n // recall = 2/3, precision = 2/4 = 0.5\n const recall = 2 / 3;\n const precision = 2 / 4;\n const expectedFScore = (2 * precision * recall) / (precision + recall);\n\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeCloseTo(expectedFScore, 2);\n });\n });\n\n describe('timestamp', () => {\n it('should include a valid ISO timestamp', () => {\n const request = createRequest({\n expectedKeywords: ['test'],\n actualResponse: 'test response',\n });\n\n const result = performRougeLEvaluation(request);\n\n expect(result.timestamp).toBeDefined();\n expect(new Date(result.timestamp!).toISOString()).toBe(result.timestamp);\n });\n });\n});\n"]}
|
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
import { loadSemanticModel } from "./model-loader";
|
|
2
|
+
import { evaluateKeywordsSemantically } from "./evaluate-keywords";
|
|
3
|
+
import { evaluateSourceLinks } from "../../evaluators/exact/exact";
|
|
4
|
+
import { DEFAULT_SEMANTIC_PASS_SCORE } from "../../constant";
|
|
5
|
+
import { EvaluationApproach } from "../../constants/evaluation-approach";
|
|
6
|
+
export class SemanticEvaluator {
|
|
7
|
+
// TODO(LLM-39): Refactor SemanticEvaluator into a singleton pattern.
|
|
8
|
+
static extractor = null;
|
|
9
|
+
async initialize() {
|
|
10
|
+
if (SemanticEvaluator.extractor)
|
|
11
|
+
return;
|
|
12
|
+
try {
|
|
13
|
+
SemanticEvaluator.extractor = await loadSemanticModel();
|
|
14
|
+
}
|
|
15
|
+
catch (error) {
|
|
16
|
+
console.error('Failed to load semantic evaluation model:', error);
|
|
17
|
+
throw error;
|
|
18
|
+
}
|
|
19
|
+
}
|
|
20
|
+
async performEvaluation(request) {
|
|
21
|
+
try {
|
|
22
|
+
await this.initialize();
|
|
23
|
+
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, request.expectedKeywords, DEFAULT_SEMANTIC_PASS_SCORE);
|
|
24
|
+
const sourceLinkMatches = evaluateSourceLinks(request.expectedSourceLinks, request.actualResponse);
|
|
25
|
+
const totalItems = keywordMatches.length + sourceLinkMatches.length;
|
|
26
|
+
// calculate the overall score by averaging the score of the keyword and source link matches
|
|
27
|
+
const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
|
|
28
|
+
const sourceLinkScore = sourceLinkMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
|
|
29
|
+
const totalScore = keywordScore + sourceLinkScore;
|
|
30
|
+
const overallScore = totalItems > 0 ? totalScore / totalItems : 0; // to avoid division by zero
|
|
31
|
+
const passed = keywordMatches.every(match => match.found) && sourceLinkMatches.every(match => match.found);
|
|
32
|
+
const evaluationParameters = {
|
|
33
|
+
approach: EvaluationApproach.SEMANTIC,
|
|
34
|
+
threshold: DEFAULT_SEMANTIC_PASS_SCORE,
|
|
35
|
+
};
|
|
36
|
+
return {
|
|
37
|
+
testCaseId: request.testCaseId,
|
|
38
|
+
passed,
|
|
39
|
+
keywordMatches,
|
|
40
|
+
sourceLinkMatches,
|
|
41
|
+
evaluationParameters,
|
|
42
|
+
evaluationApproachResult: { score: overallScore, approachUsed: EvaluationApproach.SEMANTIC },
|
|
43
|
+
timestamp: new Date().toISOString(),
|
|
44
|
+
};
|
|
45
|
+
}
|
|
46
|
+
catch (error) {
|
|
47
|
+
console.error('Failed to perform semantic evaluation:', error);
|
|
48
|
+
return {
|
|
49
|
+
testCaseId: request.testCaseId,
|
|
50
|
+
passed: false,
|
|
51
|
+
keywordMatches: [],
|
|
52
|
+
sourceLinkMatches: [],
|
|
53
|
+
evaluationParameters: {
|
|
54
|
+
approach: EvaluationApproach.SEMANTIC,
|
|
55
|
+
threshold: DEFAULT_SEMANTIC_PASS_SCORE,
|
|
56
|
+
},
|
|
57
|
+
evaluationApproachResult: { score: 0, approachUsed: EvaluationApproach.SEMANTIC },
|
|
58
|
+
timestamp: new Date().toISOString(),
|
|
59
|
+
};
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
}
|
|
63
|
+
//# sourceMappingURL=SemanticEvaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"SemanticEvaluator.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/SemanticEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAC;AACnD,OAAO,EAAE,4BAA4B,EAAE,MAAM,qBAAqB,CAAC;AAEnE,OAAO,EAAE,mBAAmB,EAAE,MAAM,8BAA8B,CAAC;AAEnE,OAAO,EAAE,2BAA2B,EAAE,MAAM,gBAAgB,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,qCAAqC,CAAC;AAEzE,MAAM,OAAO,iBAAiB;IAC5B,qEAAqE;IAC7D,MAAM,CAAC,SAAS,GAA8B,IAAI,CAAC;IAE3D,KAAK,CAAC,UAAU;QACd,IAAI,iBAAiB,CAAC,SAAS;YAAE,OAAO;QACxC,IAAI,CAAC;YACH,iBAAiB,CAAC,SAAS,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC1D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2CAA2C,EAAE,KAAK,CAAC,CAAC;YAClE,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,CAAC,iBAAiB,CAAC,OAA0B;QAChD,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;YACxB,MAAM,cAAc,GAAG,MAAM,4BAA4B,CAAC,iBAAiB,CAAC,SAAS,EAAE,OAAO,CAAC,cAAc,EAAE,OAAO,CAAC,gBAAgB,EAAE,2BAA2B,CAAC,CAAC;YACtK,MAAM,iBAAiB,GAAG,mBAAmB,CAAC,OAAO,CAAC,mBAAmB,EAAE,OAAO,CAAC,cAAc,CAAC,CAAC;YACnG,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,GAAG,iBAAiB,CAAC,MAAM,CAAC;YACpE,4FAA4F;YAC5F,MAAM,YAAY,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,wBAAwB,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YACxG,MAAM,eAAe,GAAG,iBAAiB,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,wBAAwB,CAAC,KAAK,EAAE,CAAC,CAAC,CAAC;YAC9G,MAAM,UAAU,GAAG,YAAY,GAAG,eAAe,CAAC;YAClD,MAAM,YAAY,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,4BAA4B;YAC/F,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,iBAAiB,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAE3G,MAAM,oBAAoB,GAAG;gBAC3B,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;gBACrC,SAAS,EAAE,2BAA2B;aACf,CAAC;YAE1B,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM;gBACN,cAAc;gBACd,iBAAiB;gBACjB,oBAAoB;gBACpB,wBAAwB,EAAE,EAAE,KAAK,EAAE,YAAY,EAAE,YAAY,EAAE,kBAAkB,CAAC,QAAQ,EAAE;gBAC5F,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,EAAE,KAAK,CAAC,CAAC;YAC/D,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,iBAAiB,EAAE,EAAE;gBACrB,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;oBACrC,SAAS,EAAE,2BAA2B;iBACvC;gBACD,wBAAwB,EAAE,EAAE,KAAK,EAAE,CAAC,EAAE,YAAY,EAAE,kBAAkB,CAAC,QAAQ,EAAE;gBACjF,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;IACH,CAAC","sourcesContent":["import { EvaluationResult, EvaluationRequest } from '../../types';\nimport { loadSemanticModel } from './model-loader';\nimport { evaluateKeywordsSemantically } from './evaluate-keywords';\nimport { FeatureExtractionPipeline } from '@xenova/transformers';\nimport { evaluateSourceLinks } from '../../evaluators/exact/exact';\nimport { EvaluationParameters } from '../../../../types/evaluation';\nimport { DEFAULT_SEMANTIC_PASS_SCORE } from '../../constant';\nimport { EvaluationApproach } from '../../constants/evaluation-approach';\n\nexport class SemanticEvaluator {\n // TODO(LLM-39): Refactor SemanticEvaluator into a singleton pattern.\n private static extractor: FeatureExtractionPipeline = null;\n\n async initialize(): Promise<void> {\n if (SemanticEvaluator.extractor) return;\n try {\n SemanticEvaluator.extractor = await loadSemanticModel();\n } catch (error) {\n console.error('Failed to load semantic evaluation model:', error);\n throw error;\n }\n }\n\n async performEvaluation(request: EvaluationRequest): Promise<EvaluationResult> {\n try {\n await this.initialize();\n const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, request.expectedKeywords, DEFAULT_SEMANTIC_PASS_SCORE);\n const sourceLinkMatches = evaluateSourceLinks(request.expectedSourceLinks, request.actualResponse);\n const totalItems = keywordMatches.length + sourceLinkMatches.length;\n // calculate the overall score by averaging the score of the keyword and source link matches\n const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);\n const sourceLinkScore = sourceLinkMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);\n const totalScore = keywordScore + sourceLinkScore;\n const overallScore = totalItems > 0 ? totalScore / totalItems : 0; // to avoid division by zero\n const passed = keywordMatches.every(match => match.found) && sourceLinkMatches.every(match => match.found);\n\n const evaluationParameters = {\n approach: EvaluationApproach.SEMANTIC,\n threshold: DEFAULT_SEMANTIC_PASS_SCORE,\n } as EvaluationParameters;\n\n return {\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n sourceLinkMatches,\n evaluationParameters,\n evaluationApproachResult: { score: overallScore, approachUsed: EvaluationApproach.SEMANTIC },\n timestamp: new Date().toISOString(),\n };\n } catch (error) {\n console.error('Failed to perform semantic evaluation:', error);\n return {\n testCaseId: request.testCaseId,\n passed: false,\n keywordMatches: [],\n sourceLinkMatches: [],\n evaluationParameters: {\n approach: EvaluationApproach.SEMANTIC,\n threshold: DEFAULT_SEMANTIC_PASS_SCORE,\n },\n evaluationApproachResult: { score: 0, approachUsed: EvaluationApproach.SEMANTIC },\n timestamp: new Date().toISOString(),\n };\n }\n }\n}\n"]}
|