llm-testrunner-components 1.0.5 → 1.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -48
- package/dist/cjs/index.cjs.js +24610 -60
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.css +14 -2
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +38 -9
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/lib/evaluation/constant.js +4 -0
- package/dist/collection/lib/evaluation/constant.js.map +1 -0
- package/dist/collection/lib/evaluation/constants/evaluation-approach.js +6 -0
- package/dist/collection/lib/evaluation/constants/evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluation-engine.js +28 -44
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluators/exact/exact.js +51 -0
- package/dist/collection/lib/evaluation/evaluators/exact/exact.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.js +82 -0
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.js +73 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.test.js +313 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +63 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/evaluate-keywords.js +56 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/evaluate-keywords.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/index.js +7 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/index.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/model-loader.js +15 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/model-loader.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/similarity-utils.js +16 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/similarity-utils.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js +65 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/text-utils.js +5 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/text-utils.js.map +1 -0
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js +118 -0
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/rate-limited-fetcher/rate-limited-fetcher.js +6 -6
- package/dist/collection/lib/rate-limited-fetcher/rate-limited-fetcher.js.map +1 -1
- package/dist/collection/types/evaluation.js +6 -0
- package/dist/collection/types/evaluation.js.map +1 -0
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-lpWX1sHl.js +26319 -0
- package/dist/components/p-lpWX1sHl.js.map +1 -0
- package/dist/esm/index.js +24609 -60
- package/dist/esm/index.js.map +1 -1
- package/dist/llm-testrunner/index.esm.js +6 -1
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +4 -1
- package/dist/types/lib/evaluation/constant.d.ts +3 -0
- package/dist/types/lib/evaluation/constants/evaluation-approach.d.ts +4 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +0 -4
- package/dist/types/lib/evaluation/evaluators/exact/exact.d.ts +3 -0
- package/dist/types/lib/evaluation/evaluators/rouge1-evaluator.d.ts +17 -0
- package/dist/types/lib/evaluation/evaluators/rougeL-evaluator.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/rougeL-evaluator.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/SemanticEvaluator.d.ts +6 -0
- package/dist/types/lib/evaluation/evaluators/semantic/evaluate-keywords.d.ts +7 -0
- package/dist/types/lib/evaluation/evaluators/semantic/index.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/semantic/model-loader.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/similarity-utils.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/text-utils.d.ts +1 -0
- package/dist/types/lib/evaluation/rouge1-evaluator.test.d.ts +1 -0
- package/dist/types/lib/evaluation/types.d.ts +19 -0
- package/dist/types/lib/rate-limited-fetcher/rate-limited-fetcher.d.ts +1 -1
- package/dist/types/types/evaluation.d.ts +10 -0
- package/package.json +10 -6
- package/dist/components/p-CYUbsbxt.js +0 -1770
- package/dist/components/p-CYUbsbxt.js.map +0 -1
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { splitIntoWords } from "./text-utils";
|
|
2
|
+
import { cosineSimilarity } from "./similarity-utils";
|
|
3
|
+
import { EvaluationApproach } from "../../constants/evaluation-approach";
|
|
4
|
+
/**
|
|
5
|
+
* Evaluates whether each keyword is semantically present in the response text.
|
|
6
|
+
* Uses embeddings and cosine similarity instead of direct string matching.
|
|
7
|
+
*/
|
|
8
|
+
export async function evaluateKeywordsSemantically(extractor, response, keywords, threshold) {
|
|
9
|
+
if (keywords.length === 0)
|
|
10
|
+
return [];
|
|
11
|
+
const words = splitIntoWords(response);
|
|
12
|
+
// Generate embeddings for both response words and keywords in parallel
|
|
13
|
+
const [wordsEmbeddings, keywordsEmbeddings] = await Promise.all([
|
|
14
|
+
Promise.all(words.map(async (word) => ({
|
|
15
|
+
word,
|
|
16
|
+
emb: await extractor(word, { pooling: 'mean', normalize: true }),
|
|
17
|
+
}))),
|
|
18
|
+
Promise.all(keywords.map(async (keyword) => ({
|
|
19
|
+
keyword,
|
|
20
|
+
emb: await extractor(keyword, { pooling: 'mean', normalize: true }),
|
|
21
|
+
}))),
|
|
22
|
+
]);
|
|
23
|
+
// For each keyword, find the most semantically similar word in the response
|
|
24
|
+
const matches = keywordsEmbeddings.map(({ keyword, emb: keywordEmb }) => {
|
|
25
|
+
let bestSimilarity = 0;
|
|
26
|
+
try {
|
|
27
|
+
for (const { emb: wordEmb } of wordsEmbeddings) {
|
|
28
|
+
const similarity = cosineSimilarity(Array.from(keywordEmb.data), Array.from(wordEmb.data));
|
|
29
|
+
if (similarity > bestSimilarity)
|
|
30
|
+
bestSimilarity = similarity;
|
|
31
|
+
}
|
|
32
|
+
// Consider the keyword "found" if similarity exceeds the threshold
|
|
33
|
+
return {
|
|
34
|
+
keyword,
|
|
35
|
+
found: bestSimilarity >= threshold,
|
|
36
|
+
evaluationApproachResult: {
|
|
37
|
+
score: bestSimilarity,
|
|
38
|
+
approachUsed: EvaluationApproach.SEMANTIC,
|
|
39
|
+
},
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
catch (err) {
|
|
43
|
+
console.error(`Error evaluating "${keyword}":`, err);
|
|
44
|
+
return {
|
|
45
|
+
keyword,
|
|
46
|
+
found: false,
|
|
47
|
+
evaluationApproachResult: {
|
|
48
|
+
score: 0,
|
|
49
|
+
approachUsed: EvaluationApproach.SEMANTIC,
|
|
50
|
+
},
|
|
51
|
+
};
|
|
52
|
+
}
|
|
53
|
+
});
|
|
54
|
+
return matches;
|
|
55
|
+
}
|
|
56
|
+
//# sourceMappingURL=evaluate-keywords.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate-keywords.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/evaluate-keywords.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,cAAc,EAAE,MAAM,cAAc,CAAC;AAC9C,OAAO,EAAE,gBAAgB,EAAE,MAAM,oBAAoB,CAAC;AAEtD,OAAO,EAAE,kBAAkB,EAAE,MAAM,qCAAqC,CAAC;AAEzE;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,4BAA4B,CAAC,SAAoC,EAAE,QAAgB,EAAE,QAAkB,EAAE,SAAiB;IAC9I,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAErC,MAAM,KAAK,GAAG,cAAc,CAAC,QAAQ,CAAC,CAAC;IAEvC,uEAAuE;IACvE,MAAM,CAAC,eAAe,EAAE,kBAAkB,CAAC,GAAG,MAAM,OAAO,CAAC,GAAG,CAAC;QAC9D,OAAO,CAAC,GAAG,CACT,KAAK,CAAC,GAAG,CAAC,KAAK,EAAC,IAAI,EAAC,EAAE,CAAC,CAAC;YACvB,IAAI;YACJ,GAAG,EAAE,MAAM,SAAS,CAAC,IAAI,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;SACjE,CAAC,CAAC,CACJ;QAED,OAAO,CAAC,GAAG,CACT,QAAQ,CAAC,GAAG,CAAC,KAAK,EAAC,OAAO,EAAC,EAAE,CAAC,CAAC;YAC7B,OAAO;YACP,GAAG,EAAE,MAAM,SAAS,CAAC,OAAO,EAAE,EAAE,OAAO,EAAE,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC;SACpE,CAAC,CAAC,CACJ;KACF,CAAC,CAAC;IAEH,4EAA4E;IAC5E,MAAM,OAAO,GAAmB,kBAAkB,CAAC,GAAG,CAAC,CAAC,EAAE,OAAO,EAAE,GAAG,EAAE,UAAU,EAAE,EAAE,EAAE;QACtF,IAAI,cAAc,GAAG,CAAC,CAAC;QAEvB,IAAI,CAAC;YACH,KAAK,MAAM,EAAE,GAAG,EAAE,OAAO,EAAE,IAAI,eAAe,EAAE,CAAC;gBAC/C,MAAM,UAAU,GAAG,gBAAgB,CAAC,KAAK,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC;gBAC3F,IAAI,UAAU,GAAG,cAAc;oBAAE,cAAc,GAAG,UAAU,CAAC;YAC/D,CAAC;YAED,mEAAmE;YACnE,OAAO;gBACL,OAAO;gBACP,KAAK,EAAE,cAAc,IAAI,SAAS;gBAClC,wBAAwB,EAAE;oBACxB,KAAK,EAAE,cAAc;oBACrB,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;aACF,CAAC;QACJ,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,OAAO,CAAC,KAAK,CAAC,qBAAqB,OAAO,IAAI,EAAE,GAAG,CAAC,CAAC;YACrD,OAAO;gBACL,OAAO;gBACP,KAAK,EAAE,KAAK;gBACZ,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;aACF,CAAC;QACJ,CAAC;IACH,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC;AACjB,CAAC","sourcesContent":["import { KeywordMatch } from '../../types';\nimport { splitIntoWords } from './text-utils';\nimport { cosineSimilarity } from './similarity-utils';\nimport { FeatureExtractionPipeline } from '@xenova/transformers';\nimport { EvaluationApproach } from '../../constants/evaluation-approach';\n\n/**\n * Evaluates whether each keyword is semantically present in the response text.\n * Uses embeddings and cosine similarity instead of direct string matching.\n */\nexport async function evaluateKeywordsSemantically(extractor: FeatureExtractionPipeline, response: string, keywords: string[], threshold: number): Promise<KeywordMatch[]> {\n if (keywords.length === 0) return [];\n\n const words = splitIntoWords(response);\n\n // Generate embeddings for both response words and keywords in parallel\n const [wordsEmbeddings, keywordsEmbeddings] = await Promise.all([\n Promise.all(\n words.map(async word => ({\n word,\n emb: await extractor(word, { pooling: 'mean', normalize: true }),\n })),\n ),\n\n Promise.all(\n keywords.map(async keyword => ({\n keyword,\n emb: await extractor(keyword, { pooling: 'mean', normalize: true }),\n })),\n ),\n ]);\n\n // For each keyword, find the most semantically similar word in the response\n const matches: KeywordMatch[] = keywordsEmbeddings.map(({ keyword, emb: keywordEmb }) => {\n let bestSimilarity = 0;\n\n try {\n for (const { emb: wordEmb } of wordsEmbeddings) {\n const similarity = cosineSimilarity(Array.from(keywordEmb.data), Array.from(wordEmb.data));\n if (similarity > bestSimilarity) bestSimilarity = similarity;\n }\n\n // Consider the keyword \"found\" if similarity exceeds the threshold\n return {\n keyword,\n found: bestSimilarity >= threshold,\n evaluationApproachResult: {\n score: bestSimilarity,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n };\n } catch (err) {\n console.error(`Error evaluating \"${keyword}\":`, err);\n return {\n keyword,\n found: false,\n evaluationApproachResult: {\n score: 0,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n };\n }\n });\n\n return matches;\n}\n"]}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import { SemanticEvaluator } from "./SemanticEvaluator";
|
|
2
|
+
const semanticEvaluator = new SemanticEvaluator();
|
|
3
|
+
export async function performSemanticEvaluation(request) {
|
|
4
|
+
await semanticEvaluator.initialize();
|
|
5
|
+
return semanticEvaluator.performEvaluation(request);
|
|
6
|
+
}
|
|
7
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,iBAAiB,EAAE,MAAM,qBAAqB,CAAC;AAGxD,MAAM,iBAAiB,GAAG,IAAI,iBAAiB,EAAE,CAAC;AAElD,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAC7C,OAA0B;IAE1B,MAAM,iBAAiB,CAAC,UAAU,EAAE,CAAC;IACrC,OAAO,iBAAiB,CAAC,iBAAiB,CAAC,OAAO,CAAC,CAAC;AACtD,CAAC","sourcesContent":["import { SemanticEvaluator } from './SemanticEvaluator';\nimport { EvaluationRequest, EvaluationResult } from '../../types';\n\nconst semanticEvaluator = new SemanticEvaluator();\n\nexport async function performSemanticEvaluation(\n request: EvaluationRequest\n): Promise<EvaluationResult> {\n await semanticEvaluator.initialize();\n return semanticEvaluator.performEvaluation(request);\n}"]}
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
import { pipeline } from "@xenova/transformers";
|
|
2
|
+
// Loads a semantic feature extraction model to generate embeddings
|
|
3
|
+
export async function loadSemanticModel() {
|
|
4
|
+
try {
|
|
5
|
+
const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {
|
|
6
|
+
quantized: true, // use quantized model to reduce memory usage
|
|
7
|
+
});
|
|
8
|
+
return extractor;
|
|
9
|
+
}
|
|
10
|
+
catch (error) {
|
|
11
|
+
console.error('Failed to load semantic evaluation model:', error);
|
|
12
|
+
throw error;
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
//# sourceMappingURL=model-loader.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"model-loader.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/model-loader.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,MAAM,sBAAsB,CAAC;AAEhD,mEAAmE;AACnE,MAAM,CAAC,KAAK,UAAU,iBAAiB;IACrC,IAAI,CAAC;QACH,MAAM,SAAS,GAAG,MAAM,QAAQ,CAAC,oBAAoB,EAAE,yBAAyB,EAAE;YAChF,SAAS,EAAE,IAAI,EAAE,6CAA6C;SAC/D,CAAC,CAAC;QACH,OAAO,SAAS,CAAC;IACnB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,OAAO,CAAC,KAAK,CAAC,2CAA2C,EAAE,KAAK,CAAC,CAAC;QAClE,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC","sourcesContent":["import { pipeline } from '@xenova/transformers';\n\n// Loads a semantic feature extraction model to generate embeddings\nexport async function loadSemanticModel() {\n try {\n const extractor = await pipeline('feature-extraction', 'Xenova/all-MiniLM-L6-v2', {\n quantized: true, // use quantized model to reduce memory usage\n });\n return extractor;\n } catch (error) {\n console.error('Failed to load semantic evaluation model:', error);\n throw error;\n }\n}"]}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
// Computes cosine similarity between two numeric vectors.
|
|
2
|
+
// Returns a value between -1 and 1 indicating similarity.(1 means identical, 0 means completely different and negative values mean opposite directions)
|
|
3
|
+
export function cosineSimilarity(vecA, vecB) {
|
|
4
|
+
if (vecA.length !== vecB.length)
|
|
5
|
+
throw new Error('Vectors must have the same length');
|
|
6
|
+
let dot = 0, normA = 0, normB = 0;
|
|
7
|
+
for (let i = 0; i < vecA.length; i++) {
|
|
8
|
+
dot += vecA[i] * vecB[i];
|
|
9
|
+
normA += vecA[i] ** 2;
|
|
10
|
+
normB += vecB[i] ** 2;
|
|
11
|
+
}
|
|
12
|
+
if (normA === 0 || normB === 0)
|
|
13
|
+
return 0;
|
|
14
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
15
|
+
}
|
|
16
|
+
//# sourceMappingURL=similarity-utils.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"similarity-utils.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/similarity-utils.ts"],"names":[],"mappings":"AAAA,0DAA0D;AAC1D,wJAAwJ;AACxJ,MAAM,UAAU,gBAAgB,CAAC,IAAc,EAAE,IAAc;IAC3D,IAAI,IAAI,CAAC,MAAM,KAAK,IAAI,CAAC,MAAM;QAAE,MAAM,IAAI,KAAK,CAAC,mCAAmC,CAAC,CAAC;IAEtF,IAAI,GAAG,GAAG,CAAC,EACP,KAAK,GAAG,CAAC,EACT,KAAK,GAAG,CAAC,CAAC;IAEd,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,GAAG,IAAI,IAAI,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,CAAC,CAAC,CAAC;QACzB,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;QACtB,KAAK,IAAI,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC;IAC1B,CAAC;IACD,IAAI,KAAK,KAAK,CAAC,IAAI,KAAK,KAAK,CAAC;QAAE,OAAO,CAAC,CAAC;IACzC,OAAO,GAAG,GAAG,CAAC,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;AACvD,CAAC","sourcesContent":["// Computes cosine similarity between two numeric vectors.\n// Returns a value between -1 and 1 indicating similarity.(1 means identical, 0 means completely different and negative values mean opposite directions)\nexport function cosineSimilarity(vecA: number[], vecB: number[]): number {\n if (vecA.length !== vecB.length) throw new Error('Vectors must have the same length');\n\n let dot = 0,\n normA = 0,\n normB = 0;\n\n for (let i = 0; i < vecA.length; i++) {\n dot += vecA[i] * vecB[i];\n normA += vecA[i] ** 2;\n normB += vecB[i] ** 2;\n }\n if (normA === 0 || normB === 0) return 0; \n return dot / (Math.sqrt(normA) * Math.sqrt(normB));\n}"]}
|
|
@@ -0,0 +1,65 @@
|
|
|
1
|
+
import { jest, describe, it, expect } from "@jest/globals";
|
|
2
|
+
import { evaluateKeywordsSemantically } from "../evaluate-keywords";
|
|
3
|
+
import { DEFAULT_SEMANTIC_PASS_SCORE as DEFAULT_SEMANTIC_THRESHOLD } from "../../../constant";
|
|
4
|
+
describe('evaluateKeywordsSemantically (only extractor mocked)', () => {
|
|
5
|
+
it('should return empty array when no keywords provided', async () => {
|
|
6
|
+
const mockExtractor = jest.fn();
|
|
7
|
+
const result = await evaluateKeywordsSemantically(mockExtractor, 'some response', [], DEFAULT_SEMANTIC_THRESHOLD);
|
|
8
|
+
expect(result).toEqual([]);
|
|
9
|
+
});
|
|
10
|
+
it('should return matches above threshold', async () => {
|
|
11
|
+
const response = 'The quick brown fox';
|
|
12
|
+
const keywords = ['fast', 'animal'];
|
|
13
|
+
const mockExtractor = jest.fn();
|
|
14
|
+
mockExtractor.mockImplementation(async (text) => {
|
|
15
|
+
const data = new Float32Array(text.length).fill(1);
|
|
16
|
+
return { data };
|
|
17
|
+
});
|
|
18
|
+
const cosSpy = jest.spyOn(require('../similarity-utils'), 'cosineSimilarity');
|
|
19
|
+
cosSpy
|
|
20
|
+
.mockReturnValueOnce(0.91) // these are the similarity scores for the keyword 'fast' in the response.
|
|
21
|
+
.mockReturnValueOnce(0.4)
|
|
22
|
+
.mockReturnValueOnce(0.3)
|
|
23
|
+
.mockReturnValueOnce(0.85)
|
|
24
|
+
.mockReturnValueOnce(0.6) // these are the similarity scores for the keyword 'animal' in the response.
|
|
25
|
+
.mockReturnValueOnce(0.5)
|
|
26
|
+
.mockReturnValueOnce(0.7)
|
|
27
|
+
.mockReturnValueOnce(0.8);
|
|
28
|
+
const result = await evaluateKeywordsSemantically(mockExtractor, response, keywords, DEFAULT_SEMANTIC_THRESHOLD);
|
|
29
|
+
expect(result).toHaveLength(2);
|
|
30
|
+
expect(result).toEqual([
|
|
31
|
+
{
|
|
32
|
+
keyword: 'fast',
|
|
33
|
+
found: true,
|
|
34
|
+
evaluationApproachResult: { score: 0.91, approachUsed: 'semantic' }
|
|
35
|
+
},
|
|
36
|
+
{
|
|
37
|
+
keyword: 'animal',
|
|
38
|
+
found: true,
|
|
39
|
+
evaluationApproachResult: { score: 0.8, approachUsed: 'semantic' }
|
|
40
|
+
}
|
|
41
|
+
]);
|
|
42
|
+
});
|
|
43
|
+
it('should mark below-threshold as not found', async () => {
|
|
44
|
+
const response = 'A sunny day';
|
|
45
|
+
const keywords = ['rain'];
|
|
46
|
+
const mockExtractor = jest.fn();
|
|
47
|
+
mockExtractor.mockImplementation(async (text) => {
|
|
48
|
+
return { data: new Float32Array(text.length).fill(1) };
|
|
49
|
+
});
|
|
50
|
+
const cosSpy = jest.spyOn(require('../similarity-utils'), 'cosineSimilarity');
|
|
51
|
+
cosSpy
|
|
52
|
+
.mockReturnValueOnce(0.5) // this is the similarity score for the keyword 'rain' in the response.
|
|
53
|
+
.mockReturnValueOnce(0.49)
|
|
54
|
+
.mockReturnValueOnce(0.4);
|
|
55
|
+
const result = await evaluateKeywordsSemantically(mockExtractor, response, keywords, DEFAULT_SEMANTIC_THRESHOLD);
|
|
56
|
+
expect(result).toEqual([
|
|
57
|
+
{
|
|
58
|
+
keyword: 'rain',
|
|
59
|
+
found: false,
|
|
60
|
+
evaluationApproachResult: { score: 0.5, approachUsed: 'semantic' }
|
|
61
|
+
}
|
|
62
|
+
]);
|
|
63
|
+
});
|
|
64
|
+
});
|
|
65
|
+
//# sourceMappingURL=evaluate-keywords.test.js.map
|
package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js.map
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluate-keywords.test.js","sourceRoot":"","sources":["../../../../../../src/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAC3D,OAAO,EAAE,4BAA4B,EAAE,MAAM,sBAAsB,CAAC;AAEpE,OAAO,EAAE,2BAA2B,IAAI,0BAA0B,EAAE,MAAM,mBAAmB,CAAC;AAG9F,QAAQ,CAAC,sDAAsD,EAAE,GAAG,EAAE;IAEpE,EAAE,CAAC,qDAAqD,EAAE,KAAK,IAAI,EAAE;QACnE,MAAM,aAAa,GAAI,IAAI,CAAC,EAAE,EAAe,CAAC;QAC9C,MAAM,MAAM,GAAG,MAAM,4BAA4B,CAC/C,aAAqD,EACrD,eAAe,EACf,EAAE,EACF,0BAA0B,CAC3B,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC,CAAC;IAC7B,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;QACrD,MAAM,QAAQ,GAAG,qBAAqB,CAAC;QACvC,MAAM,QAAQ,GAAG,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;QACpC,MAAM,aAAa,GAAI,IAAI,CAAC,EAAE,EAAe,CAAC;QAC9C,aAAa,CAAC,kBAAkB,CAAC,KAAK,EAAE,IAAY,EAAE,EAAE;YACtD,MAAM,IAAI,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACnD,OAAO,EAAE,IAAI,EAAE,CAAC;QAClB,CAAC,CAAC,CAAC;QAEH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,qBAAqB,CAAC,EAAE,kBAAkB,CAAC,CAAC;QAC9E,MAAM;aACH,mBAAmB,CAAC,IAAI,CAAC,CAAC,0EAA0E;aACpG,mBAAmB,CAAC,GAAG,CAAC;aACxB,mBAAmB,CAAC,GAAG,CAAC;aACxB,mBAAmB,CAAC,IAAI,CAAC;aACzB,mBAAmB,CAAC,GAAG,CAAC,CAAC,4EAA4E;aACrG,mBAAmB,CAAC,GAAG,CAAC;aACxB,mBAAmB,CAAC,GAAG,CAAC;aACxB,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAE5B,MAAM,MAAM,GAAG,MAAM,4BAA4B,CAC/C,aAAqD,EACrD,QAAQ,EACR,QAAQ,EACR,0BAA0B,CAC3B,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;QAC/B,MAAM,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC;YACrB;gBACE,OAAO,EAAE,MAAM;gBACf,KAAK,EAAE,IAAI;gBACX,wBAAwB,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,YAAY,EAAE,UAAU,EAAE;aACpE;YACD;gBACE,OAAO,EAAE,QAAQ;gBACjB,KAAK,EAAE,IAAI;gBACX,wBAAwB,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,YAAY,EAAE,UAAU,EAAE;aACnE;SACF,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,EAAE,CAAC,0CAA0C,EAAE,KAAK,IAAI,EAAE;QACxD,MAAM,QAAQ,GAAG,aAAa,CAAC;QAC/B,MAAM,QAAQ,GAAG,CAAC,MAAM,CAAC,CAAC;QAC1B,MAAM,aAAa,GAAI,IAAI,CAAC,EAAE,EAAe,CAAC;QAC9C,aAAa,CAAC,kBAAkB,CAAC,KAAK,EAAE,IAAY,EAAE,EAAE;YACtD,OAAO,EAAE,IAAI,EAAE,IAAI,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC;QACzD,CAAC,CAAC,CAAC;QAGH,MAAM,MAAM,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,qBAAqB,CAAC,EAAE,kBAAkB,CAAC,CAAC;QAC9E,MAAM;aACH,mBAAmB,CAAC,GAAG,CAAC,CAAC,uEAAuE;aAChG,mBAAmB,CAAC,IAAI,CAAC;aACzB,mBAAmB,CAAC,GAAG,CAAC,CAAC;QAE5B,MAAM,MAAM,GAAG,MAAM,4BAA4B,CAC/C,aAAqD,EACrD,QAAQ,EACR,QAAQ,EACR,0BAA0B,CAC3B,CAAC;QAEF,MAAM,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC;YACrB;gBACE,OAAO,EAAE,MAAM;gBACf,KAAK,EAAE,KAAK;gBACZ,wBAAwB,EAAE,EAAE,KAAK,EAAE,GAAG,EAAE,YAAY,EAAE,UAAU,EAAE;aACnE;SACF,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { jest, describe, it, expect } from '@jest/globals';\nimport { evaluateKeywordsSemantically } from '../evaluate-keywords';\nimport { FeatureExtractionPipeline } from '@xenova/transformers';\nimport { DEFAULT_SEMANTIC_PASS_SCORE as DEFAULT_SEMANTIC_THRESHOLD } from '../../../constant';\n\n\ndescribe('evaluateKeywordsSemantically (only extractor mocked)', () => {\n\n it('should return empty array when no keywords provided', async () => {\n const mockExtractor = jest.fn() as jest.Mock;\n const result = await evaluateKeywordsSemantically(\n mockExtractor as unknown as FeatureExtractionPipeline,\n 'some response',\n [],\n DEFAULT_SEMANTIC_THRESHOLD\n );\n\n expect(result).toEqual([]);\n });\n\n it('should return matches above threshold', async () => {\n const response = 'The quick brown fox';\n const keywords = ['fast', 'animal'];\n const mockExtractor = jest.fn() as jest.Mock;\n mockExtractor.mockImplementation(async (text: string) => {\n const data = new Float32Array(text.length).fill(1);\n return { data };\n });\n\n const cosSpy = jest.spyOn(require('../similarity-utils'), 'cosineSimilarity');\n cosSpy\n .mockReturnValueOnce(0.91) // these are the similarity scores for the keyword 'fast' in the response.\n .mockReturnValueOnce(0.4) \n .mockReturnValueOnce(0.3) \n .mockReturnValueOnce(0.85)\n .mockReturnValueOnce(0.6) // these are the similarity scores for the keyword 'animal' in the response.\n .mockReturnValueOnce(0.5)\n .mockReturnValueOnce(0.7)\n .mockReturnValueOnce(0.8);\n\n const result = await evaluateKeywordsSemantically(\n mockExtractor as unknown as FeatureExtractionPipeline,\n response,\n keywords,\n DEFAULT_SEMANTIC_THRESHOLD\n );\n\n expect(result).toHaveLength(2);\n expect(result).toEqual([\n {\n keyword: 'fast',\n found: true,\n evaluationApproachResult: { score: 0.91, approachUsed: 'semantic' }\n },\n {\n keyword: 'animal',\n found: true,\n evaluationApproachResult: { score: 0.8, approachUsed: 'semantic' }\n }\n ]);\n });\n\n it('should mark below-threshold as not found', async () => {\n const response = 'A sunny day';\n const keywords = ['rain'];\n const mockExtractor = jest.fn() as jest.Mock;\n mockExtractor.mockImplementation(async (text: string) => {\n return { data: new Float32Array(text.length).fill(1) };\n });\n\n \n const cosSpy = jest.spyOn(require('../similarity-utils'), 'cosineSimilarity');\n cosSpy\n .mockReturnValueOnce(0.5) // this is the similarity score for the keyword 'rain' in the response.\n .mockReturnValueOnce(0.49)\n .mockReturnValueOnce(0.4); \n\n const result = await evaluateKeywordsSemantically(\n mockExtractor as unknown as FeatureExtractionPipeline,\n response,\n keywords,\n DEFAULT_SEMANTIC_THRESHOLD\n );\n \n expect(result).toEqual([\n {\n keyword: 'rain',\n found: false,\n evaluationApproachResult: { score: 0.5, approachUsed: 'semantic' }\n }\n ]);\n });\n});"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"text-utils.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/text-utils.ts"],"names":[],"mappings":"AAAA,mDAAmD;AACnD,MAAM,UAAU,cAAc,CAAC,IAAY;IACvC,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,QAAQ,CAAC,CAAC,CAAC,CAAC;AAC5D,CAAC","sourcesContent":["// Splits the response into unique lowercase words.\nexport function splitIntoWords(text: string): string[] {\n return [...new Set(text.toLowerCase().split(/[^\\w]+/))];\n}"]}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
import { describe, it, expect } from "@jest/globals";
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE } from "./constant";
|
|
3
|
+
// Using integration tests with actual js-rouge library (no mocks).
|
|
4
|
+
// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.
|
|
5
|
+
import { performRouge1Evaluation } from "./evaluators/rouge1-evaluator";
|
|
6
|
+
const mockRequest = {
|
|
7
|
+
testCaseId: 'test-000',
|
|
8
|
+
question: 'What is your name?',
|
|
9
|
+
actualResponse: 'I am a large language model',
|
|
10
|
+
expectedKeywords: ['model', 'language'],
|
|
11
|
+
expectedSourceLinks: [],
|
|
12
|
+
evaluationParameters: {
|
|
13
|
+
approach: 'rouge',
|
|
14
|
+
threshold: 0.5,
|
|
15
|
+
}
|
|
16
|
+
};
|
|
17
|
+
const mockRequestNoThreshold = {
|
|
18
|
+
...mockRequest,
|
|
19
|
+
evaluationParameters: {
|
|
20
|
+
approach: 'rouge',
|
|
21
|
+
threshold: undefined,
|
|
22
|
+
}
|
|
23
|
+
};
|
|
24
|
+
describe('performRouge1Evaluation', () => {
|
|
25
|
+
describe('Basic functionality', () => {
|
|
26
|
+
it('should pass when response contains exact keyword matches', async () => {
|
|
27
|
+
const request = {
|
|
28
|
+
...mockRequest,
|
|
29
|
+
actualResponse: 'This is a language model system',
|
|
30
|
+
expectedKeywords: ['language', 'model'],
|
|
31
|
+
};
|
|
32
|
+
const result = await performRouge1Evaluation(request);
|
|
33
|
+
expect(result.passed).toBe(true);
|
|
34
|
+
expect(result.keywordMatches.length).toBe(2);
|
|
35
|
+
expect(result.keywordMatches[0].found).toBe(true);
|
|
36
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThan(0.5);
|
|
37
|
+
expect(result.keywordMatches[1].found).toBe(true);
|
|
38
|
+
expect(result.keywordMatches[1].evaluationApproachResult.score).toBeGreaterThan(0.5);
|
|
39
|
+
});
|
|
40
|
+
it('should fail when keywords are not sufficiently present', async () => {
|
|
41
|
+
const request = {
|
|
42
|
+
...mockRequest,
|
|
43
|
+
actualResponse: 'This is completely unrelated content about cooking',
|
|
44
|
+
expectedKeywords: ['machine learning', 'artificial intelligence'],
|
|
45
|
+
};
|
|
46
|
+
const result = await performRouge1Evaluation(request);
|
|
47
|
+
expect(result.passed).toBe(false);
|
|
48
|
+
expect(result.keywordMatches[0].found).toBe(false);
|
|
49
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(0.5);
|
|
50
|
+
expect(result.keywordMatches[1].found).toBe(false);
|
|
51
|
+
expect(result.keywordMatches[1].evaluationApproachResult.score).toBeLessThan(0.5);
|
|
52
|
+
});
|
|
53
|
+
it('should partially pass when only some keywords meet threshold', async () => {
|
|
54
|
+
const request = {
|
|
55
|
+
...mockRequest,
|
|
56
|
+
actualResponse: 'Machine learning is fascinating',
|
|
57
|
+
expectedKeywords: ['machine learning', 'database systems'],
|
|
58
|
+
};
|
|
59
|
+
const result = await performRouge1Evaluation(request);
|
|
60
|
+
expect(result.passed).toBe(false);
|
|
61
|
+
expect(result.keywordMatches[0].found).toBe(true);
|
|
62
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThanOrEqual(0.5);
|
|
63
|
+
expect(result.keywordMatches[1].found).toBe(false);
|
|
64
|
+
expect(result.keywordMatches[1].evaluationApproachResult.score).toBeLessThan(0.5);
|
|
65
|
+
});
|
|
66
|
+
});
|
|
67
|
+
describe('Threshold handling', () => {
|
|
68
|
+
it('should use default threshold when not provided', async () => {
|
|
69
|
+
const result = await performRouge1Evaluation(mockRequestNoThreshold);
|
|
70
|
+
expect(result.evaluationParameters.threshold).toBe(DEFAULT_ROUGE_PASS_SCORE);
|
|
71
|
+
});
|
|
72
|
+
it('should pass all keywords with threshold 0.0', async () => {
|
|
73
|
+
const request = {
|
|
74
|
+
...mockRequest,
|
|
75
|
+
actualResponse: 'completely unrelated text about cooking',
|
|
76
|
+
expectedKeywords: ['quantum physics', 'mathematics'],
|
|
77
|
+
evaluationParameters: {
|
|
78
|
+
approach: 'rouge',
|
|
79
|
+
threshold: 0.0,
|
|
80
|
+
},
|
|
81
|
+
};
|
|
82
|
+
const result = await performRouge1Evaluation(request);
|
|
83
|
+
expect(result.passed).toBe(true);
|
|
84
|
+
expect(result.keywordMatches.every(m => m.found)).toBe(true);
|
|
85
|
+
expect(result.evaluationParameters.threshold).toBe(0.0);
|
|
86
|
+
});
|
|
87
|
+
it('should fail when threshold is 1.0 and match is not perfect', async () => {
|
|
88
|
+
const request = {
|
|
89
|
+
...mockRequest,
|
|
90
|
+
actualResponse: 'This is about learning concepts',
|
|
91
|
+
expectedKeywords: ['machine learning'],
|
|
92
|
+
evaluationParameters: {
|
|
93
|
+
approach: 'rouge',
|
|
94
|
+
threshold: 1.0,
|
|
95
|
+
},
|
|
96
|
+
};
|
|
97
|
+
const result = await performRouge1Evaluation(request);
|
|
98
|
+
expect(result.evaluationParameters.threshold).toBe(1.0);
|
|
99
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(1.0);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
describe('Edge cases', () => {
|
|
103
|
+
it('should handle empty actualResponse', async () => {
|
|
104
|
+
const request = { ...mockRequest, actualResponse: '' };
|
|
105
|
+
const result = await performRouge1Evaluation(request);
|
|
106
|
+
expect(result.passed).toBe(false);
|
|
107
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBe(0);
|
|
108
|
+
expect(result.keywordMatches[1].evaluationApproachResult.score).toBe(0);
|
|
109
|
+
});
|
|
110
|
+
it('should handle empty expectedKeywords array', async () => {
|
|
111
|
+
const request = { ...mockRequest, expectedKeywords: [] };
|
|
112
|
+
const result = await performRouge1Evaluation(request);
|
|
113
|
+
expect(result.passed).toBe(true);
|
|
114
|
+
expect(result.keywordMatches.length).toBe(0);
|
|
115
|
+
});
|
|
116
|
+
});
|
|
117
|
+
});
|
|
118
|
+
//# sourceMappingURL=rouge1-evaluator.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rouge1-evaluator.test.js","sourceRoot":"","sources":["../../../src/lib/evaluation/rouge1-evaluator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAC,MAAM,eAAe,CAAC;AAEpD,OAAO,EAAE,wBAAwB,EAAE,MAAM,YAAY,CAAC;AACtD,mEAAmE;AACnE,8FAA8F;AAC9F,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AAExE,MAAM,WAAW,GAAsB;IACnC,UAAU,EAAE,UAAU;IACtB,QAAQ,EAAE,oBAAoB;IAC9B,cAAc,EAAE,6BAA6B;IAC7C,gBAAgB,EAAE,CAAC,OAAO,EAAE,UAAU,CAAC;IACvC,mBAAmB,EAAE,EAAE;IACvB,oBAAoB,EAAE;QAClB,QAAQ,EAAE,OAAO;QACjB,SAAS,EAAE,GAAG;KACjB;CACJ,CAAC;AAEF,MAAM,sBAAsB,GAAsB;IAC9C,GAAG,WAAW;IACd,oBAAoB,EAAE;QAClB,QAAQ,EAAE,OAAO;QACjB,SAAS,EAAE,SAAS;KACvB;CACJ,CAAC;AAEF,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IAErC,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACjC,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;YACtE,MAAM,OAAO,GAAsB;gBAC/B,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,gBAAgB,EAAE,CAAC,UAAU,EAAE,OAAO,CAAC;aAC1C,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;YACrF,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QACzF,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;YACpE,MAAM,OAAO,GAAsB;gBAC/B,GAAG,WAAW;gBACd,cAAc,EAAE,oDAAoD;gBACpE,gBAAgB,EAAE,CAAC,kBAAkB,EAAE,yBAAyB,CAAC;aACpE,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YAClF,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtF,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,OAAO,GAAsB;gBAC/B,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,gBAAgB,EAAE,CAAC,kBAAkB,EAAE,kBAAkB,CAAC;aAC7D,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;YAC5F,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtF,CAAC,CAAC,CAAC;IACP,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAChC,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC5D,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,sBAAsB,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,wBAAwB,CAAC,CAAC;QACjF,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YACzD,MAAM,OAAO,GAAsB;gBAC/B,GAAG,WAAW;gBACd,cAAc,EAAE,yCAAyC;gBACzD,gBAAgB,EAAE,CAAC,iBAAiB,EAAE,aAAa,CAAC;gBACpD,oBAAoB,EAAE;oBAClB,QAAQ,EAAE,OAAO;oBACjB,SAAS,EAAE,GAAG;iBACjB;aACJ,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7D,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC5D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;YACxE,MAAM,OAAO,GAAsB;gBAC/B,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,gBAAgB,EAAE,CAAC,kBAAkB,CAAC;gBACtC,oBAAoB,EAAE;oBAClB,QAAQ,EAAE,OAAO;oBACjB,SAAS,EAAE,GAAG;iBACjB;aACJ,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtF,CAAC,CAAC,CAAC;IACP,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;QACxB,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAChD,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC;YAEvD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACxE,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC5E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YACxD,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,gBAAgB,EAAE,EAAE,EAAE,CAAC;YAEzD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QACjD,CAAC,CAAC,CAAC;IACP,CAAC,CAAC,CAAC;AACP,CAAC,CAAC,CAAC","sourcesContent":["import { describe, it, expect} from '@jest/globals';\nimport { EvaluationRequest } from './types';\nimport { DEFAULT_ROUGE_PASS_SCORE } from './constant';\n// Using integration tests with actual js-rouge library (no mocks).\n// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.\nimport { performRouge1Evaluation } from './evaluators/rouge1-evaluator';\n\nconst mockRequest: EvaluationRequest = {\n testCaseId: 'test-000',\n question: 'What is your name?',\n actualResponse: 'I am a large language model',\n expectedKeywords: ['model', 'language'],\n expectedSourceLinks: [],\n evaluationParameters: {\n approach: 'rouge',\n threshold: 0.5,\n }\n};\n\nconst mockRequestNoThreshold: EvaluationRequest = {\n ...mockRequest,\n evaluationParameters: {\n approach: 'rouge',\n threshold: undefined, \n }\n};\n\ndescribe('performRouge1Evaluation', () => {\n\n describe('Basic functionality', () => {\n it('should pass when response contains exact keyword matches', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is a language model system',\n expectedKeywords: ['language', 'model'],\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(2);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThan(0.5);\n expect(result.keywordMatches[1].found).toBe(true);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBeGreaterThan(0.5);\n });\n\n it('should fail when keywords are not sufficiently present', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is completely unrelated content about cooking',\n expectedKeywords: ['machine learning', 'artificial intelligence'],\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(false);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBeLessThan(0.5);\n });\n\n it('should partially pass when only some keywords meet threshold', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'Machine learning is fascinating',\n expectedKeywords: ['machine learning', 'database systems'],\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThanOrEqual(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBeLessThan(0.5);\n });\n });\n\n describe('Threshold handling', () => {\n it('should use default threshold when not provided', async () => {\n const result = await performRouge1Evaluation(mockRequestNoThreshold);\n\n expect(result.evaluationParameters.threshold).toBe(DEFAULT_ROUGE_PASS_SCORE);\n });\n\n it('should pass all keywords with threshold 0.0', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'completely unrelated text about cooking',\n expectedKeywords: ['quantum physics', 'mathematics'],\n evaluationParameters: {\n approach: 'rouge',\n threshold: 0.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.every(m => m.found)).toBe(true);\n expect(result.evaluationParameters.threshold).toBe(0.0);\n });\n\n it('should fail when threshold is 1.0 and match is not perfect', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is about learning concepts', \n expectedKeywords: ['machine learning'],\n evaluationParameters: {\n approach: 'rouge',\n threshold: 1.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(1.0);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(1.0);\n });\n });\n\n describe('Edge cases', () => {\n it('should handle empty actualResponse', async () => {\n const request = { ...mockRequest, actualResponse: '' };\n\n const result = await performRouge1Evaluation(request);\n \n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBe(0);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBe(0);\n });\n\n it('should handle empty expectedKeywords array', async () => {\n const request = { ...mockRequest, expectedKeywords: [] };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(0);\n });\n });\n});"]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult\n} from '../../types/evaluation';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedKeywords: string[];\n expectedSourceLinks: string[];\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n sourceLinkMatches: SourceLinkMatch[];\n timestamp?: string;\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport interface SourceLinkMatch {\n link: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}"]}
|
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
export class RateLimitedFetcher {
|
|
2
2
|
queue = [];
|
|
3
3
|
delay; // delay in milliseconds
|
|
4
|
-
|
|
4
|
+
timer;
|
|
5
5
|
constructor(delayMs) {
|
|
6
6
|
this.delay = delayMs;
|
|
7
7
|
}
|
|
8
8
|
startQueue() {
|
|
9
|
-
if (this.
|
|
9
|
+
if (this.timer)
|
|
10
10
|
return;
|
|
11
|
-
this.
|
|
11
|
+
this.timer = setInterval(() => {
|
|
12
12
|
const task = this.queue.shift();
|
|
13
13
|
if (task)
|
|
14
14
|
task();
|
|
@@ -26,9 +26,9 @@ export class RateLimitedFetcher {
|
|
|
26
26
|
});
|
|
27
27
|
}
|
|
28
28
|
stop() {
|
|
29
|
-
if (this.
|
|
30
|
-
clearInterval(this.
|
|
31
|
-
this.
|
|
29
|
+
if (this.timer) {
|
|
30
|
+
clearInterval(this.timer);
|
|
31
|
+
this.timer = undefined;
|
|
32
32
|
}
|
|
33
33
|
}
|
|
34
34
|
async runAll(tasks) {
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"rate-limited-fetcher.js","sourceRoot":"","sources":["../../../src/lib/rate-limited-fetcher/rate-limited-fetcher.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,kBAAkB;IACrB,KAAK,GAAmB,EAAE,CAAC;IAC3B,KAAK,CAAS,CAAC,wBAAwB;IACvC,
|
|
1
|
+
{"version":3,"file":"rate-limited-fetcher.js","sourceRoot":"","sources":["../../../src/lib/rate-limited-fetcher/rate-limited-fetcher.ts"],"names":[],"mappings":"AAAA,MAAM,OAAO,kBAAkB;IACrB,KAAK,GAAmB,EAAE,CAAC;IAC3B,KAAK,CAAS,CAAC,wBAAwB;IACvC,KAAK,CAAkC;IAE/C,YAAY,OAAe;QACzB,IAAI,CAAC,KAAK,GAAG,OAAO,CAAC;IACvB,CAAC;IAEO,UAAU;QAChB,IAAI,IAAI,CAAC,KAAK;YAAE,OAAO;QACvB,IAAI,CAAC,KAAK,GAAG,WAAW,CAAC,GAAG,EAAE;YAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,EAAE,CAAC;YAChC,IAAI,IAAI;gBAAE,IAAI,EAAE,CAAC;YACjB,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;gBAC5B,IAAI,CAAC,IAAI,EAAE,CAAC;YACd,CAAC;QACH,CAAC,EAAE,IAAI,CAAC,KAAK,CAAC,CAAC;IACjB,CAAC;IAEM,QAAQ,CAAI,IAAsB;QACvC,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;YACrC,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,EAAE;gBACnB,IAAI,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC;YACrC,CAAC,CAAC,CAAC;YACH,IAAI,CAAC,UAAU,EAAE,CAAC;QACpB,CAAC,CAAC,CAAC;IACL,CAAC;IAEM,IAAI;QACT,IAAI,IAAI,CAAC,KAAK,EAAE,CAAC;YACf,aAAa,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAC1B,IAAI,CAAC,KAAK,GAAG,SAAS,CAAC;QACzB,CAAC;IACH,CAAC;IAEM,KAAK,CAAC,MAAM,CAAI,KAA8B;QACnD,MAAM,QAAQ,GAAG,KAAK,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,IAAI,CAAC,CAAC,CAAC;QACxD,OAAO,OAAO,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;IAC/B,CAAC;CACF","sourcesContent":["export class RateLimitedFetcher {\n private queue: (() => void)[] = [];\n private delay: number; // delay in milliseconds\n private timer?: ReturnType<typeof setInterval>;\n\n constructor(delayMs: number) {\n this.delay = delayMs;\n }\n\n private startQueue() {\n if (this.timer) return;\n this.timer = setInterval(() => {\n const task = this.queue.shift();\n if (task) task();\n if (this.queue.length === 0) {\n this.stop();\n }\n }, this.delay);\n }\n\n public schedule<T>(task: () => Promise<T>): Promise<T> {\n return new Promise((resolve, reject) => {\n this.queue.push(() => {\n task().then(resolve).catch(reject);\n });\n this.startQueue(); \n });\n }\n\n public stop() {\n if (this.timer) {\n clearInterval(this.timer);\n this.timer = undefined;\n }\n }\n\n public async runAll<T>(tasks: Array<() => Promise<T>>): Promise<T[]> {\n const promises = tasks.map(task => this.schedule(task));\n return Promise.all(promises);\n }\n}\n"]}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluation.js","sourceRoot":"","sources":["../../src/types/evaluation.ts"],"names":[],"mappings":"AAKA,MAAM,CAAC,MAAM,wBAAwB,GAAG;IACpC,OAAO;IACP,UAAU;IACV,OAAO;CACD,CAAC","sourcesContent":["export interface EvaluationParameters {\n approach: EvaluationApproach;\n threshold?: number; \n}\n\nexport const EvaluationApproachValues = [\n 'exact',\n 'semantic',\n 'rouge'\n] as const;\n\nexport type EvaluationApproach = typeof EvaluationApproachValues[number];\n\nexport interface EvaluationApproachResult {\n score: number; // 0-1\n approachUsed: EvaluationApproach;\n}"]}
|
package/dist/components/index.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
export { L as LLMTestRunner, g as getAssetPath, r as render, s as setAssetPath, a as setNonce, b as setPlatformOptions } from './p-
|
|
1
|
+
export { L as LLMTestRunner, g as getAssetPath, r as render, s as setAssetPath, a as setNonce, b as setPlatformOptions } from './p-lpWX1sHl.js';
|
|
2
2
|
|
|
3
3
|
function appGlobalScript () {
|
|
4
4
|
window.env = {
|