npm - llm-testrunner-components - Versions diffs - 1.0.6 → 1.0.9 - Mend

llm-testrunner-components 1.0.6 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (266) hide show

package/dist/collection/lib/evaluation/evaluation-engine.js CHANGED Viewed

@@ -1,12 +1,45 @@
+import { performEvaluation } from "./evaluators/exact/exact";
+import { EvaluationApproach } from "./constants";
+import { performRouge1Evaluation } from "./evaluators/rouge1-evaluator";
+import { performSemanticEvaluation } from "./evaluators/semantic/index";
+import { performRougeLEvaluation } from "./evaluators/rougeL-evaluator";
+import { performBleuEvaluation } from "./evaluators/bleu/bleu-evaluator";
 export class LLMEvaluationEngine {
-    constructor() { }
     async evaluateResponse(request, callback) {
         try {
-            console.log('🔍 Starting evaluation for test case:', request.testCaseId);
-            const result = await this.performEvaluation(request);
-            console.log('Evaluation completed for test case:', request.testCaseId);
-            console.log('Result:', result);
-            callback(result);
+            const approach = request.evaluationParameters.approach;
+            switch (approach) {
+                case EvaluationApproach.BLEU: {
+                    const bleuResult = performBleuEvaluation(request);
+                    callback(bleuResult);
+                    break;
+                }
+                case EvaluationApproach.EXACT: {
+                    const exactResult = await performEvaluation(request);
+                    callback(exactResult);
+                    break;
+                }
+                case EvaluationApproach.ROUGE_1: {
+                    const rougeResult = await performRouge1Evaluation(request);
+                    callback(rougeResult);
+                    break;
+                }
+                case EvaluationApproach.ROUGE_L: {
+                    const rougeLResult = await performRougeLEvaluation(request);
+                    callback(rougeLResult);
+                    break;
+                }
+                case EvaluationApproach.SEMANTIC: {
+                    const semanticResult = await performSemanticEvaluation(request);
+                    callback(semanticResult);
+                    break;
+                }
+                default: {
+                    console.warn(`Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`);
+                    const fallbackResult = await performEvaluation(request);
+                    callback(fallbackResult);
+                }
+            }
         }
         catch (error) {
             console.error('Evaluation failed:', error);
@@ -14,48 +47,15 @@ export class LLMEvaluationEngine {
                 testCaseId: request.testCaseId,
                 passed: false,
                 keywordMatches: [],
-                sourceLinkMatches: [],
-                timestamp: new Date().toISOString()
+                timestamp: new Date().toISOString(),
+                evaluationParameters: request.evaluationParameters,
+                evaluationApproachResult: {
+                    score: 0,
+                    approachUsed: EvaluationApproach.EXACT,
+                },
             };
             callback(errorResult);
         }
     }
-    async performEvaluation(request) {
-        const { testCaseId, expectedKeywords, expectedSourceLinks, actualResponse } = request;
-        const keywordMatches = this.evaluateKeywords(expectedKeywords, actualResponse);
-        const sourceLinkMatches = this.evaluateSourceLinks(expectedSourceLinks, actualResponse);
-        // Test passes only if ALL expected keywords and source links are found
-        const totalItems = keywordMatches.length + sourceLinkMatches.length;
-        const foundItems = keywordMatches.filter(m => m.found).length + sourceLinkMatches.filter(m => m.found).length;
-        const passed = foundItems === totalItems;
-        return {
-            testCaseId,
-            passed,
-            keywordMatches,
-            sourceLinkMatches,
-            timestamp: new Date().toISOString()
-        };
-    }
-    evaluateKeywords(expectedKeywords, actualResponse) {
-        // Case-insensitive keyword matching
-        const response = actualResponse.toLowerCase();
-        return expectedKeywords.map(keyword => {
-            const keywordToMatch = keyword.toLowerCase();
-            const found = response.includes(keywordToMatch);
-            return {
-                keyword,
-                found
-            };
-        });
-    }
-    evaluateSourceLinks(expectedSourceLinks, actualResponse) {
-        return expectedSourceLinks.map(link => {
-            const found = actualResponse.includes(link);
-            return {
-                link,
-                found
-            };
-        });
-    }
 }
 //# sourceMappingURL=evaluation-engine.js.map

package/dist/collection/lib/evaluation/evaluation-engine.js.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"~~AAQA~~,~~MAAM,~~OAAO,~~mBAAmB;IAC9B~~,~~gBAAgB~~,~~CAAC;IAEjB~~,~~KAAK~~,CAAC,~~gBAAgB~~,~~CACpB~~,~~OAA0B~~,~~EAC1B~~,~~QAA4B;QAE5B~~,~~IAAI~~,CAAC;~~YACH~~,OAAO,~~CAAC~~,~~GAAG~~,CAAC,~~uCAAuC~~,EAAE,~~OAAO~~,~~CAAC~~,~~UAAU~~,~~CAAC~~,CAAC;~~YAEzE~~,~~MAAM~~,~~MAAM~~,~~GAAG~~,MAAM,~~IAAI,~~CAAC,~~iBAAiB~~,~~CAAC~~,~~OAAO~~,~~CAAC~~,CAAC;~~YAErD~~,OAAO,~~CAAC~~,~~GAAG~~,CAAC,~~qCAAqC~~,~~EAAE~~,~~OAAO~~,~~CAAC~~,~~UAAU~~,~~CAAC~~,CAAC;~~YACvE~~,OAAO,CAAC,~~GAAG~~,CAAC,~~SAAS~~,~~EAAE~~,~~MAAM~~,~~CAAC~~,CAAC;~~YAE/B~~,~~QAAQ~~,CAAC,~~MAAM~~,CAAC,CAAC~~;QAEnB~~,CAAC;~~QAAC~~,~~OAAO~~,~~KAAK~~,~~EAAE~~,CAAC~~;YACf~~,OAAO,CAAC,~~KAAK,~~CAAC,~~oBAAoB~~,~~EAAE~~,~~KAAK~~,CAAC,CAAC;~~YAE3C~~,MAAM~~,WAAW,GAAqB~~;~~gBACpC~~,~~UAAU,EAAE,OAAO,~~CAAC~~,UAAU~~;~~gBAC9B~~,~~MAAM,EAAE,~~KAAK~~;gBACb~~,~~cAAc~~,~~EAAE~~,~~EAAE;gBAClB~~,~~iBAAiB~~,~~EAAE~~,~~EAAE~~;~~gBACrB~~,~~SAAS~~,~~EAAE~~,~~IAAI~~,~~IAAI~~,~~EAAE~~,CAAC,~~WAAW~~,~~EAAE;aACpC~~,CAAC;~~YAEF~~,QAAQ,CAAC,WAAW,CAAC,CAAC;~~QACxB~~,~~CAAC~~;~~IACH~~,CAAC;~~IAEO~~,KAAK,CAAC,~~iBAAiB~~,CAAC,~~OAA0B~~;~~QACxD~~,MAAM,~~EAAE~~,~~UAAU~~,~~EAAE~~,~~gBAAgB~~,~~EAAE~~,~~mBAAmB~~,~~EAAE~~,~~cAAc~~,~~EAAE~~,~~GAAG~~,~~OAAO~~,CAAC;~~QAEtF~~,MAAM,~~cAAc~~,~~GAAG~~,~~IAAI~~,CAAC,~~gBAAgB~~,CAAC,~~gBAAgB,EAAE,cAAc,~~CAAC,CAAC;~~QAC/E~~,MAAM,~~iBAAiB~~,GAAG,~~IAAI~~,CAAC,~~mBAAmB~~,CAAC,~~mBAAmB~~,~~EAAE~~,~~cAAc~~,CAAC,CAAC;~~QAExF~~,~~uEAAuE~~;~~QACvE~~,~~MAAM~~,~~UAAU~~,~~GAAG~~,~~cAAc,~~CAAC,~~MAAM~~,~~GAAG~~,~~iBAAiB,~~CAAC,~~MAAM,~~CAAC;~~QACpE~~,MAAM,~~UAAU~~,GAAG,~~cAAc,CAAC,~~MAAM,~~CAAC~~,CAAC,~~CAAC~~,~~EAAE,~~CAAC,CAAC,CAAC,~~KAAK~~,CAAC,CAAC,MAAM,~~GAAG,iBAAiB,~~CAAC,~~MAAM~~,CAAC,CAAC,CAAC,~~EAAE~~,CAAC,~~CAAC~~,~~CAAC~~,~~KAAK~~,CAAC,CAAC,~~MAAM~~,CAAC;~~QAC9G~~,MAAM,~~MAAM~~,GAAG,~~UAAU~~,~~KAAK~~,~~UAAU,~~CAAC~~;QAEzC~~,OAAO~~;YACL~~,~~UAAU;YACV~~,~~MAAM~~;~~YACN~~,~~cAAc;YACd~~,~~iBAAiB;YACjB~~,~~SAAS~~,~~EAAE~~,~~IAAI,IAAI,EAAE,~~CAAC~~,WAAW,EAAE~~;~~SACpC~~,CAAC;~~IACJ~~,CAAC;~~IAEO~~,~~gBAAgB,~~CAAC~~,gBAA0B,EAAE,cAAsB~~;~~QACzE~~,~~oCAAoC;QACpC~~,~~MAAM~~,~~QAAQ,GAAG,cAAc,CAAC,WAAW,~~EAAE,CAAC;~~QAE9C~~,OAAO,~~gBAAgB,~~CAAC,~~GAAG~~,CAAC,~~OAAO~~,~~CAAC,~~EAAE~~;YACpC~~,~~MAAM~~,~~cAAc,GAAG,OAAO,~~CAAC,~~WAAW,EAAE,~~CAAC;~~YAC7C~~,MAAM,~~KAAK~~,~~GAAG~~,~~QAAQ~~,~~CAAC~~,~~QAAQ~~,CAAC,~~cAAc,CAAC,CAAC~~;~~YAEhD~~,~~OAAO;gBACL~~,~~OAAO;gBACP~~,KAAK;~~aACN~~,~~CAAC~~;~~QACJ~~,~~CAAC~~,~~CAAC~~,~~CAAC;IACL~~,~~CAAC;IAEO~~,~~mBAAmB~~,CAAC,~~mBAA6B~~,EAAE~~,cAAsB~~;~~QAC/E~~,~~OAAO~~,~~mBAAmB~~,~~CAAC~~,~~GAAG,~~CAAC,~~IAAI~~,~~CAAC~~,EAAE;~~YACpC~~,~~MAAM,~~KAAK,~~GAAG~~,~~cAAc,~~CAAC,~~QAAQ~~,~~CAAC~~,~~IAAI~~,CAAC,~~CAAC~~;~~YAE5C,OAAO~~;~~gBACL~~,~~IAAI~~;~~gBACJ~~,~~KAAK;aACN~~,CAAC~~;QACJ~~,CAAC,CAAC,CAAC;~~IACL~~,CAAC;~~CAIF~~","sourcesContent":["import {\n EvaluationRequest,\n EvaluationResult,\n ~~KeywordMatch~~,\n ~~SourceLinkMatch,\n EvaluationCallback\n~~} from './types';\n\nexport class LLMEvaluationEngine {\n ~~constructor() { }\n\n~~ async evaluateResponse(\n request: EvaluationRequest,\n callback: EvaluationCallback\n ): Promise<void> {\n try {\n ~~console.log('🔍~~ ~~Starting~~ ~~evaluation~~ ~~for test case:',~~ request.~~testCaseId)~~;\n\n ~~const~~ ~~result = await this.performEvaluation~~(~~request~~)~~;\n~~\n ~~console~~.~~log('Evaluation~~ ~~completed~~ ~~for~~ ~~test~~ ~~case:',~~ request~~.testCaseId~~);\n ~~console.log~~(~~'Result:', result~~);\n~~\n callback(result)~~;\n\n ~~} catch (error) {~~\n ~~console~~.~~error('Evaluation failed~~:', ~~error);\n~~\n const ~~errorResult:~~ ~~EvaluationResult~~ = ~~{\n testCaseId:~~ request~~.testCaseId,\n passed: false,\n keywordMatches: [],\n sourceLinkMatches: [],\n timestamp: new Date(~~)~~.toISOString()\n }~~;\n\n callback(~~errorResult~~);\n }\n }\n\n ~~private~~ ~~async performEvaluation(request~~: ~~EvaluationRequest): Promise<EvaluationResult>~~ {\n const { ~~testCaseId, expectedKeywords, expectedSourceLinks, actualResponse }~~ = ~~request;\n\n const~~ ~~keywordMatches = this.evaluateKeywords~~(~~expectedKeywords, actualResponse~~);\n ~~const sourceLinkMatches = this.evaluateSourceLinks~~(~~expectedSourceLinks, actualResponse~~);\n\n ~~// Test passes only if ALL expected keywords and source links are found~~\n ~~const~~ ~~totalItems = keywordMatches~~.~~length~~ ~~+ sourceLinkMatches.length;\~~n const ~~foundItems~~ = ~~keywordMatches.filter~~(~~m => m.found~~)~~.length + sourceLinkMatches.filter~~(~~m => m.found~~)~~.length~~;\n ~~const passed = foundItems === totalItems~~;\n\n ~~return~~ {\n ~~testCaseId,\n passed,\n keywordMatches,\n sourceLinkMatches,\n timestamp:~~ ~~new~~ ~~Date~~()~~.toISOString~~()\n };\n }\n\n ~~private~~ ~~evaluateKeywords~~(~~expectedKeywords:~~ ~~string[],~~ ~~actualResponse~~: ~~string):~~ ~~KeywordMatch[]~~ ~~{\n //~~ ~~Case-insensitive~~ ~~keyword~~ matching\n const ~~response~~ = ~~actualResponse.toLowerCase~~();\n\n ~~return~~ ~~expectedKeywords.map~~(~~keyword~~ => {\n ~~const~~ ~~keywordToMatch~~ ~~= keyword.toLowerCase(~~);\n const ~~found~~ = ~~response.includes(keywordToMatch);\n~~\n ~~return~~ {\n ~~keyword~~,\n ~~found\n };\n });\n }\n\n private evaluateSourceLinks(expectedSourceLinks~~: ~~string~~[]~~, actualResponse~~: ~~string~~): ~~SourceLinkMatch[]~~ {\n ~~return~~ ~~expectedSourceLinks.map(link => {\~~n ~~const~~ ~~found = actualResponse~~.~~includes(link);\~~n\n ~~return {\~~n ~~link,\n found~~\n };\n }~~);\~~n }\n~~\n\n\n~~}\n\n"]}
1	+ {"version":3,"file":"evaluation-engine.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-engine.ts"],"names":[],"mappings":"AAKA,OAAO,EAAE,iBAAiB,EAAE,MAAM,0BAA0B,CAAC;AAC7D,OAAO,EAAE,kBAAkB,EAAE,MAAM,aAAa,CAAC;AACjD,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,yBAAyB,EAAE,MAAM,6BAA6B,CAAC;AACxE,OAAO,EAAE,uBAAuB,EAAE,MAAM,+BAA+B,CAAC;AACxE,OAAO,EAAE,qBAAqB,EAAE,MAAM,kCAAkC,CAAC;AAEzE,MAAM,OAAO,mBAAmB;IAC9B,KAAK,CAAC,gBAAgB,CACpB,OAA0B,EAC1B,QAA4B;QAE5B,IAAI,CAAC;YACH,MAAM,QAAQ,GACZ,OAAO,CAAC,oBAAoB,CAAC,QAAQ,CAAC;YACxC,QAAQ,QAAQ,EAAE,CAAC;gBACjB,KAAK,kBAAkB,CAAC,IAAI,CAAC,CAAC,CAAC;oBAC7B,MAAM,UAAU,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;oBAClD,QAAQ,CAAC,UAAU,CAAC,CAAC;oBACrB,MAAM;gBACR,CAAC;gBAED,KAAK,kBAAkB,CAAC,KAAK,CAAC,CAAC,CAAC;oBAC9B,MAAM,WAAW,GAAG,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;oBACrD,QAAQ,CAAC,WAAW,CAAC,CAAC;oBACtB,MAAM;gBACR,CAAC;gBAED,KAAK,kBAAkB,CAAC,OAAO,CAAC,CAAC,CAAC;oBAChC,MAAM,WAAW,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;oBAC3D,QAAQ,CAAC,WAAW,CAAC,CAAC;oBACtB,MAAM;gBACR,CAAC;gBAED,KAAK,kBAAkB,CAAC,OAAO,CAAC,CAAC,CAAC;oBAChC,MAAM,YAAY,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;oBAC5D,QAAQ,CAAC,YAAY,CAAC,CAAC;oBACvB,MAAM;gBACR,CAAC;gBAED,KAAK,kBAAkB,CAAC,QAAQ,CAAC,CAAC,CAAC;oBACjC,MAAM,cAAc,GAAG,MAAM,yBAAyB,CAAC,OAAO,CAAC,CAAC;oBAChE,QAAQ,CAAC,cAAc,CAAC,CAAC;oBACzB,MAAM;gBACR,CAAC;gBAED,OAAO,CAAC,CAAC,CAAC;oBACR,OAAO,CAAC,IAAI,CACV,8BAA8B,OAAO,CAAC,oBAAoB,CAAC,QAAQ,kCAAkC,CACtG,CAAC;oBACF,MAAM,cAAc,GAAG,MAAM,iBAAiB,CAAC,OAAO,CAAC,CAAC;oBACxD,QAAQ,CAAC,cAAc,CAAC,CAAC;gBAC3B,CAAC;YACH,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,oBAAoB,EAAE,KAAK,CAAC,CAAC;YAE3C,MAAM,WAAW,GAAqB;gBACpC,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;gBACnC,oBAAoB,EAAE,OAAO,CAAC,oBAAoB;gBAClD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,kBAAkB,CAAC,KAAK;iBACvC;aACF,CAAC;YAEF,QAAQ,CAAC,WAAW,CAAC,CAAC;QACxB,CAAC;IACH,CAAC;CACF","sourcesContent":["import {\n EvaluationRequest,\n EvaluationResult,\n EvaluationCallback,\n} from './types';\nimport { performEvaluation } from './evaluators/exact/exact';\nimport { EvaluationApproach } from './constants';\nimport { performRouge1Evaluation } from './evaluators/rouge1-evaluator';\nimport { performSemanticEvaluation } from './evaluators/semantic/index';\nimport { performRougeLEvaluation } from './evaluators/rougeL-evaluator';\nimport { performBleuEvaluation } from './evaluators/bleu/bleu-evaluator';\n\nexport class LLMEvaluationEngine {\n async evaluateResponse(\n request: EvaluationRequest,\n callback: EvaluationCallback,\n ): Promise<void> {\n try {\n const approach: EvaluationApproach =\n request.evaluationParameters.approach;\n switch (approach) {\n case EvaluationApproach.BLEU: {\n const bleuResult = performBleuEvaluation(request);\n callback(bleuResult);\n break;\n }\n\n case EvaluationApproach.EXACT: {\n const exactResult = await performEvaluation(request);\n callback(exactResult);\n break;\n }\n\n case EvaluationApproach.ROUGE_1: {\n const rougeResult = await performRouge1Evaluation(request);\n callback(rougeResult);\n break;\n }\n\n case EvaluationApproach.ROUGE_L: {\n const rougeLResult = await performRougeLEvaluation(request);\n callback(rougeLResult);\n break;\n }\n\n case EvaluationApproach.SEMANTIC: {\n const semanticResult = await performSemanticEvaluation(request);\n callback(semanticResult);\n break;\n }\n\n default: {\n console.warn(\n `Unknown matching approach: ${request.evaluationParameters.approach}, falling back to exact matching`,\n );\n const fallbackResult = await performEvaluation(request);\n callback(fallbackResult);\n }\n }\n } catch (error) {\n console.error('Evaluation failed:', error);\n\n const errorResult: EvaluationResult = {\n testCaseId: request.testCaseId,\n passed: false,\n keywordMatches: [],\n timestamp: new Date().toISOString(),\n evaluationParameters: request.evaluationParameters,\n evaluationApproachResult: {\n score: 0,\n approachUsed: EvaluationApproach.EXACT,\n },\n };\n\n callback(errorResult);\n }\n }\n}\n"]}

package/dist/collection/lib/evaluation/evaluation-service.js ADDED Viewed

@@ -0,0 +1,33 @@
+import { LLMEvaluationEngine } from "./evaluation-engine";
+/**
+ * Service for evaluating test case responses
+ */
+export class EvaluationService {
+    engine;
+    constructor() {
+        this.engine = new LLMEvaluationEngine();
+    }
+    /**
+     * Evaluates a test case response
+     * @param testCase - The test case to evaluate
+     * @param onResult - Callback to handle the evaluation result
+     */
+    async evaluateTestCase(testCase, onResult) {
+        if (!testCase.output) {
+            console.warn('⚠️ No output to evaluate for test case:', testCase.id);
+            return;
+        }
+        const evaluationRequest = {
+            testCaseId: testCase.id,
+            question: testCase.question,
+            expectedOutcome: testCase.expectedOutcome,
+            actualResponse: testCase.output,
+            evaluationParameters: testCase.evaluationParameters,
+        };
+        await this.engine.evaluateResponse(evaluationRequest, (result) => {
+            console.log('📊 Evaluation result received:', result);
+            onResult(result);
+        });
+    }
+}
+//# sourceMappingURL=evaluation-service.js.map

package/dist/collection/lib/evaluation/evaluation-service.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"evaluation-service.js","sourceRoot":"","sources":["../../../src/lib/evaluation/evaluation-service.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAI1D;;GAEG;AACH,MAAM,OAAO,iBAAiB;IACpB,MAAM,CAAsB;IAEpC;QACE,IAAI,CAAC,MAAM,GAAG,IAAI,mBAAmB,EAAE,CAAC;IAC1C,CAAC;IAED;;;;OAIG;IACH,KAAK,CAAC,gBAAgB,CACpB,QAAkB,EAClB,QAA4C;QAE5C,IAAI,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC;YACrB,OAAO,CAAC,IAAI,CAAC,yCAAyC,EAAE,QAAQ,CAAC,EAAE,CAAC,CAAC;YACrE,OAAO;QACT,CAAC;QAED,MAAM,iBAAiB,GAAsB;YAC3C,UAAU,EAAE,QAAQ,CAAC,EAAE;YACvB,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,eAAe,EAAE,QAAQ,CAAC,eAAe;YACzC,cAAc,EAAE,QAAQ,CAAC,MAAM;YAC/B,oBAAoB,EAAE,QAAQ,CAAC,oBAAoB;SACpD,CAAC;QAEF,MAAM,IAAI,CAAC,MAAM,CAAC,gBAAgB,CAChC,iBAAiB,EACjB,CAAC,MAAwB,EAAE,EAAE;YAC3B,OAAO,CAAC,GAAG,CAAC,gCAAgC,EAAE,MAAM,CAAC,CAAC;YACtD,QAAQ,CAAC,MAAM,CAAC,CAAC;QACnB,CAAC,CACF,CAAC;IACJ,CAAC;CACF","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport { EvaluationRequest, EvaluationResult } from './types';\nimport { TestCase } from '../../types/llm-test-runner';\n\n/**\n * Service for evaluating test case responses\n */\nexport class EvaluationService {\n private engine: LLMEvaluationEngine;\n\n constructor() {\n this.engine = new LLMEvaluationEngine();\n }\n\n /**\n * Evaluates a test case response\n * @param testCase - The test case to evaluate\n * @param onResult - Callback to handle the evaluation result\n */\n async evaluateTestCase(\n testCase: TestCase,\n onResult: (result: EvaluationResult) => void,\n ): Promise<void> {\n if (!testCase.output) {\n console.warn('⚠️ No output to evaluate for test case:', testCase.id);\n return;\n }\n\n const evaluationRequest: EvaluationRequest = {\n testCaseId: testCase.id,\n question: testCase.question,\n expectedOutcome: testCase.expectedOutcome,\n actualResponse: testCase.output,\n evaluationParameters: testCase.evaluationParameters,\n };\n\n await this.engine.evaluateResponse(\n evaluationRequest,\n (result: EvaluationResult) => {\n console.log('📊 Evaluation result received:', result);\n onResult(result);\n },\n );\n }\n}\n"]}

package/dist/collection/lib/evaluation/evaluators/bleu/bleu-evaluator.js ADDED Viewed

@@ -0,0 +1,116 @@
+import { bleu } from "bleu-score";
+import { DEFAULT_BLEU_PASS_SCORE, EvaluationApproach } from "../../constants";
+/**
+ * Normalizes text by converting to lowercase and normalizing whitespace.
+ * Also removes punctuation that would interfere with n-gram matching.
+ *
+ * @param {string} text - The text to normalize
+ * @returns {string} The normalized text
+ */
+function normalizeText(text) {
+    return text
+        .trim()
+        .toLowerCase()
+        .replace(/[.,!?;:()]/g, ' ') // Replace punctuation with spaces
+        .replace(/\s+/g, ' '); // Replace multiple whitespace with single space
+}
+/**
+ * Evaluates a single keyword against the candidate text using BLEU score.
+ *
+ * @param {string} keyword - The expected keyword (reference text)
+ * @param {string} candidate - The actual response text (candidate text)
+ * @param {number} bleuThreshold - The minimum BLEU score required to pass
+ * @returns {KeywordMatch} The evaluation result for this keyword
+ */
+function evaluateKeyword(keyword, candidate, bleuThreshold) {
+    let bleuScore = 0;
+    try {
+        const normalizedKeyword = normalizeText(keyword);
+        const normalizedCandidate = normalizeText(candidate);
+        if (normalizedKeyword.length > 0 && normalizedCandidate.length > 0) {
+            // BLEU function signature: bleu(reference, candidate, maxN)
+            // reference: the expected keyword (ground truth)
+            // candidate: the actual response text
+            // maxN: maximum n-gram order (typically 4 for standard BLEU)
+            // Adjust maxN based on keyword length - it should not exceed the number of words in the keyword
+            const keywordTokens = normalizedKeyword.split(/\s+/).length;
+            const maxN = Math.min(4, Math.max(1, keywordTokens)); // Use up to 4-grams, but respect keyword length
+            const bleuResult = bleu(normalizedKeyword, normalizedCandidate, maxN);
+            bleuScore = isNaN(bleuResult) ? 0 : bleuResult;
+        }
+        else {
+            console.warn(`BLEU not computed for keyword "${keyword}": Keyword or Candidate is missing.`);
+        }
+    }
+    catch (err) {
+        console.error(`BLEU computation failed for keyword "${keyword}":`, err);
+    }
+    const keywordPassed = bleuScore >= bleuThreshold;
+    const keywordApproachResult = {
+        score: bleuScore,
+        approachUsed: EvaluationApproach.BLEU,
+    };
+    return {
+        keyword: keyword,
+        found: keywordPassed,
+        evaluationApproachResult: keywordApproachResult,
+    };
+}
+/**
+ * Computes the BLEU score for keywords against the candidate text.
+ *
+ * BLEU measures the precision of n-grams (typically 1-4 grams) between the candidate
+ * and reference text. A score of 1.0 indicates perfect match.
+ *
+ * @example
+ * const match = performBleuEvaluation({
+ *   testCaseId: 'test-1',
+ *   question: 'What is the capital?',
+ *   expectedKeywords: ['Paris'],
+ *   actualResponse: 'The capital is Paris.',
+ *   evaluationParameters: { approach: 'bleu', threshold: 0.7 }
+ * });
+ * // Returns evaluation result with BLEU scores for each keyword
+ */
+export function performBleuEvaluation(request) {
+    const { testCaseId, actualResponse, expectedOutcome, evaluationParameters } = request;
+    // Split expectedOutcome by newlines, commas, and periods to create keywords array
+    let expectedKeywords = expectedOutcome
+        ? expectedOutcome
+            .split(/[\n,.]+/)
+            .map(k => k.trim())
+            .filter(k => k.length > 0)
+        : [];
+    // If no keywords after filtering (e.g., whitespace-only input), treat the original input as a single keyword
+    if (expectedKeywords.length === 0 && expectedOutcome) {
+        expectedKeywords = [expectedOutcome];
+    }
+    const candidate = (actualResponse || '').trim();
+    const bleuThreshold = evaluationParameters.threshold ?? DEFAULT_BLEU_PASS_SCORE;
+    let keywordsPassed = 0;
+    const totalKeywords = expectedKeywords.length;
+    const keywordMatches = expectedKeywords.map(keyword => {
+        const match = evaluateKeyword(keyword, candidate, bleuThreshold);
+        if (match.found) {
+            keywordsPassed++;
+        }
+        return match;
+    });
+    const overallPassed = keywordsPassed === totalKeywords;
+    const overallApproachResult = {
+        score: totalKeywords > 0 ? keywordsPassed / totalKeywords : 1,
+        approachUsed: EvaluationApproach.BLEU,
+    };
+    return {
+        testCaseId: testCaseId,
+        passed: overallPassed,
+        keywordMatches: keywordMatches,
+        timestamp: new Date().toISOString(),
+        evaluationParameters: {
+            ...evaluationParameters,
+            threshold: bleuThreshold,
+        },
+        evaluationApproachResult: overallApproachResult,
+    };
+}
+//# sourceMappingURL=bleu-evaluator.js.map

package/dist/collection/lib/evaluation/evaluators/bleu/bleu-evaluator.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"bleu-evaluator.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/bleu/bleu-evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,MAAM,YAAY,CAAC;AAGlC,OAAO,EAAE,uBAAuB,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAE9E;;;;;;GAMG;AACH,SAAS,aAAa,CAAC,IAAY;IACjC,OAAO,IAAI;SACR,IAAI,EAAE;SACN,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC,CAAC,kCAAkC;SAC9D,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC,CAAC,gDAAgD;AAC3E,CAAC;AAED;;;;;;;GAOG;AACH,SAAS,eAAe,CACtB,OAAe,EACf,SAAiB,EACjB,aAAqB;IAErB,IAAI,SAAS,GAAG,CAAC,CAAC;IAElB,IAAI,CAAC;QACH,MAAM,iBAAiB,GAAG,aAAa,CAAC,OAAO,CAAC,CAAC;QACjD,MAAM,mBAAmB,GAAG,aAAa,CAAC,SAAS,CAAC,CAAC;QAErD,IAAI,iBAAiB,CAAC,MAAM,GAAG,CAAC,IAAI,mBAAmB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACnE,4DAA4D;YAC5D,iDAAiD;YACjD,sCAAsC;YACtC,6DAA6D;YAC7D,gGAAgG;YAChG,MAAM,aAAa,GAAG,iBAAiB,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;YAC5D,MAAM,IAAI,GAAG,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,aAAa,CAAC,CAAC,CAAC,CAAC,gDAAgD;YACtG,MAAM,UAAU,GAAG,IAAI,CAAC,iBAAiB,EAAE,mBAAmB,EAAE,IAAI,CAAC,CAAC;YACtE,SAAS,GAAG,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC;QACjD,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CACV,kCAAkC,OAAO,qCAAqC,CAC/E,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,wCAAwC,OAAO,IAAI,EAAE,GAAG,CAAC,CAAC;IAC1E,CAAC;IAED,MAAM,aAAa,GAAG,SAAS,IAAI,aAAa,CAAC;IAEjD,MAAM,qBAAqB,GAA6B;QACtD,KAAK,EAAE,SAAS;QAChB,YAAY,EAAE,kBAAkB,CAAC,IAAI;KACtC,CAAC;IAEF,OAAO;QACL,OAAO,EAAE,OAAO;QAChB,KAAK,EAAE,aAAa;QACpB,wBAAwB,EAAE,qBAAqB;KAChD,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;;GAeG;AAEH,MAAM,UAAU,qBAAqB,CACnC,OAA0B;IAE1B,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,eAAe,EAAE,oBAAoB,EAAE,GACzE,OAAO,CAAC;IAEV,kFAAkF;IAClF,IAAI,gBAAgB,GAAG,eAAe;QACpC,CAAC,CAAC,eAAe;aACZ,KAAK,CAAC,SAAS,CAAC;aAChB,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAC9B,CAAC,CAAC,EAAE,CAAC;IAEP,6GAA6G;IAC7G,IAAI,gBAAgB,CAAC,MAAM,KAAK,CAAC,IAAI,eAAe,EAAE,CAAC;QACrD,gBAAgB,GAAG,CAAC,eAAe,CAAC,CAAC;IACvC,CAAC;IAED,MAAM,SAAS,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAChD,MAAM,aAAa,GACjB,oBAAoB,CAAC,SAAS,IAAI,uBAAuB,CAAC;IAE5D,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,MAAM,aAAa,GAAG,gBAAgB,CAAC,MAAM,CAAC;IAE9C,MAAM,cAAc,GAAmB,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;QACpE,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,EAAE,SAAS,EAAE,aAAa,CAAC,CAAC;QAEjE,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,cAAc,EAAE,CAAC;QACnB,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,cAAc,KAAK,aAAa,CAAC;IAEvD,MAAM,qBAAqB,GAA6B;QACtD,KAAK,EAAE,aAAa,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,GAAG,aAAa,CAAC,CAAC,CAAC,CAAC;QAC7D,YAAY,EAAE,kBAAkB,CAAC,IAAI;KACtC,CAAC;IAEF,OAAO;QACL,UAAU,EAAE,UAAU;QACtB,MAAM,EAAE,aAAa;QACrB,cAAc,EAAE,cAAc;QAC9B,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,oBAAoB,EAAE;YACpB,GAAG,oBAAoB;YACvB,SAAS,EAAE,aAAa;SACzB;QACD,wBAAwB,EAAE,qBAAqB;KAChD,CAAC;AACJ,CAAC","sourcesContent":["import { bleu } from 'bleu-score';\nimport { EvaluationApproachResult } from '../../../../types/evaluation';\nimport { EvaluationRequest, EvaluationResult, KeywordMatch } from '../../types';\nimport { DEFAULT_BLEU_PASS_SCORE, EvaluationApproach } from '../../constants';\n\n/**\n * Normalizes text by converting to lowercase and normalizing whitespace.\n * Also removes punctuation that would interfere with n-gram matching.\n *\n * @param {string} text - The text to normalize\n * @returns {string} The normalized text\n */\nfunction normalizeText(text: string): string {\n return text\n .trim()\n .toLowerCase()\n .replace(/[.,!?;:()]/g, ' ') // Replace punctuation with spaces\n .replace(/\\s+/g, ' '); // Replace multiple whitespace with single space\n}\n\n/**\n * Evaluates a single keyword against the candidate text using BLEU score.\n *\n * @param {string} keyword - The expected keyword (reference text)\n * @param {string} candidate - The actual response text (candidate text)\n * @param {number} bleuThreshold - The minimum BLEU score required to pass\n * @returns {KeywordMatch} The evaluation result for this keyword\n */\nfunction evaluateKeyword(\n keyword: string,\n candidate: string,\n bleuThreshold: number,\n): KeywordMatch {\n let bleuScore = 0;\n\n try {\n const normalizedKeyword = normalizeText(keyword);\n const normalizedCandidate = normalizeText(candidate);\n\n if (normalizedKeyword.length > 0 && normalizedCandidate.length > 0) {\n // BLEU function signature: bleu(reference, candidate, maxN)\n // reference: the expected keyword (ground truth)\n // candidate: the actual response text\n // maxN: maximum n-gram order (typically 4 for standard BLEU)\n // Adjust maxN based on keyword length - it should not exceed the number of words in the keyword\n const keywordTokens = normalizedKeyword.split(/\\s+/).length;\n const maxN = Math.min(4, Math.max(1, keywordTokens)); // Use up to 4-grams, but respect keyword length\n const bleuResult = bleu(normalizedKeyword, normalizedCandidate, maxN);\n bleuScore = isNaN(bleuResult) ? 0 : bleuResult;\n } else {\n console.warn(\n `BLEU not computed for keyword \"${keyword}\": Keyword or Candidate is missing.`,\n );\n }\n } catch (err) {\n console.error(`BLEU computation failed for keyword \"${keyword}\":`, err);\n }\n\n const keywordPassed = bleuScore >= bleuThreshold;\n\n const keywordApproachResult: EvaluationApproachResult = {\n score: bleuScore,\n approachUsed: EvaluationApproach.BLEU,\n };\n\n return {\n keyword: keyword,\n found: keywordPassed,\n evaluationApproachResult: keywordApproachResult,\n };\n}\n\n/**\n * Computes the BLEU score for keywords against the candidate text.\n *\n * BLEU measures the precision of n-grams (typically 1-4 grams) between the candidate\n * and reference text. A score of 1.0 indicates perfect match.\n *\n * @example\n * const match = performBleuEvaluation({\n * testCaseId: 'test-1',\n * question: 'What is the capital?',\n * expectedKeywords: ['Paris'],\n * actualResponse: 'The capital is Paris.',\n * evaluationParameters: { approach: 'bleu', threshold: 0.7 }\n * });\n * // Returns evaluation result with BLEU scores for each keyword\n */\n\nexport function performBleuEvaluation(\n request: EvaluationRequest,\n): EvaluationResult {\n const { testCaseId, actualResponse, expectedOutcome, evaluationParameters } =\n request;\n\n // Split expectedOutcome by newlines, commas, and periods to create keywords array\n let expectedKeywords = expectedOutcome\n ? expectedOutcome\n .split(/[\\n,.]+/)\n .map(k => k.trim())\n .filter(k => k.length > 0)\n : [];\n\n // If no keywords after filtering (e.g., whitespace-only input), treat the original input as a single keyword\n if (expectedKeywords.length === 0 && expectedOutcome) {\n expectedKeywords = [expectedOutcome];\n }\n\n const candidate = (actualResponse || '').trim();\n const bleuThreshold =\n evaluationParameters.threshold ?? DEFAULT_BLEU_PASS_SCORE;\n\n let keywordsPassed = 0;\n const totalKeywords = expectedKeywords.length;\n\n const keywordMatches: KeywordMatch[] = expectedKeywords.map(keyword => {\n const match = evaluateKeyword(keyword, candidate, bleuThreshold);\n\n if (match.found) {\n keywordsPassed++;\n }\n\n return match;\n });\n\n const overallPassed = keywordsPassed === totalKeywords;\n\n const overallApproachResult: EvaluationApproachResult = {\n score: totalKeywords > 0 ? keywordsPassed / totalKeywords : 1,\n approachUsed: EvaluationApproach.BLEU,\n };\n\n return {\n testCaseId: testCaseId,\n passed: overallPassed,\n keywordMatches: keywordMatches,\n timestamp: new Date().toISOString(),\n evaluationParameters: {\n ...evaluationParameters,\n threshold: bleuThreshold,\n },\n evaluationApproachResult: overallApproachResult,\n };\n}\n"]}