llm-testrunner-components 1.0.6 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +101 -100
- package/dist/cjs/app-chips_5.cjs.entry.js +158 -0
- package/dist/cjs/app-chips_5.cjs.entry.js.map +1 -0
- package/dist/cjs/app-globals-Chb-oJtg.js +34 -0
- package/dist/cjs/app-globals-Chb-oJtg.js.map +1 -0
- package/dist/cjs/index-By1scwl6.js +25542 -0
- package/dist/cjs/index-By1scwl6.js.map +1 -0
- package/dist/cjs/index-CgmLNwZO.js +21460 -0
- package/dist/cjs/index-CgmLNwZO.js.map +1 -0
- package/dist/cjs/index.cjs.js +5 -483
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +6 -4
- package/dist/cjs/llm-testrunner.cjs.js.map +1 -1
- package/dist/cjs/loader.cjs.js +5 -3
- package/dist/collection/collection-manifest.json +8 -3
- package/dist/collection/components/error-message/error-message.css +34 -0
- package/dist/collection/components/error-message/error-message.js +2 -2
- package/dist/collection/components/error-message/error-message.js.map +1 -1
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.css +60 -0
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +18 -0
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -0
- package/dist/collection/components/llm-test-runner/llm-test-runner.css +17 -657
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +253 -0
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -0
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +191 -200
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/actions/row-actions.css +28 -0
- package/dist/collection/components/llm-test-runner/test-cases/actions/row-actions.js +6 -0
- package/dist/collection/components/llm-test-runner/test-cases/actions/row-actions.js.map +1 -0
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +67 -0
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +5 -0
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +42 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +39 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.css +39 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +7 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -0
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.css +51 -0
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js +5 -0
- package/dist/collection/components/llm-test-runner/test-cases/output/response-output.js.map +1 -0
- package/dist/collection/global/env.js +3 -1
- package/dist/collection/global/env.js.map +1 -1
- package/dist/collection/index.js.map +1 -1
- package/dist/collection/lib/evaluation/constants.js +14 -0
- package/dist/collection/lib/evaluation/constants.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluation-engine.js +45 -45
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +33 -0
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/bleu/bleu-evaluator.js +116 -0
- package/dist/collection/lib/evaluation/evaluators/bleu/bleu-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/bleu/tests/bleu.test.js +352 -0
- package/dist/collection/lib/evaluation/evaluators/bleu/tests/bleu.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/exact/exact.js +44 -0
- package/dist/collection/lib/evaluation/evaluators/exact/exact.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.js +88 -0
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.js +82 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.test.js +326 -0
- package/dist/collection/lib/evaluation/evaluators/rougeL-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +69 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/evaluate-keywords.js +56 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/evaluate-keywords.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/index.js +7 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/index.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/model-loader.js +19 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/model-loader.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/similarity-utils.js +16 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/similarity-utils.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js +65 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/text-utils.js +5 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/text-utils.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js +117 -0
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/file/file-download.js +18 -0
- package/dist/collection/lib/file/file-download.js.map +1 -0
- package/dist/collection/lib/file/file-reader.js +14 -0
- package/dist/collection/lib/file/file-reader.js.map +1 -0
- package/dist/collection/lib/form/components/app-chips.css +97 -0
- package/dist/collection/lib/form/components/app-chips.js +155 -0
- package/dist/collection/lib/form/components/app-chips.js.map +1 -0
- package/dist/collection/lib/form/components/app-select.css +28 -0
- package/dist/collection/lib/form/components/app-select.js +101 -0
- package/dist/collection/lib/form/components/app-select.js.map +1 -0
- package/dist/collection/lib/form/components/app-textarea.css +38 -0
- package/dist/collection/lib/form/components/app-textarea.js +126 -0
- package/dist/collection/lib/form/components/app-textarea.js.map +1 -0
- package/dist/collection/lib/form/form-builder.js +171 -0
- package/dist/collection/lib/form/form-builder.js.map +1 -0
- package/dist/collection/lib/form/schema/base-input-field-config.js +2 -0
- package/dist/collection/lib/form/schema/base-input-field-config.js.map +1 -0
- package/dist/collection/lib/form/schema/form-control-config.js +2 -0
- package/dist/collection/lib/form/schema/form-control-config.js.map +1 -0
- package/dist/collection/lib/form/schema/index.js +8 -0
- package/dist/collection/lib/form/schema/index.js.map +1 -0
- package/dist/collection/lib/import-export/test-results-csv.js +65 -0
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -0
- package/dist/collection/lib/import-export/test-suite-exporter.js +15 -0
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -0
- package/dist/collection/lib/import-export/test-suite-importer.js +44 -0
- package/dist/collection/lib/import-export/test-suite-importer.js.map +1 -0
- package/dist/collection/lib/rate-limited-fetcher/rate-limited-fetcher.js +6 -6
- package/dist/collection/lib/rate-limited-fetcher/rate-limited-fetcher.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +56 -0
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -0
- package/dist/collection/lib/test-cases/test-case-mutations.js +16 -0
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -0
- package/dist/collection/lib/ui/button/button.css +113 -0
- package/dist/collection/lib/ui/button/button.js +21 -0
- package/dist/collection/lib/ui/button/button.js.map +1 -0
- package/dist/collection/lib/ui/button/index.js +2 -0
- package/dist/collection/lib/ui/button/index.js.map +1 -0
- package/dist/collection/lib/ui/icon-button/icon-button.css +77 -0
- package/dist/collection/lib/ui/icon-button/icon-button.js +19 -0
- package/dist/collection/lib/ui/icon-button/icon-button.js.map +1 -0
- package/dist/collection/lib/ui/icon-button/index.js +2 -0
- package/dist/collection/lib/ui/icon-button/index.js.map +1 -0
- package/dist/collection/services/adapters.js +2 -0
- package/dist/collection/services/adapters.js.map +1 -0
- package/dist/collection/services/models/gemini.js +17 -0
- package/dist/collection/services/models/gemini.js.map +1 -0
- package/dist/collection/styles/tokens.css +180 -0
- package/dist/collection/types/evaluation.js +2 -0
- package/dist/collection/types/evaluation.js.map +1 -0
- package/dist/collection/types/llm-test-runner.js +2 -0
- package/dist/collection/types/llm-test-runner.js.map +1 -0
- package/dist/components/app-chips.d.ts +11 -0
- package/dist/components/app-chips.js +2 -0
- package/dist/components/app-chips.js.map +1 -0
- package/dist/components/app-select.d.ts +11 -0
- package/dist/components/app-select.js +2 -0
- package/dist/components/app-select.js.map +1 -0
- package/dist/components/app-textarea.d.ts +11 -0
- package/dist/components/app-textarea.js +2 -0
- package/dist/components/app-textarea.js.map +1 -0
- package/dist/components/form-builder.d.ts +11 -0
- package/dist/components/form-builder.js +2 -0
- package/dist/components/form-builder.js.map +1 -0
- package/dist/components/index.d.ts +2 -0
- package/dist/components/index.js +1 -13
- package/dist/components/index.js.map +1 -1
- package/dist/components/llm-test-runner.js +1 -8
- package/dist/components/llm-test-runner.js.map +1 -1
- package/dist/components/p--2rdv_J9.js +2 -0
- package/dist/components/p--2rdv_J9.js.map +1 -0
- package/dist/components/p-B7J48VNq.js +2 -0
- package/dist/components/p-B7J48VNq.js.map +1 -0
- package/dist/components/p-BCB1rjPS.js +7 -0
- package/dist/components/p-BCB1rjPS.js.map +1 -0
- package/dist/components/p-BQhb2H_a.js +2 -0
- package/dist/components/p-BQhb2H_a.js.map +1 -0
- package/dist/components/p-D9BrlHdP.js +297 -0
- package/dist/components/p-D9BrlHdP.js.map +1 -0
- package/dist/components/p-DtCkZ1g2.js +2 -0
- package/dist/components/p-DtCkZ1g2.js.map +1 -0
- package/dist/esm/app-chips_5.entry.js +153 -0
- package/dist/esm/app-chips_5.entry.js.map +1 -0
- package/dist/esm/app-globals-DbR5vV7d.js +32 -0
- package/dist/esm/app-globals-DbR5vV7d.js.map +1 -0
- package/dist/esm/index-Bvg6mh1M.js +25539 -0
- package/dist/esm/index-Bvg6mh1M.js.map +1 -0
- package/dist/esm/index-DxzhGhec.js +21450 -0
- package/dist/esm/index-DxzhGhec.js.map +1 -0
- package/dist/esm/index.js +4 -486
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +7 -5
- package/dist/esm/llm-testrunner.js.map +1 -1
- package/dist/esm/loader.js +6 -4
- package/dist/llm-testrunner/index.esm.js +1 -1
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js.map +1 -1
- package/dist/llm-testrunner/p-3f04b0fb.entry.js +2 -0
- package/dist/llm-testrunner/p-3f04b0fb.entry.js.map +1 -0
- package/dist/llm-testrunner/p-DFds8y01.js +7 -0
- package/dist/llm-testrunner/p-DFds8y01.js.map +1 -0
- package/dist/llm-testrunner/p-DxzhGhec.js +298 -0
- package/dist/llm-testrunner/p-DxzhGhec.js.map +1 -0
- package/dist/llm-testrunner/p-GQwFOmwJ.js +2 -0
- package/dist/llm-testrunner/p-GQwFOmwJ.js.map +1 -0
- package/dist/react/components.d.ts +32 -2
- package/dist/react/components.d.ts.map +1 -1
- package/dist/react/components.js +44 -2
- package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +14 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +13 -29
- package/dist/types/components/llm-test-runner/llm-test-runner.import-export.test.d.ts +1 -0
- package/dist/types/components/llm-test-runner/test-cases/actions/row-actions.d.ts +8 -0
- package/dist/types/components/llm-test-runner/test-cases/evaluation/evaluation-summary.d.ts +7 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +25 -0
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +26 -0
- package/dist/types/components/llm-test-runner/test-cases/output/response-output.d.ts +6 -0
- package/dist/types/components.d.ts +199 -4
- package/dist/types/global/env.d.ts +2 -0
- package/dist/types/index.d.ts +1 -1
- package/dist/types/lib/evaluation/constants.d.ts +11 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +0 -4
- package/dist/types/lib/evaluation/evaluation-service.d.ts +15 -0
- package/dist/types/lib/evaluation/evaluators/bleu/bleu-evaluator.d.ts +18 -0
- package/dist/types/lib/evaluation/evaluators/bleu/tests/bleu.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/exact/exact.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/rouge1-evaluator.d.ts +17 -0
- package/dist/types/lib/evaluation/evaluators/rougeL-evaluator.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/rougeL-evaluator.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/SemanticEvaluator.d.ts +6 -0
- package/dist/types/lib/evaluation/evaluators/semantic/evaluate-keywords.d.ts +7 -0
- package/dist/types/lib/evaluation/evaluators/semantic/index.d.ts +2 -0
- package/dist/types/lib/evaluation/evaluators/semantic/model-loader.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/similarity-utils.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/tests/evaluate-keywords.test.d.ts +1 -0
- package/dist/types/lib/evaluation/evaluators/semantic/text-utils.d.ts +1 -0
- package/dist/types/lib/evaluation/index.d.ts +2 -2
- package/dist/types/lib/evaluation/rouge1-evaluator.test.d.ts +1 -0
- package/dist/types/lib/evaluation/types.d.ts +19 -7
- package/dist/types/lib/file/file-download.d.ts +7 -0
- package/dist/types/lib/file/file-reader.d.ts +6 -0
- package/dist/types/lib/form/components/app-chips.d.ts +20 -0
- package/dist/types/lib/form/components/app-select.d.ts +7 -0
- package/dist/types/lib/form/components/app-textarea.d.ts +14 -0
- package/dist/types/lib/form/form-builder.d.ts +24 -0
- package/dist/types/lib/form/schema/base-input-field-config.d.ts +37 -0
- package/dist/types/lib/form/schema/form-control-config.d.ts +13 -0
- package/dist/types/lib/form/schema/index.d.ts +9 -0
- package/dist/types/lib/import-export/test-results-csv.d.ts +13 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +16 -0
- package/dist/types/lib/import-export/test-suite-importer.d.ts +12 -0
- package/dist/types/lib/rate-limited-fetcher/rate-limited-fetcher.d.ts +1 -1
- package/dist/types/lib/test-cases/test-case-factory.d.ts +12 -0
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +9 -0
- package/dist/types/lib/ui/button/button.d.ts +13 -0
- package/dist/types/lib/ui/button/index.d.ts +2 -0
- package/dist/types/lib/ui/icon-button/icon-button.d.ts +11 -0
- package/dist/types/lib/ui/icon-button/index.d.ts +2 -0
- package/dist/types/services/adapters.d.ts +3 -0
- package/dist/types/services/models/gemini.d.ts +11 -0
- package/dist/types/stencil-public-runtime.d.ts +110 -6
- package/dist/types/types/evaluation.d.ts +9 -0
- package/dist/types/types/llm-test-runner.d.ts +22 -0
- package/package.json +30 -6
- package/dist/cjs/app-globals-CbbEbofA.js +0 -14
- package/dist/cjs/app-globals-CbbEbofA.js.map +0 -1
- package/dist/cjs/index-D-FySkoV.js +0 -1470
- package/dist/cjs/index-D-FySkoV.js.map +0 -1
- package/dist/cjs/llm-test-runner.cjs.entry.js +0 -9
- package/dist/cjs/llm-test-runner.entry.cjs.js.map +0 -1
- package/dist/components/p-CYUbsbxt.js +0 -1770
- package/dist/components/p-CYUbsbxt.js.map +0 -1
- package/dist/esm/app-globals-BOQOUavG.js +0 -12
- package/dist/esm/app-globals-BOQOUavG.js.map +0 -1
- package/dist/esm/index-cncubhtM.js +0 -1463
- package/dist/esm/index-cncubhtM.js.map +0 -1
- package/dist/esm/llm-test-runner.entry.js +0 -3
- package/dist/esm/llm-test-runner.entry.js.map +0 -1
- package/dist/llm-testrunner/llm-test-runner.entry.esm.js.map +0 -1
- package/dist/llm-testrunner/loader.esm.js.map +0 -1
- package/dist/llm-testrunner/p-BOQOUavG.js +0 -2
- package/dist/llm-testrunner/p-BOQOUavG.js.map +0 -1
- package/dist/llm-testrunner/p-cncubhtM.js +0 -3
- package/dist/llm-testrunner/p-cncubhtM.js.map +0 -1
- package/dist/llm-testrunner/p-f68fd660.entry.js +0 -2
- package/dist/llm-testrunner/p-f68fd660.entry.js.map +0 -1
|
@@ -0,0 +1,352 @@
|
|
|
1
|
+
import { jest, describe, it, expect } from "@jest/globals";
|
|
2
|
+
import { performBleuEvaluation } from "../bleu-evaluator";
|
|
3
|
+
import { DEFAULT_BLEU_PASS_SCORE, EvaluationApproach, } from "../../../constants";
|
|
4
|
+
describe('performBleuEvaluation', () => {
|
|
5
|
+
// Helper function to create a base request with optional overrides
|
|
6
|
+
const createRequest = (overrides = {}) => {
|
|
7
|
+
const defaults = {
|
|
8
|
+
testCaseId: 'test-001',
|
|
9
|
+
question: 'Test question',
|
|
10
|
+
expectedOutcome: 'keyword',
|
|
11
|
+
actualResponse: 'response with keyword',
|
|
12
|
+
evaluationParameters: {
|
|
13
|
+
approach: EvaluationApproach.BLEU,
|
|
14
|
+
threshold: DEFAULT_BLEU_PASS_SCORE,
|
|
15
|
+
},
|
|
16
|
+
};
|
|
17
|
+
return {
|
|
18
|
+
...defaults,
|
|
19
|
+
...overrides,
|
|
20
|
+
evaluationParameters: {
|
|
21
|
+
...defaults.evaluationParameters,
|
|
22
|
+
...overrides.evaluationParameters,
|
|
23
|
+
},
|
|
24
|
+
};
|
|
25
|
+
};
|
|
26
|
+
describe('basic functionality', () => {
|
|
27
|
+
it('should return a valid EvaluationResult structure', async () => {
|
|
28
|
+
const request = createRequest({
|
|
29
|
+
actualResponse: 'AI stands for artificial intelligence',
|
|
30
|
+
expectedOutcome: 'artificial intelligence',
|
|
31
|
+
});
|
|
32
|
+
const result = performBleuEvaluation(request);
|
|
33
|
+
expect(result).toMatchObject({
|
|
34
|
+
testCaseId: 'test-001',
|
|
35
|
+
passed: expect.any(Boolean),
|
|
36
|
+
keywordMatches: expect.any(Array),
|
|
37
|
+
timestamp: expect.any(String),
|
|
38
|
+
evaluationParameters: expect.any(Object),
|
|
39
|
+
evaluationApproachResult: expect.any(Object),
|
|
40
|
+
});
|
|
41
|
+
});
|
|
42
|
+
it('should use default threshold when not provided', async () => {
|
|
43
|
+
const request = createRequest({
|
|
44
|
+
expectedOutcome: 'test evaluation system works',
|
|
45
|
+
actualResponse: 'test evaluation system works well',
|
|
46
|
+
evaluationParameters: { approach: EvaluationApproach.BLEU },
|
|
47
|
+
});
|
|
48
|
+
const result = performBleuEvaluation(request);
|
|
49
|
+
expect(result.evaluationParameters.threshold).toBe(DEFAULT_BLEU_PASS_SCORE);
|
|
50
|
+
});
|
|
51
|
+
it('should use provided threshold when specified', async () => {
|
|
52
|
+
const customThreshold = 0.85;
|
|
53
|
+
const request = createRequest({
|
|
54
|
+
actualResponse: 'response text with multiple words',
|
|
55
|
+
expectedOutcome: 'response text with multiple',
|
|
56
|
+
evaluationParameters: {
|
|
57
|
+
approach: EvaluationApproach.BLEU,
|
|
58
|
+
threshold: customThreshold,
|
|
59
|
+
},
|
|
60
|
+
});
|
|
61
|
+
const result = performBleuEvaluation(request);
|
|
62
|
+
expect(result.evaluationParameters.threshold).toBe(customThreshold);
|
|
63
|
+
});
|
|
64
|
+
});
|
|
65
|
+
describe('single keyword evaluation', () => {
|
|
66
|
+
it('should pass when keyword with 4+ words matches exactly', async () => {
|
|
67
|
+
const request = createRequest({
|
|
68
|
+
expectedOutcome: 'the machine learning algorithm works',
|
|
69
|
+
actualResponse: 'the machine learning algorithm works',
|
|
70
|
+
});
|
|
71
|
+
const result = performBleuEvaluation(request);
|
|
72
|
+
expect(result).toMatchObject({
|
|
73
|
+
passed: true,
|
|
74
|
+
keywordMatches: [
|
|
75
|
+
{
|
|
76
|
+
keyword: 'the machine learning algorithm works',
|
|
77
|
+
found: true,
|
|
78
|
+
evaluationApproachResult: {
|
|
79
|
+
score: expect.any(Number),
|
|
80
|
+
approachUsed: EvaluationApproach.BLEU,
|
|
81
|
+
},
|
|
82
|
+
},
|
|
83
|
+
],
|
|
84
|
+
});
|
|
85
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThanOrEqual(0.7);
|
|
86
|
+
});
|
|
87
|
+
it('should pass when keyword with 4+ words is found with high n-gram overlap', async () => {
|
|
88
|
+
const request = createRequest({
|
|
89
|
+
expectedOutcome: 'the cat sat on',
|
|
90
|
+
actualResponse: 'the cat sat on the mat',
|
|
91
|
+
evaluationParameters: {
|
|
92
|
+
approach: EvaluationApproach.BLEU,
|
|
93
|
+
threshold: 0.5,
|
|
94
|
+
},
|
|
95
|
+
});
|
|
96
|
+
const result = performBleuEvaluation(request);
|
|
97
|
+
expect(result.keywordMatches[0]).toMatchObject({
|
|
98
|
+
found: true,
|
|
99
|
+
evaluationApproachResult: {
|
|
100
|
+
score: expect.any(Number),
|
|
101
|
+
approachUsed: EvaluationApproach.BLEU,
|
|
102
|
+
},
|
|
103
|
+
});
|
|
104
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThan(0);
|
|
105
|
+
});
|
|
106
|
+
it('should fail when keyword has low n-gram overlap', async () => {
|
|
107
|
+
const request = createRequest({
|
|
108
|
+
expectedOutcome: 'quantum physics research continues',
|
|
109
|
+
actualResponse: 'This is about machine learning algorithms',
|
|
110
|
+
evaluationParameters: {
|
|
111
|
+
approach: EvaluationApproach.BLEU,
|
|
112
|
+
threshold: 0.7,
|
|
113
|
+
},
|
|
114
|
+
});
|
|
115
|
+
const result = performBleuEvaluation(request);
|
|
116
|
+
expect(result).toMatchObject({
|
|
117
|
+
passed: false,
|
|
118
|
+
keywordMatches: [
|
|
119
|
+
{
|
|
120
|
+
found: false,
|
|
121
|
+
evaluationApproachResult: {
|
|
122
|
+
score: expect.any(Number),
|
|
123
|
+
},
|
|
124
|
+
},
|
|
125
|
+
],
|
|
126
|
+
});
|
|
127
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(0.7);
|
|
128
|
+
});
|
|
129
|
+
});
|
|
130
|
+
describe('4-gram limitation', () => {
|
|
131
|
+
it('should fail when keyword has fewer than 4 words', async () => {
|
|
132
|
+
const request = createRequest({
|
|
133
|
+
expectedOutcome: 'machine learning',
|
|
134
|
+
actualResponse: 'machine learning is important',
|
|
135
|
+
evaluationParameters: {
|
|
136
|
+
approach: EvaluationApproach.BLEU,
|
|
137
|
+
threshold: 0.7,
|
|
138
|
+
},
|
|
139
|
+
});
|
|
140
|
+
const result = performBleuEvaluation(request);
|
|
141
|
+
// BLEU uses 4-gram matching, so keywords with fewer than 4 words will have very low scores
|
|
142
|
+
expect(result.passed).toBe(false);
|
|
143
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(0.7);
|
|
144
|
+
});
|
|
145
|
+
it('should work correctly with 4-word keywords', async () => {
|
|
146
|
+
const request = createRequest({
|
|
147
|
+
expectedOutcome: 'the cat sat on',
|
|
148
|
+
actualResponse: 'the cat sat on the mat',
|
|
149
|
+
evaluationParameters: {
|
|
150
|
+
approach: EvaluationApproach.BLEU,
|
|
151
|
+
threshold: 0.5,
|
|
152
|
+
},
|
|
153
|
+
});
|
|
154
|
+
const result = performBleuEvaluation(request);
|
|
155
|
+
// 4-word keywords can produce proper 4-gram BLEU scores
|
|
156
|
+
expect(result.keywordMatches[0].found).toBe(true);
|
|
157
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeGreaterThan(0);
|
|
158
|
+
});
|
|
159
|
+
});
|
|
160
|
+
describe('multiple keywords evaluation', () => {
|
|
161
|
+
it('should pass when all keywords meet threshold', async () => {
|
|
162
|
+
const request = createRequest({
|
|
163
|
+
expectedOutcome: 'Machine learning algorithms work. Artificial intelligence systems are. Neural network models perform.',
|
|
164
|
+
actualResponse: 'Machine learning algorithms work. Artificial intelligence systems are. Neural network models perform.',
|
|
165
|
+
evaluationParameters: {
|
|
166
|
+
approach: EvaluationApproach.BLEU,
|
|
167
|
+
threshold: 0.15,
|
|
168
|
+
},
|
|
169
|
+
});
|
|
170
|
+
const result = performBleuEvaluation(request);
|
|
171
|
+
expect(result).toMatchObject({
|
|
172
|
+
passed: true,
|
|
173
|
+
evaluationApproachResult: {
|
|
174
|
+
score: 1,
|
|
175
|
+
approachUsed: EvaluationApproach.BLEU,
|
|
176
|
+
},
|
|
177
|
+
});
|
|
178
|
+
expect(result.keywordMatches).toHaveLength(3);
|
|
179
|
+
expect(result.keywordMatches.every(match => match.found)).toBe(true);
|
|
180
|
+
});
|
|
181
|
+
it('should fail when not all keywords meet threshold', async () => {
|
|
182
|
+
const request = createRequest({
|
|
183
|
+
expectedOutcome: 'machine learning algorithms work\nquantum computing research continues\nartificial intelligence systems',
|
|
184
|
+
actualResponse: 'Machine learning algorithms work. Artificial intelligence systems are growing.',
|
|
185
|
+
evaluationParameters: {
|
|
186
|
+
approach: EvaluationApproach.BLEU,
|
|
187
|
+
threshold: 0.7,
|
|
188
|
+
},
|
|
189
|
+
});
|
|
190
|
+
const result = performBleuEvaluation(request);
|
|
191
|
+
expect(result.passed).toBe(false);
|
|
192
|
+
expect(result.keywordMatches).toHaveLength(3);
|
|
193
|
+
const foundCount = result.keywordMatches.filter(match => match.found).length;
|
|
194
|
+
expect(foundCount).toBeLessThan(3);
|
|
195
|
+
expect(result.evaluationApproachResult.score).toBe(foundCount / 3);
|
|
196
|
+
});
|
|
197
|
+
it('should calculate overall score as ratio of passed keywords', async () => {
|
|
198
|
+
const request = createRequest({
|
|
199
|
+
expectedOutcome: 'alpha beta gamma delta\nepsilon zeta eta theta\niota kappa lambda mu\nnu xi omicron pi',
|
|
200
|
+
actualResponse: 'alpha beta gamma delta and epsilon zeta eta theta are here',
|
|
201
|
+
evaluationParameters: {
|
|
202
|
+
approach: EvaluationApproach.BLEU,
|
|
203
|
+
threshold: 0.3,
|
|
204
|
+
},
|
|
205
|
+
});
|
|
206
|
+
const result = performBleuEvaluation(request);
|
|
207
|
+
const foundCount = result.keywordMatches.filter(match => match.found).length;
|
|
208
|
+
expect(result).toMatchObject({
|
|
209
|
+
passed: false,
|
|
210
|
+
evaluationApproachResult: {
|
|
211
|
+
score: foundCount / 4,
|
|
212
|
+
},
|
|
213
|
+
});
|
|
214
|
+
});
|
|
215
|
+
});
|
|
216
|
+
describe('threshold handling', () => {
|
|
217
|
+
it('should pass all keywords with threshold 0.0', async () => {
|
|
218
|
+
const request = createRequest({
|
|
219
|
+
actualResponse: 'completely unrelated text about cooking',
|
|
220
|
+
expectedOutcome: 'quantum physics research continues\nmathematics and statistics are',
|
|
221
|
+
evaluationParameters: {
|
|
222
|
+
approach: EvaluationApproach.BLEU,
|
|
223
|
+
threshold: 0.0,
|
|
224
|
+
},
|
|
225
|
+
});
|
|
226
|
+
const result = performBleuEvaluation(request);
|
|
227
|
+
expect(result.passed).toBe(true);
|
|
228
|
+
expect(result.keywordMatches.every(m => m.found)).toBe(true);
|
|
229
|
+
expect(result.evaluationParameters.threshold).toBe(0.0);
|
|
230
|
+
});
|
|
231
|
+
it('should fail when threshold is 1.0 and match is not perfect', async () => {
|
|
232
|
+
const request = createRequest({
|
|
233
|
+
actualResponse: 'This is about machine learning concepts',
|
|
234
|
+
expectedOutcome: 'machine learning algorithms work well',
|
|
235
|
+
evaluationParameters: {
|
|
236
|
+
approach: EvaluationApproach.BLEU,
|
|
237
|
+
threshold: 1.0,
|
|
238
|
+
},
|
|
239
|
+
});
|
|
240
|
+
const result = performBleuEvaluation(request);
|
|
241
|
+
expect(result.evaluationParameters.threshold).toBe(1.0);
|
|
242
|
+
expect(result.keywordMatches[0].evaluationApproachResult.score).toBeLessThan(1.0);
|
|
243
|
+
expect(result.keywordMatches[0].found).toBe(false);
|
|
244
|
+
});
|
|
245
|
+
});
|
|
246
|
+
describe('edge cases', () => {
|
|
247
|
+
it('should handle empty keywords array', async () => {
|
|
248
|
+
const request = createRequest({
|
|
249
|
+
expectedOutcome: '',
|
|
250
|
+
actualResponse: 'Some response',
|
|
251
|
+
});
|
|
252
|
+
const result = performBleuEvaluation(request);
|
|
253
|
+
expect(result).toMatchObject({
|
|
254
|
+
passed: true,
|
|
255
|
+
keywordMatches: [],
|
|
256
|
+
evaluationApproachResult: {
|
|
257
|
+
score: 1,
|
|
258
|
+
},
|
|
259
|
+
});
|
|
260
|
+
});
|
|
261
|
+
it('should handle empty actual response', async () => {
|
|
262
|
+
const request = createRequest({
|
|
263
|
+
expectedOutcome: 'machine learning algorithms work',
|
|
264
|
+
actualResponse: '',
|
|
265
|
+
});
|
|
266
|
+
// Suppress expected warning
|
|
267
|
+
const consoleWarnSpy = jest
|
|
268
|
+
.spyOn(console, 'warn')
|
|
269
|
+
.mockImplementation(() => { });
|
|
270
|
+
const result = performBleuEvaluation(request);
|
|
271
|
+
expect(result).toMatchObject({
|
|
272
|
+
passed: false,
|
|
273
|
+
keywordMatches: [
|
|
274
|
+
{
|
|
275
|
+
found: false,
|
|
276
|
+
evaluationApproachResult: {
|
|
277
|
+
score: 0,
|
|
278
|
+
},
|
|
279
|
+
},
|
|
280
|
+
],
|
|
281
|
+
});
|
|
282
|
+
consoleWarnSpy.mockRestore();
|
|
283
|
+
});
|
|
284
|
+
it('should handle whitespace-only keyword', async () => {
|
|
285
|
+
const request = createRequest({
|
|
286
|
+
expectedOutcome: ' ',
|
|
287
|
+
actualResponse: 'Some response',
|
|
288
|
+
});
|
|
289
|
+
// Suppress expected warning
|
|
290
|
+
const consoleWarnSpy = jest
|
|
291
|
+
.spyOn(console, 'warn')
|
|
292
|
+
.mockImplementation(() => { });
|
|
293
|
+
const result = performBleuEvaluation(request);
|
|
294
|
+
expect(result.keywordMatches[0]).toMatchObject({
|
|
295
|
+
found: false,
|
|
296
|
+
evaluationApproachResult: {
|
|
297
|
+
score: 0,
|
|
298
|
+
},
|
|
299
|
+
});
|
|
300
|
+
consoleWarnSpy.mockRestore();
|
|
301
|
+
});
|
|
302
|
+
it('should handle null/undefined actualResponse gracefully', async () => {
|
|
303
|
+
const request = createRequest({
|
|
304
|
+
expectedOutcome: 'machine learning algorithms work',
|
|
305
|
+
actualResponse: null,
|
|
306
|
+
});
|
|
307
|
+
// Suppress expected warning
|
|
308
|
+
const consoleWarnSpy = jest
|
|
309
|
+
.spyOn(console, 'warn')
|
|
310
|
+
.mockImplementation(() => { });
|
|
311
|
+
const result = performBleuEvaluation(request);
|
|
312
|
+
expect(result).toMatchObject({
|
|
313
|
+
passed: false,
|
|
314
|
+
keywordMatches: [
|
|
315
|
+
{
|
|
316
|
+
found: false,
|
|
317
|
+
},
|
|
318
|
+
],
|
|
319
|
+
});
|
|
320
|
+
consoleWarnSpy.mockRestore();
|
|
321
|
+
});
|
|
322
|
+
});
|
|
323
|
+
describe('BLEU score calculation', () => {
|
|
324
|
+
it('should calculate BLEU score for partial match', async () => {
|
|
325
|
+
const request = createRequest({
|
|
326
|
+
expectedOutcome: 'the cat sat on the mat',
|
|
327
|
+
actualResponse: 'the cat sat on',
|
|
328
|
+
evaluationParameters: {
|
|
329
|
+
approach: EvaluationApproach.BLEU,
|
|
330
|
+
threshold: 0.3,
|
|
331
|
+
},
|
|
332
|
+
});
|
|
333
|
+
const result = performBleuEvaluation(request);
|
|
334
|
+
// Partial match should have lower BLEU score than perfect match
|
|
335
|
+
const score = result.keywordMatches[0].evaluationApproachResult.score;
|
|
336
|
+
expect(score).toBeGreaterThan(0);
|
|
337
|
+
expect(score).toBeLessThan(1.0);
|
|
338
|
+
});
|
|
339
|
+
});
|
|
340
|
+
describe('timestamp', () => {
|
|
341
|
+
it('should include a valid ISO timestamp', async () => {
|
|
342
|
+
const request = createRequest({
|
|
343
|
+
expectedOutcome: 'test evaluation system works',
|
|
344
|
+
actualResponse: 'test evaluation system works well',
|
|
345
|
+
});
|
|
346
|
+
const result = performBleuEvaluation(request);
|
|
347
|
+
expect(result.timestamp).toBeDefined();
|
|
348
|
+
expect(new Date(result.timestamp).toISOString()).toBe(result.timestamp);
|
|
349
|
+
});
|
|
350
|
+
});
|
|
351
|
+
});
|
|
352
|
+
//# sourceMappingURL=bleu.test.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"bleu.test.js","sourceRoot":"","sources":["../../../../../../src/lib/evaluation/evaluators/bleu/tests/bleu.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAC3D,OAAO,EAAE,qBAAqB,EAAE,MAAM,mBAAmB,CAAC;AAE1D,OAAO,EACL,uBAAuB,EACvB,kBAAkB,GACnB,MAAM,oBAAoB,CAAC;AAE5B,QAAQ,CAAC,uBAAuB,EAAE,GAAG,EAAE;IACrC,mEAAmE;IACnE,MAAM,aAAa,GAAG,CACpB,YAAwC,EAAE,EACvB,EAAE;QACrB,MAAM,QAAQ,GAAsB;YAClC,UAAU,EAAE,UAAU;YACtB,QAAQ,EAAE,eAAe;YACzB,eAAe,EAAE,SAAS;YAC1B,cAAc,EAAE,uBAAuB;YACvC,oBAAoB,EAAE;gBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;gBACjC,SAAS,EAAE,uBAAuB;aACnC;SACF,CAAC;QAEF,OAAO;YACL,GAAG,QAAQ;YACX,GAAG,SAAS;YACZ,oBAAoB,EAAE;gBACpB,GAAG,QAAQ,CAAC,oBAAoB;gBAChC,GAAG,SAAS,CAAC,oBAAoB;aAClC;SACF,CAAC;IACJ,CAAC,CAAC;IAEF,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,cAAc,EAAE,uCAAuC;gBACvD,eAAe,EAAE,yBAAyB;aAC3C,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,UAAU,EAAE,UAAU;gBACtB,MAAM,EAAE,MAAM,CAAC,GAAG,CAAC,OAAO,CAAC;gBAC3B,cAAc,EAAE,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC;gBACjC,SAAS,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBAC7B,oBAAoB,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;gBACxC,wBAAwB,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;aAC7C,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,8BAA8B;gBAC/C,cAAc,EAAE,mCAAmC;gBACnD,oBAAoB,EAAE,EAAE,QAAQ,EAAE,kBAAkB,CAAC,IAAI,EAAE;aAC5D,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAChD,uBAAuB,CACxB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;YAC5D,MAAM,eAAe,GAAG,IAAI,CAAC;YAC7B,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,cAAc,EAAE,mCAAmC;gBACnD,eAAe,EAAE,6BAA6B;gBAC9C,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,eAAe;iBAC3B;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QACtE,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,2BAA2B,EAAE,GAAG,EAAE;QACzC,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;YACtE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,sCAAsC;gBACvD,cAAc,EAAE,sCAAsC;aACvD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,IAAI;gBACZ,cAAc,EAAE;oBACd;wBACE,OAAO,EAAE,sCAAsC;wBAC/C,KAAK,EAAE,IAAI;wBACX,wBAAwB,EAAE;4BACxB,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;4BACzB,YAAY,EAAE,kBAAkB,CAAC,IAAI;yBACtC;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;QAChC,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,0EAA0E,EAAE,KAAK,IAAI,EAAE;YACxF,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,gBAAgB;gBACjC,cAAc,EAAE,wBAAwB;gBACxC,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;gBAC7C,KAAK,EAAE,IAAI;gBACX,wBAAwB,EAAE;oBACxB,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;oBACzB,YAAY,EAAE,kBAAkB,CAAC,IAAI;iBACtC;aACF,CAAC,CAAC;YACH,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;YAC/D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,oCAAoC;gBACrD,cAAc,EAAE,2CAA2C;gBAC3D,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,KAAK;wBACZ,wBAAwB,EAAE;4BACxB,KAAK,EAAE,MAAM,CAAC,GAAG,CAAC,MAAM,CAAC;yBAC1B;qBACF;iBACF;aACF,CAAC,CAAC;YACH,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,mBAAmB,EAAE,GAAG,EAAE;QACjC,EAAE,CAAC,iDAAiD,EAAE,KAAK,IAAI,EAAE;YAC/D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,kBAAkB;gBACnC,cAAc,EAAE,+BAA+B;gBAC/C,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,2FAA2F;YAC3F,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,gBAAgB;gBACjC,cAAc,EAAE,wBAAwB;gBACxC,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,wDAAwD;YACxD,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;QACvB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,8BAA8B,EAAE,GAAG,EAAE;QAC5C,EAAE,CAAC,8CAA8C,EAAE,KAAK,IAAI,EAAE;YAC5D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EACb,uGAAuG;gBACzG,cAAc,EACZ,uGAAuG;gBACzG,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,IAAI;iBAChB;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,IAAI;gBACZ,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,kBAAkB,CAAC,IAAI;iBACtC;aACF,CAAC,CAAC;YACH,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACvE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,kDAAkD,EAAE,KAAK,IAAI,EAAE;YAChE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EACb,yGAAyG;gBAC3G,cAAc,EACZ,gFAAgF;gBAClF,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YAC9C,MAAM,UAAU,GAAG,MAAM,CAAC,cAAc,CAAC,MAAM,CAC7C,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CACrB,CAAC,MAAM,CAAC;YACT,MAAM,CAAC,UAAU,CAAC,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC;YACnC,MAAM,CAAC,MAAM,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,UAAU,GAAG,CAAC,CAAC,CAAC;QACrE,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EACb,wFAAwF;gBAC1F,cAAc,EACZ,4DAA4D;gBAC9D,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,UAAU,GAAG,MAAM,CAAC,cAAc,CAAC,MAAM,CAC7C,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CACrB,CAAC,MAAM,CAAC;YACT,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,wBAAwB,EAAE;oBACxB,KAAK,EAAE,UAAU,GAAG,CAAC;iBACtB;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,cAAc,EAAE,yCAAyC;gBACzD,eAAe,EACb,oEAAoE;gBACtE,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7D,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,cAAc,EAAE,yCAAyC;gBACzD,eAAe,EAAE,uCAAuC;gBACxD,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YACpB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrD,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;QAC1B,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,EAAE;gBACnB,cAAc,EAAE,eAAe;aAChC,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,IAAI;gBACZ,cAAc,EAAE,EAAE;gBAClB,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;iBACT;aACF,CAAC,CAAC;QACL,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,qCAAqC,EAAE,KAAK,IAAI,EAAE;YACnD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,kCAAkC;gBACnD,cAAc,EAAE,EAAE;aACnB,CAAC,CAAC;YAEH,4BAA4B;YAC5B,MAAM,cAAc,GAAG,IAAI;iBACxB,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC;iBACtB,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAEhC,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,KAAK;wBACZ,wBAAwB,EAAE;4BACxB,KAAK,EAAE,CAAC;yBACT;qBACF;iBACF;aACF,CAAC,CAAC;YAEH,cAAc,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,uCAAuC,EAAE,KAAK,IAAI,EAAE;YACrD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,KAAK;gBACtB,cAAc,EAAE,eAAe;aAChC,CAAC,CAAC;YAEH,4BAA4B;YAC5B,MAAM,cAAc,GAAG,IAAI;iBACxB,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC;iBACtB,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAEhC,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,CAAC,aAAa,CAAC;gBAC7C,KAAK,EAAE,KAAK;gBACZ,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;iBACT;aACF,CAAC,CAAC;YAEH,cAAc,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;YACtE,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,kCAAkC;gBACnD,cAAc,EAAE,IAAyB;aAC1C,CAAC,CAAC;YAEH,4BAA4B;YAC5B,MAAM,cAAc,GAAG,IAAI;iBACxB,KAAK,CAAC,OAAO,EAAE,MAAM,CAAC;iBACtB,kBAAkB,CAAC,GAAG,EAAE,GAAE,CAAC,CAAC,CAAC;YAEhC,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,CAAC,aAAa,CAAC;gBAC3B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE;oBACd;wBACE,KAAK,EAAE,KAAK;qBACb;iBACF;aACF,CAAC,CAAC;YAEH,cAAc,CAAC,WAAW,EAAE,CAAC;QAC/B,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,wBAAwB,EAAE,GAAG,EAAE;QACtC,EAAE,CAAC,+CAA+C,EAAE,KAAK,IAAI,EAAE;YAC7D,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,wBAAwB;gBACzC,cAAc,EAAE,gBAAgB;gBAChC,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,IAAI;oBACjC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,gEAAgE;YAChE,MAAM,KAAK,GAAG,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC;YACtE,MAAM,CAAC,KAAK,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC;YACjC,MAAM,CAAC,KAAK,CAAC,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QAClC,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,WAAW,EAAE,GAAG,EAAE;QACzB,EAAE,CAAC,sCAAsC,EAAE,KAAK,IAAI,EAAE;YACpD,MAAM,OAAO,GAAG,aAAa,CAAC;gBAC5B,eAAe,EAAE,8BAA8B;gBAC/C,cAAc,EAAE,mCAAmC;aACpD,CAAC,CAAC;YAEH,MAAM,MAAM,GAAG,qBAAqB,CAAC,OAAO,CAAC,CAAC;YAE9C,MAAM,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,WAAW,EAAE,CAAC;YACvC,MAAM,CAAC,IAAI,IAAI,CAAC,MAAM,CAAC,SAAU,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,IAAI,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC;QAC3E,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { jest, describe, it, expect } from '@jest/globals';\nimport { performBleuEvaluation } from '../bleu-evaluator';\nimport { EvaluationRequest } from '../../../types';\nimport {\n DEFAULT_BLEU_PASS_SCORE,\n EvaluationApproach,\n} from '../../../constants';\n\ndescribe('performBleuEvaluation', () => {\n // Helper function to create a base request with optional overrides\n const createRequest = (\n overrides: Partial<EvaluationRequest> = {},\n ): EvaluationRequest => {\n const defaults: EvaluationRequest = {\n testCaseId: 'test-001',\n question: 'Test question',\n expectedOutcome: 'keyword',\n actualResponse: 'response with keyword',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: DEFAULT_BLEU_PASS_SCORE,\n },\n };\n\n return {\n ...defaults,\n ...overrides,\n evaluationParameters: {\n ...defaults.evaluationParameters,\n ...overrides.evaluationParameters,\n },\n };\n };\n\n describe('basic functionality', () => {\n it('should return a valid EvaluationResult structure', async () => {\n const request = createRequest({\n actualResponse: 'AI stands for artificial intelligence',\n expectedOutcome: 'artificial intelligence',\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n testCaseId: 'test-001',\n passed: expect.any(Boolean),\n keywordMatches: expect.any(Array),\n timestamp: expect.any(String),\n evaluationParameters: expect.any(Object),\n evaluationApproachResult: expect.any(Object),\n });\n });\n\n it('should use default threshold when not provided', async () => {\n const request = createRequest({\n expectedOutcome: 'test evaluation system works',\n actualResponse: 'test evaluation system works well',\n evaluationParameters: { approach: EvaluationApproach.BLEU },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(\n DEFAULT_BLEU_PASS_SCORE,\n );\n });\n\n it('should use provided threshold when specified', async () => {\n const customThreshold = 0.85;\n const request = createRequest({\n actualResponse: 'response text with multiple words',\n expectedOutcome: 'response text with multiple',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: customThreshold,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(customThreshold);\n });\n });\n\n describe('single keyword evaluation', () => {\n it('should pass when keyword with 4+ words matches exactly', async () => {\n const request = createRequest({\n expectedOutcome: 'the machine learning algorithm works',\n actualResponse: 'the machine learning algorithm works',\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n passed: true,\n keywordMatches: [\n {\n keyword: 'the machine learning algorithm works',\n found: true,\n evaluationApproachResult: {\n score: expect.any(Number),\n approachUsed: EvaluationApproach.BLEU,\n },\n },\n ],\n });\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThanOrEqual(0.7);\n });\n\n it('should pass when keyword with 4+ words is found with high n-gram overlap', async () => {\n const request = createRequest({\n expectedOutcome: 'the cat sat on',\n actualResponse: 'the cat sat on the mat',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.5,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.keywordMatches[0]).toMatchObject({\n found: true,\n evaluationApproachResult: {\n score: expect.any(Number),\n approachUsed: EvaluationApproach.BLEU,\n },\n });\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThan(0);\n });\n\n it('should fail when keyword has low n-gram overlap', async () => {\n const request = createRequest({\n expectedOutcome: 'quantum physics research continues',\n actualResponse: 'This is about machine learning algorithms',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.7,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n keywordMatches: [\n {\n found: false,\n evaluationApproachResult: {\n score: expect.any(Number),\n },\n },\n ],\n });\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(0.7);\n });\n });\n\n describe('4-gram limitation', () => {\n it('should fail when keyword has fewer than 4 words', async () => {\n const request = createRequest({\n expectedOutcome: 'machine learning',\n actualResponse: 'machine learning is important',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.7,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n // BLEU uses 4-gram matching, so keywords with fewer than 4 words will have very low scores\n expect(result.passed).toBe(false);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(0.7);\n });\n\n it('should work correctly with 4-word keywords', async () => {\n const request = createRequest({\n expectedOutcome: 'the cat sat on',\n actualResponse: 'the cat sat on the mat',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.5,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n // 4-word keywords can produce proper 4-gram BLEU scores\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThan(0);\n });\n });\n\n describe('multiple keywords evaluation', () => {\n it('should pass when all keywords meet threshold', async () => {\n const request = createRequest({\n expectedOutcome:\n 'Machine learning algorithms work. Artificial intelligence systems are. Neural network models perform.',\n actualResponse:\n 'Machine learning algorithms work. Artificial intelligence systems are. Neural network models perform.',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.15,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n passed: true,\n evaluationApproachResult: {\n score: 1,\n approachUsed: EvaluationApproach.BLEU,\n },\n });\n expect(result.keywordMatches).toHaveLength(3);\n expect(result.keywordMatches.every(match => match.found)).toBe(true);\n });\n\n it('should fail when not all keywords meet threshold', async () => {\n const request = createRequest({\n expectedOutcome:\n 'machine learning algorithms work\\nquantum computing research continues\\nartificial intelligence systems',\n actualResponse:\n 'Machine learning algorithms work. Artificial intelligence systems are growing.',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.7,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches).toHaveLength(3);\n const foundCount = result.keywordMatches.filter(\n match => match.found,\n ).length;\n expect(foundCount).toBeLessThan(3);\n expect(result.evaluationApproachResult.score).toBe(foundCount / 3);\n });\n\n it('should calculate overall score as ratio of passed keywords', async () => {\n const request = createRequest({\n expectedOutcome:\n 'alpha beta gamma delta\\nepsilon zeta eta theta\\niota kappa lambda mu\\nnu xi omicron pi',\n actualResponse:\n 'alpha beta gamma delta and epsilon zeta eta theta are here',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.3,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n const foundCount = result.keywordMatches.filter(\n match => match.found,\n ).length;\n expect(result).toMatchObject({\n passed: false,\n evaluationApproachResult: {\n score: foundCount / 4,\n },\n });\n });\n });\n\n describe('threshold handling', () => {\n it('should pass all keywords with threshold 0.0', async () => {\n const request = createRequest({\n actualResponse: 'completely unrelated text about cooking',\n expectedOutcome:\n 'quantum physics research continues\\nmathematics and statistics are',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.0,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.every(m => m.found)).toBe(true);\n expect(result.evaluationParameters.threshold).toBe(0.0);\n });\n\n it('should fail when threshold is 1.0 and match is not perfect', async () => {\n const request = createRequest({\n actualResponse: 'This is about machine learning concepts',\n expectedOutcome: 'machine learning algorithms work well',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 1.0,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(1.0);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(1.0);\n expect(result.keywordMatches[0].found).toBe(false);\n });\n });\n\n describe('edge cases', () => {\n it('should handle empty keywords array', async () => {\n const request = createRequest({\n expectedOutcome: '',\n actualResponse: 'Some response',\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n passed: true,\n keywordMatches: [],\n evaluationApproachResult: {\n score: 1,\n },\n });\n });\n\n it('should handle empty actual response', async () => {\n const request = createRequest({\n expectedOutcome: 'machine learning algorithms work',\n actualResponse: '',\n });\n\n // Suppress expected warning\n const consoleWarnSpy = jest\n .spyOn(console, 'warn')\n .mockImplementation(() => {});\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n keywordMatches: [\n {\n found: false,\n evaluationApproachResult: {\n score: 0,\n },\n },\n ],\n });\n\n consoleWarnSpy.mockRestore();\n });\n\n it('should handle whitespace-only keyword', async () => {\n const request = createRequest({\n expectedOutcome: ' ',\n actualResponse: 'Some response',\n });\n\n // Suppress expected warning\n const consoleWarnSpy = jest\n .spyOn(console, 'warn')\n .mockImplementation(() => {});\n\n const result = performBleuEvaluation(request);\n\n expect(result.keywordMatches[0]).toMatchObject({\n found: false,\n evaluationApproachResult: {\n score: 0,\n },\n });\n\n consoleWarnSpy.mockRestore();\n });\n\n it('should handle null/undefined actualResponse gracefully', async () => {\n const request = createRequest({\n expectedOutcome: 'machine learning algorithms work',\n actualResponse: null as unknown as string,\n });\n\n // Suppress expected warning\n const consoleWarnSpy = jest\n .spyOn(console, 'warn')\n .mockImplementation(() => {});\n\n const result = performBleuEvaluation(request);\n\n expect(result).toMatchObject({\n passed: false,\n keywordMatches: [\n {\n found: false,\n },\n ],\n });\n\n consoleWarnSpy.mockRestore();\n });\n });\n\n describe('BLEU score calculation', () => {\n it('should calculate BLEU score for partial match', async () => {\n const request = createRequest({\n expectedOutcome: 'the cat sat on the mat',\n actualResponse: 'the cat sat on',\n evaluationParameters: {\n approach: EvaluationApproach.BLEU,\n threshold: 0.3,\n },\n });\n\n const result = performBleuEvaluation(request);\n\n // Partial match should have lower BLEU score than perfect match\n const score = result.keywordMatches[0].evaluationApproachResult.score;\n expect(score).toBeGreaterThan(0);\n expect(score).toBeLessThan(1.0);\n });\n });\n\n describe('timestamp', () => {\n it('should include a valid ISO timestamp', async () => {\n const request = createRequest({\n expectedOutcome: 'test evaluation system works',\n actualResponse: 'test evaluation system works well',\n });\n\n const result = performBleuEvaluation(request);\n\n expect(result.timestamp).toBeDefined();\n expect(new Date(result.timestamp!).toISOString()).toBe(result.timestamp);\n });\n });\n});\n"]}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { EvaluationApproach } from "../../constants";
|
|
2
|
+
export async function performEvaluation(request) {
|
|
3
|
+
const { testCaseId, expectedOutcome, actualResponse } = request;
|
|
4
|
+
// Split expectedOutcome by newlines to create keywords array
|
|
5
|
+
const expectedKeywords = expectedOutcome
|
|
6
|
+
? expectedOutcome
|
|
7
|
+
.split(/[\n,]+/)
|
|
8
|
+
.map(k => k.trim())
|
|
9
|
+
.filter(k => k.length > 0)
|
|
10
|
+
: [];
|
|
11
|
+
const keywordMatches = evaluateKeywords(expectedKeywords, actualResponse);
|
|
12
|
+
// Test passes only if ALL expected keywords are found
|
|
13
|
+
const totalItems = keywordMatches.length;
|
|
14
|
+
const foundItems = keywordMatches.filter(m => m.found).length;
|
|
15
|
+
const passed = foundItems === totalItems;
|
|
16
|
+
return {
|
|
17
|
+
testCaseId,
|
|
18
|
+
passed,
|
|
19
|
+
keywordMatches,
|
|
20
|
+
timestamp: new Date().toISOString(),
|
|
21
|
+
evaluationParameters: request.evaluationParameters,
|
|
22
|
+
evaluationApproachResult: {
|
|
23
|
+
score: totalItems > 0 ? foundItems / totalItems : 1,
|
|
24
|
+
approachUsed: EvaluationApproach.EXACT,
|
|
25
|
+
},
|
|
26
|
+
};
|
|
27
|
+
}
|
|
28
|
+
function evaluateKeywords(expectedKeywords, actualResponse) {
|
|
29
|
+
// Case-insensitive keyword matching
|
|
30
|
+
const response = actualResponse.toLowerCase();
|
|
31
|
+
return expectedKeywords.map(keyword => {
|
|
32
|
+
const keywordToMatch = keyword.toLowerCase();
|
|
33
|
+
const found = response.includes(keywordToMatch);
|
|
34
|
+
return {
|
|
35
|
+
keyword,
|
|
36
|
+
found,
|
|
37
|
+
evaluationApproachResult: {
|
|
38
|
+
score: found ? 1.0 : 0.0,
|
|
39
|
+
approachUsed: EvaluationApproach.EXACT,
|
|
40
|
+
},
|
|
41
|
+
};
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
//# sourceMappingURL=exact.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"exact.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/exact/exact.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,kBAAkB,EAAE,MAAM,iBAAiB,CAAC;AAErD,MAAM,CAAC,KAAK,UAAU,iBAAiB,CACrC,OAA0B;IAE1B,MAAM,EAAE,UAAU,EAAE,eAAe,EAAE,cAAc,EAAE,GAAG,OAAO,CAAC;IAEhE,6DAA6D;IAC7D,MAAM,gBAAgB,GAAG,eAAe;QACtC,CAAC,CAAC,eAAe;aACZ,KAAK,CAAC,QAAQ,CAAC;aACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAC9B,CAAC,CAAC,EAAE,CAAC;IAEP,MAAM,cAAc,GAAG,gBAAgB,CAAC,gBAAgB,EAAE,cAAc,CAAC,CAAC;IAE1E,sDAAsD;IACtD,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,CAAC;IACzC,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC;IAC9D,MAAM,MAAM,GAAG,UAAU,KAAK,UAAU,CAAC;IAEzC,OAAO;QACL,UAAU;QACV,MAAM;QACN,cAAc;QACd,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,oBAAoB,EAAE,OAAO,CAAC,oBAAoB;QAClD,wBAAwB,EAAE;YACxB,KAAK,EAAE,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,UAAU,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;YACnD,YAAY,EAAE,kBAAkB,CAAC,KAAK;SACvC;KACF,CAAC;AACJ,CAAC;AAED,SAAS,gBAAgB,CACvB,gBAA0B,EAC1B,cAAsB;IAEtB,oCAAoC;IACpC,MAAM,QAAQ,GAAG,cAAc,CAAC,WAAW,EAAE,CAAC;IAE9C,OAAO,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;QACpC,MAAM,cAAc,GAAG,OAAO,CAAC,WAAW,EAAE,CAAC;QAC7C,MAAM,KAAK,GAAG,QAAQ,CAAC,QAAQ,CAAC,cAAc,CAAC,CAAC;QAEhD,OAAO;YACL,OAAO;YACP,KAAK;YACL,wBAAwB,EAAE;gBACxB,KAAK,EAAE,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG;gBACxB,YAAY,EAAE,kBAAkB,CAAC,KAAK;aACvC;SACF,CAAC;IACJ,CAAC,CAAC,CAAC;AACL,CAAC","sourcesContent":["import { EvaluationRequest, EvaluationResult, KeywordMatch } from '../../types';\nimport { EvaluationApproach } from '../../constants';\n\nexport async function performEvaluation(\n request: EvaluationRequest,\n): Promise<EvaluationResult> {\n const { testCaseId, expectedOutcome, actualResponse } = request;\n\n // Split expectedOutcome by newlines to create keywords array\n const expectedKeywords = expectedOutcome\n ? expectedOutcome\n .split(/[\\n,]+/)\n .map(k => k.trim())\n .filter(k => k.length > 0)\n : [];\n\n const keywordMatches = evaluateKeywords(expectedKeywords, actualResponse);\n\n // Test passes only if ALL expected keywords are found\n const totalItems = keywordMatches.length;\n const foundItems = keywordMatches.filter(m => m.found).length;\n const passed = foundItems === totalItems;\n\n return {\n testCaseId,\n passed,\n keywordMatches,\n timestamp: new Date().toISOString(),\n evaluationParameters: request.evaluationParameters,\n evaluationApproachResult: {\n score: totalItems > 0 ? foundItems / totalItems : 1,\n approachUsed: EvaluationApproach.EXACT,\n },\n };\n}\n\nfunction evaluateKeywords(\n expectedKeywords: string[],\n actualResponse: string,\n): KeywordMatch[] {\n // Case-insensitive keyword matching\n const response = actualResponse.toLowerCase();\n\n return expectedKeywords.map(keyword => {\n const keywordToMatch = keyword.toLowerCase();\n const found = response.includes(keywordToMatch);\n\n return {\n keyword,\n found,\n evaluationApproachResult: {\n score: found ? 1.0 : 0.0,\n approachUsed: EvaluationApproach.EXACT,\n },\n };\n });\n}\n"]}
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
import * as rouge from "js-rouge";
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "../constants";
|
|
3
|
+
/**
|
|
4
|
+
* Evaluates a single keyword against the candidate text using ROUGE-1.
|
|
5
|
+
*
|
|
6
|
+
* @param {string} keyword - The expected keyword to evaluate
|
|
7
|
+
* @param {string} candidate - The actual response text
|
|
8
|
+
* @param {number} rougeThreshold - The minimum ROUGE-1 score required to pass
|
|
9
|
+
* @returns {KeywordMatch} The evaluation result for this keyword
|
|
10
|
+
*/
|
|
11
|
+
function evaluateKeyword(keyword, candidate, rougeThreshold) {
|
|
12
|
+
let rouge1Score = 0;
|
|
13
|
+
try {
|
|
14
|
+
if (keyword.trim().length > 0 && candidate.length > 0) {
|
|
15
|
+
const rouge1 = rouge.n(candidate, keyword.trim(), { n: 1 });
|
|
16
|
+
rouge1Score = isNaN(rouge1) ? 0 : rouge1;
|
|
17
|
+
}
|
|
18
|
+
else {
|
|
19
|
+
console.warn(`ROUGE-1 not computed for keyword "${keyword}": Keyword or Candidate is missing.`);
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
catch (err) {
|
|
23
|
+
console.error(`ROUGE-1 computation failed for keyword "${keyword}":`, err);
|
|
24
|
+
}
|
|
25
|
+
const keywordPassed = rouge1Score >= rougeThreshold;
|
|
26
|
+
const keywordApproachResult = {
|
|
27
|
+
score: rouge1Score,
|
|
28
|
+
approachUsed: EvaluationApproach.ROUGE_1,
|
|
29
|
+
};
|
|
30
|
+
return {
|
|
31
|
+
keyword: keyword,
|
|
32
|
+
found: keywordPassed,
|
|
33
|
+
evaluationApproachResult: keywordApproachResult,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Computes the ROUGE-1 score for a single keyword against the candidate text.
|
|
38
|
+
*
|
|
39
|
+
* ROUGE-1 measures the overlap of unigrams (single words) between the candidate
|
|
40
|
+
* and reference text. A score of 1.0 indicates perfect overlap.
|
|
41
|
+
*
|
|
42
|
+
* @example
|
|
43
|
+
* const match = evaluateSingleKeyword(
|
|
44
|
+
* "The quick brown fox",
|
|
45
|
+
* "quick fox",
|
|
46
|
+
* 0.5
|
|
47
|
+
* );
|
|
48
|
+
* // Returns: { keyword: "quick fox", found: true, score: 0.67, ... }
|
|
49
|
+
* //general idea , here we are doing it. by word to word comparison
|
|
50
|
+
*/
|
|
51
|
+
export async function performRouge1Evaluation(request) {
|
|
52
|
+
const { testCaseId, actualResponse, expectedOutcome, evaluationParameters } = request;
|
|
53
|
+
// Split expectedOutcome by newlines to create keywords array
|
|
54
|
+
const expectedKeywords = expectedOutcome
|
|
55
|
+
? expectedOutcome
|
|
56
|
+
.split(/[\n,]+/)
|
|
57
|
+
.map(k => k.trim())
|
|
58
|
+
.filter(k => k.length > 0)
|
|
59
|
+
: [];
|
|
60
|
+
const candidate = (actualResponse || '').trim();
|
|
61
|
+
const rougeThreshold = evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;
|
|
62
|
+
let keywordsPassed = 0;
|
|
63
|
+
const totalKeywords = expectedKeywords.length;
|
|
64
|
+
const keywordMatches = expectedKeywords.map(keyword => {
|
|
65
|
+
const match = evaluateKeyword(keyword, candidate, rougeThreshold);
|
|
66
|
+
if (match.found) {
|
|
67
|
+
keywordsPassed++;
|
|
68
|
+
}
|
|
69
|
+
return match;
|
|
70
|
+
});
|
|
71
|
+
const overallPassed = keywordsPassed === totalKeywords;
|
|
72
|
+
const overallApproachResult = {
|
|
73
|
+
score: keywordsPassed / totalKeywords,
|
|
74
|
+
approachUsed: EvaluationApproach.ROUGE_1,
|
|
75
|
+
};
|
|
76
|
+
return {
|
|
77
|
+
testCaseId: testCaseId,
|
|
78
|
+
passed: overallPassed,
|
|
79
|
+
keywordMatches: keywordMatches,
|
|
80
|
+
timestamp: new Date().toISOString(),
|
|
81
|
+
evaluationParameters: {
|
|
82
|
+
...evaluationParameters,
|
|
83
|
+
threshold: rougeThreshold,
|
|
84
|
+
},
|
|
85
|
+
evaluationApproachResult: overallApproachResult,
|
|
86
|
+
};
|
|
87
|
+
}
|
|
88
|
+
//# sourceMappingURL=rouge1-evaluator.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rouge1-evaluator.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rouge1-evaluator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,KAAK,MAAM,UAAU,CAAC;AAGlC,OAAO,EAAE,wBAAwB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAE5E;;;;;;;GAOG;AACH,SAAS,eAAe,CACtB,OAAe,EACf,SAAiB,EACjB,cAAsB;IAEtB,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,IAAI,CAAC;QACH,IAAI,OAAO,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtD,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,SAAS,EAAE,OAAO,CAAC,IAAI,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,EAAE,CAAC,CAAC;YAC5D,WAAW,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;QAC3C,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CACV,qCAAqC,OAAO,qCAAqC,CAClF,CAAC;QACJ,CAAC;IACH,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,OAAO,CAAC,KAAK,CAAC,2CAA2C,OAAO,IAAI,EAAE,GAAG,CAAC,CAAC;IAC7E,CAAC;IAED,MAAM,aAAa,GAAG,WAAW,IAAI,cAAc,CAAC;IAEpD,MAAM,qBAAqB,GAA6B;QACtD,KAAK,EAAE,WAAW;QAClB,YAAY,EAAE,kBAAkB,CAAC,OAAO;KACzC,CAAC;IAEF,OAAO;QACL,OAAO,EAAE,OAAO;QAChB,KAAK,EAAE,aAAa;QACpB,wBAAwB,EAAE,qBAAqB;KAChD,CAAC;AACJ,CAAC;AAED;;;;;;;;;;;;;;GAcG;AAEH,MAAM,CAAC,KAAK,UAAU,uBAAuB,CAC3C,OAA0B;IAE1B,MAAM,EAAE,UAAU,EAAE,cAAc,EAAE,eAAe,EAAE,oBAAoB,EAAE,GACzE,OAAO,CAAC;IAEV,6DAA6D;IAC7D,MAAM,gBAAgB,GAAG,eAAe;QACtC,CAAC,CAAC,eAAe;aACZ,KAAK,CAAC,QAAQ,CAAC;aACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;aAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;QAC9B,CAAC,CAAC,EAAE,CAAC;IAEP,MAAM,SAAS,GAAG,CAAC,cAAc,IAAI,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IAChD,MAAM,cAAc,GAClB,oBAAoB,CAAC,SAAS,IAAI,wBAAwB,CAAC;IAE7D,IAAI,cAAc,GAAG,CAAC,CAAC;IACvB,MAAM,aAAa,GAAG,gBAAgB,CAAC,MAAM,CAAC;IAE9C,MAAM,cAAc,GAAmB,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAC,EAAE;QACpE,MAAM,KAAK,GAAG,eAAe,CAAC,OAAO,EAAE,SAAS,EAAE,cAAc,CAAC,CAAC;QAElE,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;YAChB,cAAc,EAAE,CAAC;QACnB,CAAC;QAED,OAAO,KAAK,CAAC;IACf,CAAC,CAAC,CAAC;IAEH,MAAM,aAAa,GAAG,cAAc,KAAK,aAAa,CAAC;IAEvD,MAAM,qBAAqB,GAA6B;QACtD,KAAK,EAAE,cAAc,GAAG,aAAa;QACrC,YAAY,EAAE,kBAAkB,CAAC,OAAO;KACzC,CAAC;IAEF,OAAO;QACL,UAAU,EAAE,UAAU;QACtB,MAAM,EAAE,aAAa;QACrB,cAAc,EAAE,cAAc;QAC9B,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;QACnC,oBAAoB,EAAE;YACpB,GAAG,oBAAoB;YACvB,SAAS,EAAE,cAAc;SAC1B;QACD,wBAAwB,EAAE,qBAAqB;KAChD,CAAC;AACJ,CAAC","sourcesContent":["import * as rouge from 'js-rouge';\nimport { EvaluationApproachResult } from '../../../types/evaluation';\nimport { EvaluationRequest, EvaluationResult, KeywordMatch } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from '../constants';\n\n/**\n * Evaluates a single keyword against the candidate text using ROUGE-1.\n *\n * @param {string} keyword - The expected keyword to evaluate\n * @param {string} candidate - The actual response text\n * @param {number} rougeThreshold - The minimum ROUGE-1 score required to pass\n * @returns {KeywordMatch} The evaluation result for this keyword\n */\nfunction evaluateKeyword(\n keyword: string,\n candidate: string,\n rougeThreshold: number,\n): KeywordMatch {\n let rouge1Score = 0;\n\n try {\n if (keyword.trim().length > 0 && candidate.length > 0) {\n const rouge1 = rouge.n(candidate, keyword.trim(), { n: 1 });\n rouge1Score = isNaN(rouge1) ? 0 : rouge1;\n } else {\n console.warn(\n `ROUGE-1 not computed for keyword \"${keyword}\": Keyword or Candidate is missing.`,\n );\n }\n } catch (err) {\n console.error(`ROUGE-1 computation failed for keyword \"${keyword}\":`, err);\n }\n\n const keywordPassed = rouge1Score >= rougeThreshold;\n\n const keywordApproachResult: EvaluationApproachResult = {\n score: rouge1Score,\n approachUsed: EvaluationApproach.ROUGE_1,\n };\n\n return {\n keyword: keyword,\n found: keywordPassed,\n evaluationApproachResult: keywordApproachResult,\n };\n}\n\n/**\n * Computes the ROUGE-1 score for a single keyword against the candidate text.\n *\n * ROUGE-1 measures the overlap of unigrams (single words) between the candidate\n * and reference text. A score of 1.0 indicates perfect overlap.\n *\n * @example\n * const match = evaluateSingleKeyword(\n * \"The quick brown fox\",\n * \"quick fox\",\n * 0.5\n * );\n * // Returns: { keyword: \"quick fox\", found: true, score: 0.67, ... }\n * //general idea , here we are doing it. by word to word comparison\n */\n\nexport async function performRouge1Evaluation(\n request: EvaluationRequest,\n): Promise<EvaluationResult> {\n const { testCaseId, actualResponse, expectedOutcome, evaluationParameters } =\n request;\n\n // Split expectedOutcome by newlines to create keywords array\n const expectedKeywords = expectedOutcome\n ? expectedOutcome\n .split(/[\\n,]+/)\n .map(k => k.trim())\n .filter(k => k.length > 0)\n : [];\n\n const candidate = (actualResponse || '').trim();\n const rougeThreshold =\n evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;\n\n let keywordsPassed = 0;\n const totalKeywords = expectedKeywords.length;\n\n const keywordMatches: KeywordMatch[] = expectedKeywords.map(keyword => {\n const match = evaluateKeyword(keyword, candidate, rougeThreshold);\n\n if (match.found) {\n keywordsPassed++;\n }\n\n return match;\n });\n\n const overallPassed = keywordsPassed === totalKeywords;\n\n const overallApproachResult: EvaluationApproachResult = {\n score: keywordsPassed / totalKeywords,\n approachUsed: EvaluationApproach.ROUGE_1,\n };\n\n return {\n testCaseId: testCaseId,\n passed: overallPassed,\n keywordMatches: keywordMatches,\n timestamp: new Date().toISOString(),\n evaluationParameters: {\n ...evaluationParameters,\n threshold: rougeThreshold,\n },\n evaluationApproachResult: overallApproachResult,\n };\n}\n"]}
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
import * as rouge from "js-rouge";
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "../constants";
|
|
3
|
+
function evaluateKeyword(keyword, candidate, rougeThreshold) {
|
|
4
|
+
let rougeLScore = 0;
|
|
5
|
+
try {
|
|
6
|
+
const trimmedKeyword = keyword.trim();
|
|
7
|
+
if (trimmedKeyword.length > 0 && candidate.length > 0) {
|
|
8
|
+
const referenceTokens = trimmedKeyword.toLowerCase().split(/\s+/);
|
|
9
|
+
const candidateTokens = candidate.toLowerCase().split(/\s+/);
|
|
10
|
+
if (referenceTokens.length === 1 &&
|
|
11
|
+
candidateTokens.includes(referenceTokens[0])) {
|
|
12
|
+
rougeLScore = 1;
|
|
13
|
+
}
|
|
14
|
+
else {
|
|
15
|
+
const lcsResult = rouge.lcs(candidateTokens, referenceTokens);
|
|
16
|
+
const lcsLength = typeof lcsResult === 'number' ? lcsResult : (lcsResult?.length ?? 0);
|
|
17
|
+
const recall = referenceTokens.length > 0 ? lcsLength / referenceTokens.length : 0;
|
|
18
|
+
const precision = candidateTokens.length > 0 ? lcsLength / candidateTokens.length : 0;
|
|
19
|
+
const denominator = precision + recall;
|
|
20
|
+
const f1Score = denominator > 0 ? (2 * precision * recall) / denominator : 0;
|
|
21
|
+
rougeLScore = f1Score;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
else {
|
|
25
|
+
console.warn(`ROUGE-L not computed for keyword "${keyword}": Keyword or candidate missing.`);
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
catch (err) {
|
|
29
|
+
console.error(`ROUGE-L computation failed for keyword "${keyword}":`, err);
|
|
30
|
+
}
|
|
31
|
+
const keywordPassed = rougeLScore >= rougeThreshold;
|
|
32
|
+
const keywordApproachResult = {
|
|
33
|
+
score: rougeLScore,
|
|
34
|
+
approachUsed: EvaluationApproach.ROUGE_L,
|
|
35
|
+
};
|
|
36
|
+
return {
|
|
37
|
+
keyword,
|
|
38
|
+
found: keywordPassed,
|
|
39
|
+
evaluationApproachResult: keywordApproachResult,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
export function performRougeLEvaluation(request) {
|
|
43
|
+
const { testCaseId, actualResponse, expectedOutcome, evaluationParameters } = request;
|
|
44
|
+
// Split expectedOutcome by newlines, commas, and periods to create keywords array
|
|
45
|
+
let expectedKeywords = expectedOutcome
|
|
46
|
+
? expectedOutcome
|
|
47
|
+
.split(/[\n,.]+/)
|
|
48
|
+
.map(k => k.trim())
|
|
49
|
+
.filter(k => k.length > 0)
|
|
50
|
+
: [];
|
|
51
|
+
// If no keywords after filtering (e.g., whitespace-only input), treat the original input as a single keyword
|
|
52
|
+
if (expectedKeywords.length === 0 && expectedOutcome) {
|
|
53
|
+
expectedKeywords = [expectedOutcome];
|
|
54
|
+
}
|
|
55
|
+
const candidate = (actualResponse || '').trim();
|
|
56
|
+
const rougeThreshold = evaluationParameters.threshold ?? DEFAULT_ROUGE_PASS_SCORE;
|
|
57
|
+
let keywordsPassed = 0;
|
|
58
|
+
const totalKeywords = expectedKeywords.length;
|
|
59
|
+
const keywordMatches = expectedKeywords.map(keyword => {
|
|
60
|
+
const match = evaluateKeyword(keyword, candidate, rougeThreshold);
|
|
61
|
+
if (match.found)
|
|
62
|
+
keywordsPassed++;
|
|
63
|
+
return match;
|
|
64
|
+
});
|
|
65
|
+
const overallPassed = keywordsPassed === totalKeywords;
|
|
66
|
+
const overallApproachResult = {
|
|
67
|
+
score: totalKeywords > 0 ? keywordsPassed / totalKeywords : 1,
|
|
68
|
+
approachUsed: EvaluationApproach.ROUGE_L,
|
|
69
|
+
};
|
|
70
|
+
return {
|
|
71
|
+
testCaseId,
|
|
72
|
+
passed: overallPassed,
|
|
73
|
+
keywordMatches,
|
|
74
|
+
timestamp: new Date().toISOString(),
|
|
75
|
+
evaluationParameters: {
|
|
76
|
+
...evaluationParameters,
|
|
77
|
+
threshold: rougeThreshold,
|
|
78
|
+
},
|
|
79
|
+
evaluationApproachResult: overallApproachResult,
|
|
80
|
+
};
|
|
81
|
+
}
|
|
82
|
+
//# sourceMappingURL=rougeL-evaluator.js.map
|