llm-testrunner-components 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +165 -242
- package/dist/cjs/index.cjs.js +305 -237
- package/dist/cjs/index.cjs.js.map +1 -1
- package/dist/cjs/llm-testrunner.cjs.js +1 -1
- package/dist/cjs/loader.cjs.js +1 -1
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js +2 -2
- package/dist/collection/components/llm-test-runner/header/llm-test-runner-header.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js +25 -54
- package/dist/collection/components/llm-test-runner/llm-test-runner.import-export.test.js.map +1 -1
- package/dist/collection/components/llm-test-runner/llm-test-runner.js +27 -49
- package/dist/collection/components/llm-test-runner/llm-test-runner.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.css +60 -21
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js +3 -1
- package/dist/collection/components/llm-test-runner/test-cases/evaluation/evaluation-summary.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js +31 -11
- package/dist/collection/components/llm-test-runner/test-cases/expected-outcome-renderer.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.css +17 -0
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js +2 -12
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-case-row.js.map +1 -1
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js +2 -2
- package/dist/collection/components/llm-test-runner/test-cases/llm-test-cases.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-engine.js +63 -42
- package/dist/collection/lib/evaluation/evaluation-engine.js.map +1 -1
- package/dist/collection/lib/evaluation/evaluation-service.js +15 -3
- package/dist/collection/lib/evaluation/evaluation-service.js.map +1 -1
- package/dist/collection/lib/evaluation/{rouge1-evaluator.test.js → evaluators/rouge1-evaluator.test.js} +2 -2
- package/dist/collection/lib/evaluation/evaluators/rouge1-evaluator.test.js.map +1 -0
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js +4 -3
- package/dist/collection/lib/evaluation/evaluators/semantic/SemanticEvaluator.js.map +1 -1
- package/dist/collection/lib/evaluation/field-evaluation-approach.js +24 -0
- package/dist/collection/lib/evaluation/field-evaluation-approach.js.map +1 -0
- package/dist/collection/lib/evaluation/index.js +0 -4
- package/dist/collection/lib/evaluation/index.js.map +1 -1
- package/dist/collection/lib/evaluation/types.js.map +1 -1
- package/dist/collection/lib/import-export/test-results-csv.js +47 -33
- package/dist/collection/lib/import-export/test-results-csv.js.map +1 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js +0 -1
- package/dist/collection/lib/import-export/test-suite-exporter.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-factory.js +17 -27
- package/dist/collection/lib/test-cases/test-case-factory.js.map +1 -1
- package/dist/collection/lib/test-cases/test-case-mutations.js +60 -9
- package/dist/collection/lib/test-cases/test-case-mutations.js.map +1 -1
- package/dist/collection/schemas/expected-outcome.js +20 -2
- package/dist/collection/schemas/expected-outcome.js.map +1 -1
- package/dist/collection/schemas/test-case.js +2 -20
- package/dist/collection/schemas/test-case.js.map +1 -1
- package/dist/collection/types/llm-test-runner.js.map +1 -1
- package/dist/collection/types/test-case.js.map +1 -1
- package/dist/components/index.js +1 -1
- package/dist/components/llm-test-runner.js +1 -1
- package/dist/components/p-JPMPoOC8.js +7 -0
- package/dist/components/p-JPMPoOC8.js.map +1 -0
- package/dist/esm/index.js +305 -237
- package/dist/esm/index.js.map +1 -1
- package/dist/esm/llm-testrunner.js +1 -1
- package/dist/esm/loader.js +1 -1
- package/dist/llm-testrunner/index.esm.js +2 -2
- package/dist/llm-testrunner/index.esm.js.map +1 -1
- package/dist/llm-testrunner/llm-testrunner.esm.js +1 -1
- package/dist/types/components/llm-test-runner/header/llm-test-runner-header.d.ts +1 -0
- package/dist/types/components/llm-test-runner/llm-test-runner.d.ts +1 -1
- package/dist/types/components/llm-test-runner/test-cases/expected-outcome-renderer.d.ts +3 -6
- package/dist/types/components/llm-test-runner/test-cases/llm-test-case-row.d.ts +0 -2
- package/dist/types/components/llm-test-runner/test-cases/llm-test-cases.d.ts +0 -2
- package/dist/types/components.d.ts +9 -0
- package/dist/types/lib/evaluation/evaluation-engine.d.ts +4 -2
- package/dist/types/lib/evaluation/field-evaluation-approach.d.ts +6 -0
- package/dist/types/lib/evaluation/index.d.ts +0 -1
- package/dist/types/lib/evaluation/types.d.ts +26 -0
- package/dist/types/lib/import-export/test-suite-exporter.d.ts +0 -4
- package/dist/types/lib/test-cases/test-case-factory.d.ts +2 -3
- package/dist/types/lib/test-cases/test-case-mutations.d.ts +21 -5
- package/dist/types/schemas/expected-outcome.d.ts +65 -17
- package/dist/types/schemas/test-case.d.ts +51 -95
- package/dist/types/types/llm-test-runner.d.ts +1 -1
- package/dist/types/types/test-case.d.ts +1 -1
- package/package.json +9 -2
- package/dist/collection/lib/evaluation/rouge1-evaluator.test.js.map +0 -1
- package/dist/components/p-BF90yb1z.js +0 -7
- package/dist/components/p-BF90yb1z.js.map +0 -1
- /package/dist/types/lib/evaluation/{rouge1-evaluator.test.d.ts → evaluators/rouge1-evaluator.test.d.ts} +0 -0
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { describe, it, expect } from "@jest/globals";
|
|
2
|
-
import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "
|
|
2
|
+
import { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from "../constants";
|
|
3
3
|
// Using integration tests with actual js-rouge library (no mocks).
|
|
4
4
|
// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.
|
|
5
|
-
import { performRouge1Evaluation } from "./
|
|
5
|
+
import { performRouge1Evaluation } from "./rouge1-evaluator";
|
|
6
6
|
const mockRequest = {
|
|
7
7
|
testCaseId: 'test-000',
|
|
8
8
|
question: 'What is your name?',
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"rouge1-evaluator.test.js","sourceRoot":"","sources":["../../../../src/lib/evaluation/evaluators/rouge1-evaluator.test.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,QAAQ,EAAE,EAAE,EAAE,MAAM,EAAE,MAAM,eAAe,CAAC;AAErD,OAAO,EAAE,wBAAwB,EAAE,kBAAkB,EAAE,MAAM,cAAc,CAAC;AAC5E,mEAAmE;AACnE,8FAA8F;AAC9F,OAAO,EAAE,uBAAuB,EAAE,MAAM,oBAAoB,CAAC;AAE7D,MAAM,WAAW,GAAsB;IACrC,UAAU,EAAE,UAAU;IACtB,QAAQ,EAAE,oBAAoB;IAC9B,cAAc,EAAE,6BAA6B;IAC7C,eAAe,EAAE,iBAAiB;IAClC,oBAAoB,EAAE;QACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;QACpC,SAAS,EAAE,GAAG;KACf;CACF,CAAC;AAEF,MAAM,sBAAsB,GAAsB;IAChD,GAAG,WAAW;IACd,oBAAoB,EAAE;QACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;QACpC,SAAS,EAAE,SAAS;KACrB;CACF,CAAC;AAEF,QAAQ,CAAC,yBAAyB,EAAE,GAAG,EAAE;IACvC,QAAQ,CAAC,qBAAqB,EAAE,GAAG,EAAE;QACnC,EAAE,CAAC,0DAA0D,EAAE,KAAK,IAAI,EAAE;YACxE,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,iBAAiB;aACnC,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YAC7C,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;YACvB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,eAAe,CAAC,GAAG,CAAC,CAAC;QACzB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,wDAAwD,EAAE,KAAK,IAAI,EAAE;YACtE,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,oDAAoD;gBACpE,eAAe,EAAE,2CAA2C;aAC7D,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;YACpB,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,8DAA8D,EAAE,KAAK,IAAI,EAAE;YAC5E,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,oCAAoC;aACtD,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAClD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,sBAAsB,CAAC,GAAG,CAAC,CAAC;YAC9B,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACnD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,oBAAoB,EAAE,GAAG,EAAE;QAClC,EAAE,CAAC,gDAAgD,EAAE,KAAK,IAAI,EAAE;YAC9D,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,sBAAsB,CAAC,CAAC;YAErE,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAChD,wBAAwB,CACzB,CAAC;QACJ,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,6CAA6C,EAAE,KAAK,IAAI,EAAE;YAC3D,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,yCAAyC;gBACzD,eAAe,EAAE,8BAA8B;gBAC/C,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;oBACpC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YAC7D,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;QAC1D,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4DAA4D,EAAE,KAAK,IAAI,EAAE;YAC1E,MAAM,OAAO,GAAsB;gBACjC,GAAG,WAAW;gBACd,cAAc,EAAE,iCAAiC;gBACjD,eAAe,EAAE,kBAAkB;gBACnC,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,OAAO;oBACpC,SAAS,EAAE,GAAG;iBACf;aACF,CAAC;YAEF,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,oBAAoB,CAAC,SAAS,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;YACxD,MAAM,CACJ,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CACxD,CAAC,YAAY,CAAC,GAAG,CAAC,CAAC;QACtB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;IAEH,QAAQ,CAAC,YAAY,EAAE,GAAG,EAAE;QAC1B,EAAE,CAAC,oCAAoC,EAAE,KAAK,IAAI,EAAE;YAClD,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,cAAc,EAAE,EAAE,EAAE,CAAC;YAEvD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAClC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;YACxE,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,CAAC,CAAC,CAAC,wBAAwB,CAAC,KAAK,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC1E,CAAC,CAAC,CAAC;QAEH,EAAE,CAAC,4CAA4C,EAAE,KAAK,IAAI,EAAE;YAC1D,MAAM,OAAO,GAAG,EAAE,GAAG,WAAW,EAAE,eAAe,EAAE,EAAE,EAAE,CAAC;YAExD,MAAM,MAAM,GAAG,MAAM,uBAAuB,CAAC,OAAO,CAAC,CAAC;YAEtD,MAAM,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;YACjC,MAAM,CAAC,MAAM,CAAC,cAAc,CAAC,MAAM,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;QAC/C,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC,CAAC,CAAC","sourcesContent":["import { describe, it, expect } from '@jest/globals';\nimport { EvaluationRequest } from '../types';\nimport { DEFAULT_ROUGE_PASS_SCORE, EvaluationApproach } from '../constants';\n// Using integration tests with actual js-rouge library (no mocks).\n// This approach tests the real ROUGE-1 scoring behavior rather than just orchestration logic.\nimport { performRouge1Evaluation } from './rouge1-evaluator';\n\nconst mockRequest: EvaluationRequest = {\n testCaseId: 'test-000',\n question: 'What is your name?',\n actualResponse: 'I am a large language model',\n expectedOutcome: 'model\\nlanguage',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 0.5,\n },\n};\n\nconst mockRequestNoThreshold: EvaluationRequest = {\n ...mockRequest,\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: undefined,\n },\n};\n\ndescribe('performRouge1Evaluation', () => {\n describe('Basic functionality', () => {\n it('should pass when response contains exact keyword matches', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is a language model system',\n expectedOutcome: 'language\\nmodel',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(2);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThan(0.5);\n expect(result.keywordMatches[1].found).toBe(true);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeGreaterThan(0.5);\n });\n\n it('should fail when keywords are not sufficiently present', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is completely unrelated content about cooking',\n expectedOutcome: 'machine learning\\nartificial intelligence',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(false);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n });\n\n it('should partially pass when only some keywords meet threshold', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'Machine learning is fascinating',\n expectedOutcome: 'machine learning\\ndatabase systems',\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].found).toBe(true);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeGreaterThanOrEqual(0.5);\n expect(result.keywordMatches[1].found).toBe(false);\n expect(\n result.keywordMatches[1].evaluationApproachResult.score,\n ).toBeLessThan(0.5);\n });\n });\n\n describe('Threshold handling', () => {\n it('should use default threshold when not provided', async () => {\n const result = await performRouge1Evaluation(mockRequestNoThreshold);\n\n expect(result.evaluationParameters.threshold).toBe(\n DEFAULT_ROUGE_PASS_SCORE,\n );\n });\n\n it('should pass all keywords with threshold 0.0', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'completely unrelated text about cooking',\n expectedOutcome: 'quantum physics\\nmathematics',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 0.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.every(m => m.found)).toBe(true);\n expect(result.evaluationParameters.threshold).toBe(0.0);\n });\n\n it('should fail when threshold is 1.0 and match is not perfect', async () => {\n const request: EvaluationRequest = {\n ...mockRequest,\n actualResponse: 'This is about learning concepts',\n expectedOutcome: 'machine learning',\n evaluationParameters: {\n approach: EvaluationApproach.ROUGE_1,\n threshold: 1.0,\n },\n };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.evaluationParameters.threshold).toBe(1.0);\n expect(\n result.keywordMatches[0].evaluationApproachResult.score,\n ).toBeLessThan(1.0);\n });\n });\n\n describe('Edge cases', () => {\n it('should handle empty actualResponse', async () => {\n const request = { ...mockRequest, actualResponse: '' };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(false);\n expect(result.keywordMatches[0].evaluationApproachResult.score).toBe(0);\n expect(result.keywordMatches[1].evaluationApproachResult.score).toBe(0);\n });\n\n it('should handle empty expectedOutcome string', async () => {\n const request = { ...mockRequest, expectedOutcome: '' };\n\n const result = await performRouge1Evaluation(request);\n\n expect(result.passed).toBe(true);\n expect(result.keywordMatches.length).toBe(0);\n });\n });\n});\n"]}
|
|
@@ -16,6 +16,7 @@ export class SemanticEvaluator {
|
|
|
16
16
|
}
|
|
17
17
|
}
|
|
18
18
|
async performEvaluation(request) {
|
|
19
|
+
const threshold = request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;
|
|
19
20
|
try {
|
|
20
21
|
await this.initialize();
|
|
21
22
|
// Split expectedOutcome by newlines to create keywords array
|
|
@@ -25,7 +26,7 @@ export class SemanticEvaluator {
|
|
|
25
26
|
.map(k => k.trim())
|
|
26
27
|
.filter(k => k.length > 0)
|
|
27
28
|
: [];
|
|
28
|
-
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords,
|
|
29
|
+
const keywordMatches = await evaluateKeywordsSemantically(SemanticEvaluator.extractor, request.actualResponse, expectedKeywords, threshold);
|
|
29
30
|
const totalItems = keywordMatches.length;
|
|
30
31
|
// calculate the overall score by averaging the score of the keyword matches
|
|
31
32
|
const keywordScore = keywordMatches.reduce((acc, curr) => acc + curr.evaluationApproachResult.score, 0);
|
|
@@ -33,7 +34,7 @@ export class SemanticEvaluator {
|
|
|
33
34
|
const passed = keywordMatches.every(match => match.found);
|
|
34
35
|
const evaluationParameters = {
|
|
35
36
|
approach: EvaluationApproach.SEMANTIC,
|
|
36
|
-
threshold
|
|
37
|
+
threshold,
|
|
37
38
|
};
|
|
38
39
|
return {
|
|
39
40
|
testCaseId: request.testCaseId,
|
|
@@ -55,7 +56,7 @@ export class SemanticEvaluator {
|
|
|
55
56
|
keywordMatches: [],
|
|
56
57
|
evaluationParameters: {
|
|
57
58
|
approach: EvaluationApproach.SEMANTIC,
|
|
58
|
-
threshold
|
|
59
|
+
threshold,
|
|
59
60
|
},
|
|
60
61
|
evaluationApproachResult: {
|
|
61
62
|
score: 0,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"SemanticEvaluator.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/SemanticEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAC;AACnD,OAAO,EAAE,4BAA4B,EAAE,MAAM,qBAAqB,CAAC;AAGnE,OAAO,EACL,2BAA2B,EAC3B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAEzB,MAAM,OAAO,iBAAiB;IAC5B,qEAAqE;IAC7D,MAAM,CAAC,SAAS,GAA8B,IAAI,CAAC;IAE3D,KAAK,CAAC,UAAU;QACd,IAAI,iBAAiB,CAAC,SAAS;YAAE,OAAO;QACxC,IAAI,CAAC;YACH,iBAAiB,CAAC,SAAS,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC1D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2CAA2C,EAAE,KAAK,CAAC,CAAC;YAClE,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,CAAC,iBAAiB,CACrB,OAA0B;QAE1B,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;YAExB,6DAA6D;YAC7D,MAAM,gBAAgB,GAAG,OAAO,CAAC,eAAe;gBAC9C,CAAC,CAAC,OAAO,CAAC,eAAe;qBACpB,KAAK,CAAC,QAAQ,CAAC;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;qBAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;gBAC9B,CAAC,CAAC,EAAE,CAAC;YAEP,MAAM,cAAc,GAAG,MAAM,4BAA4B,CACvD,iBAAiB,CAAC,SAAS,EAC3B,OAAO,CAAC,cAAc,EACtB,gBAAgB,EAChB,
|
|
1
|
+
{"version":3,"file":"SemanticEvaluator.js","sourceRoot":"","sources":["../../../../../src/lib/evaluation/evaluators/semantic/SemanticEvaluator.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,gBAAgB,CAAC;AACnD,OAAO,EAAE,4BAA4B,EAAE,MAAM,qBAAqB,CAAC;AAGnE,OAAO,EACL,2BAA2B,EAC3B,kBAAkB,GACnB,MAAM,iBAAiB,CAAC;AAEzB,MAAM,OAAO,iBAAiB;IAC5B,qEAAqE;IAC7D,MAAM,CAAC,SAAS,GAA8B,IAAI,CAAC;IAE3D,KAAK,CAAC,UAAU;QACd,IAAI,iBAAiB,CAAC,SAAS;YAAE,OAAO;QACxC,IAAI,CAAC;YACH,iBAAiB,CAAC,SAAS,GAAG,MAAM,iBAAiB,EAAE,CAAC;QAC1D,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,2CAA2C,EAAE,KAAK,CAAC,CAAC;YAClE,MAAM,KAAK,CAAC;QACd,CAAC;IACH,CAAC;IAED,KAAK,CAAC,iBAAiB,CACrB,OAA0B;QAE1B,MAAM,SAAS,GACb,OAAO,CAAC,oBAAoB,EAAE,SAAS,IAAI,2BAA2B,CAAC;QAEzE,IAAI,CAAC;YACH,MAAM,IAAI,CAAC,UAAU,EAAE,CAAC;YAExB,6DAA6D;YAC7D,MAAM,gBAAgB,GAAG,OAAO,CAAC,eAAe;gBAC9C,CAAC,CAAC,OAAO,CAAC,eAAe;qBACpB,KAAK,CAAC,QAAQ,CAAC;qBACf,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;qBAClB,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC;gBAC9B,CAAC,CAAC,EAAE,CAAC;YAEP,MAAM,cAAc,GAAG,MAAM,4BAA4B,CACvD,iBAAiB,CAAC,SAAS,EAC3B,OAAO,CAAC,cAAc,EACtB,gBAAgB,EAChB,SAAS,CACV,CAAC;YAEF,MAAM,UAAU,GAAG,cAAc,CAAC,MAAM,CAAC;YACzC,4EAA4E;YAC5E,MAAM,YAAY,GAAG,cAAc,CAAC,MAAM,CACxC,CAAC,GAAG,EAAE,IAAI,EAAE,EAAE,CAAC,GAAG,GAAG,IAAI,CAAC,wBAAwB,CAAC,KAAK,EACxD,CAAC,CACF,CAAC;YACF,MAAM,YAAY,GAAG,UAAU,GAAG,CAAC,CAAC,CAAC,CAAC,YAAY,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,4BAA4B;YACjG,MAAM,MAAM,GAAG,cAAc,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC;YAE1D,MAAM,oBAAoB,GAAG;gBAC3B,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;gBACrC,SAAS;aACc,CAAC;YAE1B,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM;gBACN,cAAc;gBACd,oBAAoB;gBACpB,wBAAwB,EAAE;oBACxB,KAAK,EAAE,YAAY;oBACnB,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;gBACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,OAAO,CAAC,KAAK,CAAC,wCAAwC,EAAE,KAAK,CAAC,CAAC;YAC/D,OAAO;gBACL,UAAU,EAAE,OAAO,CAAC,UAAU;gBAC9B,MAAM,EAAE,KAAK;gBACb,cAAc,EAAE,EAAE;gBAClB,oBAAoB,EAAE;oBACpB,QAAQ,EAAE,kBAAkB,CAAC,QAAQ;oBACrC,SAAS;iBACV;gBACD,wBAAwB,EAAE;oBACxB,KAAK,EAAE,CAAC;oBACR,YAAY,EAAE,kBAAkB,CAAC,QAAQ;iBAC1C;gBACD,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;aACpC,CAAC;QACJ,CAAC;IACH,CAAC","sourcesContent":["import { EvaluationResult, EvaluationRequest } from '../../types';\nimport { loadSemanticModel } from './model-loader';\nimport { evaluateKeywordsSemantically } from './evaluate-keywords';\nimport { FeatureExtractionPipeline } from '@xenova/transformers';\nimport { EvaluationParameters } from '../../../../types/evaluation';\nimport {\n DEFAULT_SEMANTIC_PASS_SCORE,\n EvaluationApproach,\n} from '../../constants';\n\nexport class SemanticEvaluator {\n // TODO(LLM-39): Refactor SemanticEvaluator into a singleton pattern.\n private static extractor: FeatureExtractionPipeline = null;\n\n async initialize(): Promise<void> {\n if (SemanticEvaluator.extractor) return;\n try {\n SemanticEvaluator.extractor = await loadSemanticModel();\n } catch (error) {\n console.error('Failed to load semantic evaluation model:', error);\n throw error;\n }\n }\n\n async performEvaluation(\n request: EvaluationRequest,\n ): Promise<EvaluationResult> {\n const threshold =\n request.evaluationParameters?.threshold ?? DEFAULT_SEMANTIC_PASS_SCORE;\n\n try {\n await this.initialize();\n\n // Split expectedOutcome by newlines to create keywords array\n const expectedKeywords = request.expectedOutcome\n ? request.expectedOutcome\n .split(/[\\n,]+/)\n .map(k => k.trim())\n .filter(k => k.length > 0)\n : [];\n\n const keywordMatches = await evaluateKeywordsSemantically(\n SemanticEvaluator.extractor,\n request.actualResponse,\n expectedKeywords,\n threshold,\n );\n\n const totalItems = keywordMatches.length;\n // calculate the overall score by averaging the score of the keyword matches\n const keywordScore = keywordMatches.reduce(\n (acc, curr) => acc + curr.evaluationApproachResult.score,\n 0,\n );\n const overallScore = totalItems > 0 ? keywordScore / totalItems : 0; // to avoid division by zero\n const passed = keywordMatches.every(match => match.found);\n\n const evaluationParameters = {\n approach: EvaluationApproach.SEMANTIC,\n threshold,\n } as EvaluationParameters;\n\n return {\n testCaseId: request.testCaseId,\n passed,\n keywordMatches,\n evaluationParameters,\n evaluationApproachResult: {\n score: overallScore,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n timestamp: new Date().toISOString(),\n };\n } catch (error) {\n console.error('Failed to perform semantic evaluation:', error);\n return {\n testCaseId: request.testCaseId,\n passed: false,\n keywordMatches: [],\n evaluationParameters: {\n approach: EvaluationApproach.SEMANTIC,\n threshold,\n },\n evaluationApproachResult: {\n score: 0,\n approachUsed: EvaluationApproach.SEMANTIC,\n },\n timestamp: new Date().toISOString(),\n };\n }\n }\n}\n"]}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { EvaluationApproach, EvaluationApproachValues } from "./constants";
|
|
2
|
+
const SELECT_ONLY_APPROACHES = [EvaluationApproach.EXACT];
|
|
3
|
+
export function getAllowedApproachesForFieldType(fieldType) {
|
|
4
|
+
if (fieldType === 'select') {
|
|
5
|
+
return SELECT_ONLY_APPROACHES;
|
|
6
|
+
}
|
|
7
|
+
return EvaluationApproachValues;
|
|
8
|
+
}
|
|
9
|
+
export function isApproachAllowedForFieldType(fieldType, approach) {
|
|
10
|
+
return getAllowedApproachesForFieldType(fieldType).includes(approach);
|
|
11
|
+
}
|
|
12
|
+
export function normalizeEvaluationParametersForField(fieldType, evaluationParameters) {
|
|
13
|
+
const allowedApproaches = getAllowedApproachesForFieldType(fieldType);
|
|
14
|
+
const fallbackApproach = allowedApproaches[0];
|
|
15
|
+
const rawApproach = evaluationParameters?.approach;
|
|
16
|
+
const approach = rawApproach && allowedApproaches.includes(rawApproach)
|
|
17
|
+
? rawApproach
|
|
18
|
+
: fallbackApproach;
|
|
19
|
+
return {
|
|
20
|
+
...evaluationParameters,
|
|
21
|
+
approach,
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=field-evaluation-approach.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"field-evaluation-approach.js","sourceRoot":"","sources":["../../../src/lib/evaluation/field-evaluation-approach.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,kBAAkB,EAAE,wBAAwB,EAAE,MAAM,aAAa,CAAC;AAK3E,MAAM,sBAAsB,GAAyB,CAAC,kBAAkB,CAAC,KAAK,CAAC,CAAC;AAEhF,MAAM,UAAU,gCAAgC,CAC9C,SAA8B;IAE9B,IAAI,SAAS,KAAK,QAAQ,EAAE,CAAC;QAC3B,OAAO,sBAAsB,CAAC;IAChC,CAAC;IACD,OAAO,wBAAwB,CAAC;AAClC,CAAC;AAED,MAAM,UAAU,6BAA6B,CAC3C,SAA8B,EAC9B,QAA4B;IAE5B,OAAO,gCAAgC,CAAC,SAAS,CAAC,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;AACxE,CAAC;AAED,MAAM,UAAU,qCAAqC,CACnD,SAA8B,EAC9B,oBAA2C;IAE3C,MAAM,iBAAiB,GAAG,gCAAgC,CAAC,SAAS,CAAC,CAAC;IACtE,MAAM,gBAAgB,GAAG,iBAAiB,CAAC,CAAC,CAAC,CAAC;IAC9C,MAAM,WAAW,GAAG,oBAAoB,EAAE,QAAQ,CAAC;IACnD,MAAM,QAAQ,GACZ,WAAW,IAAI,iBAAiB,CAAC,QAAQ,CAAC,WAAW,CAAC;QACpD,CAAC,CAAC,WAAW;QACb,CAAC,CAAC,gBAAgB,CAAC;IAEvB,OAAO;QACL,GAAG,oBAAoB;QACvB,QAAQ;KACT,CAAC;AACJ,CAAC","sourcesContent":["import { EvaluationApproach, EvaluationApproachValues } from './constants';\nimport type { EvaluationParameters } from '../../types/evaluation';\n\nexport type EvaluationFieldType = 'text' | 'textarea' | 'chips-input' | 'select';\n\nconst SELECT_ONLY_APPROACHES: EvaluationApproach[] = [EvaluationApproach.EXACT];\n\nexport function getAllowedApproachesForFieldType(\n fieldType: EvaluationFieldType,\n): EvaluationApproach[] {\n if (fieldType === 'select') {\n return SELECT_ONLY_APPROACHES;\n }\n return EvaluationApproachValues;\n}\n\nexport function isApproachAllowedForFieldType(\n fieldType: EvaluationFieldType,\n approach: EvaluationApproach,\n): boolean {\n return getAllowedApproachesForFieldType(fieldType).includes(approach);\n}\n\nexport function normalizeEvaluationParametersForField(\n fieldType: EvaluationFieldType,\n evaluationParameters?: EvaluationParameters,\n): EvaluationParameters {\n const allowedApproaches = getAllowedApproachesForFieldType(fieldType);\n const fallbackApproach = allowedApproaches[0];\n const rawApproach = evaluationParameters?.approach;\n const approach =\n rawApproach && allowedApproaches.includes(rawApproach)\n ? rawApproach\n : fallbackApproach;\n\n return {\n ...evaluationParameters,\n approach,\n };\n}\n\n"]}
|
|
@@ -1,7 +1,3 @@
|
|
|
1
1
|
import { LLMEvaluationEngine } from "./evaluation-engine";
|
|
2
2
|
export { LLMEvaluationEngine };
|
|
3
|
-
export async function evaluateLLMResponse(request, callback) {
|
|
4
|
-
const engine = new LLMEvaluationEngine();
|
|
5
|
-
await engine.evaluateResponse(request, callback);
|
|
6
|
-
}
|
|
7
3
|
//# sourceMappingURL=index.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/lib/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAQ1D,OAAO,EAAE,mBAAmB,EAAE,CAAC
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../../src/lib/evaluation/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,MAAM,qBAAqB,CAAC;AAQ1D,OAAO,EAAE,mBAAmB,EAAE,CAAC","sourcesContent":["import { LLMEvaluationEngine } from './evaluation-engine';\nimport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n} from './types';\n\nexport { LLMEvaluationEngine };\nexport type {\n EvaluationRequest,\n EvaluationResult,\n KeywordMatch,\n EvaluationCallback,\n};"]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n timestamp?: string;\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../src/lib/evaluation/types.ts"],"names":[],"mappings":"","sourcesContent":["import {\n EvaluationParameters,\n EvaluationApproachResult,\n} from '../../types/evaluation';\nimport type { ExpectedOutcomeFieldType } from '../../types/llm-test-runner';\n\nexport interface EvaluationRequest {\n testCaseId: string;\n question: string;\n expectedOutcome: string;\n actualResponse: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface FieldEvaluationInput {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n evaluationParameters: EvaluationParameters;\n}\n\nexport interface EvaluationRequestV2 {\n testCaseId: string;\n question: string;\n actualResponse: string;\n fields: FieldEvaluationInput[];\n}\n\nexport interface EvaluationResult {\n testCaseId: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n fieldResults?: FieldEvaluationResult[];\n timestamp?: string;\n evaluationParameters?: EvaluationParameters;\n evaluationApproachResult?: EvaluationApproachResult;\n}\n\nexport interface FieldEvaluationResult {\n index: number;\n label: string;\n type: ExpectedOutcomeFieldType;\n expectedValue: string;\n passed: boolean;\n keywordMatches: KeywordMatch[];\n evaluationParameters: EvaluationParameters;\n evaluationApproachResult: EvaluationApproachResult;\n error?: string;\n}\n\nexport interface KeywordMatch {\n keyword: string;\n found: boolean;\n evaluationApproachResult: EvaluationApproachResult;\n}\n\nexport type EvaluationCallback = (result: EvaluationResult) => void;\n\nexport interface RougeKeywordDetails {\n rouge1: number;\n rougeL: number;\n scoreUsed: string;\n approach: string;\n}\n\nexport interface Rouge1OverallDetails {\n keywordsPassed: number;\n totalKeywords: number;\n passRate: string;\n thresholdUsed: number;\n approach: string;\n}\n"]}
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { serializeExpectedOutcome } from "../expected-outcome-serializer";
|
|
2
1
|
/**
|
|
3
2
|
* Escapes a CSV field by wrapping it in quotes if it contains special characters
|
|
4
3
|
* @param field - The field to escape
|
|
@@ -17,48 +16,63 @@ export function escapeCsvField(field) {
|
|
|
17
16
|
*/
|
|
18
17
|
export function exportTestResultsToCsv(testCases) {
|
|
19
18
|
const csvRows = [];
|
|
19
|
+
const maxFieldCount = testCases.reduce((max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length), 0);
|
|
20
20
|
// Add header row
|
|
21
21
|
const headers = [
|
|
22
22
|
'Question',
|
|
23
|
-
'Expected Keywords',
|
|
24
|
-
'Generated Keywords',
|
|
25
|
-
'Keywords Match',
|
|
26
23
|
'Response Time (s)',
|
|
27
|
-
'Evaluation Approach',
|
|
28
|
-
'Evaluation Score',
|
|
29
24
|
];
|
|
25
|
+
for (let i = 1; i <= maxFieldCount; i++) {
|
|
26
|
+
headers.push('Field Name');
|
|
27
|
+
headers.push('Expected Keywords');
|
|
28
|
+
headers.push('Generated Keywords');
|
|
29
|
+
headers.push('Evaluation Strategy');
|
|
30
|
+
headers.push('Passed Evaluation');
|
|
31
|
+
headers.push('Keyword Match');
|
|
32
|
+
headers.push('Score');
|
|
33
|
+
if (i < maxFieldCount) {
|
|
34
|
+
headers.push('');
|
|
35
|
+
}
|
|
36
|
+
}
|
|
30
37
|
csvRows.push(headers.join(','));
|
|
31
|
-
// Add data rows
|
|
38
|
+
// Add data rows (one row per test case)
|
|
32
39
|
testCases.forEach(testCase => {
|
|
33
|
-
const expectedOutcome = serializeExpectedOutcome(testCase.expectedOutcome || [], ' | ');
|
|
34
|
-
const evaluationApproach = testCase.evaluationParameters?.approach || '';
|
|
35
|
-
const score = testCase.evaluationResult?.evaluationApproachResult?.score;
|
|
36
|
-
const evaluationScore = score !== undefined ? score.toString() : '';
|
|
37
|
-
let generatedKeywords = '';
|
|
38
|
-
let keywordsMatch = '';
|
|
39
|
-
if (testCase.evaluationResult) {
|
|
40
|
-
const foundKeywords = testCase.evaluationResult.keywordMatches
|
|
41
|
-
.filter(match => match.found)
|
|
42
|
-
.map(match => match.keyword);
|
|
43
|
-
generatedKeywords = foundKeywords.join('; ');
|
|
44
|
-
// Calculate match percentages
|
|
45
|
-
const keywordMatchCount = testCase.evaluationResult.keywordMatches.filter(m => m.found).length;
|
|
46
|
-
const totalKeywords = testCase.evaluationResult.keywordMatches.length;
|
|
47
|
-
keywordsMatch =
|
|
48
|
-
totalKeywords > 0 ? `${keywordMatchCount}/${totalKeywords}` : 'N/A';
|
|
49
|
-
}
|
|
50
40
|
const responseTime = testCase.responseTime
|
|
51
41
|
? (testCase.responseTime / 1000).toFixed(3)
|
|
52
42
|
: 'N/A';
|
|
53
|
-
const row = [
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
43
|
+
const row = [escapeCsvField(testCase.question), responseTime];
|
|
44
|
+
for (let i = 0; i < maxFieldCount; i++) {
|
|
45
|
+
const field = testCase.expectedOutcome?.[i];
|
|
46
|
+
const fieldResult = testCase.evaluationResult?.fieldResults?.find(result => result.index === i);
|
|
47
|
+
const expectedKeywords = fieldResult?.expectedValue ??
|
|
48
|
+
(field
|
|
49
|
+
? field.type === 'chips-input'
|
|
50
|
+
? field.value.join(', ')
|
|
51
|
+
: field.value
|
|
52
|
+
: '');
|
|
53
|
+
const generatedKeywords = (fieldResult?.keywordMatches || [])
|
|
54
|
+
.filter(match => match.found)
|
|
55
|
+
.map(match => match.keyword)
|
|
56
|
+
.join('; ');
|
|
57
|
+
const matchedCount = (fieldResult?.keywordMatches || []).filter(match => match.found).length;
|
|
58
|
+
const totalMatches = fieldResult?.keywordMatches?.length || 0;
|
|
59
|
+
const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';
|
|
60
|
+
const score = fieldResult?.evaluationApproachResult?.score !== undefined
|
|
61
|
+
? fieldResult.evaluationApproachResult.score.toFixed(2)
|
|
62
|
+
: '';
|
|
63
|
+
row.push(escapeCsvField(field?.label || ''));
|
|
64
|
+
row.push(escapeCsvField(expectedKeywords || ''));
|
|
65
|
+
row.push(escapeCsvField(generatedKeywords));
|
|
66
|
+
row.push(escapeCsvField(fieldResult?.evaluationParameters.approach ||
|
|
67
|
+
field?.evaluationParameters?.approach ||
|
|
68
|
+
''));
|
|
69
|
+
row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');
|
|
70
|
+
row.push(keywordMatch);
|
|
71
|
+
row.push(score);
|
|
72
|
+
if (i < maxFieldCount - 1) {
|
|
73
|
+
row.push('');
|
|
74
|
+
}
|
|
75
|
+
}
|
|
62
76
|
csvRows.push(row.join(','));
|
|
63
77
|
});
|
|
64
78
|
return csvRows.join('\n');
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"test-results-csv.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-results-csv.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"test-results-csv.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-results-csv.ts"],"names":[],"mappings":"AAEA;;;;GAIG;AACH,MAAM,UAAU,cAAc,CAAC,KAAa;IAC1C,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,IAAI,KAAK,CAAC,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC;QACvE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC;IAC1C,CAAC;IACD,OAAO,KAAK,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,sBAAsB,CAAC,SAAqB;IAC1D,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,MAAM,aAAa,GAAG,SAAS,CAAC,MAAM,CACpC,CAAC,GAAG,EAAE,QAAQ,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,GAAG,EAAE,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,EACzE,CAAC,CACF,CAAC;IAEF,iBAAiB;IACjB,MAAM,OAAO,GAAa;QACxB,UAAU;QACV,mBAAmB;KACpB,CAAC;IACF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,IAAI,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;QACxC,OAAO,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;QAC3B,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAClC,OAAO,CAAC,IAAI,CAAC,oBAAoB,CAAC,CAAC;QACnC,OAAO,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QACpC,OAAO,CAAC,IAAI,CAAC,mBAAmB,CAAC,CAAC;QAClC,OAAO,CAAC,IAAI,CAAC,eAAe,CAAC,CAAC;QAC9B,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;QACtB,IAAI,CAAC,GAAG,aAAa,EAAE,CAAC;YACtB,OAAO,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QACnB,CAAC;IACH,CAAC;IACD,OAAO,CAAC,IAAI,CAAC,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAEhC,wCAAwC;IACxC,SAAS,CAAC,OAAO,CAAC,QAAQ,CAAC,EAAE;QAC3B,MAAM,YAAY,GAAG,QAAQ,CAAC,YAAY;YACxC,CAAC,CAAC,CAAC,QAAQ,CAAC,YAAY,GAAG,IAAI,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC;YAC3C,CAAC,CAAC,KAAK,CAAC;QACV,MAAM,GAAG,GAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,QAAQ,CAAC,EAAE,YAAY,CAAC,CAAC;QAExE,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;YACvC,MAAM,KAAK,GAAG,QAAQ,CAAC,eAAe,EAAE,CAAC,CAAC,CAAC,CAAC;YAC5C,MAAM,WAAW,GAAG,QAAQ,CAAC,gBAAgB,EAAE,YAAY,EAAE,IAAI,CAC/D,MAAM,CAAC,EAAE,CAAC,MAAM,CAAC,KAAK,KAAK,CAAC,CAC7B,CAAC;YAEF,MAAM,gBAAgB,GACpB,WAAW,EAAE,aAAa;gBAC1B,CAAC,KAAK;oBACJ,CAAC,CAAC,KAAK,CAAC,IAAI,KAAK,aAAa;wBAC5B,CAAC,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC;wBACxB,CAAC,CAAC,KAAK,CAAC,KAAK;oBACf,CAAC,CAAC,EAAE,CAAC,CAAC;YACV,MAAM,iBAAiB,GAAG,CAAC,WAAW,EAAE,cAAc,IAAI,EAAE,CAAC;iBAC1D,MAAM,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC;iBAC5B,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC;iBAC3B,IAAI,CAAC,IAAI,CAAC,CAAC;YACd,MAAM,YAAY,GAAG,CAAC,WAAW,EAAE,cAAc,IAAI,EAAE,CAAC,CAAC,MAAM,CAC7D,KAAK,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CACrB,CAAC,MAAM,CAAC;YACT,MAAM,YAAY,GAAG,WAAW,EAAE,cAAc,EAAE,MAAM,IAAI,CAAC,CAAC;YAC9D,MAAM,YAAY,GAAG,YAAY,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,YAAY,IAAI,YAAY,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;YAC/E,MAAM,KAAK,GACT,WAAW,EAAE,wBAAwB,EAAE,KAAK,KAAK,SAAS;gBACxD,CAAC,CAAC,WAAW,CAAC,wBAAwB,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC;gBACvD,CAAC,CAAC,EAAE,CAAC;YAET,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,KAAK,EAAE,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC;YAC7C,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,gBAAgB,IAAI,EAAE,CAAC,CAAC,CAAC;YACjD,GAAG,CAAC,IAAI,CAAC,cAAc,CAAC,iBAAiB,CAAC,CAAC,CAAC;YAC5C,GAAG,CAAC,IAAI,CACN,cAAc,CACZ,WAAW,EAAE,oBAAoB,CAAC,QAAQ;gBACxC,KAAK,EAAE,oBAAoB,EAAE,QAAQ;gBACrC,EAAE,CACL,CACF,CAAC;YACF,GAAG,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC;YACrE,GAAG,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;YACvB,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YAEhB,IAAI,CAAC,GAAG,aAAa,GAAG,CAAC,EAAE,CAAC;gBAC1B,GAAG,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACf,CAAC;QACH,CAAC;QAED,OAAO,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IAC9B,CAAC,CAAC,CAAC;IAEH,OAAO,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC5B,CAAC","sourcesContent":["import { TestCase } from '../../types/llm-test-runner';\n\n/**\n * Escapes a CSV field by wrapping it in quotes if it contains special characters\n * @param field - The field to escape\n * @returns Escaped field string\n */\nexport function escapeCsvField(field: string): string {\n if (field.includes(',') || field.includes('\"') || field.includes('\\n')) {\n return `\"${field.replace(/\"/g, '\"\"')}\"`;\n }\n return field;\n}\n\n/**\n * Exports test results to a CSV string\n * @param testCases - Array of test cases with results to export\n * @returns CSV string representation of the test results\n */\nexport function exportTestResultsToCsv(testCases: TestCase[]): string {\n const csvRows: string[] = [];\n const maxFieldCount = testCases.reduce(\n (max, testCase) => Math.max(max, (testCase.expectedOutcome || []).length),\n 0,\n );\n\n // Add header row\n const headers: string[] = [\n 'Question',\n 'Response Time (s)',\n ];\n for (let i = 1; i <= maxFieldCount; i++) {\n headers.push('Field Name');\n headers.push('Expected Keywords');\n headers.push('Generated Keywords');\n headers.push('Evaluation Strategy');\n headers.push('Passed Evaluation');\n headers.push('Keyword Match');\n headers.push('Score');\n if (i < maxFieldCount) {\n headers.push('');\n }\n }\n csvRows.push(headers.join(','));\n\n // Add data rows (one row per test case)\n testCases.forEach(testCase => {\n const responseTime = testCase.responseTime\n ? (testCase.responseTime / 1000).toFixed(3)\n : 'N/A';\n const row: string[] = [escapeCsvField(testCase.question), responseTime];\n\n for (let i = 0; i < maxFieldCount; i++) {\n const field = testCase.expectedOutcome?.[i];\n const fieldResult = testCase.evaluationResult?.fieldResults?.find(\n result => result.index === i,\n );\n\n const expectedKeywords =\n fieldResult?.expectedValue ??\n (field\n ? field.type === 'chips-input'\n ? field.value.join(', ')\n : field.value\n : '');\n const generatedKeywords = (fieldResult?.keywordMatches || [])\n .filter(match => match.found)\n .map(match => match.keyword)\n .join('; ');\n const matchedCount = (fieldResult?.keywordMatches || []).filter(\n match => match.found,\n ).length;\n const totalMatches = fieldResult?.keywordMatches?.length || 0;\n const keywordMatch = totalMatches > 0 ? `${matchedCount}/${totalMatches}` : '';\n const score =\n fieldResult?.evaluationApproachResult?.score !== undefined\n ? fieldResult.evaluationApproachResult.score.toFixed(2)\n : '';\n\n row.push(escapeCsvField(field?.label || ''));\n row.push(escapeCsvField(expectedKeywords || ''));\n row.push(escapeCsvField(generatedKeywords));\n row.push(\n escapeCsvField(\n fieldResult?.evaluationParameters.approach ||\n field?.evaluationParameters?.approach ||\n '',\n ),\n );\n row.push(fieldResult ? (fieldResult.passed ? 'TRUE' : 'FALSE') : '');\n row.push(keywordMatch);\n row.push(score);\n\n if (i < maxFieldCount - 1) {\n row.push('');\n }\n }\n\n csvRows.push(row.join(','));\n });\n\n return csvRows.join('\\n');\n}\n\n"]}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"test-suite-exporter.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-suite-exporter.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"test-suite-exporter.js","sourceRoot":"","sources":["../../../src/lib/import-export/test-suite-exporter.ts"],"names":[],"mappings":"AAQA;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CAAC,SAAqB;IACzD,MAAM,UAAU,GAA0B,SAAS,CAAC,GAAG,CAAC,QAAQ,CAAC,EAAE,CAAC,CAAC;QACnE,EAAE,EAAE,QAAQ,CAAC,EAAE;QACf,QAAQ,EAAE,QAAQ,CAAC,QAAQ;QAC3B,eAAe,EAAE,QAAQ,CAAC,eAAe;KAC1C,CAAC,CAAC,CAAC;IAEJ,OAAO,IAAI,CAAC,SAAS,CAAC,UAAU,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;AAC7C,CAAC","sourcesContent":["import { ExpectedOutcomeField, TestCase } from '../../types/llm-test-runner';\n\nexport interface TestSuiteExportData {\n id: string;\n question: string;\n expectedOutcome: ExpectedOutcomeField[];\n}\n\n/**\n * Formats test cases as a JSON string suitable for saving as a test suite\n * @param testCases - Array of test cases to format\n * @returns JSON string representation of the test suite\n */\nexport function formatTestSuiteAsJson(testCases: TestCase[]): string {\n const exportData: TestSuiteExportData[] = testCases.map(testCase => ({\n id: testCase.id,\n question: testCase.question,\n expectedOutcome: testCase.expectedOutcome,\n }));\n\n return JSON.stringify(exportData, null, 2);\n}\n"]}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { v4 as uuidv4 } from "uuid";
|
|
2
|
-
import {
|
|
2
|
+
import { normalizeEvaluationParametersForField } from "../evaluation/field-evaluation-approach";
|
|
3
3
|
export const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
4
4
|
{
|
|
5
5
|
type: 'textarea',
|
|
@@ -8,6 +8,12 @@ export const DEFAULT_EXPECTED_OUTCOME_SCHEMA = [
|
|
|
8
8
|
rows: 2,
|
|
9
9
|
},
|
|
10
10
|
];
|
|
11
|
+
function normalizeExpectedOutcomeField(field) {
|
|
12
|
+
return {
|
|
13
|
+
...field,
|
|
14
|
+
evaluationParameters: normalizeEvaluationParametersForField(field.type, field.evaluationParameters),
|
|
15
|
+
};
|
|
16
|
+
}
|
|
11
17
|
/**
|
|
12
18
|
* Creates a new test case with default values
|
|
13
19
|
* @returns A new TestCase object with a unique ID
|
|
@@ -17,9 +23,6 @@ export function createTestCase(expectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_
|
|
|
17
23
|
id: uuidv4(),
|
|
18
24
|
question: '',
|
|
19
25
|
expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),
|
|
20
|
-
evaluationParameters: {
|
|
21
|
-
approach: EvaluationApproach.EXACT,
|
|
22
|
-
},
|
|
23
26
|
isRunning: false,
|
|
24
27
|
};
|
|
25
28
|
}
|
|
@@ -29,35 +32,35 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
29
32
|
return {
|
|
30
33
|
type: 'text',
|
|
31
34
|
label: schemaField.label,
|
|
32
|
-
required: schemaField.required,
|
|
33
35
|
placeholder: schemaField.placeholder,
|
|
34
36
|
value: '',
|
|
37
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
35
38
|
};
|
|
36
39
|
case 'textarea':
|
|
37
40
|
return {
|
|
38
41
|
type: 'textarea',
|
|
39
42
|
label: schemaField.label,
|
|
40
|
-
required: schemaField.required,
|
|
41
43
|
placeholder: schemaField.placeholder,
|
|
42
44
|
rows: schemaField.rows,
|
|
43
45
|
value: '',
|
|
46
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
44
47
|
};
|
|
45
48
|
case 'chips-input':
|
|
46
49
|
return {
|
|
47
50
|
type: 'chips-input',
|
|
48
51
|
label: schemaField.label,
|
|
49
|
-
required: schemaField.required,
|
|
50
52
|
placeholder: schemaField.placeholder,
|
|
51
53
|
value: [],
|
|
54
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
52
55
|
};
|
|
53
56
|
case 'select':
|
|
54
57
|
return {
|
|
55
58
|
type: 'select',
|
|
56
59
|
label: schemaField.label,
|
|
57
|
-
required: schemaField.required,
|
|
58
60
|
placeholder: schemaField.placeholder,
|
|
59
61
|
value: '',
|
|
60
62
|
options: schemaField.options,
|
|
63
|
+
evaluationParameters: normalizeEvaluationParametersForField(schemaField.type, schemaField.evaluationParameters),
|
|
61
64
|
};
|
|
62
65
|
default: {
|
|
63
66
|
const _exhaustiveCheck = schemaField;
|
|
@@ -68,31 +71,18 @@ function createExpectedOutcomeFieldFromSchema(schemaField) {
|
|
|
68
71
|
export function createExpectedOutcomeFromSchema(expectedOutcomeSchema) {
|
|
69
72
|
return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);
|
|
70
73
|
}
|
|
71
|
-
export function migrateLegacyExpectedOutcomeString(value) {
|
|
72
|
-
return [
|
|
73
|
-
{
|
|
74
|
-
type: 'textarea',
|
|
75
|
-
label: 'Expected Outcome',
|
|
76
|
-
value,
|
|
77
|
-
},
|
|
78
|
-
];
|
|
79
|
-
}
|
|
80
74
|
/**
|
|
81
75
|
* Creates a runtime test case from validated input data.
|
|
82
|
-
* The input is expected to already satisfy `TestCaseInput
|
|
83
|
-
* and this function only performs normalization/defaulting
|
|
76
|
+
* The input is expected to already satisfy `TestCaseInput`,
|
|
77
|
+
* and this function only performs normalization/defaulting.
|
|
84
78
|
*
|
|
85
79
|
* @param data - Validated test case input
|
|
86
80
|
* @returns A normalized TestCase object with runtime defaults applied
|
|
87
81
|
*/
|
|
88
82
|
export function createTestCaseFromInput(data) {
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
expectedOutcome
|
|
92
|
-
}
|
|
93
|
-
else {
|
|
94
|
-
expectedOutcome = data.expectedOutcome;
|
|
95
|
-
}
|
|
96
|
-
return { ...data, expectedOutcome };
|
|
83
|
+
return {
|
|
84
|
+
...data,
|
|
85
|
+
expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),
|
|
86
|
+
};
|
|
97
87
|
}
|
|
98
88
|
//# sourceMappingURL=test-case-factory.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"test-case-factory.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;
|
|
1
|
+
{"version":3,"file":"test-case-factory.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,EAAE,IAAI,MAAM,EAAE,MAAM,MAAM,CAAC;AASpC,OAAO,EAAE,qCAAqC,EAAE,MAAM,yCAAyC,CAAC;AAEhG,MAAM,CAAC,MAAM,+BAA+B,GAA0B;IACpE;QACE,IAAI,EAAE,UAAU;QAChB,KAAK,EAAE,kBAAkB;QACzB,WAAW,EAAE,2BAA2B;QACxC,IAAI,EAAE,CAAC;KACR;CACF,CAAC;AAEF,SAAS,6BAA6B,CACpC,KAA2B;IAE3B,OAAO;QACL,GAAG,KAAK;QACR,oBAAoB,EAAE,qCAAqC,CACzD,KAAK,CAAC,IAAI,EACV,KAAK,CAAC,oBAAoB,CAC3B;KACF,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,cAAc,CAC5B,wBAA+C,+BAA+B;IAE9E,OAAO;QACL,EAAE,EAAE,MAAM,EAAE;QACZ,QAAQ,EAAE,EAAE;QACZ,eAAe,EAAE,+BAA+B,CAAC,qBAAqB,CAAC;QACvE,SAAS,EAAE,KAAK;KACjB,CAAC;AACJ,CAAC;AAED,SAAS,oCAAoC,CAC3C,WAAuC;IAEvC,QAAQ,WAAW,CAAC,IAAI,EAAE,CAAC;QACzB,KAAK,MAAM;YACT,OAAO;gBACL,IAAI,EAAE,MAAM;gBACZ,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CACjC;aACF,CAAC;QAEJ,KAAK,UAAU;YACb,OAAO;gBACL,IAAI,EAAE,UAAU;gBAChB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,IAAI,EAAE,WAAW,CAAC,IAAI;gBACtB,KAAK,EAAE,EAAE;gBACT,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CACjC;aACF,CAAC;QAEJ,KAAK,aAAa;YAChB,OAAO;gBACL,IAAI,EAAE,aAAa;gBACnB,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CACjC;aACF,CAAC;QAEJ,KAAK,QAAQ;YACX,OAAO;gBACL,IAAI,EAAE,QAAQ;gBACd,KAAK,EAAE,WAAW,CAAC,KAAK;gBACxB,WAAW,EAAE,WAAW,CAAC,WAAW;gBACpC,KAAK,EAAE,EAAE;gBACT,OAAO,EAAE,WAAW,CAAC,OAAO;gBAC5B,oBAAoB,EAAE,qCAAqC,CACzD,WAAW,CAAC,IAAI,EAChB,WAAW,CAAC,oBAAoB,CAC6B;aAChE,CAAC;QAEJ,OAAO,CAAC,CAAC,CAAC;YACR,MAAM,gBAAgB,GAAU,WAAW,CAAC;YAC5C,OAAO,gBAAgB,CAAC;QAC1B,CAAC;IACH,CAAC;AACH,CAAC;AAED,MAAM,UAAU,+BAA+B,CAC7C,qBAA4C;IAE5C,OAAO,qBAAqB,CAAC,GAAG,CAAC,oCAAoC,CAAC,CAAC;AACzE,CAAC;AAED;;;;;;;GAOG;AACH,MAAM,UAAU,uBAAuB,CAAC,IAAmB;IACzD,OAAO;QACL,GAAG,IAAI;QACP,eAAe,EAAE,IAAI,CAAC,eAAe,CAAC,GAAG,CAAC,6BAA6B,CAAC;KACzE,CAAC;AACJ,CAAC","sourcesContent":["import { v4 as uuidv4 } from 'uuid';\nimport {\n ExpectedOutcomeField,\n ExpectedOutcomeSchema,\n ExpectedOutcomeSchemaField,\n TestCase,\n TestCaseInput,\n} from '../../types/llm-test-runner';\nimport { EvaluationApproach } from '../evaluation/constants';\nimport { normalizeEvaluationParametersForField } from '../evaluation/field-evaluation-approach';\n\nexport const DEFAULT_EXPECTED_OUTCOME_SCHEMA: ExpectedOutcomeSchema = [\n {\n type: 'textarea',\n label: 'Expected Outcome',\n placeholder: 'Enter expected outcome...',\n rows: 2,\n },\n];\n\nfunction normalizeExpectedOutcomeField(\n field: ExpectedOutcomeField,\n): ExpectedOutcomeField {\n return {\n ...field,\n evaluationParameters: normalizeEvaluationParametersForField(\n field.type,\n field.evaluationParameters,\n ),\n };\n}\n\n/**\n * Creates a new test case with default values\n * @returns A new TestCase object with a unique ID\n */\nexport function createTestCase(\n expectedOutcomeSchema: ExpectedOutcomeSchema = DEFAULT_EXPECTED_OUTCOME_SCHEMA,\n): TestCase {\n return {\n id: uuidv4(),\n question: '',\n expectedOutcome: createExpectedOutcomeFromSchema(expectedOutcomeSchema),\n isRunning: false,\n };\n}\n\nfunction createExpectedOutcomeFieldFromSchema(\n schemaField: ExpectedOutcomeSchemaField,\n): ExpectedOutcomeField {\n switch (schemaField.type) {\n case 'text':\n return {\n type: 'text',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n value: '',\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ),\n };\n\n case 'textarea':\n return {\n type: 'textarea',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n rows: schemaField.rows,\n value: '',\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ),\n };\n\n case 'chips-input':\n return {\n type: 'chips-input',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n value: [],\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ),\n };\n\n case 'select':\n return {\n type: 'select',\n label: schemaField.label,\n placeholder: schemaField.placeholder,\n value: '',\n options: schemaField.options,\n evaluationParameters: normalizeEvaluationParametersForField(\n schemaField.type,\n schemaField.evaluationParameters,\n ) as { approach: EvaluationApproach.EXACT; threshold?: number },\n };\n\n default: {\n const _exhaustiveCheck: never = schemaField;\n return _exhaustiveCheck;\n }\n }\n}\n\nexport function createExpectedOutcomeFromSchema(\n expectedOutcomeSchema: ExpectedOutcomeSchema,\n): ExpectedOutcomeField[] {\n return expectedOutcomeSchema.map(createExpectedOutcomeFieldFromSchema);\n}\n\n/**\n * Creates a runtime test case from validated input data.\n * The input is expected to already satisfy `TestCaseInput`,\n * and this function only performs normalization/defaulting.\n *\n * @param data - Validated test case input\n * @returns A normalized TestCase object with runtime defaults applied\n */\nexport function createTestCaseFromInput(data: TestCaseInput): TestCase {\n return {\n ...data,\n expectedOutcome: data.expectedOutcome.map(normalizeExpectedOutcomeField),\n };\n}\n"]}
|
|
@@ -1,16 +1,67 @@
|
|
|
1
|
+
import { normalizeEvaluationParametersForField } from "../evaluation/field-evaluation-approach";
|
|
2
|
+
export function applyExpectedOutcomeChange(testCase, change) {
|
|
3
|
+
const { index } = change;
|
|
4
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
5
|
+
const target = expectedOutcome[index];
|
|
6
|
+
if (!target) {
|
|
7
|
+
return testCase;
|
|
8
|
+
}
|
|
9
|
+
switch (change.operation) {
|
|
10
|
+
case 'set-value': {
|
|
11
|
+
if (target.type === 'chips-input') {
|
|
12
|
+
return testCase;
|
|
13
|
+
}
|
|
14
|
+
expectedOutcome[index] = {
|
|
15
|
+
...target,
|
|
16
|
+
value: change.value,
|
|
17
|
+
};
|
|
18
|
+
return { ...testCase, expectedOutcome };
|
|
19
|
+
}
|
|
20
|
+
case 'add-chip': {
|
|
21
|
+
if (target.type !== 'chips-input') {
|
|
22
|
+
return testCase;
|
|
23
|
+
}
|
|
24
|
+
expectedOutcome[index] = {
|
|
25
|
+
...target,
|
|
26
|
+
value: [...target.value, change.value],
|
|
27
|
+
};
|
|
28
|
+
return { ...testCase, expectedOutcome };
|
|
29
|
+
}
|
|
30
|
+
case 'remove-chip': {
|
|
31
|
+
if (target.type !== 'chips-input') {
|
|
32
|
+
return testCase;
|
|
33
|
+
}
|
|
34
|
+
expectedOutcome[index] = {
|
|
35
|
+
...target,
|
|
36
|
+
value: target.value.filter(chip => chip !== change.value),
|
|
37
|
+
};
|
|
38
|
+
return { ...testCase, expectedOutcome };
|
|
39
|
+
}
|
|
40
|
+
case 'set-evaluation-approach':
|
|
41
|
+
return updateExpectedOutcomeFieldApproach(testCase, index, change.value);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
1
44
|
/**
|
|
2
|
-
* Updates the evaluation approach for a
|
|
3
|
-
*
|
|
4
|
-
* @param approach - The new evaluation approach
|
|
5
|
-
* @returns Updated test case with the new evaluation approach
|
|
45
|
+
* Updates the evaluation approach for a specific expected outcome field.
|
|
46
|
+
* Select fields always use exact matching.
|
|
6
47
|
*/
|
|
7
|
-
export function
|
|
48
|
+
export function updateExpectedOutcomeFieldApproach(testCase, fieldIndex, approach) {
|
|
49
|
+
const expectedOutcome = [...(testCase.expectedOutcome || [])];
|
|
50
|
+
const target = expectedOutcome[fieldIndex];
|
|
51
|
+
if (!target) {
|
|
52
|
+
return testCase;
|
|
53
|
+
}
|
|
54
|
+
const currentEvaluationParameters = target.evaluationParameters;
|
|
55
|
+
expectedOutcome[fieldIndex] = {
|
|
56
|
+
...target,
|
|
57
|
+
evaluationParameters: normalizeEvaluationParametersForField(target.type, {
|
|
58
|
+
...currentEvaluationParameters,
|
|
59
|
+
approach,
|
|
60
|
+
}),
|
|
61
|
+
};
|
|
8
62
|
return {
|
|
9
63
|
...testCase,
|
|
10
|
-
|
|
11
|
-
...testCase.evaluationParameters,
|
|
12
|
-
approach: approach,
|
|
13
|
-
},
|
|
64
|
+
expectedOutcome,
|
|
14
65
|
};
|
|
15
66
|
}
|
|
16
67
|
//# sourceMappingURL=test-case-mutations.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"test-case-mutations.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-mutations.ts"],"names":[],"mappings":"
|
|
1
|
+
{"version":3,"file":"test-case-mutations.js","sourceRoot":"","sources":["../../../src/lib/test-cases/test-case-mutations.ts"],"names":[],"mappings":"AAEA,OAAO,EAAE,qCAAqC,EAAE,MAAM,yCAAyC,CAAC;AAwBhG,MAAM,UAAU,0BAA0B,CACxC,QAAkB,EAClB,MAA6B;IAE7B,MAAM,EAAE,KAAK,EAAE,GAAG,MAAM,CAAC;IACzB,MAAM,eAAe,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,eAAe,CAAC,KAAK,CAAC,CAAC;IAEtC,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,QAAQ,MAAM,CAAC,SAAS,EAAE,CAAC;QACzB,KAAK,WAAW,CAAC,CAAC,CAAC;YACjB,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;gBAClC,OAAO,QAAQ,CAAC;YAClB,CAAC;YACD,eAAe,CAAC,KAAK,CAAC,GAAG;gBACvB,GAAG,MAAM;gBACT,KAAK,EAAE,MAAM,CAAC,KAAK;aACpB,CAAC;YACF,OAAO,EAAE,GAAG,QAAQ,EAAE,eAAe,EAAE,CAAC;QAC1C,CAAC;QACD,KAAK,UAAU,CAAC,CAAC,CAAC;YAChB,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;gBAClC,OAAO,QAAQ,CAAC;YAClB,CAAC;YACD,eAAe,CAAC,KAAK,CAAC,GAAG;gBACvB,GAAG,MAAM;gBACT,KAAK,EAAE,CAAC,GAAG,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC;aACvC,CAAC;YACF,OAAO,EAAE,GAAG,QAAQ,EAAE,eAAe,EAAE,CAAC;QAC1C,CAAC;QACD,KAAK,aAAa,CAAC,CAAC,CAAC;YACnB,IAAI,MAAM,CAAC,IAAI,KAAK,aAAa,EAAE,CAAC;gBAClC,OAAO,QAAQ,CAAC;YAClB,CAAC;YACD,eAAe,CAAC,KAAK,CAAC,GAAG;gBACvB,GAAG,MAAM;gBACT,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI,KAAK,MAAM,CAAC,KAAK,CAAC;aAC1D,CAAC;YACF,OAAO,EAAE,GAAG,QAAQ,EAAE,eAAe,EAAE,CAAC;QAC1C,CAAC;QACD,KAAK,yBAAyB;YAC5B,OAAO,kCAAkC,CAAC,QAAQ,EAAE,KAAK,EAAE,MAAM,CAAC,KAAK,CAAC,CAAC;IAC7E,CAAC;AACH,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,kCAAkC,CAChD,QAAkB,EAClB,UAAkB,EAClB,QAA4B;IAE5B,MAAM,eAAe,GAAG,CAAC,GAAG,CAAC,QAAQ,CAAC,eAAe,IAAI,EAAE,CAAC,CAAC,CAAC;IAC9D,MAAM,MAAM,GAAG,eAAe,CAAC,UAAU,CAAC,CAAC;IAE3C,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,OAAO,QAAQ,CAAC;IAClB,CAAC;IAED,MAAM,2BAA2B,GAAG,MAAM,CAAC,oBAAoB,CAAC;IAChE,eAAe,CAAC,UAAU,CAAC,GAAG;QAC5B,GAAG,MAAM;QACT,oBAAoB,EAAE,qCAAqC,CAAC,MAAM,CAAC,IAAI,EAAE;YACvE,GAAG,2BAA2B;YAC9B,QAAQ;SACT,CAAC;KACH,CAAC;IAEF,OAAO;QACL,GAAG,QAAQ;QACX,eAAe;KAChB,CAAC;AACJ,CAAC","sourcesContent":["import { TestCase } from '../../types/llm-test-runner';\nimport { EvaluationApproach } from '../evaluation/constants';\nimport { normalizeEvaluationParametersForField } from '../evaluation/field-evaluation-approach';\n\nexport type ExpectedOutcomeChange =\n | {\n index: number;\n operation: 'set-value';\n value: string;\n }\n | {\n index: number;\n operation: 'add-chip';\n value: string;\n }\n | {\n index: number;\n operation: 'remove-chip';\n value: string;\n }\n | {\n index: number;\n operation: 'set-evaluation-approach';\n value: EvaluationApproach;\n };\n\nexport function applyExpectedOutcomeChange(\n testCase: TestCase,\n change: ExpectedOutcomeChange,\n): TestCase {\n const { index } = change;\n const expectedOutcome = [...(testCase.expectedOutcome || [])];\n const target = expectedOutcome[index];\n\n if (!target) {\n return testCase;\n }\n\n switch (change.operation) {\n case 'set-value': {\n if (target.type === 'chips-input') {\n return testCase;\n }\n expectedOutcome[index] = {\n ...target,\n value: change.value,\n };\n return { ...testCase, expectedOutcome };\n }\n case 'add-chip': {\n if (target.type !== 'chips-input') {\n return testCase;\n }\n expectedOutcome[index] = {\n ...target,\n value: [...target.value, change.value],\n };\n return { ...testCase, expectedOutcome };\n }\n case 'remove-chip': {\n if (target.type !== 'chips-input') {\n return testCase;\n }\n expectedOutcome[index] = {\n ...target,\n value: target.value.filter(chip => chip !== change.value),\n };\n return { ...testCase, expectedOutcome };\n }\n case 'set-evaluation-approach':\n return updateExpectedOutcomeFieldApproach(testCase, index, change.value);\n }\n}\n\n/**\n * Updates the evaluation approach for a specific expected outcome field.\n * Select fields always use exact matching.\n */\nexport function updateExpectedOutcomeFieldApproach(\n testCase: TestCase,\n fieldIndex: number,\n approach: EvaluationApproach,\n): TestCase {\n const expectedOutcome = [...(testCase.expectedOutcome || [])];\n const target = expectedOutcome[fieldIndex];\n\n if (!target) {\n return testCase;\n }\n\n const currentEvaluationParameters = target.evaluationParameters;\n expectedOutcome[fieldIndex] = {\n ...target,\n evaluationParameters: normalizeEvaluationParametersForField(target.type, {\n ...currentEvaluationParameters,\n approach,\n }),\n };\n\n return {\n ...testCase,\n expectedOutcome,\n };\n}\n"]}
|
|
@@ -1,28 +1,46 @@
|
|
|
1
1
|
import { z } from "zod";
|
|
2
|
+
import { EvaluationApproach } from "../lib/evaluation/constants";
|
|
3
|
+
import { isApproachAllowedForFieldType } from "../lib/evaluation/field-evaluation-approach";
|
|
2
4
|
const nonEmptyString = z.string().trim().min(1);
|
|
3
5
|
const optionalPositiveInt = z.number().int().positive().optional();
|
|
4
6
|
const optionalString = z.string().optional();
|
|
5
|
-
const optionalBoolean = z.boolean().optional();
|
|
6
7
|
const selectOptionsSchema = z.array(nonEmptyString).min(1);
|
|
8
|
+
const optionalNumber = z.number().optional();
|
|
9
|
+
const evaluationParametersSchema = z.object({
|
|
10
|
+
approach: z.enum(EvaluationApproach),
|
|
11
|
+
threshold: optionalNumber,
|
|
12
|
+
});
|
|
13
|
+
const selectEvaluationParametersSchema = evaluationParametersSchema.superRefine((parameters, ctx) => {
|
|
14
|
+
if (!isApproachAllowedForFieldType('select', parameters.approach)) {
|
|
15
|
+
ctx.addIssue({
|
|
16
|
+
code: 'custom',
|
|
17
|
+
path: ['approach'],
|
|
18
|
+
message: `select fields only support "${EvaluationApproach.EXACT}" evaluation approach.`,
|
|
19
|
+
});
|
|
20
|
+
}
|
|
21
|
+
});
|
|
7
22
|
const defaultExpectedOutcomeBaseSchema = z.object({
|
|
8
23
|
label: nonEmptyString,
|
|
9
|
-
required: optionalBoolean,
|
|
10
24
|
placeholder: optionalString,
|
|
11
25
|
});
|
|
12
26
|
const createDefaultExpectedOutcomeFieldSchemas = (baseSchema) => ({
|
|
13
27
|
text: baseSchema.extend({
|
|
14
28
|
type: z.literal('text'),
|
|
29
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
15
30
|
}),
|
|
16
31
|
textarea: baseSchema.extend({
|
|
17
32
|
type: z.literal('textarea'),
|
|
18
33
|
rows: optionalPositiveInt,
|
|
34
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
19
35
|
}),
|
|
20
36
|
chipsInput: baseSchema.extend({
|
|
21
37
|
type: z.literal('chips-input'),
|
|
38
|
+
evaluationParameters: evaluationParametersSchema.optional(),
|
|
22
39
|
}),
|
|
23
40
|
select: baseSchema.extend({
|
|
24
41
|
type: z.literal('select'),
|
|
25
42
|
options: selectOptionsSchema,
|
|
43
|
+
evaluationParameters: selectEvaluationParametersSchema.optional(),
|
|
26
44
|
}),
|
|
27
45
|
});
|
|
28
46
|
function hasDuplicateChips(values) {
|